diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a87c737..c6131b4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     - id: check-added-large-files
     - id: check-ast
@@ -25,13 +25,13 @@ repos:
       exclude_types: [jupyter]
 
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.24
+    rev: v0.24.1
     hooks:
       - id: validate-pyproject
         additional_dependencies: ["validate-pyproject-schema-store[all]"]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.13.0
     hooks:
       - id: ruff
         args: [--fix]
@@ -39,14 +39,14 @@ repos:
 
   # Mypy: static type checking
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.15.0"
+    rev: "v1.18.1"
     hooks:
       - id: mypy
         # Envorce only one source of configuration.
         args: ["--config-file", "pyproject.toml"]
         additional_dependencies:
           - cuda-core
-          - cuda-bindings>=12.9.1,<13
+          - cuda-bindings>=12.9.2,<13
           - cupy-cuda12x
           - mpi4py>=4.1.0
           - numba
@@ -56,7 +56,6 @@ repos:
           - scipy
           - torch
           - types-cffi
-          - types-pywin32
           - invoke
           - cython>=3.0.4,!=3.1.0,!=3.1.1
           - tomli
@@ -78,7 +77,7 @@ repos:
 
   # Security: secrets
   - repo: https://github.com/gitleaks/gitleaks
-    rev: v8.24.0
+    rev: v8.28.0
     hooks:
       - id: gitleaks
 
@@ -91,13 +90,13 @@ repos:
 
   # Shell script linter
   - repo: https://github.com/shellcheck-py/shellcheck-py
-    rev: "v0.10.0.1"
+    rev: "v0.11.0.1"
     hooks:
       - id: shellcheck
 
   # Lint: Markdown
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.44.0
+    rev: v0.45.0
     hooks:
       - id: markdownlint
         # Setting up node version explicitly
diff --git a/README.md b/README.md
index e1c9cef..638c798 100644
--- a/README.md
+++ b/README.md
@@ -6,17 +6,24 @@
 
 # nvmath-python: NVIDIA Math Libraries for the Python Ecosystem
 
-nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem. The
-package aims to provide intuitive pythonic APIs that provide users full access to all the
+nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem.
+The package aims to provide intuitive pythonic APIs giving users full access to all
 features offered by NVIDIA's libraries in a variety of execution spaces. nvmath-python works
 seamlessly with existing Python array/tensor frameworks and focuses on providing
 functionality that is missing from those frameworks.
 
 ## Some Examples
 
-Using the nvmath-python API allows access to all parameters of the underlying NVIDIA
-cuBLASLt library. Some of these parameters are unavailable in other wrappings of NVIDIA's
-C-API libraries.
+Below are a few representative examples showcasing the three main categories of
+features nvmath-python offers: host, device, and distributed APIs.
+
+### Host APIs
+
+Host APIs are called from host code but can execute in any supported execution
+space (CPU or GPU). The following example shows how to compute a matrix multiplication
+on CuPy matrices. Using the nvmath-python API allows access to *all* parameters
+of the underlying NVIDIA cuBLASLt library, a distinguishing feature of nvmath-python
+from other wrappings of NVIDIA's C-API libraries.
 
 ```python
 import cupy as cp
@@ -61,6 +68,42 @@ print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
 ```
 
+nvmath-python provides the ability to write custom prologs and epilogs for FFT functions as
+Python functions and compile them to LTO-IR. For example, to have unitary scaling for an
+FFT, we can define an epilog which rescales the output by `1/sqrt(N)`.
+
+```python
+import cupy as cp
+import nvmath
+import math
+
+# Create the data for the batched 1-D FFT.
+B, N = 256, 1024
+a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64)
+
+# Compute the normalization factor for unitary transforms
+norm_factor = 1.0 / math.sqrt(N)
+
+# Define the epilog function for the FFT.
+def rescale(data_out, offset, data, user_info, unused):
+    data_out[offset] = data * norm_factor
+
+# Compile the epilog to LTO-IR.
+with cp.cuda.Device():
+    epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
+
+# Perform the forward FFT, applying the filter as a epilog...
+r = nvmath.fft.fft(a, axes=[-1], epilog={"ltoir": epilog})
+
+# Finally, we can test that the fused FFT run result matches the result of separate
+# calls
+s = cp.fft.fftn(a, axes=[-1], norm="ortho")
+
+assert cp.allclose(r, s)
+```
+
+### Device-side APIs
+
 nvmath-python exposes NVIDIA's device-side (Dx) APIs. This allows developers to call NVIDIA
 library functions inside their custom device kernels. For example, a numba jit function can
 call cuFFT in order to implement FFT-based convolution.
@@ -91,7 +134,6 @@ def main():
         ffts_per_block=ffts_per_block,
         elements_per_thread=2,
         execution="Block",
-        compiler="numba",
     )
     FFT_inv = fft(
         fft_type="c2c",
@@ -101,41 +143,35 @@ def main():
         ffts_per_block=ffts_per_block,
         elements_per_thread=2,
         execution="Block",
-        compiler="numba",
     )
 
-    value_type          = FFT_fwd.value_type
-    storage_size        = FFT_fwd.storage_size
-    shared_memory_size  = FFT_fwd.shared_memory_size
-    fft_stride          = FFT_fwd.stride
-    ept                 = FFT_fwd.elements_per_thread
-    block_dim           = FFT_fwd.block_dim
-
     # Define a numba jit function targeting CUDA devices
-    @cuda.jit(link=FFT_fwd.files + FFT_inv.files)
+    @cuda.jit
     def f(signal, filter):
 
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(
+            shape=(FFT_fwd.storage_size,), dtype=FFT_fwd.value_type,
+        )
+        shared_mem = cuda.shared.array(shape=(0,), dtype=FFT_fwd.value_type)
 
         fft_id = (cuda.blockIdx.x * ffts_per_block) + cuda.threadIdx.y
         if(fft_id >= batch_size):
             return
         offset = cuda.threadIdx.x
 
-        for i in range(ept):
-            thread_data[i] = signal[fft_id, offset + i * fft_stride]
+        for i in range(FFT_fwd.elements_per_thread):
+            thread_data[i] = signal[fft_id, offset + i * FFT_fwd.stride]
 
         # Call the cuFFTDx FFT function from *inside* your custom function
         FFT_fwd(thread_data, shared_mem)
 
-        for i in range(ept):
-            thread_data[i] = thread_data[i] * filter[fft_id, offset + i * fft_stride]
+        for i in range(FFT_fwd.elements_per_thread):
+            thread_data[i] *= filter[fft_id, offset + i * FFT_fwd.stride]
 
         FFT_inv(thread_data, shared_mem)
 
-        for i in range(ept):
-            signal[fft_id, offset + i * fft_stride] = thread_data[i]
+        for i in range(FFT_fwd.elements_per_thread):
+            signal[fft_id, offset + i * FFT_fwd.stride] = thread_data[i]
 
 
     data = random_complex((ffts_per_block, size), np.float32)
@@ -144,7 +180,7 @@ def main():
     data_d = cuda.to_device(data)
     filter_d = cuda.to_device(filter)
 
-    f[1, block_dim, 0, shared_memory_size](data_d, filter_d)
+    f[1, FFT_fwd.block_dim, 0, FFT_fwd.shared_memory_size](data_d, filter_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
@@ -159,38 +195,79 @@ if __name__ == "__main__":
     main()
 ```
 
-nvmath-python provides the ability to write custom prologs and epilogs for FFT functions as
-a Python functions and compiled them LTO-IR. For example, to have unitary scaling for an
-FFT, we can define an epilog which rescales the output by 1/sqrt(N).
+### Distributed APIs
+
+Distributed APIs are called from host code but execute on a distributed
+(multi-node multi-GPU) system. The following example shows the use of the
+function-form distributed FFT with CuPy ndarrays:
 
 ```python
 import cupy as cp
-import nvmath
-import math
-
-# Create the data for the batched 1-D FFT.
-B, N = 256, 1024
-a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64)
-
-# Compute the normalization factor for unitary transforms
-norm_factor = 1.0 / math.sqrt(N)
-
-# Define the epilog function for the FFT.
-def rescale(data_out, offset, data, user_info, unused):
-    data_out[offset] = data * norm_factor
-
-# Compile the epilog to LTO-IR.
-with cp.cuda.Device():
-    epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
-
-# Perform the forward FFT, applying the filter as a epilog...
-r = nvmath.fft.fft(a, axes=[-1], epilog={"ltoir": epilog})
-
-# Finally, we can test that the fused FFT run result matches the result of separate
-# calls
-s = cp.fft.fftn(a, axes=[-1], norm="ortho")
-
-assert cp.allclose(r, s)
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import Slab
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
+
+# The global 3-D FFT size is (512, 256, 512).
+# In this example, the input data is distributed across processes according to
+# the cuFFTMp Slab distribution on the X axis.
+shape = 512 // nranks, 256, 512
+
+# cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
+# operands to be on the symmetric heap.
+a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.complex128)
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
+with cp.cuda.Device(device_id):
+    a[:] = cp.random.rand(*shape, dtype=cp.float64) + 1j *
+        cp.random.rand(*shape, dtype=cp.float64)
+
+# Forward FFT.
+# In this example, the forward FFT operand is distributed according
+# to Slab.X distribution. With reshape=False, the FFT result will be
+# distributed according to Slab.Y distribution.
+b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options={"reshape": False})
+
+# Distributed FFT performs computations in-place. The result is stored in the same
+# buffer as operand a. Note, however, that operand b has a different shape (due
+# to Slab.Y distribution).
+if rank == 0:
+    print(f"Shape of a on rank {rank} is {a.shape}")
+    print(f"Shape of b on rank {rank} is {b.shape}")
+
+# Inverse FFT.
+# Recall from previous transform that the inverse FFT operand is distributed according
+# to Slab.Y. With reshape=False, the inverse FFT result will be distributed according
+# to Slab.X distribution.
+c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
+
+# The shape of c is the same as a (due to Slab.X distribution). Once again, note that
+# a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
+# are in-place).
+if rank == 0:
+    print(f"Shape of c on rank {rank} is {c.shape}")
+
+# Synchronize the default stream
+with cp.cuda.Device(device_id):
+    cp.cuda.get_current_stream().synchronize()
+
+if rank == 0:
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"FFT output type = {type(b)}, device = {b.device}")
+    print(f"IFFT output type = {type(c)}, device = {c.device}")
+
+# GPU operands on the symmetric heap are not garbage-collected and the user is
+# responsible for freeing any that they own (this deallocation is a collective
+# operation that must be called by all processes at the same point in the execution).
+# All cuFFTMp operations are inplace (a, b, and c share the same memory buffer), so
+# we take care to only free the buffer once.
+nvmath.distributed.free_symmetric_memory(a)
 ```
 
 ## License
diff --git a/builder/utils.py b/builder/utils.py
index c5771ca..187ca23 100644
--- a/builder/utils.py
+++ b/builder/utils.py
@@ -48,7 +48,21 @@ def check_path(header):
 
 def decide_lib_name(ext_name):
     # TODO: move the record of the supported lib list elsewhere?
-    for lib in ("cublas", "cusolver", "cufftMp", "cufft", "cusparse", "curand", "nvpl", "nvshmem", "mathdx", "cudss"):
+    for lib in (
+        "cublasMp",
+        "cublas",
+        "cusolver",
+        "cufftMp",
+        "cufft",
+        "cusparse",
+        "curand",
+        "nvpl",
+        "nvshmem",
+        "nccl",
+        "mathdx",
+        "cudss",
+        "cutensor",
+    ):
         if lib in ext_name:
             return lib
     else:
diff --git a/docs/sphinx/_static/switcher.json b/docs/sphinx/_static/switcher.json
index 3810ea5..5421d60 100644
--- a/docs/sphinx/_static/switcher.json
+++ b/docs/sphinx/_static/switcher.json
@@ -3,6 +3,10 @@
     "version": "latest",
     "url": "https://docs.nvidia.com/cuda/nvmath-python/latest"
   },
+  {
+    "version": "0.7.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.7.0"
+  },
   {
     "version": "0.6.0",
     "url": "https://docs.nvidia.com/cuda/nvmath-python/0.6.0"
diff --git a/docs/sphinx/bindings/cublasMp.rst b/docs/sphinx/bindings/cublasMp.rst
new file mode 100644
index 0000000..be1017b
--- /dev/null
+++ b/docs/sphinx/bindings/cublasMp.rst
@@ -0,0 +1,45 @@
+.. module:: nvmath.bindings.cublasMp
+
+cuBLASMp (:mod:`nvmath.bindings.cublasMp`)
+==========================================
+
+For detailed documentation on the original C APIs, refer to the `cuBLASMp documentation
+<https://docs.nvidia.com/cuda/cublasmp>`_.
+
+Enums and constants
+*******************
+
+.. autosummary::
+   :toctree: generated/
+
+   ComputeType
+   cuBLASMpError
+   GridLayout
+   MatmulAlgoType
+   MatmulDescriptorAttribute
+   MatmulEpilogue
+   MatmulMatrixScale
+   Operation
+   Status
+
+Functions
+*********
+
+.. autosummary::
+   :toctree: generated/
+
+   create
+   destroy
+   stream_set
+   get_version
+   grid_create
+   grid_destroy
+   matrix_descriptor_create
+   matrix_descriptor_destroy
+   matmul_descriptor_create
+   matmul_descriptor_destroy
+   matmul_descriptor_attribute_set
+   matmul_descriptor_attribute_get
+   matmul_buffer_size
+   matmul
+   numroc
diff --git a/docs/sphinx/bindings/index.rst b/docs/sphinx/bindings/index.rst
index 6a2e477..b83835f 100644
--- a/docs/sphinx/bindings/index.rst
+++ b/docs/sphinx/bindings/index.rst
@@ -31,6 +31,8 @@ follows:
      - :mod:`nvmath.bindings.cublas`
    * - cuBLASLt
      - :mod:`nvmath.bindings.cublasLt`
+   * - cuBLASMp
+     - :mod:`nvmath.bindings.cublasMp`
    * - cuDSS
      - :mod:`nvmath.bindings.cudss`
    * - cuFFT
@@ -43,6 +45,10 @@ follows:
      - :mod:`nvmath.bindings.cusolverDn`
    * - cuSPARSE
      - :mod:`nvmath.bindings.cusparse`
+   * - NVPL BLAS
+     - :mod:`nvmath.bindings.nvpl.blas`
+   * - NVPL FFT
+     - :mod:`nvmath.bindings.nvpl.fft`
 
 Support for more libraries will be added in the future.
 
@@ -188,9 +194,12 @@ This reference describes all nvmath-python's math primitives.
 
    cublas
    cublasLt
+   cublasMp
    cudss
    cufft
    cusolver
    cusolverDn
    cusparse
    curand
+   nvpl.blas
+   nvpl.fft
diff --git a/docs/sphinx/bindings/nvpl.blas.rst b/docs/sphinx/bindings/nvpl.blas.rst
new file mode 100644
index 0000000..405e777
--- /dev/null
+++ b/docs/sphinx/bindings/nvpl.blas.rst
@@ -0,0 +1,79 @@
+.. module:: nvmath.bindings.nvpl.blas
+
+NVPL BLAS (:mod:`nvmath.bindings.nvpl.blas`)
+============================================
+
+For detailed documentation on the original C APIs, refer to the `NVPL BLAS documentation
+<https://docs.nvidia.com/nvpl/latest/blas/>`_.
+
+Enums and constants
+*******************
+
+.. autosummary::
+   :toctree: generated/
+
+   DIAG
+   ORDER
+   SIDE
+   TRANSPOSE
+   UPLO
+
+
+Functions
+*********
+
+.. autosummary::
+   :toctree: generated/
+
+   cgemm
+   cgemm_batch
+   cgemm_batch_strided
+   chemm
+   chemm_batch_strided
+   cher2k
+   cherk
+   csymm
+   csymm_batch_strided
+   csyr2k
+   csyrk
+   ctrmm
+   ctrmm_batch_strided
+   ctrsm
+   dgemm
+   dgemm_batch
+   dgemm_batch_strided
+   dsymm
+   dsymm_batch_strided
+   dsyr2k
+   dsyrk
+   dtrmm
+   dtrmm_batch_strided
+   dtrsm
+   get_max_threads
+   get_version
+   set_num_threads
+   set_num_threads_local
+   sgemm
+   sgemm_batch
+   sgemm_batch_strided
+   ssymm
+   ssymm_batch_strided
+   ssyr2k
+   ssyrk
+   strmm
+   strmm_batch_strided
+   strsm
+   zgemm
+   zgemm_batch
+   zgemm_batch_strided
+   zhemm
+   zhemm_batch_strided
+   zher2k
+   zherk
+   zsymm
+   zsymm_batch_strided
+   zsyr2k
+   zsyrk
+   ztrmm
+   ztrmm_batch_strided
+   ztrsm
diff --git a/docs/sphinx/bindings/nvpl.fft.rst b/docs/sphinx/bindings/nvpl.fft.rst
new file mode 100644
index 0000000..777da1b
--- /dev/null
+++ b/docs/sphinx/bindings/nvpl.fft.rst
@@ -0,0 +1,59 @@
+.. module:: nvmath.bindings.nvpl.fft
+
+NVPL FFT (:mod:`nvmath.bindings.nvpl.fft`)
+==========================================
+
+For detailed documentation on the original C APIs, refer to the `NVPL FFT documentation
+<https://docs.nvidia.com/nvpl/latest/fft/>`_.
+
+Enums and constants
+*******************
+
+.. autosummary::
+   :toctree: generated/
+
+   FFTWError
+   FFTWUnaligned
+   Kind
+   Plan
+   PlannerFlags
+   Precision
+   Sign
+
+
+Functions
+*********
+
+.. autosummary::
+   :toctree: generated/
+
+   cleanup_threads
+   cleanup_threads_double
+   cleanup_threads_float
+   destroy
+   destroy_plan_double
+   destroy_plan_float
+   execute
+   execute_c2c_double
+   execute_c2c_float
+   execute_c2r_double
+   execute_c2r_float
+   execute_r2c_double
+   execute_r2c_float
+   get_version
+   init_threads
+   init_threads_double
+   init_threads_float
+   plan_many
+   plan_many_c2c_double
+   plan_many_c2c_float
+   plan_many_c2r_double
+   plan_many_c2r_float
+   plan_many_r2c_double
+   plan_many_r2c_float
+   plan_with_nthreads
+   plan_with_nthreads_double
+   plan_with_nthreads_float
+   planner_nthreads
+   planner_nthreads_double
+   planner_nthreads_float
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index 6c81d1a..f4b6da5 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -180,11 +180,13 @@
 # Output file base name for HTML help builder.
 htmlhelp_basename = "nvmath-python-doc"
 
-# TODO: remove this once examples are published.
-linkcheck_ignore = [
-    "https://github.com/NVIDIA/nvmath-python/tree/main/examples/sparse/.*",
-    "https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft/.*",
-]
+# If we need to generate docs that point to examples on GitHub that haven't been
+# published yet, add the links that we want to ignore here (so as not to break
+# docs build).
+# Example of URL ignore pattern:
+# "https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft/.*"
+# NOTE: remove ignore patterns once examples are published.
+linkcheck_ignore = []
 
 
 def autodoc_process_docstring(app, what, name, obj, options, lines):
@@ -196,6 +198,7 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
     if isinstance(obj, np.dtype):
         docs = {}
         from nvmath.sparse._internal.cudss_data_ifc import memory_estimates_dtype
+        from nvmath.linalg.generic._configuration.qualifiers import matrix_qualifiers_dtype, MM_QUALIFIERS_DOCUMENTATION
 
         # TODO: find better way to declare docs in the source code.
         if obj == memory_estimates_dtype:
@@ -208,20 +211,43 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
                 "hybrid_max_device_memory": "(if in hybrid memory mode) maximum host memory for the hybrid memory mode",
                 "reserved": "reserved for future use",
             }
+        elif obj == matrix_qualifiers_dtype:
+            docs = {
+                "abbreviation": MM_QUALIFIERS_DOCUMENTATION["abbreviation"],
+                "conjugate": MM_QUALIFIERS_DOCUMENTATION["conjugate"],
+                "transpose": MM_QUALIFIERS_DOCUMENTATION["transpose"],
+                "uplo": MM_QUALIFIERS_DOCUMENTATION["uplo"],
+                "diag": MM_QUALIFIERS_DOCUMENTATION["diag"],
+                "incx": MM_QUALIFIERS_DOCUMENTATION["incx"],
+            }
 
-        _, *mod, struct = name.split(".")
-        mod = ".".join(mod)
-        if mod == "bindings":
-            # handle bindings
-            struct = snake_to_camel([mod] + struct.split("_")[:-1])
-            line = f"NumPy dtype object that represents the `{struct}` struct.\n"
+        # Generate the main description line
+        if obj == matrix_qualifiers_dtype:
+            line = "A NumPy custom dtype which describes a structured matrix.\n"
         else:
-            # handle dtype in high-level Pythonic APIs
-            struct = " ".join(struct.split("_")[:-1])
-            line = f"NumPy dtype object that encapsulates the {struct} in {mod}.\n"
+            _, *mod, struct = name.split(".")
+            mod = ".".join(mod)
+            if mod == "bindings":
+                # handle bindings
+                struct = snake_to_camel([mod] + struct.split("_")[:-1])
+                line = f"NumPy dtype object that represents the `{struct}` struct.\n"
+            else:
+                # handle dtype in high-level Pythonic APIs
+                struct = " ".join(struct.split("_")[:-1])
+                line = f"NumPy dtype object that encapsulates the {struct} in {mod}.\n"
+
         lines.clear()
         lines.append(line)
         lines.append("\n")
+
+        # Add seealso section for matrix_qualifiers_dtype
+        if obj == matrix_qualifiers_dtype:
+            lines.append(".. seealso::\n")
+            lines.append("    :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,\n")
+            lines.append("    :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`,\n")
+            lines.append("    :class:`DiagonalMatrixQualifier`\n")
+            lines.append("\n")
+
         for k in obj.fields:
             docs_value = docs.get(k, "")
             lines.append(f":param {k}: {docs_value}\n")
@@ -345,7 +371,11 @@ def fixup_internal_alias():
 autosummary_filename_map = {
     # avoid name clash with the fft func
     "nvmath.fft.FFT": "nvmath.fft.FFT-class",
+    "nvmath.device.FFT": "nvmath.device.FFT-class",
+    "nvmath.device.Matmul": "nvmath.device.Matmul-class",
     "nvmath.linalg.advanced.Matmul": "nvmath.linalg.advanced.Matmul-class",
+    "nvmath.linalg.generic.Matmul": "nvmath.linalg.generic.Matmul-class",
+    "nvmath.distributed.linalg.advanced.Matmul": "nvmath.distributed.linalg.advanced.Matmul-class",
     "nvmath.distributed.fft.FFT": "nvmath.distributed.fft.FFT-class",
     "nvmath.distributed.reshape.Reshape": "nvmath.distributed.reshape.Reshape-class",
 }
@@ -357,6 +387,7 @@ def fixup_internal_alias():
     "cudss": ("https://docs.nvidia.com/cuda/cudss/", None),
     "cufft": ("https://docs.nvidia.com/cuda/cufft/", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "cuquantum": ("https://docs.nvidia.com/cuda/cuquantum/latest/", None),
     # curand is not using sphinx yet - June, 2025
     # "curand": ("https://docs.nvidia.com/cuda/curand/", None),
     "cusolver": ("https://docs.nvidia.com/cuda/cusolver/", None),
diff --git a/docs/sphinx/device-apis/cublas.rst b/docs/sphinx/device-apis/cublas.rst
index 5d3f833..e60f1b7 100644
--- a/docs/sphinx/device-apis/cublas.rst
+++ b/docs/sphinx/device-apis/cublas.rst
@@ -10,13 +10,13 @@ Overview
 
 These APIs offer integration with the NVIDIA cuBLASDx library.
 Detailed documentation of cuBLASDx can be found in the
-`cuBLASDx documentation <https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_.
+`cuBLASDx documentation <https://docs.nvidia.com/cuda/cublasdx/0.4.1>`_.
 
 .. note::
 
-   The :class:`~nvmath.device.matmul` device API in module
-   :mod:`nvmath.device` currently supports cuBLASDx 0.1.1, also available
-   as part of MathDx 24.04.
+   The :class:`~nvmath.device.Matmul` device API in module
+   :mod:`nvmath.device` currently supports cuBLASDx 0.4.1, also available
+   as part of MathDx 25.06.
 
 .. _device-api-cublas-reference:
 
@@ -28,8 +28,22 @@ API Reference
 .. autosummary::
    :toctree: generated/
 
+   Matmul
    matmul
-   BlasOptions
+   make_tensor
+   axpby
+   copy
+   copy_fragment
+   clear
+   copy_wait
+
+   OpaqueTensor
+   Layout
+
+   Partition
+   Partitioner
+
+   SharedStorageCalc
 
    :template: namedtuple.rst
 
diff --git a/docs/sphinx/device-apis/cufft.rst b/docs/sphinx/device-apis/cufft.rst
index 81078fc..4cd326f 100644
--- a/docs/sphinx/device-apis/cufft.rst
+++ b/docs/sphinx/device-apis/cufft.rst
@@ -10,13 +10,13 @@ Overview
 
 These APIs offer integration with the NVIDIA cuFFTDx library.
 Detailed documentation of cuFFTDx can be found in the
-`cuFFTDx documentation <https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_.
+`cuFFTDx documentation <https://docs.nvidia.com/cuda/cufftdx/1.5.1>`_.
 
 .. note::
 
-   The :class:`~nvmath.device.fft` device APIs in module
-   :mod:`nvmath.device` currently support cuFFTDx 1.2.0, also available
-   as part of MathDx 24.04. All functionalities from the C++ library are supported with
+   The :class:`~nvmath.device.FFT` device APIs in module
+   :mod:`nvmath.device` currently support cuFFTDx 1.5.1, also available
+   as part of MathDx 25.06. All functionalities from the C++ library are supported with
    the exception of cuFFTDx C++ APIs with a workspace argument, which are currently not
    available in nvmath-python.
 
@@ -31,4 +31,4 @@ API Reference
    :toctree: generated/
 
    fft
-   FFTOptions
+   FFT
diff --git a/docs/sphinx/device-apis/index.rst b/docs/sphinx/device-apis/index.rst
index bdddcc7..78c4ae8 100644
--- a/docs/sphinx/device-apis/index.rst
+++ b/docs/sphinx/device-apis/index.rst
@@ -10,8 +10,8 @@ nvmath-python Device APIs
 The device module of nvmath-python :mod:`nvmath.device` offers integration with NVIDIA's
 high-performance computing libraries through device APIs for cuFFTDx, cuBLASDx, and cuRAND.
 Detailed documentation for these libraries can be found at `cuFFTDx
-<https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_, `cuBLASDx
-<https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_, and `cuRAND device APIs
+<https://docs.nvidia.com/cuda/cufftdx/1.5.1>`_, `cuBLASDx
+<https://docs.nvidia.com/cuda/cublasdx/0.4.1>`_, and `cuRAND device APIs
 <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_ respectively.
 Device APIs can only be called from CUDA device or kernel code, and execute on the GPU.
 
@@ -25,9 +25,9 @@ Users may take advantage of the device module via the two approaches below:
 
 .. note::
 
-   The :class:`~nvmath.device.fft` and :class:`~nvmath.device.matmul` device APIs in module
-   :mod:`nvmath.device` currently supports cuFFTDx 1.2.0 and cuBLASDx 0.1.1, also available
-   as part of MathDx 24.04. All functionalities from the C++ libraries are supported with
+   The :class:`~nvmath.device.FFT` and :class:`~nvmath.device.Matmul` device APIs in module
+   :mod:`nvmath.device` currently supports cuFFTDx 1.5.1 and cuBLASDx 0.4.1, also available
+   as part of MathDx 25.06. All functionalities from the C++ libraries are supported with
    the exception of cuFFTDx C++ APIs with a workspace argument, which are currently not
    available in nvmath-python.
 
diff --git a/docs/sphinx/device-apis/utils.rst b/docs/sphinx/device-apis/utils.rst
index dcfe70e..d834cb2 100644
--- a/docs/sphinx/device-apis/utils.rst
+++ b/docs/sphinx/device-apis/utils.rst
@@ -20,6 +20,15 @@ API Reference
    :toctree: generated/
 
    current_device_lto
+   Complex
+   Vector
+   complex32
+   complex64
+   complex128
+   np_float16x2
+   np_float16x4
+   half2
+   half4
    float16x2
    float16x4
    float32x2
@@ -35,5 +44,4 @@ API Reference
    Code
    CodeType
    ComputeCapability
-   CodeType
    Dim3
diff --git a/docs/sphinx/distributed-apis/distribution.rst b/docs/sphinx/distributed-apis/distribution.rst
new file mode 100644
index 0000000..b5e0ada
--- /dev/null
+++ b/docs/sphinx/distributed-apis/distribution.rst
@@ -0,0 +1,324 @@
+********************
+Operand distribution
+********************
+
+To perform distributed math operations with ``nvmath.distributed`` you must first
+specify how the operands are distributed across processes. nvmath-python supports
+multiple distribution types (Slab, Box, BlockCyclic, etc.) which we'll explain in
+this section.
+
+You can use any distribution type for any distributed operation as long as nvmath-python
+implements an implicit conversion to the native distribution type supported by the
+operation. For example, the distributed dense linear algebra library (cuBLASMp)
+supports the PBLAS 2D block-cyclic distribution and your input matrices must be
+distributed in a way that conforms to this distribution type. Slab is compatible
+with 2D block distribution for uniform partition sizes, so you can use Slab for
+distributed matrix multiplication in such cases (see
+`examples <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/
+linalg/advanced/matmul>`_).
+
+It's also important to consider the memory layout requirements of the distributed
+operation that you're performing, and the potential implications on the distribution
+of the global array. See
+:ref:`Distribution, memory layout and transpose <distribution-mem-layout>` for more
+information.
+
+In the following we describe the available distribution types.
+
+.. _distribution-slab:
+
+Slab
+====
+
+Slab specifies the distribution of an N-D array that is partitioned across processes
+along a single dimension. More formally:
+
+- The shape of the slab on the first :math:`s_p \mathbin{\%} P` processes is
+  :math:`(s_0, \ldots, \lfloor \frac{s_p}{P} \rfloor + 1, \ldots, s_{n-1})`
+- The shape of the slab on the remaining processes is
+  :math:`(s_0, \ldots, \lfloor \frac{s_p}{P} \rfloor, \ldots, s_{n-1})`
+- Process 0 owns the first slab according to the global index order, process 1 owns
+  the second slab and so on.
+
+where:
+
+- :math:`s_i` is the size of dimension :math:`i` of the global array
+- :math:`p` is the partition dimension
+- :math:`n` is the number of dimensions of the array
+- :math:`P` is the number of processes
+
+Let's look at an example with a 2D array and four processes:
+
+.. figure:: ./figures/slab-example.png
+    :width: 40%
+
+Here we see a MxN 2D array partitioned on the X axis, where each number (and color) denotes
+the slab of the global array owned by that process.
+
+If :math:`(M, N) = (40, 64)`, the shape of the slab on every process will be
+:math:`(10, 64)`.  For :math:`(M, N) = (39, 64)`, the shape of the slab on the first
+three processes is :math:`(10, 64)` and the shape on the last process is :math:`(9, 64)`.
+
+Using the ``nvmath.distributed`` APIs, you can specify the above distribution like this:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import Slab
+
+    distribution = Slab(partition_dim=0)
+    # or
+    distribution = Slab.X
+
+.. tip::
+    We offer convenience aliases to use with 1D/2D/3D arrays: ``Slab.X``,
+    ``Slab.Y`` and ``Slab.Z`` (which partition on axis 0, 1 and 2, respectively).
+
+.. note::
+    Slab is natively supported by cuFFTMp (:doc:`distributed FFT API <fft/index>`).
+    A Slab (or compatible) distribution is recommended for best performance in cuFFTMp.
+
+
+.. _distributed-api-distributions-reference:
+
+.. _distribution-box:
+
+Box
+===
+
+Given a global N-D array, a N-D box can be used to describe a subsection of the global
+array by indicating the lower and upper corner of the subsection. By associating boxes
+to processes we can describe a data distribution where every process owns a contiguous
+rectangular subsection of the global array.
+
+For example, consider a 8x8 2D array distributed across 3 processes using the
+following boxes:
+
+.. figure:: ./figures/box-example.png
+    :width: 40%
+
+where each number (and color) denotes the subsection of the global N-D array owned by
+that process.
+
+Using the ``nvmath.distributed`` APIs, you can specify the above distribution like this:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import Box
+
+    if process_id == 0:
+        distribution = Box((0, 0), (4, 4))
+    elif process_id == 1:
+        distribution = Box((4, 0), (8, 4))
+    elif process_id == 2:
+        distribution = Box((0, 4), (8, 8))
+
+.. note::
+
+    Box is natively supported by cuFFTMp (:doc:`distributed FFT<fft/index>`
+    and :doc:`Reshape<reshape/index>` APIs).
+    For further information, refer to the `cuFFTMp documentation
+    <https://docs.nvidia.com/cuda/cufftmp/usage/api_usage.html
+    #usage-with-custom-slabs-and-pencils-data-decompositions>`_.
+
+.. _distribution-block:
+
+Block distributions
+===================
+
+In the block-cyclic distribution, a global N-D array is split into blocks of a specified
+shape and these blocks are distributed to a grid of processes in a cyclic pattern. As a
+result, each process owns a set of typically non-contiguous blocks of the global N-D array.
+
+PBLAS uses the block-cyclic distribution to distribute dense matrices in a way that evenly
+balances the computational load across processes, while at the same time optimizing
+performance by being able to exploit memory locality
+(`reference <https://www.netlib.org/utk/papers/sc96-scalapack/NODE8.HTM>`_).
+
+nvmath-python provides two distribution types based on block-cyclic, described below.
+
+.. _distribution-block-cyclic:
+
+BlockCyclic
+-----------
+
+BlockCyclic is specified with a process grid and a block size. The blocks assigned to
+a process are typically non-contiguous owing to the cyclic distribution pattern.
+Blocks can partition on any number of dimensions.
+
+Consider the following example:
+
+.. figure:: ./figures/2d-block-cyclic-example.png
+    :width: 60%
+
+Here we see an NxN matrix distributed across 4 processes using a 2D block-cyclic scheme.
+Each number (and color) denotes the blocks of the global matrix belonging to that
+process. Each block has BxB elements and each process has 16 blocks.
+
+Using the ``nvmath.distributed`` APIs, you can specify the above distribution like this:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import ProcessGrid, BlockCyclic
+
+    process_grid = ProcessGrid(
+        shape=(2, 2), layout=ProcessGrid.Layout.ROW_MAJOR
+    )
+    distribution = BlockCyclic(process_grid, (B, B))
+
+Note how the partition dimensions are determined by the process grid and block shape.
+Here is an example of 1D block-cyclic distribution:
+
+.. figure:: ./figures/1d-block-cyclic-example.png
+    :width: 60%
+
+The above distribution can be specified like this:
+
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import ProcessGrid, BlockCyclic
+
+    # layout is irrelevant in this case and can be omitted
+    process_grid = ProcessGrid(shape=(1, 4))
+    distribution = BlockCyclic(process_grid, (N, B))
+
+.. note::
+    Block distributions are natively supported by cuBLASMp
+    (:doc:`distributed matrix multiplication APIs<linalg/index>`).
+
+.. _distribution-block-non-cyclic:
+
+BlockNonCyclic
+--------------
+
+BlockNonCyclic is a special case of BlockCyclic, where the block size and process grid are
+such that it generates no cycles. For this distribution there is no need to specify block
+sizes, as nvmath-python can infer them automatically.
+
+.. tip::
+    BlockNonCyclic is a convenience type and you can represent the same distribution with
+    BlockCyclic and explicit block sizes.
+
+Example 1D block non-cyclic:
+
+.. figure:: ./figures/block-non-cyclic-example.png
+    :width: 60%
+
+The above distribution can be specified like this:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+
+    # layout is irrelevant in this case and can be omitted
+    process_grid = ProcessGrid(shape=(1, 4))
+    distribution = BlockNonCyclic(process_grid)
+
+
+.. note::
+    Block distributions are natively supported by cuBLASMp
+    (:doc:`distributed matrix multiplication APIs<linalg/index>`).
+
+.. tip::
+    Slab and BlockNonCyclic are equivalent for uniform partition sizes.
+
+
+Utilities
+=========
+
+You can get the local shape of an operand according to a distribution by querying the
+distribution:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import Slab
+
+    global_shape = (64, 48, 32)
+    # Get the local shape on this process according to Slab.Y
+    shape = Slab.Y.shape(process_id, global_shape)
+
+
+If desired, you may do explicit conversion between distribution types. For example:
+
+.. code-block:: python
+
+    from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic, Slab
+
+    # layout is irrelevant in this case and can be omitted
+    process_grid = ProcessGrid(shape=(1, 4))
+    distribution = BlockNonCyclic(process_grid)
+    slab_distribution = distribution.to(Slab)
+    print(slab_distribution)  # prints "Slab(partition_dim=1, ndim=2)"
+
+.. _distribution-mem-layout:
+
+Distribution, memory layout and transpose
+=========================================
+
+Memory layout refers to the way that N-D arrays are stored in memory on each process.
+The two primary layouts are C-order (row-major) and Fortran-order (column-major).
+Memory layout is independent of the distribution of the global array, i.e. you can have
+any combination of distribution and local memory layout. In practice, however, math
+libraries have specific requirements on memory layout. For example, cuFFTMp requires
+C-order while cuBLASMp requires Fortran-order. As such, you may find that you have
+to convert the layout of your inputs. Two common ways to convert the layout are:
+
+1. **Copy** the array to a buffer with the new layout (*expensive, preserves the
+   distribution*).
+
+   For example:
+
+   .. code-block:: python
+
+       # Get the local shape according to Slab.X
+       a_shape = Slab.X.shape(process_id, (m, n))
+       # Allocate operand on this process (NumPy uses C-order by default).
+       a = np.random.rand(*a_shape)
+       # Convert layout to F-order by copying to a new array (distribution is preserved)
+       a = np.asfortranarray(a)
+
+2. Get a **transposed view** (*efficient, modifies the distribution*).
+
+   Transposing a global array *transposes the distribution*, and so always results
+   in a different distribution. For example:
+
+   .. code-block:: python
+
+       # Get the local shape according to Slab.X
+       a_shape = Slab.X.shape(process_id, (m, n))
+       # Allocate operand on this process (NumPy uses C-order by default).
+       a = np.random.rand(*a_shape)
+       # Transpose the global array (transposing on each process)
+       a = a.T  # the distribution of a is now Slab.Y
+
+.. note::
+    Transposing changes the global shape of the operand and will accordingly impact the
+    distributed operation. For example, if the global shape of the input matrix A of
+    distributed matrix multiplication is :math:`(k, m)`, you have to set the
+    ``is_transpose`` qualifier to ``True`` for A. Similarly if B is transposed. See
+    :doc:`Distributed Linear Algebra <linalg/index>` for more information.
+
+.. hint::
+    For matrices, ``transpose(Slab.X) == Slab.Y`` and ``transpose(Slab.Y) == Slab.X``.
+
+.. seealso::
+    See
+    `distributed matmul examples
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/ distributed/linalg/
+    advanced/matmul>`_ for more examples showing the interaction between memory layout,
+    transpose and distribution.
+
+
+API Reference
+=============
+
+.. module:: nvmath.distributed.distribution
+
+.. autosummary::
+   :toctree: generated/
+
+   Slab
+   Box
+   ProcessGrid
+   BlockCyclic
+   BlockNonCyclic
diff --git a/docs/sphinx/distributed-apis/fft/index.rst b/docs/sphinx/distributed-apis/fft/index.rst
index 8c9bb60..f56f830 100644
--- a/docs/sphinx/distributed-apis/fft/index.rst
+++ b/docs/sphinx/distributed-apis/fft/index.rst
@@ -32,8 +32,8 @@ some key differences:
 
 * The operands to the API are the **local partition** of the global operands and
   the user specifies the **distribution** (how the data is partitioned across
-  processes). There are two types of distribution supported for FFT: ``Slab`` and custom
-  ``Box`` (these are described below).
+  processes). There are two types of distribution natively supported by FFT:
+  :ref:`distribution-slab` and custom :ref:`distribution-box`.
 
 * GPU operands need to be allocated on **symmetric memory**. Refer to
   :doc:`Distributed API Utilities <../utils>` for examples and details of how to
@@ -53,16 +53,23 @@ some key differences:
   operation. This helper is described below.
 
 
-Slab distribution
------------------
+Operand distribution
+--------------------
 
-``Slab`` is a 1D data decomposition where the data is partitioned across processes along one
-dimension (currently X or Y).
+To perform a distributed FFT operation you have to specify how the operand is distributed
+across processes. Distributed FFT natively supports the :ref:`distribution-slab` and
+:ref:`distribution-box` distributions. The distribution provided must be compatible with
+one of these.
+
+Slab
+~~~~
 
 .. tip::
-    ``Slab`` is the most optimized distribution mode to use with distributed FFT.
+    :ref:`distribution-slab` (or compatible distribution) is the most optimized
+    distribution to use with distributed FFT.
 
-To illustrate with a simple example:
+Currently, distributed FFT supports Slab decomposition on X or Y.
+Here is an example of a distributed FFT using Slab distribution:
 
 .. tip::
     Reminder to initialize the distributed context first as per
@@ -70,6 +77,8 @@ To illustrate with a simple example:
 
 .. code-block:: python
 
+    from nvmath.distributed.distribution import Slab
+
     # Get number of processes from mpi4py communicator.
     nranks = communicator.Get_size()
 
@@ -85,7 +94,7 @@ To illustrate with a simple example:
     # By default, the reshape option is True, which means that the output of the
     # distributed FFT will be re-distributed to retain the same distribution as
     # the input (in this case Slab.Y).
-    b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.Y)
+    b = nvmath.distributed.fft.fft(a, distribution=Slab.Y)
 
 For the purposes of the transform with ``reshape=False``, ``Slab.X``
 and ``Slab.Y`` are considered complementary distributions. If ``reshape=False``, the
@@ -94,6 +103,8 @@ this using GPU operands:
 
 .. code-block:: python
 
+    from nvmath.distributed.distribution import Slab
+
     # The global 3-D FFT size is (512, 256, 512).
     # Here, the input data is distributed across processes according to the
     # Slab distribution on the X axis.
@@ -109,11 +120,11 @@ this using GPU operands:
     # Forward FFT.
     # Here, the forward FFT operand is distributed according to Slab.X distribution.
     # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-    b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+    b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options={"reshape": False})
 
     # Now we can perform an inverse FFT with reshape=False and get the
     # result in Slab.X distribution (recall that `b` has Slab.Y distribution).
-    c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+    c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
 
     # Synchronize the default stream
     with cp.cuda.Device(device_id):
@@ -127,17 +138,15 @@ this using GPU operands:
     Distributed FFT operations are in-place, which needs to be taken into account
     when freeing the GPU operands on symmetric memory (as shown in the above example).
 
-Refer to :class:`nvmath.distributed.fft.Slab` for more details.
-
-Custom box distribution
------------------------
+Custom box
+~~~~~~~~~~
 
 Distributed FFT also supports arbitrary data distributions in the form of 2D/3D boxes.
-Please refer to :ref:`distributed-reshape-box` for an overview.
+Refer to :ref:`distribution-box` for an overview.
 
 .. tip::
-    While efficient, ``Box`` distribution is less optimized than ``Slab``
-    for distributed FFT.
+    While efficient, :ref:`distribution-box` distribution is less optimized than
+    :ref:`distribution-slab` for distributed FFT.
 
 To perform a distributed FFT using a custom ``Box`` distribution, each process specifies
 its own input and output box, which determines the distribution of the input and output
@@ -150,6 +159,8 @@ Here is an example of a distributed FFT across 4 GPUs using a custom pencil dist
 
 .. code-block:: python
 
+    from nvmath.distributed.distribution import Box
+
     # Get process rank from mpi4py communicator.
     rank = communicator.Get_rank()
 
@@ -164,13 +175,13 @@ Here is an example of a distributed FFT across 4 GPUs using a custom pencil dist
 
     # Forward FFT.
     if rank == 0:
-        input_box = [(0, 0, 0), (32, 128, 128)]
+        input_box = Box((0, 0, 0), (32, 128, 128))
     elif rank == 1:
-        input_box = [(0, 128, 0), (32, 256, 128)]
+        input_box = Box((0, 128, 0), (32, 256, 128))
     elif rank == 2:
-        input_box = [(32, 0, 0), (64, 128, 128)]
+        input_box = Box((32, 0, 0), (64, 128, 128))
     else:
-        input_box = [(32, 128, 0), (64, 256, 128)]
+        input_box = Box((32, 128, 0), (64, 256, 128))
     # Use the same pencil distribution for the output.
     output_box = input_box
     b = nvmath.distributed.fft.fft(a, distribution=[input_box, output_box])
@@ -267,4 +278,3 @@ FFT support (:mod:`nvmath.distributed.fft`)
 
    FFTOptions
    FFTDirection
-   Slab
diff --git a/docs/sphinx/distributed-apis/figures/1d-block-cyclic-example.png b/docs/sphinx/distributed-apis/figures/1d-block-cyclic-example.png
new file mode 100644
index 0000000..041b9a8
Binary files /dev/null and b/docs/sphinx/distributed-apis/figures/1d-block-cyclic-example.png differ
diff --git a/docs/sphinx/distributed-apis/figures/2d-block-cyclic-example.png b/docs/sphinx/distributed-apis/figures/2d-block-cyclic-example.png
new file mode 100644
index 0000000..671986b
Binary files /dev/null and b/docs/sphinx/distributed-apis/figures/2d-block-cyclic-example.png differ
diff --git a/docs/sphinx/distributed-apis/figures/block-non-cyclic-example.png b/docs/sphinx/distributed-apis/figures/block-non-cyclic-example.png
new file mode 100644
index 0000000..5f97ef0
Binary files /dev/null and b/docs/sphinx/distributed-apis/figures/block-non-cyclic-example.png differ
diff --git a/docs/sphinx/distributed-apis/figures/box-example.png b/docs/sphinx/distributed-apis/figures/box-example.png
new file mode 100644
index 0000000..86de4fd
Binary files /dev/null and b/docs/sphinx/distributed-apis/figures/box-example.png differ
diff --git a/docs/sphinx/distributed-apis/figures/slab-example.png b/docs/sphinx/distributed-apis/figures/slab-example.png
new file mode 100644
index 0000000..d7c3182
Binary files /dev/null and b/docs/sphinx/distributed-apis/figures/slab-example.png differ
diff --git a/docs/sphinx/distributed-apis/index.rst b/docs/sphinx/distributed-apis/index.rst
index 326dc1e..0d0533d 100644
--- a/docs/sphinx/distributed-apis/index.rst
+++ b/docs/sphinx/distributed-apis/index.rst
@@ -8,7 +8,7 @@ called from host code but execute on a distributed (multi-node multi-GPU)
 system.
 
 Overview
---------
+========
 
 The distributed APIs look and feel similar to their CPU and GPU counterparts,
 with a few key differences:
@@ -16,12 +16,12 @@ with a few key differences:
 * To use the APIs, the application is launched with multiple processes,
   currently using MPI (e.g. ``mpirun``).
 
-* Each process is assigned to one GPU.
+* There is one process per GPU.
 
-* The operands to the API are the **local partition** of the global operands
-  (as in the Single program multiple data -SPMD- model) and the user specifies
-  the **distribution** (how the data is partitioned across processes). This
-  allows the user to partition once and compose across distributed APIs.
+* The operands to the API on each process are the **local partition** of the global
+  operands (as in the Single program multiple data -SPMD- model) and the user specifies
+  the :doc:`distribution <distribution>` (how the data is partitioned across
+  processes). This allows the user to partition once and compose across distributed APIs.
 
 * The local operands in certain memory spaces may require **special
   allocation** considerations. For example, GPU operands to the distributed
@@ -30,37 +30,9 @@ with a few key differences:
   PyTorch tensors in PGAS space (refer to :doc:`Distributed API Utilities
   <utils>` for details).
 
-.. _distributed-api-initialize:
-
-Initializing the distributed runtime
-------------------------------------
-
-To use the distributed APIs, you must first initialize the distributed runtime.
-This is done by having each process provide a local CUDA device ID (referring
-to a GPU on the host on which that process runs) and an MPI communicator:
-
-.. code-block:: python
-
-    import nvmath.distributed
-    from mpi4py import MPI
-    comm = MPI.COMM_WORLD  # can use any MPI communicator
-    nvmath.distributed.initialize(device_id, comm)
-
-.. note::
-
-    nvmath-python uses MPI for bootstrapping, and other bootstrapping modes
-    may become available in the future.
-
-    Under the hood, the distributed math libraries use additional
-    communication backends, such as NVSHMEM.
-
-    You are free to use MPI in other parts of your application.
-
-After initializing the distributed runtime you may use the distributed APIs.
-Certain APIs such as FFT and Reshape use a PGAS model for parallelism and
-require GPU operands to be allocated on the *symmetric memory heap*. Refer to
-:doc:`Distributed API Utilities <utils>` for examples and details of how to manage
-GPU operands on symmetric memory.
+.. important::
+    To use the distributed APIs, you must first initialize the distributed runtime
+    (see :doc:`Distributed runtime <runtime>`).
 
 ========
 Contents
@@ -70,6 +42,9 @@ Contents
    :caption: API Reference
    :maxdepth: 2
 
+   Distributed runtime <runtime.rst>
+   Operand distribution <distribution.rst>
+   Linear Algebra <linalg/index.rst>
    Fast Fourier Transform <fft/index.rst>
    Reshape <reshape/index.rst>
    Distributed API Utilities <utils.rst>
diff --git a/docs/sphinx/distributed-apis/linalg/index.rst b/docs/sphinx/distributed-apis/linalg/index.rst
new file mode 100644
index 0000000..09a6ae5
--- /dev/null
+++ b/docs/sphinx/distributed-apis/linalg/index.rst
@@ -0,0 +1,198 @@
+**************************
+Distributed Linear Algebra
+**************************
+
+.. _distributed-linalg-overview:
+
+Overview
+========
+
+The distributed Linear Algebra module :mod:`nvmath.distributed.linalg.advanced` in
+nvmath-python leverages the NVIDIA cuBLASMp library and provides a powerful suite
+of APIs that can be directly called from the host to efficiently perform matrix
+multiplications on multi-node multi-GPU systems at scale. Both stateless
+function-form APIs and stateful class-form APIs are provided.
+
+The distributed matrix multiplication APIs are similar to their non-distributed host
+API counterparts, with some key differences:
+
+* The operands to the API on each process are the **local partition** of the
+  global operands and the user specifies the **distribution** (how the data
+  is partitioned across processes). The APIs natively support the block-cyclic
+  distribution (see :ref:`distribution-block`).
+
+* The APIs optionally support GPU operands on **symmetric memory**. Refer to
+  :doc:`Distributed API Utilities <../utils>` for examples and details of how to
+  manage symmetric memory GPU operands.
+
+
+Operand distribution
+--------------------
+
+To perform a distributed operation, first you have to specify how the operand is
+distributed across processes. Distributed matrix multiply natively supports the
+block-cyclic distribution (see :ref:`distribution-block`), therefore you must
+provide a distribution compatible with block-cyclic. Compatible distributions
+include :ref:`distribution-block-cyclic`, :ref:`distribution-block-non-cyclic`
+and :ref:`distribution-slab` (with uniform partition sizes).
+
+Memory layout
+-------------
+
+cuBLASMp requires operands to use Fortran-order memory layout, while Python libraries
+such as NumPy and PyTorch use C-order by default.
+See :ref:`distribution-mem-layout` for guidelines on memory layout conversion
+for distributed operands and potential implications on distribution.
+
+Matrix qualifiers
+-----------------
+
+Matrix qualifiers are used to indicate whether an input matrix is transposed or not.
+
+For example, for ``A.T @ B`` you have to specify:
+
+.. code-block:: python
+
+    from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype, matmul
+
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[0]["is_transpose"] = True  # a is transposed
+    qualifiers[1]["is_transpose"] = False  # b is not transposed (optional)
+
+    ...
+
+    result = matmul(a, b, distributions=distributions, qualifiers=qualifiers)
+
+.. caution::
+    A common strategy to convert memory layout to Fortran-order (required by cuBLASMp)
+    is to transpose the input matrices, as explained in :ref:`distribution-mem-layout`.
+    Remember to set the matrix qualifiers accordingly.
+
+
+Distributed algorithm
+---------------------
+
+cuBLASMp implements efficient communication-overlap algorithms that are suited for
+distributed machine learning scenarios with tensor parallelism.
+Algorithms include AllGather+GEMM and GEMM+ReduceScatter.
+These algorithms have special requirements in terms of how each of the operands
+is distributed and their transpose qualifiers.
+
+Currently, to be able to use these algorithms the matrices must be distributed using a
+1D partitioning scheme without the cyclic distribution and the partition sizes must
+be uniform (:ref:`distribution-block-non-cyclic` and :ref:`distribution-slab`
+are valid distributions for this use case).
+
+Please refer to
+`cuBLASMp documentation <https://docs.nvidia.com/cuda/cublasmp/usage/tp.html>`_
+for full details.
+
+Symmetric memory
+----------------
+
+Operands may be allocated on the symmetric heap. If so, the result will also be
+allocated on the symmetric heap.
+
+.. tip::
+    Certain distributed matrix multiplication algorithms may perform better when the
+    operands are on symmetric memory.
+
+.. important::
+    Any memory on the symmetric heap that is owned by the user (including the
+    distributed Matmul result) must be deleted explicitly using
+    :func:`~nvmath.distributed.free_symmetric_memory`. Refer to
+    :doc:`Distributed API Utilities <../utils>` for more information.
+
+See `example
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/
+matmul/example01_cupy_symmetric_memory.py>`_.
+
+Example
+-------
+
+The following example performs :math:`\alpha A @ B + \beta C` with inputs distributed
+according to a :ref:`distribution-slab` distribution (partitioning on a single dimension):
+
+.. tip::
+    Reminder to initialize the distributed context first as per
+    :ref:`distributed-api-initialize` and to select both NVSHMEM and
+    NCCL as communication backends.
+
+.. code-block:: python
+
+    import cupy as cp
+    from nvmath.distributed.distribution import Slab
+    from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+    # Get my process rank from mpi4py communicator.
+    rank = communicator.Get_rank()
+
+    # The global problem size m, n, k
+    m, n, k = 128, 512, 1024
+
+    # Prepare sample input data.
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(*Slab.X.shape(rank, (m, k)))
+        b = cp.random.rand(*Slab.X.shape(rank, (n, k)))
+        c = cp.random.rand(*Slab.Y.shape(rank, (n, m)))
+
+    # Get transposed views with Fortran-order memory layout
+    a = a.T  # a is now (k, m) with Slab.Y
+    b = b.T  # b is now (k, n) with Slab.Y
+    c = c.T  # c is now (m, n) with Slab.X
+
+    distributions = [Slab.Y, Slab.Y, Slab.X]
+
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[0]["is_transpose"] = True  # a is transposed
+
+    alpha = 0.45
+    beta = 0.67
+
+    # Perform the distributed GEMM.
+    result = nvmath.distributed.linalg.advanced.matmul(
+        a,
+        b,
+        c=c,
+        alpha=alpha,
+        beta=beta,
+        distributions=distributions,
+        qualifiers=qualifiers,
+    )
+
+    # Synchronize the default stream, since by default the execution
+    # is non-blocking for GPU operands.
+    cp.cuda.get_current_stream().synchronize()
+
+    # result is distributed row-wise
+    assert result.shape == Slab.X.shape(rank, (m, n))
+
+
+You can find many more examples `here
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/
+linalg/advanced/matmul>`_.
+
+
+.. _distributed-linalg-api-reference:
+
+API Reference
+=============
+
+.. module:: nvmath.distributed.linalg.advanced
+
+Distributed Linear Algebra APIs (:mod:`nvmath.distributed.linalg.advanced`)
+---------------------------------------------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   matmul
+   matrix_qualifiers_dtype
+   Matmul
+   MatmulComputeType
+   MatmulEpilog
+   MatmulAlgoType
+
+   :template: dataclass.rst
+
+   MatmulOptions
diff --git a/docs/sphinx/distributed-apis/reshape/figures/box.png b/docs/sphinx/distributed-apis/reshape/figures/box.png
deleted file mode 100644
index c8da419..0000000
Binary files a/docs/sphinx/distributed-apis/reshape/figures/box.png and /dev/null differ
diff --git a/docs/sphinx/distributed-apis/reshape/index.rst b/docs/sphinx/distributed-apis/reshape/index.rst
index 56f00d9..8b623c3 100644
--- a/docs/sphinx/distributed-apis/reshape/index.rst
+++ b/docs/sphinx/distributed-apis/reshape/index.rst
@@ -19,37 +19,7 @@ function-form APIs and stateful class-form APIs are provided:
 Reshape is a general-purpose API to change how data is distributed or
 partitioned across processes, by shuffling data among the processes.
 Distributed reshape supports arbitrary data distributions in the form of
-1D/2D/3D boxes.
-
-.. _distributed-reshape-box:
-
-Box distribution
-----------------
-
-Consider a ``X*Y*Z`` global array. 3D boxes can be used to describe a subsection
-of this global array by indicating the lower and upper corner of the subsection.
-By associating boxes to processes one can then describe a data distribution where
-every process owns a contiguous rectangular subsection of the global array.
-
-For instance, consider a 2D case with a global array of size ``X*Y = 4*4`` and
-three boxes, described as ``box = [lower, upper]``:
-
-.. code-block:: python
-
-    box_0 = [(0,0), (2,2)]  # green
-    box_1 = [(2,0), (4,2)]  # blue
-    box_2 = [(0,2), (4,4)]  # purple
-
-.. figure:: ./figures/box.png
-    :width: 33%
-
-By associating box 0 to process 0, box 1 to process 1 and box 2 to process 2, this creates a
-data distribution of the global ``4*4`` array across three processes. The same can be
-generalized to N-D arrays and any number of processes.
-
-For more information, refer to the `cuFFTMp documentation
-<https://docs.nvidia.com/cuda/cufftmp/usage/api_usage.html
-#usage-with-custom-slabs-and-pencils-data-decompositions>`_.
+1D/2D/3D boxes (see :ref:`distribution-box` distribution).
 
 Example
 -------
@@ -67,6 +37,8 @@ distributed reshape:
 
 .. code-block:: python
 
+    from nvmath.distributed.distribution import Box
+
     # The global dimensions of the matrix are 4x4. The matrix is distributed
     # column-wise, so each process has 4 rows and 2 columns.
 
@@ -78,11 +50,11 @@ distributed reshape:
 
     # Reshape from column-wise to row-wise.
     if rank == 0:
-        input_box = [(0, 0), (4, 2)]
-        output_box = [(0, 0), (2, 4)]
+        input_box = Box((0, 0), (4, 2))
+        output_box = Box((0, 0), (2, 4))
     else:
-        input_box = [(0, 2), (4, 4)]
-        output_box = [(2, 0), (4, 4)]
+        input_box = Box((0, 2), (4, 4))
+        output_box = Box((2, 0), (4, 4))
 
     # Distributed reshape returns a new operand with its own buffer.
     B = nvmath.distributed.reshape.reshape(A, input_box, output_box)
diff --git a/docs/sphinx/distributed-apis/runtime.rst b/docs/sphinx/distributed-apis/runtime.rst
new file mode 100644
index 0000000..c174317
--- /dev/null
+++ b/docs/sphinx/distributed-apis/runtime.rst
@@ -0,0 +1,51 @@
+*******************
+Distributed runtime
+*******************
+
+.. _distributed-api-initialize:
+
+Initializing the distributed runtime
+====================================
+
+To use the distributed APIs, you must first initialize the distributed runtime.
+This is done by having each process provide a local CUDA device ID (referring
+to a GPU on the host on which that process runs), an MPI communicator and the
+desired communication backends:
+
+.. code-block:: python
+
+    import nvmath.distributed
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD  # can use any MPI communicator
+    nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+.. note::
+
+    nvmath-python uses MPI for bootstrapping, and other bootstrapping modes
+    may become available in the future.
+
+    Under the hood, the distributed math libraries use additional
+    communication backends, such as NVSHMEM and NCCL.
+
+    You are free to use MPI in other parts of your application.
+
+After initializing the distributed runtime you may use the distributed APIs.
+Certain APIs such as FFT and Reshape require GPU operands to be allocated on the
+*symmetric memory heap*. Refer to :doc:`Distributed API Utilities <utils>` for
+examples and details of how to manage GPU operands on symmetric memory.
+
+API Reference
+=============
+
+.. module:: nvmath.distributed
+
+.. autosummary::
+   :toctree: generated/
+
+   initialize
+   finalize
+   get_context
+
+   :template: dataclass.rst
+
+   DistributedContext
diff --git a/docs/sphinx/distributed-apis/utils.rst b/docs/sphinx/distributed-apis/utils.rst
index 7c2ccc1..0c3298d 100644
--- a/docs/sphinx/distributed-apis/utils.rst
+++ b/docs/sphinx/distributed-apis/utils.rst
@@ -69,6 +69,7 @@ API Reference
 =============
 
 .. module:: nvmath.distributed
+   :no-index:
 
 Symmetric Memory utilities
 --------------------------
diff --git a/docs/sphinx/host-apis/fft/index.rst b/docs/sphinx/host-apis/fft/index.rst
index 8f219bd..2742ebd 100644
--- a/docs/sphinx/host-apis/fft/index.rst
+++ b/docs/sphinx/host-apis/fft/index.rst
@@ -74,7 +74,7 @@ dependencies. Pip users should run the following command:
 
 .. code-block:: bash
 
-   pip install nvmath-python[cu12,dx]
+   pip install nvmath-python[cu12-dx]
 
 
 Required dependencies
@@ -82,35 +82,19 @@ Required dependencies
 
 For those who need to collect the required dependencies manually:
 
-- LTO callbacks are supported by cuFFT 11.3 which is shipped with `CUDA Toolkit 12.6 Update
-  2 and newer <https://developer.nvidia.com/cuda-downloads>`_.
-- Using cuFFT LTO callbacks requires nvJitLink from the same CUDA toolkit or newer (within
-  the same major CUDA release, for example version 12).
+- LTO callbacks are supported starting with cuFFT 11.3, shipped with
+  `CUDA Toolkit <https://developer.nvidia.com/cuda-downloads>`_ 12.6 Update 2.
+- The target device must have compute capability 7.0 (7.5 for CTK 13) or higher.
+- Using cuFFT LTO callbacks requires nvJitLink from the same CUDA toolkit or newer, within
+  the same major CUDA release. For example, if cuFFT comes from CUDA Toolkit 13.X, then
+  nvJitLink must come from CUDA Toolkit 13.Y, such that ``Y >= X``.
 - Compiling the callbacks with the :func:`nvmath.fft.compile_prolog` and
-  :func:`nvmath.fft.compile_epilog` helpers requires Numba 0.59+ and nvcc/nvvm from the same
-  CUDA toolkit as nvJitLink or older (within the same major CUDA release). The helpers
-  require the target device to have compute capability 7.0 or higher.
+  :func:`nvmath.fft.compile_epilog` helpers requires numba-cuda 0.18.1 or newer.
 
 For further details, refer to the `cuFFT LTO documentation
 <https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_.
 
 
-Older CTKs
-^^^^^^^^^^
-
-Adventurous users who want to try callback functionality and cannot upgrade the CUDA Toolkit
-to 12.6U2, can download and install the older preview release `cuFFT LTO EA version 11.1.3.0
-<https://docs.nvidia.com/cuda/archive/12.6.1/cufft/ltoea/release_notes.html
-#cufft-lto-ea-preview-11-1-3-0>`_ from `here <https://developer.nvidia.com/cufftea>`_, which
-requires at least CUDA Toolkit 12.2. When using LTO EA, setting environmental variables may
-be needed for nvmath to pick the desired cuFFT version. Users should adjust the
-``LD_PRELOAD`` variable, so that the right cuFFT shared library is used:
-
-.. code-block:: bash
-
-   export LD_PRELOAD="/path_to_cufft_lto_ea/libcufft.so"
-
-
 .. _fft-gpu-cpu-execution:
 
 Execution space
@@ -134,7 +118,7 @@ Libraries) <https://developer.nvidia.com/nvpl>`_ FFT to run the transform. On x8
 architecture, the `MKL library <https://pypi.org/project/mkl/>`_ can be used.
 
 For pip users, the fastest way to get the required dependencies is to use ``'cu12'`` /
-``'cu11'`` and ``'cpu'`` extras:
+``'cu13'`` and ``'cpu'`` extras:
 
 .. code-block:: bash
 
diff --git a/docs/sphinx/host-apis/index.rst b/docs/sphinx/host-apis/index.rst
index 0e32df2..7f97387 100644
--- a/docs/sphinx/host-apis/index.rst
+++ b/docs/sphinx/host-apis/index.rst
@@ -2,11 +2,109 @@
 Host APIs
 *********
 
-The following of modules of nvmath-python offer integration with NVIDIA's
-high-performance computing libraries through host APIs for cuBLAS and cuFFT.
-Host APIs are called from host code but can execute in any supported execution
+The following modules of nvmath-python offer integration with NVIDIA's
+high-performance computing libraries such as cuBLAS, cuDSS, cuFFT, and
+cuTENSOR (and their NVPL counterparts) through host APIs.  Host APIs
+are called from host code but can execute in any supported execution
 space (CPU or GPU).
 
+============
+Key Concepts
+============
+
+.. _matrix-tensor-qualifiers:
+
+----------------------------
+Matrix and Tensor Qualifiers
+----------------------------
+
+Recall that nvmath-python is not an array library but it
+:ref:`interoperates <host api interop>` with array and tensor libraries
+including NumPy, CuPy, and PyTorch. Therefore we need a way to provide
+additional information about the operands that are not contained in the
+ndarray or tensor type to an operation (lazy conjugation or triangular matrix
+structure are examples). This is done via the notion of qualifiers
+on the tensor operands, which is provided as an NumPy ndarray of the same
+length as the number of operands with the appropriate qualifiers dtype.
+Each qualifier in the qualifiers array provides auxiliary information about
+the corresponding operand.
+
+The following example shows a matrix multiplication between two matrices,
+:math:`a` and :math:`b`, where :math:`a` should be treated as a regular
+dense matrix while :math:`b` as a lower triangular matrix.
+Note how the qualifier is used to inform the API of :math:`b`'s triangular structure.
+
+.. code-block:: python
+
+    import numpy as np
+    import nvmath
+
+    # Prepare sample input data.
+    m, k = 123, 789
+    a = np.random.rand(m, k).astype(np.float32)
+    b = np.tril(np.random.rand(k, k).astype(np.float32))
+
+    # We can choose the execution space for the matrix multiplication using ExecutionCUDA or
+    # ExecutionCPU. By default, the execution space matches the operands, so in order to execute
+    # a matrix multiplication on NumPy arrays using CUDA we need to specify ExecutionCUDA.
+    # Tip: use help(nvmath.linalg.ExecutionCUDA) to see available options.
+    execution = nvmath.linalg.ExecutionCUDA()
+
+    # We can use structured matrices as inputs by providing the corresponding qualifier which
+    # describes the matrix. By default, all inputs are assumed to be general matrices.
+    # MatrixQualifiers are provided as an array of custom NumPy dtype,
+    # nvmath.linalg.matrix_qualifiers_dtype.
+    qualifiers = np.full(
+       (2,), nvmath.linalg.GeneralMatrixQualifier.create(),
+       dtype=nvmath.linalg.matrix_qualifiers_dtype
+    )
+    qualifiers[1] = nvmath.linalg.TriangularMatrixQualifier.create(
+       uplo=nvmath.linalg.FillMode.LOWER
+    )
+
+    result = nvmath.linalg.matmul(a, b, execution=execution, qualifiers=qualifiers)
+
+The following example shows how a qualifier is used to conjugate a CuPy tensor
+operand as part of the contraction operation.  Since complex-conjugation is a
+memory-bound operation, this fusion improves performance compared to the
+alternative of performing the conjugation *a priori* using CuPy.
+
+.. code-block:: python
+
+    import cupy as cp
+    import numpy as np
+    import nvmath
+
+    a = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+    b = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+    c = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+    d = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+
+    # create an array of qualifiers (of length # of operands) with the default identity operator
+    qualifiers = np.full(
+       4, nvmath.tensor.Operator.OP_IDENTITY,
+       dtype=nvmath.tensor.tensor_qualifiers_dtype
+    )
+    # set the qualifier for operand b to conjugate
+    qualifiers[1] = nvmath.tensor.Operator.OP_CONJ
+
+    # result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n].conj() * c[m,n,p,q] + d[i,j,p,q]
+    result = nvmath.tensor.ternary_contraction(
+       "ijkl,klmn,mnpq->ijpq",
+       a, b, c, d=d, qualifiers=qualifiers, beta=1
+    )
+
+.. seealso::
+
+   :class:`nvmath.linalg.matrix_qualifiers_dtype`,
+   :class:`nvmath.linalg.advanced.matrix_qualifiers_dtype`,
+   :class:`nvmath.distributed.linalg.advanced.matrix_qualifiers_dtype`,
+   :class:`nvmath.tensor.tensor_qualifiers_dtype`
+
+Examples using qualifers can be found in the
+`examples <https://github.com/NVIDIA/nvmath-python/tree/main/examples>`_
+directory on GitHub.
+
 ========
 Contents
 ========
@@ -18,4 +116,5 @@ Contents
    Linear Algebra <linalg/index.rst>
    Sparse Linear Algebra <sparse/index.rst>
    Fast Fourier Transform <fft/index.rst>
+   Tensor Operations <tensor/index.rst>
    Host API Utilities <utils.rst>
diff --git a/docs/sphinx/host-apis/linalg/index.rst b/docs/sphinx/host-apis/linalg/index.rst
index 8da23c2..d8f0f08 100644
--- a/docs/sphinx/host-apis/linalg/index.rst
+++ b/docs/sphinx/host-apis/linalg/index.rst
@@ -8,8 +8,14 @@ Overview
 ========
 
 The Linear Algebra module :mod:`nvmath.linalg` in nvmath-python leverages various NVIDIA
-math libraries to support multiple linear algebra computations. As of the initial Beta
-release, we offer the specialized matrix multiplication API based on the cuBLASLt library.
+math libraries to support dense [#]_ linear algebra computations. As of version 0.7.0, we
+offer both a generic matrix multiplication API based on the cuBLAS and NVPL libraries and a
+specialized matrix multiplication API (:mod:`nvmath.linalg.advanced`) based on the cuBLASLt
+library. See :ref:`Generic and Specialized APIs <generic specialized>` for motivation.
+
+At a high-level, if your use case is predominantly GEMM and requires particular flexibility
+in matrix data layouts, input and/or compute types, and also in choosing the algorithmic
+implementation, look at the specialized APIs. Otherwise, look at the generic APIs.
 
 .. _linalg-api-reference:
 
@@ -21,13 +27,39 @@ API Reference
 Generic Linear Algebra APIs (:mod:`nvmath.linalg`)
 --------------------------------------------------
 
-Generic APIs will be available in a later release.
+The generic linear algebra module includes matrix multiplication APIs which accept
+structured matrices as input, but do not allow for control over computational precision or
+algorithm selection and planning.
+
+.. autosummary::
+   :toctree: generated/
+
+   matmul
+   Matmul
+   matrix_qualifiers_dtype
+   DiagonalMatrixQualifier
+   GeneralMatrixQualifier
+   HermitianMatrixQualifier
+   InvalidMatmulState
+   SymmetricMatrixQualifier
+   TriangularMatrixQualifier
+
+   :template: dataclass.rst
+
+   ExecutionCPU
+   ExecutionCUDA
+   MatmulOptions
 
 .. module:: nvmath.linalg.advanced
 
 Specialized Linear Algebra APIs (:mod:`nvmath.linalg.advanced`)
 ---------------------------------------------------------------
 
+The specialized linear algebra module includes a matrix multiplication API which only
+accepts general matrices, but provides extra functionality such as epilog functions, more
+options and controls over computational precision, and control over algorithm selection and
+planning.
+
 .. autosummary::
    :toctree: generated/
 
@@ -67,3 +99,7 @@ Matmul helpers (:mod:`nvmath.linalg.advanced.helpers.matmul`)
    invert_mxfp8_scale
    apply_mxfp8_scale
    get_mxfp8_scale_offset
+
+.. rubric:: Footnotes
+
+.. [#] See :ref:`Sparse Linear Algebra <sparse-overview>` for sparse operations.
diff --git a/docs/sphinx/host-apis/sparse/index.rst b/docs/sphinx/host-apis/sparse/index.rst
index 932c7ec..bfd1772 100644
--- a/docs/sphinx/host-apis/sparse/index.rst
+++ b/docs/sphinx/host-apis/sparse/index.rst
@@ -8,8 +8,8 @@ Overview
 ========
 
 The sparse linear algebra module :mod:`nvmath.sparse` in nvmath-python leverages various
-NVIDIA math libraries to support sparse linear algebra computations. As of the current Beta
-release, we offer the specialized sparse direct solver API based on the `cuDSS library
+NVIDIA math libraries to support sparse [#]_ linear algebra computations. As of the current
+Beta release, we offer the specialized sparse direct solver API based on the `cuDSS library
 <https://docs.nvidia.com/cuda/cudss/>`_.
 
 .. _sparse-api-reference:
@@ -50,3 +50,7 @@ Specialized Linear Algebra APIs (:mod:`nvmath.sparse.advanced`)
    ExecutionCUDA
    ExecutionHybrid
    HybridMemoryModeOptions
+
+.. rubric:: Footnotes
+
+.. [#] See :ref:`Linear Algebra <linalg-overview>` for dense operations.
diff --git a/docs/sphinx/host-apis/tensor/index.rst b/docs/sphinx/host-apis/tensor/index.rst
new file mode 100644
index 0000000..ffeefbb
--- /dev/null
+++ b/docs/sphinx/host-apis/tensor/index.rst
@@ -0,0 +1,90 @@
+*****************
+Tensor Operations
+*****************
+
+.. _tensor-overview:
+
+Overview
+========
+
+The tensor module :mod:`nvmath.tensor` in nvmath-python provides APIs for tensor
+operations powered by the high-performance NVIDIA cuTENSOR library. We currently
+offer binary and ternary contraction APIs supporting the CUDA execution space.
+
+For contracting a tensor network, refer to the `Network
+<https://docs.nvidia.com/cuda/cuquantum/latest/python/generated/
+cuquantum.tensornet.Network.html>`_
+API from the cuQuantum library. While network contraction can be used for
+binary and ternary contraction, the focus here is on the optimal contraction of a *tensor
+network* and therefore not all options pertinent to each pairwise
+contraction are available to the user. The generalized binary
+:math:`\alpha \; a \cdot b + \beta \; c` and ternary
+:math:`\alpha \; a \cdot b \cdot c + \beta \; d` contraction operations
+(where :math:`\cdot` represents tensor contraction)
+in this module are fused, and support options specific to efficient
+execution of these operations.
+
+.. code-block:: python
+
+    import cupy as cp
+    from cupyx.profiler import benchmark
+
+    import nvmath
+
+    a = cp.random.rand(64, 8, 8, 6, 6)
+    b = cp.random.rand(64, 8, 8, 6, 6)
+
+    # Create a stateful BinaryContraction object 'contraction'.
+    with nvmath.tensor.BinaryContraction("pijkl,pjiab->lakbp", a, b) as contraction:
+        # Get the handle to the plan preference object
+        plan_preference = contraction.plan_preference
+        # update the kernel rank to the third best for the underlying algorithm
+        plan_preference.kernel_rank = 2
+
+        for algo in (
+            nvmath.tensor.ContractionAlgo.DEFAULT_PATIENT,
+            nvmath.tensor.ContractionAlgo.GETT,
+            nvmath.tensor.ContractionAlgo.TGETT,
+            nvmath.tensor.ContractionAlgo.TTGT,
+            nvmath.tensor.ContractionAlgo.DEFAULT,
+        ):
+            print(f"Algorithm: {algo.name}")
+            plan_preference.algo = algo
+            # Plan the Contraction to activate the updated plan preference
+            contraction.plan()
+            print(benchmark(contraction.execute, n_repeat=20))
+
+More examples of tensor operations can be found on our
+`GitHub <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor>`_ repository.
+
+.. _tensor-api-reference:
+
+Host API Reference
+==================
+
+.. module:: nvmath.tensor
+
+
+Tensor Operations (:mod:`nvmath.tensor`)
+----------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   binary_contraction
+   ternary_contraction
+   tensor_qualifiers_dtype
+   BinaryContraction
+   TernaryContraction
+   ContractionAlgo
+   ContractionAutotuneMode
+   ContractionJitMode
+   ContractionCacheMode
+   ComputeDesc
+   ContractionPlanPreference
+   Operator
+
+   :template: dataclass.rst
+
+   ContractionOptions
+   ExecutionCUDA
diff --git a/docs/sphinx/host-apis/utils.rst b/docs/sphinx/host-apis/utils.rst
index 2d72f43..0c676d8 100644
--- a/docs/sphinx/host-apis/utils.rst
+++ b/docs/sphinx/host-apis/utils.rst
@@ -18,6 +18,7 @@ nvmath-python provides host-side APIs for managing device-side memory.
    :toctree: generated/
 
    BaseCUDAMemoryManager
+   BaseCUDAMemoryManagerAsync
    MemoryPointer
 
 Data types
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index f02cf3b..a72c073 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -35,21 +35,25 @@ optional dependencies expressible in the standard "extras" bracket notation. The
 assumes that **CTK components are also installed via pip** (so no extra step from users is
 needed; the dependencies are pulled via extras).
 
+.. important::
+    Using at least one of the ``pip`` extras described below is required for all ``pip``
+    installs to ensure that nvmath-python's dependencies are correctly constrained by
+    ``pip``.
+
+    ``pip install nvmath-python`` (no extras) is a bare installation (very lightweight) for
+    system admins or expert users. This requires that the user manage of all dependencies.
+
 .. list-table::
    :widths: 25 50
    :header-rows: 1
 
    * - Command
      - Description
-   * - ``pip install nvmath-python[cu11]``
-     - Install nvmath-python along with all CUDA 11 optional
-       dependencies (wheels for cuBLAS/cuFFT/... and CuPy) to support
-       nvmath host APIs.
    * - ``pip install nvmath-python[cu12]``
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (wheels for cuBLAS/cuFFT/... and CuPy) to support
        nvmath host APIs.
-   * - ``pip install nvmath-python[cu12,dx]``
+   * - ``pip install nvmath-python[cu12-dx]``
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (wheels for cuBLAS/cuFFT/..., CuPy, Numba, ...) to support
        nvmath host & device APIs (which only supports CUDA 12) [8]_.
@@ -65,6 +69,10 @@ needed; the dependencies are pulled via extras).
           provide the path to an alternate shared object which implements the
           FFTW3 (non-guru) API. Ensure ``LD_LIBRARY_PATH`` includes this
           library if it is not already in the PATH.
+       3. The environment variable ``NVMATH_BLAS_CPU_LIBRARY`` may be used to
+          provide the path to an alternate shared object which implements the
+          BLAS ABI. ``LD_LIBRARY_PATH`` should be set properly to
+          include this library if it is not already in the PATH.
 
    * - ``pip install nvmath-python[cu12-distributed]``
      - Install nvmath-python along with all MGMN optional dependencies (wheels for mpi4py,
@@ -72,7 +80,7 @@ needed; the dependencies are pulled via extras).
 
        **Note**: Users must provide an MPI implementation.
 
-   * - ``pip install nvmath-python[cu12,dx] 'nvidia-cuda-nvcc-cu12==12.8.*' 'nvidia-cuda-nvrtc-cu12==12.8.*' --extra-index-url https://download.pytorch.org/whl/cu128 torch``
+   * - ``pip install nvmath-python[cu12-dx] 'cuda-toolkit[nvcc,nvrtc]==12.8.*' --extra-index-url https://download.pytorch.org/whl/cu128 torch``
      - Install nvmath-python along with all CUDA 12 optional dependencies to support
        nvmath.device APIs and a PyTorch built with CTK 12.8.
 
@@ -93,12 +101,6 @@ themselves. The following assumes that **system CTK is installed**.
 
    * - Command
      - Description
-   * - ``pip install nvmath-python[sysctk11]``
-     - Install nvmath-python along with CuPy for CUDA 11 to support
-       nvmath host APIs.
-
-       **Note**: Set ``LD_LIBRARY_PATH`` to include the CUDA libraries.
-
    * - ``pip install nvmath-python[sysctk12]``
      - Install nvmath-python along with CuPy for CUDA 12 to support
        nvmath host APIs.
@@ -122,9 +124,8 @@ themselves. The following assumes that **system CTK is installed**.
        **Note**: Users must provide an MPI implementation and the required cuMp libraries
        and dependencies (NVSHMEM, cuFFTMp, ...).
 
-For system admins or expert users, ``pip install nvmath-python`` would be a bare minimal
-installation (very lightweight). This allows fully explicit control of all dependencies.
-
+.. hint::
+    To install extras for CUDA 13, use ``13`` in the extra names instead of ``12``.
 
 Install from conda
 ------------------
@@ -137,10 +138,6 @@ Conda packages can be installed from the `conda-forge <https://conda-forge.org>`
 
    * - Command
      - Description
-   * - ``conda install -c conda-forge nvmath-python cuda-version=11``
-     - Install nvmath-python along with all CUDA 11 optional
-       dependencies (packages for cuBLAS/cuFFT/... and CuPy) to support
-       nvmath host APIs.
    * - ``conda install -c conda-forge nvmath-python cuda-version=12``
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (packages for cuBLAS/cuFFT/... and CuPy) to support
@@ -167,7 +164,11 @@ Conda packages can be installed from the `conda-forge <https://conda-forge.org>`
           library may be substituted for x86 architecture.
        3. The environment variable ``NVMATH_FFT_CPU_LIBRARY`` may be used to
           provide the path to an alternate shared object which implements the
-          FFTW3 (non-guru) API. ``LD_LIBRARY_PATH`` should be set properly to
+          FFTW3 (non-guru) API. ``LD_LIBRARY_PATH`` should be set properly to
+          include this library if it is not already in the PATH.
+       4. The environment variable ``NVMATH_BLAS_CPU_LIBRARY`` may be used to
+          provide the path to an alternate shared object which implements the
+          BLAS ABI. ``LD_LIBRARY_PATH`` should be set properly to
           include this library if it is not already in the PATH.
 
    * - ``conda install -c conda-forge nvmath-python-distributed``
@@ -225,8 +226,8 @@ source. There are several ways to build it since we need some CUDA headers at bu
 
 **Notes**:
 
-- If you add the "extras" notation after the dot ``.`` (for example ``.[cu11]``,
-  ``.[cu12,dx]``, ...), it has the same meaning as explained in the :ref:`previous section
+- If you add the "extras" notation after the dot ``.`` (for example ``.[cpu]``,
+  ``.[cu12-dx]``, ...), it has the same meaning as explained in the :ref:`previous section
   <install from pypi>`.
 - If you don't want the run-time dependencies to be automatically handled, add ``--no-deps``
   after the ``pip install`` command above; in this case, however, it's your responsibility
@@ -276,14 +277,14 @@ dependency is *required* unless stated otherwise.
        GPU connectivity: :cufftmp_hw:`requirements`
    * - CUDA driver [2]_
      -
-     - | 450.80.02+ (Linux) / 450.39+ (Windows) with CUDA >=11.2
+     - | 525.60.13+ (Linux) / 527.41+ (Windows) with CUDA >=12.0
        |
-       | 525.60.13+ (Linux) / 527.41+ (Windows) with CUDA >=12.0
+       | 580+ with CUDA >=13.0
        |
        | *Optional*: needed if the execution space is GPU or for loading any CUDA library.
-     - 525.60.13+ (Linux) with CUDA 12.x
-     - 525.60.13+ (Linux) with CUDA 12.x
-     - 525.60.13+ (Linux) with CUDA 12.x
+     - 525.60.13+ (Linux) with CUDA >=12.0
+     - 525.60.13+ (Linux) with CUDA >=12.0
+     - 525.60.13+ (Linux) with CUDA >=12.0
    * - Python
      - 3.10-3.13
      - 3.10-3.13
@@ -315,15 +316,15 @@ dependency is *required* unless stated otherwise.
      -
      -
    * - CUDA
-     - | CUDA >=11.2
+     - | CUDA >=12.0
        | (only need headers from NVCC & CUDART [6]_)
-     - | CUDA >=11.2
+     - | CUDA >=12.0
        |
        | *Optional*: depending on the math operations in use
      - | CUDA >=12.0,!=12.4.*,!=12.5.0 [7]_
        | (NVRTC, NVVM, CCCL [8]_, CUDART)
-     - CUDA 12.x
-     - CUDA 12.x
+     - CUDA >=12.0
+     - CUDA >=12.0
    * - cuda-pathfinder
      -
      - >=1.2.1
@@ -345,17 +346,17 @@ dependency is *required* unless stated otherwise.
    * - | CuPy
        | (see `CuPy installation guide <https://docs.cupy.dev/en/stable/install.html>`_)
      -
-     - >=10.0.0 [4]_
+     - >=12.1 [4]_
      -
-     - >=10.0.0 [4]_
-     - >=10.0.0 [4]_
+     - >=12.1 [4]_
+     - >=12.1 [4]_
    * - | PyTorch
        | (see `PyTorch installation guide <https://pytorch.org/get-started/locally/>`_)
      -
-     - >=1.10 (optional) [10]_
+     - >=1.12 (optional) [10]_
      -
-     - >=1.10 (optional)
-     - >=1.10 (optional)
+     - >=1.12 (optional)
+     - >=1.12 (optional)
    * - libmathdx (cuBLASDx, cuFFTDx, ...)
      -
      -
@@ -387,16 +388,13 @@ Test Configuration
 
 nvmath-python is tested in the following environments:
 
-.. TODO:
-   Update me
-
 .. list-table::
    :widths: 50 50
 
    * - CUDA
-     - 11.x (latest), 12.0, 12.8
+     - 12.0, 12.9, 13.0
    * - Driver
-     - R520, R525, R570
+     - R525, R575, R580
    * - GPU model
      - H100, B200, RTX 4090, CG1 (Grace-Hopper)
    * - Python
@@ -453,9 +451,9 @@ See the ``examples`` directory in the repo. Currently we have:
 Tests
 .....
 
-The ``requirements/pip/tests.txt`` file lists dependencies required for ``pip``-controlled
-environments to run tests. These requirements are installed via the main
-``requirements/pip-dev-<name>.txt`` files.
+The ``pyproject.toml`` file lists dependencies required for ``pip``-controlled
+environments to run tests. These requirements are installed via the ``dev`` dependency
+group. e.g. ``pip install --group dev``
 
 
 Running functionality tests
@@ -514,8 +512,8 @@ For ``pip``-users, there are known limitations (many of which are nicely capture
 tools. For a complex library such as nvmath-python that interacts with many native
 libraries, there are user-visible caveats.
 
-1. Be sure that there are no packages with both ``-cu11`` (for CUDA 11) and ``-cu12`` (for
-   CUDA 12) suffices coexisting in your Python environment. For example, this is a corrupted
+1. Be sure that there are no packages with both ``-cu12`` (for CUDA 12) and ``-cu13`` (for
+   CUDA 13) suffixes coexisting in your Python environment. For example, this is a corrupted
    environment:
 
    .. code-block:: bash
@@ -523,8 +521,8 @@ libraries, there are user-visible caveats.
       $ pip list
       Package            Version
       ------------------ ---------
-      nvidia-cublas-cu11 11.11.3.6
       nvidia-cublas-cu12 12.5.2.13
+      nvidia-cublas      13.0.2.14
       pip                24.0
       setuptools         70.0.0
       wheel              0.43.0
@@ -563,45 +561,45 @@ but mix-n-match makes the detection logic impossible to get right.
 
 To help you perform an integrity check, the rule of thumb is that every single package
 should only come from one place (either ``pip``, or ``conda``, or local system). For
-example, if both ``nvidia-cufft-cu11`` (which is from ``pip``) and ``libcufft`` (from
+example, if both ``nvidia-cufft-cu12`` (which is from ``pip``) and ``libcufft`` (from
 ``conda``) appear in the output of ``conda list``, something is almost certainly wrong.
-Below is the package name mapping between ``pip`` and ``conda``, with ``XX={11,12}``
+Below is the package name mapping between ``pip`` and ``conda``, with ``XX=12``
 denoting CUDA's major version:
 
 .. list-table::
    :widths: 50 50 50
    :header-rows: 1
 
-   * - pip
+   * - pip (``cuda-toolkit==12``)
+     - pip (``cuda-toolkit>=13``)
      - conda (``cuda-version>=12``)
-     - conda (``cuda-version<12``)
    * - ``nvidia-cuda-nvcc-cuXX``
+     - ``nvidia-cuda-nvcc``
      - ``cuda-nvcc``
-     - n/a
    * - ``nvidia-cuda-nvrtc-cuXX``
+     - ``nvidia-cuda-nvrtc``
      - ``cuda-nvrtc``
-     - ``cudatoolkit``
    * - ``nvidia-cuda-runtime-cuXX``
+     - ``nvidia-cuda-runtime``
      - ``cuda-cudart-dev``
-     - ``cudatoolkit``
    * - ``nvidia-cuda-cccl-cuXX``
+     - ``nvidia-cuda-cccl``
      - ``cuda-cccl``
-     - n/a
    * - ``nvidia-cublas-cuXX``
+     - ``nvidia-cublas``
      - ``libcublas``
-     - ``cudatoolkit``
    * - ``nvidia-cusolver-cuXX``
+     - ``nvidia-cusolver``
      - ``libcusolver``
-     - ``cudatoolkit``
    * - ``nvidia-cusparse-cuXX``
+     - ``nvidia-cusparse``
      - ``libcusparse``
-     - ``cudatoolkit``
    * - ``nvidia-cufft-cuXX``
+     - ``nvidia-cufft``
      - ``libcufft``
-     - ``cudatoolkit``
    * - ``nvidia-curand-cuXX``
+     - ``nvidia-curand``
      - ``libcurand``
-     - ``cudatoolkit``
 
 Note that system packages (by design) do not show up in the output of ``conda list`` or
 ``pip list``. Linux users should check the installation list from your distro package
@@ -619,11 +617,10 @@ For more information with regard to the new CUDA 12+ package layout on conda-for
 .. [2] nvmath-python relies on `CUDA minor version compatibility
     <https://docs.nvidia.com/deploy/cuda-compatibility
     /minor-version-compatibility.html>`_.
-.. [4] As of Beta 6.0 (v0.6.0), CuPy is an optional run-time dependency. It is included in
-    cuda (cu11, cu12) and dx extras/meta-packages. In a future release it may be removed
-    from extras/meta-packages.
-.. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work
-    with libraries from CUDA 11.7 or below.
+.. [4] As of Beta 7.0 (v0.7.0), CuPy is an optional run-time dependency. It is not included in
+    the extras/meta-packages, and must be installed separately if desired.
+.. [5] For example, Blackwell GPUs are supported starting CUDA 12.8, so they would not work
+    with libraries from CUDA 12.6 or below (There is no CUDA 12.7).
 .. [6] While we need some CUDA headers at build time, there is no limitation in the CUDA
     version seen at build time.
 .. [7] These versions are not supported due to a known compiler bug; the ``[dx]`` extras
diff --git a/docs/sphinx/overview.rst b/docs/sphinx/overview.rst
index c6d4d9c..25147c6 100644
--- a/docs/sphinx/overview.rst
+++ b/docs/sphinx/overview.rst
@@ -4,8 +4,8 @@ Overview
 ********
 
 The primary goal of nvmath-python is to bring the power of the NVIDIA math libraries to the
-Python ecosystem. The package aims to provide intuitive Pythonic APIs that provide users
-full access to all the features offered by our libraries in a variety of execution spaces.
+Python ecosystem. The package aims to provide intuitive Pythonic APIs giving users full
+access to all the features offered by our libraries in a variety of execution spaces.
 
 We hope to empower a wide range of Python users by providing easy access to high-performance
 core math operations such as FFT, dense and sparse linear algebra, and more. This includes
@@ -28,19 +28,24 @@ The APIs provided by nvmath-python can be categorized into:
 The nvmath-python library is dedicated to delivering the following key features and
 commitments:
 
-1. **Logical Feature Parity**: While the Pythonic API surface (the number of APIs and the
+1. **Interoperability with array and tensor libraries**: Instead of providing a native array
+   or tensor data structure, nvmath-python provides seamless interoperability with
+   widely-used array libraries such as NumPy, CuPy, and PyTorch, through APIs compatible
+   with their data representations. nvmath-python should not be regarded as a replacement,
+   but rather as a complementary tool to these libraries.
+2. **Logical Feature Parity**: While the Pythonic API surface (the number of APIs and the
    complexity of each) is more concise compared to that of the C libraries, it provides
    access to their complete functionality.
-2. **Consistent Design Patterns**: Uniform design across all modules to simplify user
+3. **Consistent Design Patterns**: Uniform design across all modules to simplify user
    experience.
-3. **Transparency and Explicitness**: Avoiding implicit, costly operations such as copying
+4. **Transparency and Explicitness**: Avoiding implicit, costly operations such as copying
    data across the same memory space, automatic type promotion, and alterations to the user
    environment or state (current device, current stream, etc.). This allows users to perform
    the required conversion once for use in all subsequent operations instead of incurring
    hidden costs on each call.
-4. **Clear, Actionable Error Messages**: Ensuring that errors are informative and helpful in
+5. **Clear, Actionable Error Messages**: Ensuring that errors are informative and helpful in
    resolving the problem.
-5. **DRY Principle Compliance**: Automatically utilizing available information such as the
+6. **DRY Principle Compliance**: Automatically utilizing available information such as the
    current stream and memory pool to avoid redundant specification ("don't repeat
    yourself").
 
@@ -89,8 +94,12 @@ categories:
 
 - Fast Fourier Transform in :mod:`nvmath.fft`. Refer to :doc:`Fast Fourier Transform
   <host-apis/fft/index>` for details.
-- Linear Algebra in :mod:`nvmath.linalg`. Refer to :doc:`Linear Algebra
+- Dense Linear Algebra in :mod:`nvmath.linalg`. Refer to :doc:`Linear Algebra
   <host-apis/linalg/index>` for details.
+- Sparse Linear Algebra in :mod:`nvmath.sparse`. Refer to :doc:`Sparse Linear Algebra
+  <host-apis/sparse/index>` for details.
+- Tensor Algebra in :mod:`nvmath.tensor`. Refer to :doc:`Tensor Operations
+  <host-apis/tensor/index>` for details.
 
 
 .. _host api interop:
@@ -377,8 +386,8 @@ We currently offer support for calling FFT, matrix multiplication, and random nu
 generation APIs in kernels written using `Numba`_, with plans to offer more core operations
 and support other compilers in the future. The design of the device APIs closely mimics that
 of the C++ APIs from the corresponding NVIDIA Math Libraries (MathDx libraries `cuFFTDx
-<https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_ and `cuBLASDx
-<https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_ for FFT and matrix multiplication, and
+<https://docs.nvidia.com/cuda/cufftdx/1.5.1>`_ and `cuBLASDx
+<https://docs.nvidia.com/cuda/cublasdx/0.4.1>`_ for FFT and matrix multiplication, and
 `cuRAND device APIs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_
 for random number generation).
 
@@ -394,7 +403,7 @@ considerations, we strive to meet the following commitments:
 1. For the :doc:`low-level Python bindings <bindings/index>`,
 
    * if the library to be bound is part of CUDA Toolkit, we support the library from the
-     most recent two CUDA major versions (currently CUDA 11/12)
+     most recent two CUDA major versions (currently CUDA 12/13)
    * otherwise, we support the library within its major version
 
    Note that all bindings are currently *experimental*.
diff --git a/docs/sphinx/quickstart.rst b/docs/sphinx/quickstart.rst
index 6a6dd43..82376b8 100644
--- a/docs/sphinx/quickstart.rst
+++ b/docs/sphinx/quickstart.rst
@@ -16,7 +16,11 @@ To quickly install nvmath-python just run the following command:
 
 .. code-block:: bash
 
-    pip install nvmath-python[cu12,dx]
+    pip install nvmath-python[cu12-dx]
+
+.. important::
+    Using at least one of the ``pip`` extras is required for all ``pip`` installs to
+    ensure that nvmath-python's dependencies are correctly constrained.
 
 For more details visit the :doc:`Installation Guide<installation>`.
 
@@ -141,6 +145,8 @@ In this example, we'll use
     ... def kernel(states):
     ...     i = cuda.grid(1)
     ...     random_values = random.normal2(states[i])
+    >>>
+    >>> kernel[blocks, threads](states)
 
 To learn more about this and other Device APIs,
 visit the documentation of :mod:`nvmath.device`.
diff --git a/docs/sphinx/release-notes.rst b/docs/sphinx/release-notes.rst
index 14991ad..8df8d5a 100644
--- a/docs/sphinx/release-notes.rst
+++ b/docs/sphinx/release-notes.rst
@@ -1,6 +1,50 @@
 nvmath-python Release Notes
 ***************************
 
+nvmath-python v0.7.0
+====================
+
+Beta7 release.
+
+* This release supports CUDA 12 and CUDA 13. Support for CUDA 11 has been dropped.
+* New binary and ternary tensor contraction host APIs on GPU.
+* New generic host Matmul APIs that support dense and structured matrices (such as
+  triangular and diagonal) on GPU and CPU.
+* New distributed Matmul APIs to run on multi-node/multi-GPU systems.
+* Support for 64-bit integer indexing for the sparse direct solver.
+* The FFT and Matmul device APIs are now implicitly linked in kernels and the
+  ``link=`` argument to :func:`numba.cuda.jit` is no longer needed.
+* The device APIs now use custom types that lower to NumPy (host) or Numba (device)
+  types. As a result of this, :attr:`nvmath.device.FFT.value_type` and
+  :attr:`nvmath.device.Matmul.value_type` return NumPy types.
+
+Bugs Fixed
+----------
+
+* `nvmath-python/#47 <https://github.com/NVIDIA/nvmath-python/issues/47>`_
+  Fixed a "key error" bug that prevented use of complex-to-real double precision
+  distributed FFT.
+* `cuda-python/#852 <https://github.com/NVIDIA/cuda-python/issues/852>`_
+  An internal symbol table used when loading symbols from libraries was made
+  thread-safe.
+
+Breaking Changes
+----------------
+
+* :func:`nvmath.distributed.initialize` now requires the ``backends`` argument, which
+  was introduced to support more than one communication backend (NVSHMEM, NCCL, ...).
+* The ``code_type`` argument was replaced by the ``sm`` argument in
+  :class:`nvmath.device.FFT` and :class:`nvmath.device.Matmul`.
+
+Deprecations
+------------
+
+* The :func:`nvmath.device.fft` and :func:`nvmath.device.matmul` utility functions
+  are deprecated. Use :class:`nvmath.device.FFT` and :class:`nvmath.device.Matmul` instead.
+* The Slab distribution has moved to :mod:`nvmath.distributed.distribution` and
+  :attr:`nvmath.distributed.fft.Slab` will be removed in the future. Use
+  :class:`nvmath.distributed.distribution.Slab` instead.
+
 nvmath-python v0.6.0
 ====================
 
diff --git a/examples/_bindings/mathdx/cublasdx_tensor.py b/examples/_bindings/mathdx/cublasdx_tensor.py
index bd9541f..b5abec8 100644
--- a/examples/_bindings/mathdx/cublasdx_tensor.py
+++ b/examples/_bindings/mathdx/cublasdx_tensor.py
@@ -44,9 +44,9 @@
 mathdx.cublasdx_set_option_str(h, mathdx.CommondxOption.SYMBOL_NAME, "matmul")
 
 # Define the input and output tensors
-smem_a = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_SMEM_A)
-smem_b = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_SMEM_B)
-rmem_c = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_RMEM_C)
+smem_a = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_SMEM_A)
+smem_b = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_SMEM_B)
+rmem_c = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.SUGGESTED_RMEM_C)
 
 tensors = [smem_a, smem_b, rmem_c]
 mathdx.cublasdx_finalize_tensors(h, len(tensors), tensors)
@@ -63,7 +63,7 @@
     print(f"Tensor {t}: name {name}, storage size {size}B, alignment {alignment}B, uid {uid}")
 
 # Define a function operating on those input and output tensors
-gemm_sa_sb_rc = mathdx.cublasdx_bind_device_function(h, mathdx.CublasdxDeviceFunctionType.EXECUTE, len(tensors), tensors)
+gemm_sa_sb_rc = mathdx.cublasdx_create_device_function(h, mathdx.CublasdxDeviceFunctionType.EXECUTE, len(tensors), tensors)
 mangled_name_size = mathdx.cublasdx_get_device_function_trait_str_size(gemm_sa_sb_rc, mathdx.CublasdxDeviceFunctionTrait.SYMBOL)
 mangled_name = bytearray(mangled_name_size)
 mangled_name_size = mathdx.cublasdx_get_device_function_trait_str(
@@ -86,6 +86,9 @@
 
 print(f"Generated LTOIR for gemm device function, {lto_size} bytes at ..")
 
+for t in tensors:
+    mathdx.cublasdx_destroy_tensor(t)
+
 mathdx.commondx_destroy_code(code)
 # TODO: destroy update in original example (cpp)
 mathdx.cublasdx_destroy_descriptor(h)
diff --git a/examples/_bindings/mathdx/cusolverdx_portf.py b/examples/_bindings/mathdx/cusolverdx_portf.py
index 0d62da9..7d79c3e 100644
--- a/examples/_bindings/mathdx/cusolverdx_portf.py
+++ b/examples/_bindings/mathdx/cusolverdx_portf.py
@@ -4,12 +4,13 @@
 
 from nvmath.bindings import mathdx
 
-size = [64, 64]
+size = [64]
 block_dim = [256, 1, 1]
+arch = 800
 
 h = mathdx.cusolverdx_create_descriptor()
 
-mathdx.cusolverdx_set_operator_int64s(h, mathdx.CusolverdxOperatorType.SIZE, 2, size)
+mathdx.cusolverdx_set_operator_int64s(h, mathdx.CusolverdxOperatorType.SIZE, len(size), size)
 mathdx.cusolverdx_set_operator_int64s(h, mathdx.CusolverdxOperatorType.BLOCK_DIM, 3, block_dim)
 mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.TYPE, mathdx.CusolverdxType.REAL)
 mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.API, mathdx.CusolverdxApi.SMEM)
@@ -17,27 +18,26 @@
 mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.EXECUTION, mathdx.CommondxExecution.BLOCK)
 mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.PRECISION, mathdx.CommondxPrecision.F64)
 mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.FILL_MODE, mathdx.CusolverdxFillMode.LOWER)
-mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.SM, 800)
+mathdx.cusolverdx_set_operator_int64(h, mathdx.CusolverdxOperatorType.SM, arch)
 mathdx.cusolverdx_set_option_str(h, mathdx.CommondxOption.SYMBOL_NAME, "my_solver")
 
-lto_size = mathdx.cusolverdx_get_ltoir_size(h)
-print(f"lto size: {lto_size}")
-
+# Compile the device function to lto_90
+code = mathdx.commondx_create_code()
+mathdx.commondx_set_code_option_int64(code, mathdx.CommondxOption.TARGET_SM, arch)
+mathdx.cusolverdx_finalize_code(code, h)
+lto_size = mathdx.commondx_get_code_ltoir_size(code)
 lto = bytearray(lto_size)
-mathdx.cusolverdx_get_ltoir(h, lto_size, lto)
+mathdx.commondx_get_code_ltoir(code, lto_size, lto)
+mathdx.commondx_destroy_code(code)
 
-fatbin_size = mathdx.cusolverdx_get_universal_fatbin_size(h)
-print(f"fatbin size: {fatbin_size}")
+print(f"Generated LTOIR ({lto_size} bytes) for POTRF solver with size: {size}")
 
+fatbin_size = mathdx.cusolverdx_get_universal_fatbin_size(h)
 fatbin = bytearray(fatbin_size)
 mathdx.cusolverdx_get_universal_fatbin(h, fatbin_size, fatbin)
 
-print(
-    f"Successfully generated LTOIR, {lto_size} Bytes for POTRF of size {size}, with {fatbin_size} bytes of universal fatbin\n"
-)
+print(f"Successfully generated LTOIR, {lto_size} Bytes for POTRF of size {size}, with {fatbin_size} bytes of universal fatbin")
 
 shared_memory_size = mathdx.cusolverdx_get_trait_int64(h, mathdx.CusolverdxTraitType.SHARED_MEMORY_SIZE)
-
-print(f"Function requires {shared_memory_size} B of shared memory\n")
-
+print(f"Function requires {shared_memory_size} B of shared memory")
 mathdx.cusolverdx_destroy_descriptor(h)
diff --git a/examples/device/cublasdx_batched_gemm_fp64.py b/examples/device/cublasdx_batched_gemm_fp64.py
index f81d20e..58827be 100644
--- a/examples/device/cublasdx_batched_gemm_fp64.py
+++ b/examples/device/cublasdx_batched_gemm_fp64.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import load_to_shared_batched, store_from_shared_batched
 
@@ -18,13 +18,12 @@ def main():
     block_size = 64
     batches = 2
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float64,
         data_type="real",
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
-        compiler="numba",
         block_size=block_size,
     )
 
@@ -34,7 +33,7 @@ def main():
     b_size_batched = batches * MM.b_size
     c_size_batched = batches * MM.c_size
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         bid = cuda.threadIdx.y
 
diff --git a/examples/device/cublasdx_blockdim_gemm_fp16.py b/examples/device/cublasdx_blockdim_gemm_fp16.py
index 77adba5..eb01940 100644
--- a/examples/device/cublasdx_blockdim_gemm_fp16.py
+++ b/examples/device/cublasdx_blockdim_gemm_fp16.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul, Dim3
+from nvmath.device import Matmul, Dim3
 from common import random_real
 from common_numba import load_to_shared, store_from_shared
 
@@ -31,17 +31,16 @@ def main():
     for scenario, (blas_block_dim, kernel_block_dim) in enumerate(zip(blas_block_dims, kernel_block_dims, strict=True)):
         print(f"Scenario with BLAS dim {blas_block_dim} and kernel dim {kernel_block_dim}")
 
-        MM = matmul(
+        MM = Matmul(
             size=(m, n, k),
             precision=precision,
             data_type="real",
             arrangement=("row_major", "col_major", "col_major"),
             execution="Block",
-            compiler="numba",
             block_dim=blas_block_dim,
         )
 
-        @cuda.jit(link=MM.files)
+        @cuda.jit
         def f(a, b, c, alpha, beta, output):
             smem_a = cuda.shared.array(shape=MM.a_size, dtype=MM.a_value_type)
             smem_b = cuda.shared.array(shape=MM.b_size, dtype=MM.b_value_type)
diff --git a/examples/device/cublasdx_device_gemm_performance.py b/examples/device/cublasdx_device_gemm_performance.py
index a4bfd0d..6a42ad0 100644
--- a/examples/device/cublasdx_device_gemm_performance.py
+++ b/examples/device/cublasdx_device_gemm_performance.py
@@ -11,7 +11,7 @@
 from numba import cuda
 from common_cupy import time_cupy
 from common_numba import time_numba
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from nvmath.device.cublasdx import MAX_ALIGNMENT, SharedStorageCalc
 from nvmath.linalg.advanced import Matmul as CublasltMatmul
 from common import random_real
@@ -42,7 +42,7 @@ def main():
     assert n % tile_n == 0
     assert k % tile_k == 0
 
-    MM = matmul(
+    MM = Matmul(
         size=(tile_m, tile_n, tile_k),
         precision=(precision, precision, precision),
         data_type=data_type,
@@ -50,11 +50,7 @@ def main():
         execution="Block",
         block_size=block_size,
         alignment=alignment,
-        global_memory_alignment=alignment,
         static_block_dim=True,
-        compiler="numba",
-        execute_api="tensors",
-        tensor_types=("suggested_smem_a", "suggested_smem_b", "suggested_rmem_c"),
     )
 
     a_size = MM.suggest_layout_smem_a().cosize
@@ -66,7 +62,7 @@ def main():
     assert a_size * np.dtype(precision).itemsize % alignment.a == 0
     assert b_size * np.dtype(precision).itemsize % alignment.b == 0
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         block_m = cuda.blockIdx.x
         block_n = cuda.blockIdx.y
@@ -115,8 +111,8 @@ def f(a, b, c, alpha, beta, output):
         gmem_a = make_tensor(a_tile, MM.get_layout_gmem_a(m))
         gmem_b = make_tensor(b_tile, MM.get_layout_gmem_b(k))
 
-        copy(gmem_a, smem_a)
-        copy(gmem_b, smem_b)
+        copy(gmem_a, smem_a, alignment=16)
+        copy(gmem_b, smem_b, alignment=16)
 
         # 4. EXECUTE GEMM WITH ACCUMULATION IN REGISTERS
 
@@ -136,8 +132,8 @@ def f(a, b, c, alpha, beta, output):
             gmem_a = make_tensor(a_tile, MM.get_layout_gmem_a(m))
             gmem_b = make_tensor(b_tile, MM.get_layout_gmem_b(k))
 
-            copy(gmem_a, smem_a_n)
-            copy(gmem_b, smem_b_n)
+            copy(gmem_a, smem_a_n, alignment=16)
+            copy(gmem_b, smem_b_n, alignment=16)
 
             # Accumulate results from this stage
             MM.execute(smem_a, smem_b, rmem_c)
@@ -154,9 +150,9 @@ def f(a, b, c, alpha, beta, output):
         rmem_c_out_buff = cuda.local.array(shape=(c_size,), dtype=MM.c_value_type)
         rmem_c_out = make_tensor(rmem_c_out_buff, MM.suggest_layout_rmem_c())
 
-        copy_fragment(gmem_c, rmem_c_out)
+        copy_fragment(gmem_c, rmem_c_out, alignment=16)
         axpby(alpha, rmem_c, beta, rmem_c_out)
-        copy_fragment(rmem_c_out, gmem_output)
+        copy_fragment(rmem_c_out, gmem_output, alignment=16)
 
     a = random_real((m, k), precision, order="F")
     b = random_real((k, n), precision, order="F")
diff --git a/examples/device/cublasdx_fp64_emulation.py b/examples/device/cublasdx_fp64_emulation.py
index 34c9b88..81e1150 100644
--- a/examples/device/cublasdx_fp64_emulation.py
+++ b/examples/device/cublasdx_fp64_emulation.py
@@ -23,12 +23,12 @@
 import numpy as np
 from numba import int32, int8, int16, float64, int64, types
 from numba.types import Tuple
-import cuda.cccl.cooperative.experimental as cudax
+from cuda import coop
 
 from common import mm_perf_GFlops, random_real
 from common_numba import time_numba
-from nvmath.device import matmul
-from nvmath.device.cublasdx import MAX_ALIGNMENT, BlasOptionsComplete, SharedStorageCalc
+from nvmath.device import Matmul
+from nvmath.device.cublasdx import MAX_ALIGNMENT, SharedStorageCalc
 from nvmath.device.common import (
     clear,
     copy,
@@ -133,7 +133,7 @@ def build_split_kernel(k, threads, splits=7, order="C"):
     def op_max(a, b):
         return a if a > b else b
 
-    block_reduce = cudax.block.reduce(int16, threads, op_max)
+    block_reduce = coop.block.reduce(int16, threads, op_max)
 
     items_per_thread = (k + threads - 1) // threads
 
@@ -189,8 +189,8 @@ def split_kernel(a, splits, column_exponents):
     return split_kernel
 
 
-def matmul_specification(tile_m, tile_n, tile_k, block_size, alignment) -> BlasOptionsComplete:
-    return matmul(
+def matmul_specification(tile_m, tile_n, tile_k, block_size, alignment) -> Matmul:
+    return Matmul(
         size=(tile_m, tile_n, tile_k),
         precision=(np.int8, np.int8, np.int32),
         data_type="real",
@@ -198,15 +198,11 @@ def matmul_specification(tile_m, tile_n, tile_k, block_size, alignment) -> BlasO
         execution="Block",
         block_size=block_size,
         alignment=alignment,
-        global_memory_alignment=alignment,
         static_block_dim=True,
-        compiler="numba",
-        execute_api="tensors",
-        tensor_types=("suggested_smem_a", "suggested_smem_b", "suggested_rmem_c"),
     )
 
 
-def build_single_matmul(m: int, n: int, k: int, MM: BlasOptionsComplete):
+def build_single_matmul(m: int, n: int, k: int, MM: Matmul):
     tile_m, tile_n, tile_k = MM.size
 
     assert m % tile_m == 0
@@ -215,7 +211,7 @@ def build_single_matmul(m: int, n: int, k: int, MM: BlasOptionsComplete):
 
     grid_dim = Dim3(m // tile_m, n // tile_n, 1)
 
-    @cuda.jit(link=MM.files, device=True, forceinline=True)
+    @cuda.jit(device=True, forceinline=True)
     def matmul_func(a, b, smem_a, smem_b, smem_a_n, smem_b_n, rmem_c):
         block_m = cuda.blockIdx.x
         block_n = cuda.blockIdx.y
@@ -233,8 +229,8 @@ def matmul_func(a, b, smem_a, smem_b, smem_a_n, smem_b_n, rmem_c):
         gmem_a = make_tensor(a_tile, MM.get_layout_gmem_a(k))
         gmem_b = make_tensor(b_tile, MM.get_layout_gmem_b(k))
 
-        copy(gmem_a, smem_a)
-        copy(gmem_b, smem_b)
+        copy(gmem_a, smem_a, alignment=16)
+        copy(gmem_b, smem_b, alignment=16)
 
         # 3. EXECUTE GEMM WITH ACCUMULATION IN REGISTERS
         for stage in range(1, stages):
@@ -248,8 +244,8 @@ def matmul_func(a, b, smem_a, smem_b, smem_a_n, smem_b_n, rmem_c):
             gmem_a = make_tensor(a_tile, MM.get_layout_gmem_a(k))
             gmem_b = make_tensor(b_tile, MM.get_layout_gmem_b(k))
 
-            copy(gmem_a, smem_a_n)
-            copy(gmem_b, smem_b_n)
+            copy(gmem_a, smem_a_n, alignment=16)
+            copy(gmem_b, smem_b_n, alignment=16)
 
             # Accumulate results from this stage
             MM.execute(smem_a, smem_b, rmem_c)
@@ -290,7 +286,7 @@ def build_looped_matmul(
     b_size = MM.suggest_layout_smem_b().cosize
     c_size = MM.suggest_layout_rmem_c().cosize
 
-    @cuda.jit(link=MM.files, device=device, forceinline=device)
+    @cuda.jit(device=device, forceinline=device)
     def matmul_kernel(a_split, b_split, output):
         block_m = cuda.blockIdx.x
         block_n = cuda.blockIdx.y
@@ -351,7 +347,7 @@ def matmul_kernel(a_split, b_split, output):
             0,
         ]
         gmem_output = make_tensor(output_tile, MM.get_layout_gmem_c(m))
-        copy_fragment(rmem_c_out1, gmem_output)
+        copy_fragment(rmem_c_out1, gmem_output, alignment=16)
 
         output_tile = output[
             block_m * tile_m : (block_m + 1) * tile_m,
@@ -359,7 +355,7 @@ def matmul_kernel(a_split, b_split, output):
             1,
         ]
         gmem_output = make_tensor(output_tile, MM.get_layout_gmem_c(m))
-        copy_fragment(rmem_c_out2, gmem_output)
+        copy_fragment(rmem_c_out2, gmem_output, alignment=16)
 
     smem_calc = SharedStorageCalc()
     itemsize = np.dtype(np.int8).itemsize
@@ -369,7 +365,7 @@ def matmul_kernel(a_split, b_split, output):
     smem_calc.add(MM.alignment.b, itemsize, MM.suggest_layout_smem_b())
     shared_memory_size = smem_calc.get()
 
-    return matmul_kernel, grid_dim, MM.block_dim, shared_memory_size, MM.files
+    return matmul_kernel, grid_dim, MM.block_dim, shared_memory_size
 
 
 @cuda.jit(float64(int16, int64), device=True, forceinline=True, cache=CUDA_CACHE)
@@ -463,7 +459,7 @@ def main(m, n, k, tile_m, tile_n, tile_k, block_size, run_perf=True):
     split_a_kernel = build_split_kernel(k, split_block_size, splits=splits, order="C")
     split_b_kernel = build_split_kernel(k, split_block_size, splits=splits, order="F")
 
-    cumulative_matmul, grid_dim, block_dim, shared_memory_size, files = build_looped_matmul(
+    cumulative_matmul, grid_dim, block_dim, shared_memory_size = build_looped_matmul(
         m,
         n,
         k,
@@ -479,7 +475,7 @@ def main(m, n, k, tile_m, tile_n, tile_k, block_size, run_perf=True):
     assert block_dim[1] == 1 and block_dim[2] == 1
     compose_kernel = build_compose_kernel((tile_m, tile_n), block_dim[0], exp_shift, device=True)
 
-    @cuda.jit(link=files)
+    @cuda.jit
     def fused_kernel(alpha, a_split_d, b_split_d, m_o_d, max_e_a_d, max_e_b_d, beta, c, o_d):
         cumulative_matmul(a_split_d, b_split_d, m_o_d)
         cuda.syncthreads()
diff --git a/examples/device/cublasdx_fused_gemm_performance.py b/examples/device/cublasdx_fused_gemm_performance.py
index b1ec773..a0e71c3 100644
--- a/examples/device/cublasdx_fused_gemm_performance.py
+++ b/examples/device/cublasdx_fused_gemm_performance.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_complex
 from common_numba import set_max_dynamic_shared_size_bytes, load_to_shared, store_from_shared, time_numba
 
@@ -30,12 +30,11 @@ def main():
         "arrangement": ("col_major", "col_major", "col_major"),
         "execution": "Block",
         "block_size": block_size,
-        "compiler": "numba",
     }
 
-    MM1 = matmul(size=(m1, n1, k1), **kwargs)
+    MM1 = Matmul(size=(m1, n1, k1), **kwargs)
 
-    MM2 = matmul(size=(m2, n2, k2), **kwargs)
+    MM2 = Matmul(size=(m2, n2, k2), **kwargs)
 
     value_type = MM1.a_value_type  # all value types are the same
 
@@ -62,7 +61,7 @@ def main():
     assert MM1.c_size == MM2.a_size
     assert MM1.leading_dimension.c == MM2.leading_dimension.a
 
-    @cuda.jit(link=MM1.files)
+    @cuda.jit
     def kernel(alpha1, a, b, beta1, c, alpha2, d, beta2, f, output):
         smem = cuda.shared.array(shape=(0,), dtype=value_type)
 
diff --git a/examples/device/cublasdx_gemm_fft.py b/examples/device/cublasdx_gemm_fft.py
index f26b054..77d08c2 100644
--- a/examples/device/cublasdx_gemm_fft.py
+++ b/examples/device/cublasdx_gemm_fft.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul, fft
+from nvmath.device import Matmul, FFT
 from common import random_complex
 from common_numba import load_to_shared
 
@@ -16,7 +16,7 @@
 def main():
     m, n, k = 8, 8, 8
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=m * n,
         precision=np.float32,
@@ -24,71 +24,63 @@ def main():
         elements_per_thread=2,
         ffts_per_block=1,
         execution="Block",
-        compiler="numba",
     )
 
-    MM = matmul(
+    mm = Matmul(
         size=(m, n, k),
         precision=np.float32,
         data_type="complex",
         arrangement=("col_major", "col_major", "col_major"),
         execution="Block",
-        block_dim=FFT.block_dim,
-        compiler="numba",
+        block_dim=fft.block_dim,
     )
 
-    elements_per_thread = FFT.elements_per_thread
-    ffts_per_block = FFT.ffts_per_block
-    complex_type = FFT.value_type
-    storage_size = FFT.storage_size
-    stride = FFT.stride
+    shared_memory_size = max(mm.get_shared_storage_size(), fft.shared_memory_size)
 
-    shared_memory_size = max(MM.get_shared_storage_size(), FFT.shared_memory_size)
-
-    @cuda.jit(link=MM.files + FFT.files)
+    @cuda.jit
     def kernel(a, b, c, alpha, beta, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=complex_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=complex_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
         index = cuda.threadIdx.x
 
         smem_a = shared_mem[0:]
-        smem_b = shared_mem[MM.a_size :]
-        smem_c = shared_mem[MM.a_size + MM.b_size :]
-        [lda, ldb, ldc] = MM.leading_dimension
+        smem_b = shared_mem[mm.a_size :]
+        smem_c = shared_mem[mm.a_size + mm.b_size :]
+        [lda, ldb, ldc] = mm.leading_dimension
 
-        load_to_shared(a, smem_a, MM.a_dim, lda)
-        load_to_shared(b, smem_b, MM.b_dim, ldb)
-        load_to_shared(c, smem_c, MM.c_dim, ldc)
+        load_to_shared(a, smem_a, mm.a_dim, lda)
+        load_to_shared(b, smem_b, mm.b_dim, ldb)
+        load_to_shared(c, smem_c, mm.c_dim, ldc)
 
         cuda.syncthreads()
 
         # smem_a is 8 * 8
         # smem_b is 8 * 8
         # -> smem_c is 8 * 8 = 64 elements
-        MM.execute(alpha, smem_a, smem_b, beta, smem_c)
+        mm.execute(alpha, smem_a, smem_b, beta, smem_c)
 
         cuda.syncthreads()
 
-        index = local_fft_id * ffts_per_block + cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        index = local_fft_id * fft.ffts_per_block + cuda.threadIdx.x
+        for i in range(fft.elements_per_thread):
             thread_data[i] = smem_c[index]
-            index += stride
+            index += fft.stride
 
         cuda.syncthreads()
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
-        index = local_fft_id * ffts_per_block + cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        index = local_fft_id * fft.ffts_per_block + cuda.threadIdx.x
+        for i in range(fft.elements_per_thread):
             output[index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    a = random_complex(MM.a_dim, np.float32)
-    b = random_complex(MM.b_dim, np.float32)
-    c = random_complex(MM.c_dim, np.float32)
-    o = np.zeros((MM.c_dim[0] * MM.c_dim[1],), dtype=np.complex64)
+    a = random_complex(mm.a_dim, np.float32)
+    b = random_complex(mm.b_dim, np.float32)
+    c = random_complex(mm.c_dim, np.float32)
+    o = np.zeros((mm.c_dim[0] * mm.c_dim[1],), dtype=np.complex64)
 
     a_d = cuda.to_device(a)
     b_d = cuda.to_device(b)
@@ -98,7 +90,7 @@ def kernel(a, b, c, alpha, beta, output):
     alpha = 2.0 + 0j
     beta = 3.0 + 0j
 
-    kernel[1, FFT.block_dim, 0, shared_memory_size](a_d, b_d, c_d, alpha, beta, o_d)
+    kernel[1, fft.block_dim, 0, shared_memory_size](a_d, b_d, c_d, alpha, beta, o_d)
     cuda.synchronize()
 
     data_test = o_d.copy_to_host()
diff --git a/examples/device/cublasdx_gemm_fft_fp16.py b/examples/device/cublasdx_gemm_fft_fp16.py
index 6883ba1..1402b70 100644
--- a/examples/device/cublasdx_gemm_fft_fp16.py
+++ b/examples/device/cublasdx_gemm_fft_fp16.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul, fft, float16x4, float16x2
+from nvmath.device import Matmul, FFT, float16x4, float16x2
 from common import random_complex, complex64_to_fp16x2, fp16x2_to_complex64
 from common_numba import load_to_shared_1d_float16x2, store_from_shared_1d_float16x2
 
@@ -19,7 +19,7 @@ def main():
 
     m, n, k = 64, batch_size, 64
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=k,
         precision=np.float16,
@@ -27,29 +27,21 @@ def main():
         elements_per_thread=2,
         ffts_per_block=batch_size,
         execution="Block",
-        compiler="numba",
     )
 
-    assert FFT.block_dim.y == 1
-    assert FFT.ffts_per_block == FFT.implicit_type_batching
+    assert fft.block_dim.y == 1
+    assert fft.ffts_per_block == fft.implicit_type_batching
 
-    MM = matmul(
+    mm = Matmul(
         size=(m, n, k),
         precision=np.float16,
         data_type="complex",
         arrangement=("col_major", "col_major", "col_major"),
         execution="Block",
-        block_size=FFT.block_dim.x,
-        compiler="numba",
+        block_size=fft.block_dim.x,
     )
 
-    elements_per_thread = FFT.elements_per_thread
-    fft_complex_type = FFT.value_type
-    mm_complex_type = MM.a_value_type  # all value types are the same
-    storage_size = FFT.storage_size
-    stride = FFT.stride
-
-    shared_memory_size = max(MM.get_shared_storage_size(), FFT.shared_memory_size)
+    shared_memory_size = max(mm.get_shared_storage_size(), fft.shared_memory_size)
 
     # A is m x k
     # B is k x n
@@ -58,17 +50,17 @@ def main():
     # We compute
     # B[:,r] = FFT(B[:,r])
     # C = alpha A * B + beta C
-    @cuda.jit(link=MM.files + FFT.files)
+    @cuda.jit
     def kernel(a, b, c, alpha, beta, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=fft_complex_type)  # dtype = float16x4
-        fft_shared_mem = cuda.shared.array(shape=(0,), dtype=fft_complex_type)
-        mm_shared_mem = cuda.shared.array(shape=(0,), dtype=mm_complex_type)  # dtype = float16x2
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)  # dtype = float16x4
+        fft_shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+        mm_shared_mem = cuda.shared.array(shape=(0,), dtype=mm.a_value_type)  # dtype = float16x2
 
         smem_a = mm_shared_mem[0:]
-        smem_b = mm_shared_mem[MM.a_size :]
-        smem_c = mm_shared_mem[MM.a_size + MM.b_size :]
+        smem_b = mm_shared_mem[mm.a_size :]
+        smem_c = mm_shared_mem[mm.a_size + mm.b_size :]
 
-        lda, ldc = MM.leading_dimension.a, MM.leading_dimension.c
+        lda, ldc = mm.leading_dimension.a, mm.leading_dimension.c
 
         # Load B to thread_data
         # - B
@@ -81,13 +73,13 @@ def kernel(a, b, c, alpha, beta, output):
         #     dtype float16x4
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             r0, i0 = b[index, 0], b[index, 1]
             r1, i1 = b[index, 2], b[index, 3]
             thread_data[i] = float16x4(r0, r1, i0, i1)
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, fft_shared_mem)
+        fft.execute(thread_data, fft_shared_mem)
 
         cuda.syncthreads()
 
@@ -103,31 +95,31 @@ def kernel(a, b, c, alpha, beta, output):
         #     dtype float16x2
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             v = thread_data[i]
             r0, r1, i0, i1 = v.x, v.y, v.z, v.w
             smem_b[index + ldc * 0] = float16x2(r0, i0)
             smem_b[index + ldc * 1] = float16x2(r1, i1)
-            index += stride
+            index += fft.stride
 
         # Load A to smem_a, C to smem_c
-        load_to_shared_1d_float16x2(a, smem_a, MM.a_dim, lda)
-        load_to_shared_1d_float16x2(c, smem_c, MM.c_dim, ldc)
+        load_to_shared_1d_float16x2(a, smem_a, mm.a_dim, lda)
+        load_to_shared_1d_float16x2(c, smem_c, mm.c_dim, ldc)
 
         cuda.syncthreads()
 
         # MM
-        MM.execute(alpha, smem_a, smem_b, beta, smem_c)
+        mm.execute(alpha, smem_a, smem_b, beta, smem_c)
 
         cuda.syncthreads()
 
         # Store C
 
-        store_from_shared_1d_float16x2(smem_c, output, MM.c_dim, ldc)
+        store_from_shared_1d_float16x2(smem_c, output, mm.c_dim, ldc)
 
-    a = random_complex(MM.a_dim, np.float32)
-    b = random_complex(MM.b_dim, np.float32)
-    c = random_complex(MM.c_dim, np.float32)
+    a = random_complex(mm.a_dim, np.float32)
+    b = random_complex(mm.b_dim, np.float32)
+    c = random_complex(mm.c_dim, np.float32)
     o = np.zeros_like(c)
 
     a_d = cuda.to_device(complex64_to_fp16x2(a))
@@ -138,7 +130,7 @@ def kernel(a, b, c, alpha, beta, output):
     alpha = 2.0 + 0j
     beta = 3.0 + 0j
 
-    kernel[1, FFT.block_dim, 0, shared_memory_size](a_d, b_d, c_d, alpha, beta, o_d)
+    kernel[1, fft.block_dim, 0, shared_memory_size](a_d, b_d, c_d, alpha, beta, o_d)
     cuda.synchronize()
 
     data_test = fp16x2_to_complex64(o_d.copy_to_host())
diff --git a/examples/device/cublasdx_gemm_fft_performance.py b/examples/device/cublasdx_gemm_fft_performance.py
index 0897d20..f47a76b 100644
--- a/examples/device/cublasdx_gemm_fft_performance.py
+++ b/examples/device/cublasdx_gemm_fft_performance.py
@@ -9,7 +9,7 @@
 import numpy as np
 import cupy as cp
 from numba import cuda
-from nvmath.device import matmul, fft
+from nvmath.device import Matmul, FFT
 from common import random_complex
 from common_numba import load_to_shared, time_numba
 from common_cupy import time_cupy
@@ -25,7 +25,7 @@ def main():
     batch_size = 128 * 1024
     m, n, k = 8, 8, 8
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=m * n,
         precision=np.float32,
@@ -33,31 +33,23 @@ def main():
         elements_per_thread=2,
         ffts_per_block=1,
         execution="Block",
-        compiler="numba",
     )
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float32,
         data_type="complex",
         arrangement=("col_major", "col_major", "col_major"),
         execution="Block",
-        block_dim=FFT.block_dim,
-        compiler="numba",
+        block_dim=fft.block_dim,
     )
 
-    elements_per_thread = FFT.elements_per_thread
-    ffts_per_block = FFT.ffts_per_block
-    complex_type = FFT.value_type
-    storage_size = FFT.storage_size
-    stride = FFT.stride
+    shared_memory_size = max(MM.get_shared_storage_size(), fft.shared_memory_size)
 
-    shared_memory_size = max(MM.get_shared_storage_size(), FFT.shared_memory_size)
-
-    @cuda.jit(link=MM.files + FFT.files)
+    @cuda.jit
     def kernel(a, b, c, alpha, beta, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=complex_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=complex_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         batch = cuda.blockIdx.x
         local_fft_id = cuda.threadIdx.y
@@ -85,21 +77,21 @@ def kernel(a, b, c, alpha, beta, output):
         cuda.syncthreads()
 
         # Load data into local array
-        index = local_fft_id * ffts_per_block + cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        index = local_fft_id * fft.ffts_per_block + cuda.threadIdx.x
+        for i in range(fft.elements_per_thread):
             thread_data[i] = smem_c[index]
-            index += stride
+            index += fft.stride
 
         cuda.syncthreads()
 
         # Execute FFT
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         # Transform and store data
-        index = local_fft_id * ffts_per_block + cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        index = local_fft_id * fft.ffts_per_block + cuda.threadIdx.x
+        for i in range(fft.elements_per_thread):
             output[batch, index] = nb_transform(thread_data[i])
-            index += stride
+            index += fft.stride
 
     a = cp.array(random_complex((batch_size, *MM.a_dim), np.float32))
     b = cp.array(random_complex((batch_size, *MM.b_dim), np.float32))
@@ -108,8 +100,8 @@ def kernel(a, b, c, alpha, beta, output):
 
     alpha = 2.0 + 0j
     beta = 3.0 + 0j
-    grid_dim = batch_size // ffts_per_block
-    block_dim = FFT.block_dim
+    grid_dim = batch_size // fft.ffts_per_block
+    block_dim = fft.block_dim
 
     kernel[grid_dim, block_dim, 0, shared_memory_size](a, b, c, alpha, beta, data_test)
     cuda.synchronize()
diff --git a/examples/device/cublasdx_gemm_fusion.py b/examples/device/cublasdx_gemm_fusion.py
index 75aebd1..18b47b7 100644
--- a/examples/device/cublasdx_gemm_fusion.py
+++ b/examples/device/cublasdx_gemm_fusion.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import set_max_dynamic_shared_size_bytes, load_to_shared, store_from_shared
 
@@ -23,24 +23,22 @@ def main():
 
     block_size = 128
 
-    MM1 = matmul(
+    MM1 = Matmul(
         size=(m1, n1, k1),
         precision=np.float16,
         data_type="real",
         arrangement=("col_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
-    MM2 = matmul(
+    MM2 = Matmul(
         size=(m2, n2, k2),
         precision=np.float16,
         data_type="real",
         arrangement=("col_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
     a_size = MM1.a_size
@@ -66,7 +64,7 @@ def main():
     assert MM1.c_size == MM2.a_size
     assert MM1.leading_dimension.c == MM2.leading_dimension.a
 
-    @cuda.jit(link=MM1.files + MM2.files)
+    @cuda.jit
     def kernel(alpha1, a, b, beta1, c, alpha2, d, beta2, f, output):
         smem = cuda.shared.array(shape=(0,), dtype=np.float16)
 
diff --git a/examples/device/cublasdx_simple_gemm_arrangement.py b/examples/device/cublasdx_simple_gemm_arrangement.py
index 4091025..d8bddb1 100644
--- a/examples/device/cublasdx_simple_gemm_arrangement.py
+++ b/examples/device/cublasdx_simple_gemm_arrangement.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import load_to_shared_2d, store_from_shared_2d
 
@@ -17,17 +17,16 @@ def main():
     m, n, k = 32, 16, 64
     block_size = 256
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float32,
         data_type="real",
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         smem_a = cuda.shared.array(shape=MM.a_dim, dtype=MM.a_value_type)
         smem_b = cuda.shared.array(shape=MM.b_dim[::-1], dtype=MM.b_value_type)
diff --git a/examples/device/cublasdx_simple_gemm_cfp16.py b/examples/device/cublasdx_simple_gemm_cfp16.py
index b9a4675..36e107f 100644
--- a/examples/device/cublasdx_simple_gemm_cfp16.py
+++ b/examples/device/cublasdx_simple_gemm_cfp16.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_complex
 from common_numba import load_to_shared, store_from_shared
 
@@ -16,16 +16,15 @@
 def main():
     m, n, k = 64, 64, 64
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float16,
         data_type="complex",
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
-        compiler="numba",
     )
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         # all value types are the same
         smem = cuda.shared.array(shape=(0,), dtype=MM.a_value_type)
diff --git a/examples/device/cublasdx_simple_gemm_fp32.py b/examples/device/cublasdx_simple_gemm_fp32.py
index 6019e21..062b8ce 100644
--- a/examples/device/cublasdx_simple_gemm_fp32.py
+++ b/examples/device/cublasdx_simple_gemm_fp32.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import load_to_shared_2d, store_from_shared_2d
 
@@ -17,17 +17,16 @@ def main():
     m, n, k = 32, 16, 64
     block_size = 256
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float32,
         data_type="real",
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         smem_a = cuda.shared.array(shape=MM.a_dim, dtype=MM.a_value_type)
         # cuBLASDx requires column-major arrays but cuda.shared.array creates row-major
diff --git a/examples/device/cublasdx_simple_gemm_leading_dimensions.py b/examples/device/cublasdx_simple_gemm_leading_dimensions.py
index da4f48f..8b23e6e 100644
--- a/examples/device/cublasdx_simple_gemm_leading_dimensions.py
+++ b/examples/device/cublasdx_simple_gemm_leading_dimensions.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul, Dim3
+from nvmath.device import Matmul, Dim3
 from common import random_real
 from common_numba import load_to_shared, store_from_shared
 
@@ -27,11 +27,10 @@ def main():
         "arrangement": ("row_major", "col_major", "col_major"),
         "execution": "Block",
         "block_dim": block_dim,
-        "compiler": "numba",
     }
 
-    MM_static_ld = matmul(**kwargs, leading_dimension=(lda, ldb, ldc))
-    MM_runtime_ld = matmul(**kwargs, execute_api="dynamic_leading_dimensions")
+    MM_static_ld = Matmul(**kwargs, leading_dimension=(lda, ldb, ldc))
+    MM_runtime_ld = Matmul(**kwargs)
 
     value_type = MM_static_ld.a_value_type  # all value types are the same
     a_size = MM_static_ld.a_size
@@ -40,7 +39,7 @@ def main():
     b_dim = MM_static_ld.b_dim
     c_dim = MM_static_ld.c_dim
 
-    @cuda.jit(link=MM_static_ld.files)
+    @cuda.jit
     def f_static_ld(alpha, a, b, beta, c, output):
         smem = cuda.shared.array(shape=(0,), dtype=value_type)
         smem_a = smem[0:]
@@ -59,7 +58,7 @@ def f_static_ld(alpha, a, b, beta, c, output):
 
         store_from_shared(smem_c, output, c_dim, ldc)
 
-    @cuda.jit(link=MM_runtime_ld.files)
+    @cuda.jit
     def f_runtime_ld(alpha, a, lda, b, ldb, beta, c, ldc, output):
         smem = cuda.shared.array(shape=(0,), dtype=value_type)
         smem_a = smem[0:]
diff --git a/examples/device/cublasdx_simple_gemm_mixed_precision.py b/examples/device/cublasdx_simple_gemm_mixed_precision.py
index a8b926b..97f2197 100644
--- a/examples/device/cublasdx_simple_gemm_mixed_precision.py
+++ b/examples/device/cublasdx_simple_gemm_mixed_precision.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import load_to_shared_2d, store_from_shared_2d
 
@@ -17,17 +17,16 @@ def main():
     m, n, k = 32, 16, 64
     block_size = 256
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=(np.float16, np.float16, np.float32),
         data_type="real",
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         smem_a = cuda.shared.array(shape=MM.a_dim, dtype=MM.a_value_type)
         # cuBLASDx requires column-major arrays but cuda.shared.array creates
diff --git a/examples/device/cublasdx_simple_gemm_tensor_fp32.py b/examples/device/cublasdx_simple_gemm_tensor_fp32.py
index d2f7834..dca6937 100644
--- a/examples/device/cublasdx_simple_gemm_tensor_fp32.py
+++ b/examples/device/cublasdx_simple_gemm_tensor_fp32.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from nvmath.device.common import copy, copy_fragment, clear, copy_wait, make_tensor, axpby
 
@@ -13,23 +13,20 @@ def main():
     m, n, k = 128, 128, 32
     block_size = 256
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=(np.float16, np.float16, np.float32),
         data_type="real",
         arrangement=("col_major", "row_major", "row_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
-        tensor_types=("suggested_smem_a", "suggested_smem_b", "suggested_rmem_c"),
-        execute_api="tensors",
     )
 
     a_layout = MM.suggest_layout_smem_a()
     b_layout = MM.suggest_layout_smem_b()
     c_layout = MM.suggest_layout_rmem_c()
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(alpha, a, b, beta, c, output):
         smem = cuda.shared.array(shape=(0,), dtype=np.float16, alignment=16)
 
diff --git a/examples/device/cublasdx_simple_gemm_transpose_mode.py b/examples/device/cublasdx_simple_gemm_transpose_mode.py
index 164957f..b35c659 100644
--- a/examples/device/cublasdx_simple_gemm_transpose_mode.py
+++ b/examples/device/cublasdx_simple_gemm_transpose_mode.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_real
 from common_numba import load_to_shared_2d, store_from_shared_2d
 
@@ -13,7 +13,7 @@ def main():
     m, n, k = 32, 16, 64
     block_size = 256
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=np.float32,
         data_type="real",
@@ -22,10 +22,9 @@ def main():
         transpose_mode=("non_transposed", "transposed"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
     )
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output):
         # cuBLASDx requires column-major arrays but cuda.shared.array creates row-major
         # arrays (only) so we emulate a column-major array by flipping dimensions
diff --git a/examples/device/cublasdx_simple_partition.py b/examples/device/cublasdx_simple_partition.py
new file mode 100644
index 0000000..3535289
--- /dev/null
+++ b/examples/device/cublasdx_simple_partition.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+from numba import cuda
+from nvmath.device import Matmul
+from common import random_real
+from nvmath.device.common import axpby, clear, copy, copy_wait, make_tensor
+from nvmath.device.cublasdx_backend import MAX_ALIGNMENT
+
+
+def main():
+    m, n, k = 64, 64, 64
+    block_size = 128
+    alpha, beta = 1, 2
+    data_type = "real"
+    precision = np.float16
+
+    MM = Matmul(
+        size=(m, n, k),
+        precision=(precision, precision, precision),
+        data_type=data_type,
+        arrangement=("row_major", "col_major", "col_major"),
+        execution="Block",
+        block_size=block_size,
+        alignment=MAX_ALIGNMENT,
+    )
+    grid_dim = 1
+
+    a_cosize = MM.suggest_layout_smem_a().cosize
+    b_cosize = MM.suggest_layout_smem_b().cosize
+    c_cosize = MM.suggest_layout_rmem_c().cosize
+    c_size = MM.suggest_layout_rmem_c().size
+
+    @cuda.jit
+    def f(a, b, c, alpha, beta, output):
+        # We have same precision for all tensors
+        smem = cuda.shared.array(shape=(0,), dtype=precision, alignment=16)
+        smem_a_buff, smem = smem[:a_cosize], smem[a_cosize:]
+        smem_b_buff, smem = smem[:b_cosize], smem[b_cosize:]
+        rmem_c_buff = cuda.local.array(shape=(c_cosize,), dtype=MM.c_value_type, alignment=16)
+        rmem_c_out_buff = cuda.local.array(shape=(c_cosize,), dtype=MM.c_value_type, alignment=16)
+
+        gmem_a = make_tensor(a, MM.get_layout_gmem_a())
+        gmem_b = make_tensor(b, MM.get_layout_gmem_b())
+        gmem_c = make_tensor(c, MM.get_layout_gmem_c())
+        gmem_output = make_tensor(output, MM.get_layout_gmem_c())
+
+        smem_a = make_tensor(smem_a_buff, MM.suggest_layout_smem_a())
+        smem_b = make_tensor(smem_b_buff, MM.suggest_layout_smem_b())
+        rmem_c = make_tensor(rmem_c_buff, MM.suggest_layout_rmem_c())
+        rmem_c_out = make_tensor(rmem_c_out_buff, MM.suggest_layout_rmem_c())
+
+        copy(gmem_a, smem_a)
+        copy(gmem_b, smem_b)
+
+        copy_wait()
+
+        clear(rmem_c)
+
+        partitioner = MM.suggest_partitioner()
+        gmem_c_partition = partitioner.partition_like_C(gmem_c)
+        gmem_output_partition = partitioner.partition_like_C(gmem_output)
+
+        # Use copy_fragment(gmem_c, rmem_c_out) instead since it provides
+        # better performance achieved by vectorization. This is for functional
+        # demonstration purposes only.
+        for i in range(c_size):
+            rmem_c_out_buff[i] = gmem_c_partition[i]
+
+        alpha = c.dtype.type(alpha)
+        beta = c.dtype.type(beta)
+
+        MM.execute(smem_a, smem_b, rmem_c)
+        axpby(alpha, rmem_c, beta, rmem_c_out)
+
+        # Use copy_fragment(rmem_c_out, gmem_output) instead since it provides
+        # better performance achieved by vectorization. This is for functional
+        # demonstration purposes only.
+        for i in range(c_size):
+            gmem_output_partition[i] = rmem_c_out_buff[i]
+
+    a = random_real(MM.a_dim, precision, order="C")
+    b = random_real(MM.b_dim, precision, order="F")
+    c = random_real(MM.c_dim, precision, order="F")
+    o = np.zeros_like(c)
+
+    a_d = cuda.to_device(a)
+    b_d = cuda.to_device(b)
+    c_d = cuda.to_device(c)
+    o_d = cuda.to_device(o)
+
+    shared_memory_size = MM.get_shared_storage_size_ab(
+        MM.suggest_layout_smem_a(),
+        MM.suggest_layout_smem_b(),
+    )
+
+    f[grid_dim, MM.block_dim, 0, shared_memory_size](a_d, b_d, c_d, alpha, beta, o_d)
+    cuda.synchronize()
+
+    data_test = o_d.copy_to_host()
+    data_ref = alpha * (a @ b) + beta * c
+    error = np.linalg.norm(data_test - data_ref) / np.linalg.norm(data_ref)
+    print(f"Relative error: {error}")
+    assert error < 1e-2
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/device/cublasdx_single_gemm_performance.py b/examples/device/cublasdx_single_gemm_performance.py
index e17a37e..4845068 100644
--- a/examples/device/cublasdx_single_gemm_performance.py
+++ b/examples/device/cublasdx_single_gemm_performance.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import random_complex, mm_perf_GFlops, fp16x2_to_complex64, complex64_to_fp16x2
 from common_numba import time_numba, load_to_shared_1d_float16x2, store_from_shared_1d_float16x2
 
@@ -22,20 +22,19 @@ def main():
     data_type = "complex"
     precision = np.float16
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=precision,
         data_type=data_type,
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
         leading_dimension="suggested",
     )
 
     grid_dim = 1
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output, repeat):
         smem_a = cuda.shared.array(shape=(MM.a_size,), dtype=MM.a_value_type)
         smem_b = cuda.shared.array(shape=(MM.b_size,), dtype=MM.b_value_type)
diff --git a/examples/device/cublasdx_single_gemm_tensor_performance.py b/examples/device/cublasdx_single_gemm_tensor_performance.py
index 93ff396..7be0ff3 100644
--- a/examples/device/cublasdx_single_gemm_tensor_performance.py
+++ b/examples/device/cublasdx_single_gemm_tensor_performance.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import matmul
+from nvmath.device import Matmul
 from common import mm_perf_GFlops, random_real
 from common_numba import time_numba
 from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor
@@ -19,16 +19,13 @@ def main():
     data_type = "real"
     precision = np.float16
 
-    MM = matmul(
+    MM = Matmul(
         size=(m, n, k),
         precision=(precision, precision, precision),
         data_type=data_type,
         arrangement=("row_major", "col_major", "col_major"),
         execution="Block",
         block_size=block_size,
-        compiler="numba",
-        execute_api="tensors",
-        tensor_types=("suggested_smem_a", "suggested_smem_b", "suggested_rmem_c"),
     )
     grid_dim = 1
 
@@ -36,7 +33,7 @@ def main():
     b_size = MM.suggest_layout_smem_b().cosize
     c_size = MM.suggest_layout_rmem_c().cosize
 
-    @cuda.jit(link=MM.files)
+    @cuda.jit
     def f(a, b, c, alpha, beta, output, repeat):
         # We have same precision for all tensors
         smem = cuda.shared.array(shape=(0,), dtype=precision, alignment=16)
diff --git a/examples/device/cufftdx_autotuning.py b/examples/device/cufftdx_autotuning.py
index dfb581e..4105468 100644
--- a/examples/device/cufftdx_autotuning.py
+++ b/examples/device/cufftdx_autotuning.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import numpy as np
 from numba import cuda
-from nvmath.device import current_device_lto, FFTOptions
+from nvmath.device import current_device_sm, FFT
 from common_numba import time_numba
 
 
@@ -12,64 +13,63 @@ def main():
     batch = 1024 * 32
     ncycles = 10
 
-    base_FFT = FFTOptions(
+    BaseFFT = functools.partial(
+        FFT,
         fft_type="c2c",
         size=256,
         precision=np.float32,
         direction="forward",
         execution="Block",
-        code_type=current_device_lto(),
+        sm=current_device_sm(),
     )
+    fft = BaseFFT()
 
-    data = np.ones((batch, base_FFT.size), dtype=np.complex64)
+    data = np.ones((batch, fft.size), dtype=np.complex64)
     data_ref = np.fft.fft(data, axis=-1)
 
-    for ept, fpb in base_FFT.valid("elements_per_thread", "ffts_per_block"):
-        FFT = base_FFT.create(elements_per_thread=ept, ffts_per_block=fpb, compiler="numba")
+    valid_ept_fpb = fft.valid("elements_per_thread", "ffts_per_block")
 
-        value_type = FFT.value_type
-        storage_size = FFT.storage_size
-        shared_memory_size = FFT.shared_memory_size
-        stride = FFT.stride
-        block_dim = FFT.block_dim
-        ffts_per_block = FFT.ffts_per_block
-        elements_per_thread = FFT.elements_per_thread
-        grid_dim = (batch + ffts_per_block - 1) // ffts_per_block
+    for ept, fpb in valid_ept_fpb:
+        fft = BaseFFT(elements_per_thread=ept, ffts_per_block=fpb)
 
-        assert ept == elements_per_thread
-        assert fpb == ffts_per_block
+        grid_dim = (batch + fft.ffts_per_block - 1) // fft.ffts_per_block
 
-        @cuda.jit(link=FFT.files)
+        assert ept == fft.elements_per_thread
+        assert fpb == fft.ffts_per_block
+
+        @cuda.jit
         def f(input, output):
-            thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+            thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
             local_fft_id = cuda.threadIdx.y
-            fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+            fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
             if fft_id >= batch:
                 return
 
             index = cuda.threadIdx.x
-            for i in range(elements_per_thread):
+            for i in range(fft.elements_per_thread):
                 thread_data[i] = input[fft_id, index]
-                index += stride
+                index += fft.stride
 
-            shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
-            FFT(thread_data, shared_mem)
+            shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+            fft.execute(thread_data, shared_mem)
 
             index = cuda.threadIdx.x
-            for i in range(elements_per_thread):
+            for i in range(fft.elements_per_thread):
                 output[fft_id, index] = thread_data[i]
-                index += stride
+                index += fft.stride
 
         input_d = cuda.to_device(data)
         output_d = cuda.to_device(data)
         cuda.synchronize()
-        time_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, input_d, output_d)
+        time_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, input_d, output_d)
         cuda.synchronize()
         data_test = output_d.copy_to_host()
         error = np.linalg.norm(data_test - data_ref) / np.linalg.norm(data_ref)
         assert error < 1e-5
-        print(f"Performance (elements_per_thread={elements_per_thread}, ffts_per_block={ffts_per_block}): {time_ms} [ms.]")
+        print(
+            f"Performance (elements_per_thread={fft.elements_per_thread}, ffts_per_block={fft.ffts_per_block}): {time_ms} [ms.]"
+        )
 
 
 if __name__ == "__main__":
diff --git a/examples/device/cufftdx_block_fft.py b/examples/device/cufftdx_block_fft.py
index 5aac1bb..6ee9679 100644
--- a/examples/device/cufftdx_block_fft.py
+++ b/examples/device/cufftdx_block_fft.py
@@ -8,49 +8,38 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    size = 64
+    fft = FFT(fft_type="c2c", size=64, precision=np.float32, direction="forward", execution="Block")
 
-    FFT = fft(fft_type="c2c", size=size, precision=np.float32, direction="forward", execution="Block", compiler="numba")
-
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = data[fft_id, index]
-            index += stride
+            index += fft.stride
 
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
-        FFT(thread_data, shared_mem)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             data[fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    data = np.ones((ffts_per_block, size), dtype=np.complex64)
+    data = np.ones((fft.ffts_per_block, fft.size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_block_fft_performance.py b/examples/device/cufftdx_block_fft_performance.py
index 54aabf3..203bbed 100644
--- a/examples/device/cufftdx_block_fft_performance.py
+++ b/examples/device/cufftdx_block_fft_performance.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import fft_perf_GFlops, CHECK_CUDART
 from common_numba import time_numba, get_active_blocks_per_multiprocessor
 from cuda.bindings import runtime as cudart
@@ -20,7 +20,7 @@ def main():
     sms = out.multiProcessorCount
     elements_per_thread = 8
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=fft_size,
         precision=np.float32,
@@ -28,19 +28,12 @@ def main():
         execution="Block",
         elements_per_thread=elements_per_thread,
         ffts_per_block=ffts_per_block,
-        compiler="numba",
     )
 
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data, repeat):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
         fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
@@ -48,18 +41,18 @@ def f(data, repeat):
         index = cuda.threadIdx.x
         for i in range(elements_per_thread):
             thread_data[i] = data[fft_id, index]
-            index += stride
+            index += fft.stride
 
         for r in range(repeat):
-            FFT(thread_data, shared_mem)
+            fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(elements_per_thread):
             data[fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
     dummy = cuda.to_device(np.ones((ffts_per_block, fft_size), dtype=np.complex64))
-    blocks_per_sm = get_active_blocks_per_multiprocessor(f, block_dim, shared_memory_size, dummy, repeat)
+    blocks_per_sm = get_active_blocks_per_multiprocessor(f, fft.block_dim, fft.shared_memory_size, dummy, repeat)
     batch_size = sms * blocks_per_sm * ffts_per_block
 
     grid_dim = batch_size // ffts_per_block
@@ -68,8 +61,8 @@ def f(data, repeat):
     data = np.ones((batch_size, fft_size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
-    time_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, data_d, repeat)
-    time_2x_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, data_d, 2 * repeat)
+    time_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, data_d, repeat)
+    time_2x_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, data_d, 2 * repeat)
     time_fft_ms = (time_2x_ms - time_ms) / repeat
     perf = fft_perf_GFlops(fft_size, batch_size, time_fft_ms)
 
diff --git a/examples/device/cufftdx_block_fft_performance_many.py b/examples/device/cufftdx_block_fft_performance_many.py
index 29faf20..b5bf549 100644
--- a/examples/device/cufftdx_block_fft_performance_many.py
+++ b/examples/device/cufftdx_block_fft_performance_many.py
@@ -5,7 +5,7 @@
 import numpy as np
 from numba import cuda
 from cuda.bindings import runtime as cudart
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import fft_perf_GFlops, CHECK_CUDART
 from common_numba import time_numba, get_active_blocks_per_multiprocessor
 
@@ -17,59 +17,51 @@ def run(fft_type, fft_size, direction=None):
     sms = out.multiProcessorCount
     CHECK_CUDART(err)
 
-    FFT = fft(
+    fft = FFT(
         fft_type=fft_type,
         size=fft_size,
         precision=np.float32,
         direction=direction,
         execution="Block",
-        compiler="numba",
         ffts_per_block="suggested",
     )
 
-    value_type = FFT.value_type
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
     complex_size = fft_size if fft_type == "c2c" else fft_size // 2 + 1
 
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data, repeat):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < complex_size:
                 thread_data[i] = data[fft_id, index]
-                index += stride
+                index += fft.stride
 
         for r in range(repeat):
-            FFT(thread_data, shared_mem)
+            fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < complex_size:
                 data[fft_id, index] = thread_data[i]
-                index += stride
+                index += fft.stride
 
-    dummy = cuda.to_device(np.ones((ffts_per_block, complex_size), dtype=np.complex64))
-    blocks_per_sm = get_active_blocks_per_multiprocessor(f, block_dim, shared_memory_size, dummy, repeat)
-    batch_size = sms * blocks_per_sm * ffts_per_block
-    grid_dim = batch_size // ffts_per_block
-    assert batch_size % ffts_per_block == 0
+    dummy = cuda.to_device(np.ones((fft.ffts_per_block, complex_size), dtype=np.complex64))
+    blocks_per_sm = get_active_blocks_per_multiprocessor(f, fft.block_dim, fft.shared_memory_size, dummy, repeat)
+    batch_size = sms * blocks_per_sm * fft.ffts_per_block
+    grid_dim = batch_size // fft.ffts_per_block
+    assert batch_size % fft.ffts_per_block == 0
 
     data = np.ones((batch_size, complex_size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
-    time_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, data_d, repeat)
-    time_2x_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, data_d, 2 * repeat)
+    time_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, data_d, repeat)
+    time_2x_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, data_d, 2 * repeat)
     time_fft_ms = (time_2x_ms - time_ms) / repeat
     perf = fft_perf_GFlops(fft_size, batch_size, time_fft_ms, coef=1.0 if fft_type == "c2c" else 0.5)
 
diff --git a/examples/device/cufftdx_convolution.py b/examples/device/cufftdx_convolution.py
index 52684b2..4463d70 100644
--- a/examples/device/cufftdx_convolution.py
+++ b/examples/device/cufftdx_convolution.py
@@ -8,67 +8,55 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_complex
 import functools
 
 
 def main():
-    size = 64
-    ffts_per_block = 2
-    elements_per_thread = 8
-
     FFT_base = functools.partial(
-        fft,
+        FFT,
         fft_type="c2c",
-        size=size,
+        size=64,
         precision=np.float32,
-        ffts_per_block=ffts_per_block,
-        elements_per_thread=ffts_per_block,
+        ffts_per_block=2,
+        elements_per_thread=8,
         execution="Block",
-        compiler="numba",
     )
-    FFT = FFT_base(direction="forward")
-    IFFT = FFT_base(direction="inverse")
-
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
+    fft = FFT_base(direction="forward")
+    ifft = FFT_base(direction="inverse")
 
-    @cuda.jit(link=FFT.files + IFFT.files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = data[fft_id, index]
-            index += stride
+            index += fft.stride
 
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
-        FFT(thread_data, shared_mem)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+        fft.execute(thread_data, shared_mem)
 
-        for i in range(elements_per_thread):
-            thread_data[i] = thread_data[i] / size
+        for i in range(fft.elements_per_thread):
+            thread_data[i] = thread_data[i] / fft.size
 
-        IFFT(thread_data, shared_mem)
+        ifft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             data[fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    data = random_complex((ffts_per_block, size), real_dtype=np.float32)
+    data = random_complex((fft.ffts_per_block, fft.size), real_dtype=np.float32)
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_convolution_performance.py b/examples/device/cufftdx_convolution_performance.py
index 1045d90..fab46aa 100644
--- a/examples/device/cufftdx_convolution_performance.py
+++ b/examples/device/cufftdx_convolution_performance.py
@@ -5,7 +5,7 @@
 import numpy as np
 import cupy
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_complex
 import functools
 
@@ -19,57 +19,47 @@ def main():
     ncycles = 10
 
     FFT_base = functools.partial(
-        fft,
+        FFT,
         fft_type="c2c",
         size=fft_size,
         precision=np.float32,
         ffts_per_block="suggested",
         execution="Block",
-        compiler="numba",
     )
-    FFT = FFT_base(direction="forward")
-    IFFT = FFT_base(direction="inverse")
+    fft = FFT_base(direction="forward")
+    ifft = FFT_base(direction="inverse")
 
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
+    grid_dim = (batch_size + fft.ffts_per_block - 1) // fft.ffts_per_block
 
-    grid_dim = (batch_size + ffts_per_block - 1) // ffts_per_block
-
-    @cuda.jit(link=FFT.files + IFFT.files)
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         if fft_id >= batch_size:
             return
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < fft_size:
                 thread_data[i] = input[fft_id, index]
-                index += stride
+                index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
-        for i in range(elements_per_thread):
-            thread_data[i] = thread_data[i] / size
+        for i in range(fft.elements_per_thread):
+            thread_data[i] = thread_data[i] / fft.size
 
-        IFFT(thread_data, shared_mem)
+        ifft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < fft_size:
                 output[fft_id, index] = thread_data[i]
-                index += stride
+                index += fft.stride
 
     input = random_complex((batch_size, fft_size), real_dtype=np.float32)
     output = np.ones((batch_size, fft_size), dtype=np.complex64)
@@ -78,7 +68,7 @@ def f(input, output):
 
     cupy_ms = time_cupy(lambda input: cupy.fft.ifft(cupy.fft.fft(input, axis=-1), axis=-1), ncycles, input_d)
 
-    numba_ms = time_numba(f, grid_dim, block_dim, shared_memory_size, ncycles, input_d, output_d)
+    numba_ms = time_numba(f, grid_dim, fft.block_dim, fft.shared_memory_size, ncycles, input_d, output_d)
 
     output_test = cupy.asnumpy(output_d)
     output_ref = np.fft.ifft(np.fft.fft(input, axis=-1), axis=-1)
diff --git a/examples/device/cufftdx_convolution_r2c_c2r.py b/examples/device/cufftdx_convolution_r2c_c2r.py
index 83e2545..10e4b21 100644
--- a/examples/device/cufftdx_convolution_r2c_c2r.py
+++ b/examples/device/cufftdx_convolution_r2c_c2r.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_real
 
 
@@ -22,25 +22,20 @@ def main():
         "elements_per_thread": 2,
         "ffts_per_block": ffts_per_block,
         "execution": "Block",
-        "compiler": "numba",
     }
-    FFT_fwd = fft(**kwargs, fft_type="r2c")
-    FFT_inv = fft(**kwargs, fft_type="c2r")
-
-    value_type = FFT_fwd.value_type
-    storage_size = max(FFT_fwd.storage_size, FFT_inv.storage_size)
-    shared_memory_size = max(FFT_fwd.shared_memory_size, FFT_inv.shared_memory_size)
-    stride = FFT_fwd.stride
-    block_dim = FFT_fwd.block_dim
-    elements_per_thread = FFT_fwd.elements_per_thread
-    assert FFT_inv.stride == stride
-    assert FFT_inv.block_dim == block_dim
-    assert FFT_inv.elements_per_thread == elements_per_thread
-
-    @cuda.jit(link=FFT_fwd.files + FFT_inv.files)
+    fft = FFT(**kwargs, fft_type="r2c")
+    ifft = FFT(**kwargs, fft_type="c2r")
+
+    storage_size = max(fft.storage_size, ifft.storage_size)
+    shared_memory_size = max(fft.shared_memory_size, ifft.shared_memory_size)
+    assert ifft.stride == fft.stride
+    assert ifft.block_dim == fft.block_dim
+    assert ifft.elements_per_thread == fft.elements_per_thread
+
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
         thread_data_real = thread_data.view(np.float32)
 
         local_fft_id = cuda.threadIdx.y
@@ -48,37 +43,37 @@ def f(data):
 
         # Data being loaded is real, for we load fft_size real elements per batch
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < fft_size:
                 thread_data_real[i] = data[fft_id, index]
-            index += stride
+            index += fft.stride
 
-        FFT_fwd(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         # After the first transform, the data is complex, so we have fft_size//2+1 complex
         # elements per batch
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < (fft_size // 2 + 1):
                 thread_data[i] = thread_data[i] / fft_size
-            index += stride
+            index += fft.stride
 
-        FFT_inv(thread_data, shared_mem)
+        ifft.execute(thread_data, shared_mem)
 
         # After the second transform, the data is real again, so we store fft_size real
         # elements per batch
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             if index < fft_size:
                 data[fft_id, index] = thread_data_real[i]
-            index += stride
+            index += fft.stride
 
     data = np.ones_like(random_real((ffts_per_block, fft_size), real_dtype=np.float32))
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
index 1dd3c29..f35aeb2 100644
--- a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
+++ b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
@@ -2,109 +2,91 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, float32x2_type
+from nvmath.device import FFT
+from nvmath.device.types import complex64
 from common import random_real
 
 
 def main():
-    FFT_r2c = fft(
-        fft_type="r2c",
+    FFT_base = functools.partial(
+        FFT,
         size=64,
         precision=np.float32,
         ffts_per_block=2,
         elements_per_thread=4,
         real_fft_options={"complex_layout": "packed", "real_mode": "folded"},
         execution="Block",
-        compiler="numba",
     )
-
-    FFT_c2r = fft(
-        fft_type="c2r",
-        size=64,
-        precision=np.float32,
-        ffts_per_block=2,
-        elements_per_thread=4,
-        real_fft_options={"complex_layout": "packed", "real_mode": "folded"},
-        execution="Block",
-        compiler="numba",
-    )
-
-    complex_type = FFT_r2c.value_type
-    storage_size = FFT_r2c.storage_size
-    shared_memory_size = FFT_r2c.shared_memory_size
-    ffts_per_block = FFT_r2c.ffts_per_block
-    stride = FFT_r2c.stride
-    size = FFT_r2c.size
-    elements_per_thread = FFT_r2c.elements_per_thread
-    block_dim = FFT_r2c.block_dim
-
-    assert complex_type == float32x2_type
-    assert storage_size == 2
-    assert ffts_per_block == 2
-    assert all(file.endswith(".ltoir") for file in FFT_r2c.files)
-    assert stride == 16
-    assert size == 64
-    assert elements_per_thread == 4
-    assert block_dim == (16, 2, 1)
-
-    assert FFT_r2c.value_type == FFT_c2r.value_type
-    assert FFT_r2c.precision == FFT_c2r.precision
-    assert FFT_r2c.storage_size == FFT_c2r.storage_size
-    assert FFT_r2c.shared_memory_size == FFT_c2r.shared_memory_size
-    assert FFT_r2c.ffts_per_block == FFT_c2r.ffts_per_block
-    assert FFT_r2c.stride == FFT_c2r.stride
-    assert FFT_r2c.size == FFT_c2r.size
-    assert FFT_r2c.elements_per_thread == FFT_c2r.elements_per_thread
-    assert FFT_r2c.block_dim == FFT_c2r.block_dim
-
-    @cuda.jit(link=FFT_r2c.files + FFT_c2r.files)
+    fft = FFT_base(fft_type="r2c")
+    ifft = FFT_base(fft_type="c2r")
+
+    assert fft.value_type == complex64
+    assert fft.storage_size == 2
+    assert fft.ffts_per_block == 2
+    assert fft.stride == 16
+    assert fft.size == 64
+    assert fft.elements_per_thread == 4
+    assert fft.block_dim == (16, 2, 1)
+
+    assert fft.value_type == ifft.value_type
+    assert fft.precision == ifft.precision
+    assert fft.storage_size == ifft.storage_size
+    assert fft.shared_memory_size == ifft.shared_memory_size
+    assert fft.ffts_per_block == ifft.ffts_per_block
+    assert fft.stride == ifft.stride
+    assert fft.size == ifft.size
+    assert fft.elements_per_thread == ifft.elements_per_thread
+    assert fft.block_dim == ifft.block_dim
+
+    @cuda.jit
     def f(inout):
         # Registers
-        complex_thread_data = cuda.local.array(shape=(storage_size,), dtype=complex_type)
+        complex_thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
         real_thread_data = complex_thread_data.view(np.float32)
 
         # Figure out fft / batch IDs
         local_fft_id = cuda.threadIdx.y
-        global_fft_id = (cuda.blockIdx.x * ffts_per_block) + local_fft_id
+        global_fft_id = (cuda.blockIdx.x * fft.ffts_per_block) + local_fft_id
 
-        for i in range(elements_per_thread):
-            idx = i * stride + cuda.threadIdx.x
-            if idx < size // 2:
+        for i in range(fft.elements_per_thread):
+            idx = i * fft.stride + cuda.threadIdx.x
+            if idx < fft.size // 2:
                 # Fold optimized, so we load complex (ie 2 consecutive reals) instead of
                 # reals
                 real_thread_data[2 * i + 0] = inout[global_fft_id, 2 * idx + 0]
                 real_thread_data[2 * i + 1] = inout[global_fft_id, 2 * idx + 1]
 
         # Allocate shared
-        shared_mem = cuda.shared.array(shape=(0,), dtype=complex_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         # R2C
-        FFT_r2c(complex_thread_data, shared_mem)
+        fft.execute(complex_thread_data, shared_mem)
 
         # Normalize
         # `complex_thread_data` has a packed (not natural) layout
-        for i in range(elements_per_thread):
-            if i * stride + cuda.threadIdx.x < size // 2:
-                complex_thread_data[i] = complex_thread_data[i] / size
+        for i in range(fft.elements_per_thread):
+            if i * fft.stride + cuda.threadIdx.x < fft.size // 2:
+                complex_thread_data[i] = complex_thread_data[i] / fft.size
 
         # C2R
-        FFT_c2r(complex_thread_data, shared_mem)
+        ifft.execute(complex_thread_data, shared_mem)
 
         # Save results
-        for i in range(elements_per_thread):
-            idx = i * stride + cuda.threadIdx.x
-            if idx < size // 2:
+        for i in range(fft.elements_per_thread):
+            idx = i * fft.stride + cuda.threadIdx.x
+            if idx < fft.size // 2:
                 # Fold optimized, so we load complex (ie 2 consecutive reals) instead of
                 # reals
                 inout[global_fft_id, 2 * idx + 0] = real_thread_data[2 * i + 0]
                 inout[global_fft_id, 2 * idx + 1] = real_thread_data[2 * i + 1]
 
-    input = random_real((ffts_per_block, size), real_dtype=np.float64)
+    input = random_real((fft.ffts_per_block, fft.size), real_dtype=np.float64)
     inout_d = cuda.to_device(input)
 
-    f[1, block_dim, 0, shared_memory_size](inout_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](inout_d)
     cuda.synchronize()
 
     output_test = inout_d.copy_to_host()
diff --git a/examples/device/cufftdx_convolution_signal.py b/examples/device/cufftdx_convolution_signal.py
index 4d9c6dd..026b0cd 100644
--- a/examples/device/cufftdx_convolution_signal.py
+++ b/examples/device/cufftdx_convolution_signal.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_complex
 
 
@@ -13,56 +14,40 @@ def main():
     ffts_per_block = 1
     batch_size = 1
 
-    FFT_fwd = fft(
+    FFT_base = functools.partial(
+        FFT,
         fft_type="c2c",
         size=size,
         precision=np.float32,
-        direction="forward",
         ffts_per_block=ffts_per_block,
         elements_per_thread=2,
         execution="Block",
-        compiler="numba",
     )
-    FFT_inv = fft(
-        fft_type="c2c",
-        size=size,
-        precision=np.float32,
-        direction="inverse",
-        ffts_per_block=ffts_per_block,
-        elements_per_thread=2,
-        execution="Block",
-        compiler="numba",
-    )
-
-    value_type = FFT_fwd.value_type
-    storage_size = FFT_fwd.storage_size
-    shared_memory_size = FFT_fwd.shared_memory_size
-    fft_stride = FFT_fwd.stride
-    ept = FFT_fwd.elements_per_thread
-    block_dim = FFT_fwd.block_dim
+    fft = FFT_base(direction="forward")
+    ifft = FFT_base(direction="inverse")
 
-    @cuda.jit(link=FFT_fwd.files + FFT_inv.files)
+    @cuda.jit
     def f(signal, filter):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         fft_id = (cuda.blockIdx.x * ffts_per_block) + cuda.threadIdx.y
         if fft_id >= batch_size:
             return
         offset = cuda.threadIdx.x
 
-        for i in range(ept):
-            thread_data[i] = signal[fft_id, offset + i * fft_stride]
+        for i in range(fft.elements_per_thread):
+            thread_data[i] = signal[fft_id, offset + i * fft.stride]
 
-        FFT_fwd(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
-        for i in range(ept):
-            thread_data[i] = thread_data[i] * filter[fft_id, offset + i * fft_stride]
+        for i in range(fft.elements_per_thread):
+            thread_data[i] = thread_data[i] * filter[fft_id, offset + i * fft.stride]
 
-        FFT_inv(thread_data, shared_mem)
+        ifft.execute(thread_data, shared_mem)
 
-        for i in range(ept):
-            signal[fft_id, offset + i * fft_stride] = thread_data[i]
+        for i in range(fft.elements_per_thread):
+            signal[fft_id, offset + i * fft.stride] = thread_data[i]
 
     data = random_complex((ffts_per_block, size), np.float32)
     filter = random_complex((ffts_per_block, size), np.float32)
@@ -70,7 +55,7 @@ def f(signal, filter):
     data_d = cuda.to_device(data)
     filter_d = cuda.to_device(filter)
 
-    f[1, block_dim, 0, shared_memory_size](data_d, filter_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d, filter_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_fft_2d.py b/examples/device/cufftdx_fft_2d.py
index 840badb..5d9978b 100644
--- a/examples/device/cufftdx_fft_2d.py
+++ b/examples/device/cufftdx_fft_2d.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, Dim3
+from nvmath.device import FFT, Dim3
 from common import random_complex
 import functools
 
@@ -22,25 +22,17 @@ def main():
     ept_x = 16
     fpb_x = 8
 
-    FFT_base = functools.partial(
-        fft, fft_type="c2c", direction="forward", precision=np.float32, execution="Block", compiler="numba"
-    )
-    FFT_y = FFT_base(size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
-    FFT_x = FFT_base(size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x)
-
-    value_type = FFT_x.value_type
-    storage_size_x = FFT_x.storage_size
-    storage_size_y = FFT_y.storage_size
-    stride_x = FFT_x.stride
-    stride_y = FFT_y.stride
+    FFT_base = functools.partial(FFT, fft_type="c2c", direction="forward", precision=np.float32, execution="Block")
+    fft_y = FFT_base(size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
+    fft_x = FFT_base(size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x)
 
     grid_dim_y = Dim3(fft_size_x // fpb_y, 1, 1)
     grid_dim_x = Dim3(fft_size_y // fpb_x, 1, 1)
 
-    @cuda.jit(link=FFT_y.files)
+    @cuda.jit
     def f_y(input, output):
-        thread_data = cuda.local.array(shape=(storage_size_y,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft_y.storage_size,), dtype=fft_y.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft_y.value_type)
 
         local_fft_id = cuda.threadIdx.y
         fft_id = cuda.blockIdx.x * fpb_y + local_fft_id
@@ -48,19 +40,19 @@ def f_y(input, output):
         index = cuda.threadIdx.x
         for i in range(ept_y):
             thread_data[i] = input[fft_id, index]
-            index += stride_y
+            index += fft_y.stride
 
-        FFT_y(thread_data, shared_mem)
+        fft_y.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(ept_y):
             output[fft_id, index] = thread_data[i]
-            index += stride_y
+            index += fft_y.stride
 
-    @cuda.jit(link=FFT_x.files)
+    @cuda.jit
     def f_x(input, output):
-        thread_data = cuda.local.array(shape=(storage_size_x,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft_x.storage_size,), dtype=fft_x.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft_x.value_type)
 
         local_fft_id = cuda.threadIdx.y
         fft_id = cuda.blockIdx.x * fpb_x + local_fft_id
@@ -68,14 +60,14 @@ def f_x(input, output):
         index = cuda.threadIdx.x
         for i in range(ept_x):
             thread_data[i] = input[index, fft_id]
-            index += stride_x
+            index += fft_x.stride
 
-        FFT_x(thread_data, shared_mem)
+        fft_x.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(ept_x):
             output[index, fft_id] = thread_data[i]
-            index += stride_x
+            index += fft_x.stride
 
     input = random_complex((fft_size_x, fft_size_y), real_dtype=np.float32)
     output = np.zeros((fft_size_x, fft_size_y), dtype=np.complex64)
@@ -84,8 +76,8 @@ def f_x(input, output):
 
     print("input [:10,:10]:", input[:10, :10])
 
-    f_y[grid_dim_y, FFT_y.block_dim, 0, FFT_y.shared_memory_size](input_d, output_d)
-    f_x[grid_dim_x, FFT_x.block_dim, 0, FFT_x.shared_memory_size](output_d, output_d)
+    f_y[grid_dim_y, fft_y.block_dim, 0, fft_y.shared_memory_size](input_d, output_d)
+    f_x[grid_dim_x, fft_x.block_dim, 0, fft_x.shared_memory_size](output_d, output_d)
     cuda.synchronize()
 
     output_test = output_d.copy_to_host()
diff --git a/examples/device/cufftdx_fft_2d_r2c_c2r.py b/examples/device/cufftdx_fft_2d_r2c_c2r.py
index 49d41bc..f7f418d 100644
--- a/examples/device/cufftdx_fft_2d_r2c_c2r.py
+++ b/examples/device/cufftdx_fft_2d_r2c_c2r.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_real
 import functools
 
@@ -22,32 +22,32 @@ def main():
     ept_x = 16
     fpb_x = 8
 
-    FFT_base = functools.partial(fft, precision=np.float32, execution="Block", compiler="numba")
+    FFT_base = functools.partial(FFT, precision=np.float32, execution="Block")
     # R2C along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)  # noqa: W505
-    FFT_y_r2c = FFT_base(fft_type="r2c", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
+    fft_y_r2c = FFT_base(fft_type="r2c", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
     # C2Cf along X (fft_size_y//2+1 batches, logical FFT size is fft_size_x)
-    FFT_x_c2c_f = FFT_base(
+    fft_x_c2c_f = FFT_base(
         fft_type="c2c", direction="inverse", size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x
     )
     # C2Ci along X (fft_size_y//2+1 batches, logical FFT size is fft_size_x)
-    FFT_x_c2c_i = FFT_base(
+    fft_x_c2c_i = FFT_base(
         fft_type="c2c", direction="forward", size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x
     )
     # C2R along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)  # noqa: W505
-    FFT_y_c2r = FFT_base(fft_type="c2r", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
+    fft_y_c2r = FFT_base(fft_type="c2r", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
 
-    complex_type = FFT_y_r2c.value_type
+    complex_type = fft_y_r2c.value_type
     real_type = np.float32
-    storage_size_r2c = FFT_y_r2c.storage_size
-    storage_size_c2c = max(FFT_x_c2c_f.storage_size, FFT_x_c2c_i.storage_size)
-    storage_size_c2r = FFT_y_c2r.storage_size
-    stride_r2c = FFT_y_r2c.stride
-    stride_c2c = FFT_x_c2c_f.stride
-    stride_c2r = FFT_y_c2r.stride
+    storage_size_r2c = fft_y_r2c.storage_size
+    storage_size_c2c = max(fft_x_c2c_f.storage_size, fft_x_c2c_i.storage_size)
+    storage_size_c2r = fft_y_c2r.storage_size
+    stride_r2c = fft_y_r2c.stride
+    stride_c2c = fft_x_c2c_f.stride
+    stride_c2r = fft_y_c2r.stride
 
-    assert FFT_x_c2c_f.stride == FFT_x_c2c_i.stride
-    assert FFT_x_c2c_f.block_dim == FFT_x_c2c_i.block_dim
-    assert FFT_x_c2c_f.shared_memory_size == FFT_x_c2c_i.shared_memory_size
+    assert fft_x_c2c_f.stride == fft_x_c2c_i.stride
+    assert fft_x_c2c_f.block_dim == fft_x_c2c_i.block_dim
+    assert fft_x_c2c_f.shared_memory_size == fft_x_c2c_i.shared_memory_size
 
     grid_dim_y = (fft_size_x + fpb_y - 1) // fpb_y
     grid_dim_x = ((fft_size_y // 2 + 1) + fpb_x - 1) // fpb_x
@@ -57,7 +57,7 @@ def main():
     # stride_x             = FFT_x.stride
     # stride_y             = FFT_y.stride
 
-    @cuda.jit(link=FFT_y_r2c.files)
+    @cuda.jit
     def f_y_r2c(input, output):
         thread_data = cuda.local.array(shape=(storage_size_r2c,), dtype=complex_type)
         thread_data_real = thread_data.view(real_type)
@@ -74,7 +74,7 @@ def f_y_r2c(input, output):
                 thread_data_real[i] = input[fft_id, index]
                 index += stride_r2c
 
-        FFT_y_r2c(thread_data, shared_mem)
+        fft_y_r2c.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(ept_y):
@@ -82,7 +82,7 @@ def f_y_r2c(input, output):
                 output[fft_id, index] = thread_data[i]
                 index += stride_r2c
 
-    @cuda.jit(link=FFT_x_c2c_f.files + FFT_x_c2c_i.files)
+    @cuda.jit
     def f_x(input, output):
         thread_data = cuda.local.array(shape=(storage_size_c2c,), dtype=complex_type)
         shared_mem = cuda.shared.array(shape=(0,), dtype=complex_type)
@@ -98,7 +98,7 @@ def f_x(input, output):
                 thread_data[i] = input[index, fft_id]
                 index += stride_c2c
 
-        FFT_x_c2c_f(thread_data, shared_mem)
+        fft_x_c2c_f.execute(thread_data, shared_mem)
 
         # Can do some elementwise operation here
         # index = cuda.threadIdx.x
@@ -107,7 +107,7 @@ def f_x(input, output):
         #         thread_data[i] = thread_data[i] / (fft_size_x * fft_size_y)
         #         index += stride_c2c
 
-        FFT_x_c2c_i(thread_data, shared_mem)
+        fft_x_c2c_i.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(ept_x):
@@ -115,7 +115,7 @@ def f_x(input, output):
                 output[index, fft_id] = thread_data[i]
                 index += stride_c2c
 
-    @cuda.jit(link=FFT_y_c2r.files)
+    @cuda.jit
     def f_y_c2r(input, output):
         thread_data = cuda.local.array(shape=(storage_size_c2r,), dtype=complex_type)
         thread_data_real = thread_data.view(real_type)
@@ -132,7 +132,7 @@ def f_y_c2r(input, output):
                 thread_data[i] = input[fft_id, index]
                 index += stride_c2r
 
-        FFT_y_c2r(thread_data, shared_mem)
+        fft_y_c2r.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
         for i in range(ept_y):
@@ -148,9 +148,9 @@ def f_y_c2r(input, output):
 
     print("real (input) [:10,:10]:", real[:10, :10])
 
-    f_y_r2c[grid_dim_y, FFT_y_r2c.block_dim, 0, FFT_y_r2c.shared_memory_size](real_d, complex_d)
-    f_x[grid_dim_x, FFT_x_c2c_f.block_dim, 0, FFT_x_c2c_f.shared_memory_size](complex_d, complex_d)
-    f_y_c2r[grid_dim_y, FFT_y_c2r.block_dim, 0, FFT_y_c2r.shared_memory_size](complex_d, real_d)
+    f_y_r2c[grid_dim_y, fft_y_r2c.block_dim, 0, fft_y_r2c.shared_memory_size](real_d, complex_d)
+    f_x[grid_dim_x, fft_x_c2c_f.block_dim, 0, fft_x_c2c_f.shared_memory_size](complex_d, complex_d)
+    f_y_c2r[grid_dim_y, fft_y_c2r.block_dim, 0, fft_y_c2r.shared_memory_size](complex_d, real_d)
     cuda.synchronize()
 
     real_test = real_d.copy_to_host()
diff --git a/examples/device/cufftdx_fft_2d_single_kernel.py b/examples/device/cufftdx_fft_2d_single_kernel.py
index fcfd3fc..447c69e 100644
--- a/examples/device/cufftdx_fft_2d_single_kernel.py
+++ b/examples/device/cufftdx_fft_2d_single_kernel.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 from common import random_complex
 import functools
 
@@ -22,23 +22,19 @@ def main():
     ept_x = 8
     fpb_x = fpb_y
 
-    FFT_base = functools.partial(
-        fft, fft_type="c2c", direction="forward", precision=np.float32, execution="Block", compiler="numba"
-    )
-    FFT_y = FFT_base(size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
-    FFT_x = FFT_base(size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x)
+    FFT_base = functools.partial(FFT, fft_type="c2c", direction="forward", precision=np.float32, execution="Block")
+    fft_y = FFT_base(size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
+    fft_x = FFT_base(size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x)
 
-    value_type = FFT_y.value_type
-    storage_size = max(FFT_x.storage_size, FFT_y.storage_size)
-    shared_memory_size = max(FFT_x.shared_memory_size, FFT_y.shared_memory_size)
-    stride_x = FFT_x.stride
-    stride_y = FFT_y.stride
+    value_type = fft_y.value_type
+    storage_size = max(fft_x.storage_size, fft_y.storage_size)
+    shared_memory_size = max(fft_x.shared_memory_size, fft_y.shared_memory_size)
 
-    assert FFT_x.block_dim == FFT_y.block_dim
-    block_dim = FFT_x.block_dim
+    assert fft_x.block_dim == fft_y.block_dim
+    block_dim = fft_x.block_dim
     grid_dim = (max(fft_size_y // fpb_y, fft_size_x // fpb_x), 1, 1)
 
-    @cuda.jit(link=FFT_x.files + FFT_y.files)
+    @cuda.jit
     def f(input, output):
         thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
         shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
@@ -52,15 +48,15 @@ def f(input, output):
             for i in range(ept_y):
                 thread_data[i] = input[fft_id, index]
                 # fast_copy(input, fft_id * fft_size_y + index, thread_data, i)
-                index += stride_y
+                index += fft_y.stride
 
-            FFT_y(thread_data, shared_mem)
+            fft_y.execute(thread_data, shared_mem)
 
             index = cuda.threadIdx.x
             for i in range(ept_y):
                 output[fft_id, index] = thread_data[i]
                 # fast_copy(thread_data, i, output, fft_id * fft_size_y + index)
-                index += stride_y
+                index += fft_y.stride
 
         ## Grid sync
         g = cuda.cg.this_grid()
@@ -75,17 +71,17 @@ def f(input, output):
             for i in range(ept_x):
                 thread_data[i] = output[index, fft_id]
                 # fast_copy(output, index * fft_size_y + fft_id, thread_data, i)
-                index += stride_x
+                index += fft_x.stride
 
             # Compute
-            FFT_x(thread_data, shared_mem)
+            fft_x.execute(thread_data, shared_mem)
 
             # Store
             index = cuda.threadIdx.x
             for i in range(ept_x):
                 output[index, fft_id] = thread_data[i]
                 # fast_copy(thread_data, i, output, index * fft_size_y + fft_id)
-                index += stride_x
+                index += fft_x.stride
 
     input = random_complex((fft_size_x, fft_size_y), real_dtype=np.float32)
     output = np.zeros((fft_size_x, fft_size_y), dtype=np.complex64)
diff --git a/examples/device/cufftdx_fft_3d_box_single_block.py b/examples/device/cufftdx_fft_3d_box_single_block.py
index 5df13b8..52398aa 100644
--- a/examples/device/cufftdx_fft_3d_box_single_block.py
+++ b/examples/device/cufftdx_fft_3d_box_single_block.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, Dim3
+from nvmath.device import FFT, Dim3
 from common import random_complex
 import functools
 
@@ -18,29 +18,27 @@ def main():
     fft_size_y = 15
     fft_size_z = 14
 
-    FFT_base = functools.partial(
-        fft, fft_type="c2c", direction="forward", precision=np.float32, execution="Thread", compiler="numba"
-    )
-    FFT_x = FFT_base(size=fft_size_x)
-    FFT_y = FFT_base(size=fft_size_y)
-    FFT_z = FFT_base(size=fft_size_z)
+    FFT_base = functools.partial(FFT, fft_type="c2c", direction="forward", precision=np.float32, execution="Thread")
+    fft_x = FFT_base(size=fft_size_x)
+    fft_y = FFT_base(size=fft_size_y)
+    fft_z = FFT_base(size=fft_size_z)
 
-    value_type = FFT_x.value_type
+    value_type = fft_x.value_type
     max_dim = max(fft_size_x, fft_size_y, fft_size_z)
     block_dim = Dim3(max_dim, max_dim, 1)
     shared_memory_size = (fft_size_x * fft_size_y * fft_size_z) * np.complex64(1.0).itemsize
-    storage_size = max(FFT_x.storage_size, FFT_y.storage_size, FFT_z.storage_size)
+    storage_size = max(fft_x.storage_size, fft_y.storage_size, fft_z.storage_size)
     grid_dim = Dim3(1, 1, 1)
 
-    eptx = FFT_x.elements_per_thread
-    epty = FFT_y.elements_per_thread  # codespell:ignore epty
-    eptz = FFT_z.elements_per_thread
+    eptx = fft_x.elements_per_thread
+    epty = fft_y.elements_per_thread  # codespell:ignore epty
+    eptz = fft_z.elements_per_thread
 
     stride_x = fft_size_y * fft_size_z
     stride_y = fft_size_z
     stride_z = 1
 
-    @cuda.jit(link=FFT_x.files + FFT_y.files + FFT_z.files)
+    @cuda.jit
     def f(input, output):
         thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
         shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
@@ -56,7 +54,7 @@ def f(input, output):
                 # fast_copy(input, i * stride_x + tidy * stride_y + tidx * stride_z, thread_data, i)  # noqa: W505
                 thread_data[i] = input[i, tidy, tidx]
 
-            FFT_x(thread_data)
+            fft_x.execute(thread_data)
 
             index = tidy * stride_y + tidx * stride_z
             for i in range(eptx):
@@ -74,7 +72,7 @@ def f(input, output):
                 thread_data[i] = shared_mem[index]
                 index += stride_y
 
-            FFT_y(thread_data)
+            fft_y.execute(thread_data)
 
             index = tidy * stride_x + tidx
             for i in range(epty):  # codespell:ignore epty
@@ -95,7 +93,7 @@ def f(input, output):
                 thread_data[i] = shared_mem[index]
                 index += stride_z
 
-            FFT_z(thread_data)
+            fft_z(thread_data)
 
             # Reshuffle in shared
             index = tidy * stride_x + tidx * stride_y
diff --git a/examples/device/cufftdx_fft_3d_cube_single_block.py b/examples/device/cufftdx_fft_3d_cube_single_block.py
index c6d02c8..676f77a 100644
--- a/examples/device/cufftdx_fft_3d_cube_single_block.py
+++ b/examples/device/cufftdx_fft_3d_cube_single_block.py
@@ -8,60 +8,55 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, Dim3
+from nvmath.device import FFT, Dim3
 from common import random_complex
 
 
 def main():
-    fft_size = 16
+    fft = FFT(fft_type="c2c", size=16, direction="forward", precision=np.float32, execution="Thread")
 
-    FFT = fft(fft_type="c2c", size=fft_size, direction="forward", precision=np.float32, execution="Thread", compiler="numba")
-
-    block_dim = Dim3(fft_size, fft_size, 1)
+    block_dim = Dim3(fft.size, fft.size, 1)
     grid_dim = Dim3(1, 1, 1)
-    storage_size = FFT.storage_size
-    value_type = FFT.value_type
-    shared_memory_size = fft_size * fft_size * fft_size * np.complex64(1).itemsize
-    elements_per_thread = FFT.elements_per_thread
+    shared_memory_size = fft.size * fft.size * fft.size * np.complex64(1).itemsize
 
-    stride_x = fft_size * fft_size
-    stride_y = fft_size
+    stride_x = fft.size * fft.size
+    stride_y = fft.size
 
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         ## Load
         j, k = cuda.threadIdx.y, cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = input[i, j, k]
 
         ## FFT along X
-        FFT(thread_data)
+        fft.execute(thread_data)
 
         # Exchange/transpose via shared memory
-        index = cuda.threadIdx.x + cuda.threadIdx.y * fft_size
-        for i in range(elements_per_thread):
+        index = cuda.threadIdx.x + cuda.threadIdx.y * fft.size
+        for i in range(fft.elements_per_thread):
             shared_mem[index] = thread_data[i]
             index += stride_x
         cuda.syncthreads()
-        index = cuda.threadIdx.x + cuda.threadIdx.y * fft_size * fft_size
-        for i in range(elements_per_thread):
+        index = cuda.threadIdx.x + cuda.threadIdx.y * fft.size * fft.size
+        for i in range(fft.elements_per_thread):
             thread_data[i] = shared_mem[index]
             index += stride_y
 
         # FFT along Y
-        FFT(thread_data)
+        fft.execute(thread_data)
 
         # Exchange/transpose via shared memory
-        index = cuda.threadIdx.x + cuda.threadIdx.y * fft_size * fft_size
-        for i in range(elements_per_thread):
+        index = cuda.threadIdx.x + cuda.threadIdx.y * fft.size * fft.size
+        for i in range(fft.elements_per_thread):
             shared_mem[index] = thread_data[i]
             index += stride_y
         cuda.syncthreads()
-        index = (cuda.threadIdx.x + cuda.threadIdx.y * fft_size) * fft_size
-        for i in range(elements_per_thread):
+        index = (cuda.threadIdx.x + cuda.threadIdx.y * fft.size) * fft.size
+        for i in range(fft.elements_per_thread):
             thread_data[i] = shared_mem[index]
             index += 1
         # for i in range(0, elements_per_thread, 2): # Manually vectorized
@@ -69,29 +64,29 @@ def f(input, output):
         #     index += 2
 
         # FFT along Z
-        FFT(thread_data)
+        fft.execute(thread_data)
 
         # Shared memory IO - exchange data to store with coalesced stores
-        index = (cuda.threadIdx.x + cuda.threadIdx.y * fft_size) * fft_size
-        for i in range(elements_per_thread):
+        index = (cuda.threadIdx.x + cuda.threadIdx.y * fft.size) * fft.size
+        for i in range(fft.elements_per_thread):
             shared_mem[index] = thread_data[i]
             index += 1
         # for i in range(0, elements_per_thread, 2): # Manually vectorized
         #     fast_copy_2x(thread_data, i, shared_mem, index)
         #     index += 2
         cuda.syncthreads()
-        index = cuda.threadIdx.x + cuda.threadIdx.y * fft_size
-        for i in range(elements_per_thread):
+        index = cuda.threadIdx.x + cuda.threadIdx.y * fft.size
+        for i in range(fft.elements_per_thread):
             thread_data[i] = shared_mem[index]
             index += stride_x
 
         # Store
         j, k = cuda.threadIdx.y, cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             output[i, j, k] = thread_data[i]
 
-    input = random_complex((fft_size, fft_size, fft_size), real_dtype=np.float32)
-    output = np.zeros((fft_size, fft_size, fft_size), dtype=np.complex64)
+    input = random_complex((fft.size, fft.size, fft.size), real_dtype=np.float32)
+    output = np.zeros((fft.size, fft.size, fft.size), dtype=np.complex64)
     input_d = cuda.to_device(input)
     output_d = cuda.to_device(output)
 
diff --git a/examples/device/cufftdx_helloworld.py b/examples/device/cufftdx_helloworld.py
index e873e48..a146765 100644
--- a/examples/device/cufftdx_helloworld.py
+++ b/examples/device/cufftdx_helloworld.py
@@ -8,48 +8,36 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    size = 1024
+    fft = FFT(fft_type="c2c", size=1024, precision=np.float32, direction="forward", execution="Block")
 
-    FFT = fft(fft_type="c2c", size=size, precision=np.float32, direction="forward", execution="Block", compiler="numba")
-
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    files = FFT.files
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = data[fft_id, index]
-            index += stride
+            index += fft.stride
 
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
-        FFT(thread_data, shared_mem)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             data[fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    data = np.ones((ffts_per_block, size), dtype=np.complex64)
+    data = np.ones((fft.ffts_per_block, fft.size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_simple_fft_block.py b/examples/device/cufftdx_simple_fft_block.py
index 957eeba..d16a7ab 100644
--- a/examples/device/cufftdx_simple_fft_block.py
+++ b/examples/device/cufftdx_simple_fft_block.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=128,
         precision=np.float32,
@@ -20,43 +20,33 @@ def main():
         elements_per_thread=8,
         ffts_per_block=2,
         execution="Block",
-        compiler="numba",
     )
 
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = data[local_fft_id, index]
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             data[local_fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    data = np.ones((ffts_per_block, size), dtype=np.complex64)
+    data = np.ones((fft.ffts_per_block, fft.size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_simple_fft_block_c2r.py b/examples/device/cufftdx_simple_fft_block_c2r.py
index b92444f..6cfc950 100644
--- a/examples/device/cufftdx_simple_fft_block_c2r.py
+++ b/examples/device/cufftdx_simple_fft_block_c2r.py
@@ -8,66 +8,56 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="c2r",
         size=128,
         precision=np.float32,
         elements_per_thread=8,
         ffts_per_block=2,
         execution="Block",
-        compiler="numba",
     )
 
-    fft_size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
         thread_data_real = thread_data.view(np.float32)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < fft_size // 2 + 1:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size // 2 + 1:
                 thread_data[i] = input[local_fft_id, index]
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < fft_size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 output[local_fft_id, index] = thread_data_real[i]
-            index += stride
+            index += fft.stride
 
-    input = np.ones((ffts_per_block, fft_size // 2 + 1), dtype=np.complex64)
-    output = np.zeros((ffts_per_block, fft_size), dtype=np.float32)
+    input = np.ones((fft.ffts_per_block, fft.size // 2 + 1), dtype=np.complex64)
+    output = np.zeros((fft.ffts_per_block, fft.size), dtype=np.float32)
     input_d = cuda.to_device(input)
     output_d = cuda.to_device(output)
 
     print("input [1st FFT]:", input[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](input_d, output_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](input_d, output_d)
     cuda.synchronize()
 
     output_test = output_d.copy_to_host()
 
     print("output [1st FFT]:", output_test[0, :])
 
-    data_ref = np.fft.irfft(input, axis=-1, n=fft_size, norm="forward")
+    data_ref = np.fft.irfft(input, axis=-1, n=fft.size, norm="forward")
     error = np.linalg.norm(output_test - data_ref) / np.linalg.norm(data_ref)
     assert error < 1e-5
 
diff --git a/examples/device/cufftdx_simple_fft_block_c2r_fp16.py b/examples/device/cufftdx_simple_fft_block_c2r_fp16.py
index 9e7d6e7..9c97f4d 100644
--- a/examples/device/cufftdx_simple_fft_block_c2r_fp16.py
+++ b/examples/device/cufftdx_simple_fft_block_c2r_fp16.py
@@ -8,77 +8,69 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, float16x4, float16x2_type
+from nvmath.device import FFT, float16x4, float16x2_type
 from common import random_real, complex64_to_fp16x2
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="c2r",
         size=128,
         precision=np.float16,
         ffts_per_block=4,
         elements_per_thread=8,
         execution="Block",
-        compiler="numba",
     )
 
-    size = FFT.size
-    stride = FFT.stride
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    elements_per_thread = FFT.elements_per_thread
-    implicit_type_batching = FFT.implicit_type_batching
-    ffts_per_block = FFT.ffts_per_block
-    assert implicit_type_batching == 2
-    assert ffts_per_block % implicit_type_batching == 0
-
-    @cuda.jit(link=FFT.files)
+    assert fft.implicit_type_batching == 2
+    assert fft.ffts_per_block % fft.implicit_type_batching == 0
+
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
         thread_data_real = thread_data.view(float16x2_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size // 2 + 1:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size // 2 + 1:
                 r0 = input[2 * local_fft_id, 2 * index + 0]
                 i0 = input[2 * local_fft_id, 2 * index + 1]
                 r1 = input[2 * local_fft_id + 1, 2 * index + 0]
                 i1 = input[2 * local_fft_id + 1, 2 * index + 1]
                 thread_data[i] = float16x4(r0, r1, i0, i1)
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 rr = thread_data_real[i]
                 output[2 * local_fft_id, index] = rr.x
                 output[2 * local_fft_id + 1, index] = rr.y
-            index += stride
+            index += fft.stride
 
     # Numpy has no FP16 complex, so we create a 2xlarger arrays of FP16 reals
     # Each consecutive pair of reals form one logical FP16 complex number
-    input = np.fft.rfft(random_real((ffts_per_block, size), real_dtype=np.float32))
-    output_fp16 = np.zeros((ffts_per_block, size), dtype=np.float16)
+    input = np.fft.rfft(random_real((fft.ffts_per_block, fft.size), real_dtype=np.float32))
+    output_fp16 = np.zeros((fft.ffts_per_block, fft.size), dtype=np.float16)
     input_fp16 = complex64_to_fp16x2(input)
     input_d = cuda.to_device(input_fp16)
     output_d = cuda.to_device(output_fp16)
 
     print("input [1st FFT]:", input[0, :])
 
-    f[1, FFT.block_dim, 0, FFT.shared_memory_size](input_d, output_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](input_d, output_d)
     cuda.synchronize()
 
     data_test = output_d.copy_to_host()
 
     print("output [1st FFT]:", data_test[0, :])
 
-    data_ref = np.fft.irfft(input, axis=-1, n=size, norm="forward")
+    data_ref = np.fft.irfft(input, axis=-1, n=fft.size, norm="forward")
     error = np.linalg.norm(data_test - data_ref) / np.linalg.norm(data_ref)
     assert error < 1e-2
 
diff --git a/examples/device/cufftdx_simple_fft_block_half2.py b/examples/device/cufftdx_simple_fft_block_half2.py
index f2a4d3b..c7ef694 100644
--- a/examples/device/cufftdx_simple_fft_block_half2.py
+++ b/examples/device/cufftdx_simple_fft_block_half2.py
@@ -8,12 +8,12 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, float16x4
+from nvmath.device import FFT, float16x4
 from common import random_complex, fp16x2_to_complex64, complex64_to_fp16x2
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=128,
         precision=np.float16,
@@ -21,58 +21,50 @@ def main():
         elements_per_thread=8,
         direction="forward",
         execution="Block",
-        compiler="numba",
     )
 
-    size = FFT.size
-    stride = FFT.stride
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    elements_per_thread = FFT.elements_per_thread
-    implicit_type_batching = FFT.implicit_type_batching
-    ffts_per_block = FFT.ffts_per_block
-    assert implicit_type_batching == 2
-    assert ffts_per_block % implicit_type_batching == 0
-
-    @cuda.jit(link=FFT.files)
+    assert fft.implicit_type_batching == 2
+    assert fft.ffts_per_block % fft.implicit_type_batching == 0
+
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 r0 = data[2 * local_fft_id, 2 * index + 0]
                 i0 = data[2 * local_fft_id, 2 * index + 1]
                 r1 = data[2 * local_fft_id + 1, 2 * index + 0]
                 i1 = data[2 * local_fft_id + 1, 2 * index + 1]
                 thread_data[i] = float16x4(r0, r1, i0, i1)
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 rrii = thread_data[i]
                 r0, r1, i0, i1 = rrii.x, rrii.y, rrii.z, rrii.w
                 data[2 * local_fft_id, 2 * index + 0] = r0
                 data[2 * local_fft_id, 2 * index + 1] = i0
                 data[2 * local_fft_id + 1, 2 * index + 0] = r1
                 data[2 * local_fft_id + 1, 2 * index + 1] = i1
-            index += stride
+            index += fft.stride
 
     # Numpy has no FP16 complex, so we create a 2xlarger arrays of FP16 reals
     # Each consecutive pair of reals form one logical FP16 complex number
-    data = random_complex((ffts_per_block, size), real_dtype=np.float32)
+    data = random_complex((fft.ffts_per_block, fft.size), real_dtype=np.float32)
     data_fp16 = complex64_to_fp16x2(data)
     data_d = cuda.to_device(data_fp16)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, FFT.block_dim, 0, FFT.shared_memory_size](data_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = fp16x2_to_complex64(data_d.copy_to_host())
diff --git a/examples/device/cufftdx_simple_fft_block_r2c.py b/examples/device/cufftdx_simple_fft_block_r2c.py
index 38232d8..aaf908b 100644
--- a/examples/device/cufftdx_simple_fft_block_r2c.py
+++ b/examples/device/cufftdx_simple_fft_block_r2c.py
@@ -8,59 +8,49 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="r2c",
         size=128,
         precision=np.float32,
         elements_per_thread=8,
         ffts_per_block=2,
         execution="Block",
-        compiler="numba",
     )
 
-    fft_size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
         thread_data_real = thread_data.view(np.float32)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < fft_size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 thread_data_real[i] = input[local_fft_id, index]
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < fft_size // 2 + 1:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size // 2 + 1:
                 output[local_fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
-    input = np.ones((ffts_per_block, fft_size), dtype=np.float32)
-    output = np.zeros((ffts_per_block, fft_size // 2 + 1), dtype=np.complex64)
+    input = np.ones((fft.ffts_per_block, fft.size), dtype=np.float32)
+    output = np.zeros((fft.ffts_per_block, fft.size // 2 + 1), dtype=np.complex64)
     input_d = cuda.to_device(input)
     output_d = cuda.to_device(output)
 
     print("input [1st FFT]:", input[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](input_d, output_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](input_d, output_d)
     cuda.synchronize()
 
     output_test = output_d.copy_to_host()
diff --git a/examples/device/cufftdx_simple_fft_block_r2c_fp16.py b/examples/device/cufftdx_simple_fft_block_r2c_fp16.py
index 78c15f9..0ecfcd4 100644
--- a/examples/device/cufftdx_simple_fft_block_r2c_fp16.py
+++ b/examples/device/cufftdx_simple_fft_block_r2c_fp16.py
@@ -8,71 +8,63 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, float16x2, float16x2_type
+from nvmath.device import FFT, float16x2, float16x2_type
 from common import random_real, fp16x2_to_complex64
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="r2c",
         size=128,
         precision=np.float16,
         ffts_per_block=4,
         elements_per_thread=8,
         execution="Block",
-        compiler="numba",
     )
 
-    size = FFT.size
-    stride = FFT.stride
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    elements_per_thread = FFT.elements_per_thread
-    implicit_type_batching = FFT.implicit_type_batching
-    ffts_per_block = FFT.ffts_per_block
-    assert implicit_type_batching == 2
-    assert ffts_per_block % implicit_type_batching == 0
-
-    @cuda.jit(link=FFT.files)
+    assert fft.implicit_type_batching == 2
+    assert fft.ffts_per_block % fft.implicit_type_batching == 0
+
+    @cuda.jit
     def f(input, output):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
         thread_data_real = thread_data.view(float16x2_type)
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size:
                 r0 = input[2 * local_fft_id, index]
                 r1 = input[2 * local_fft_id + 1, index]
                 thread_data_real[i] = float16x2(r0, r1)
 
-            index += stride
+            index += fft.stride
 
-        FFT(thread_data, shared_mem)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
-            if index < size // 2 + 1:
+        for i in range(fft.elements_per_thread):
+            if index < fft.size // 2 + 1:
                 rrii = thread_data[i]
                 r0, r1, i0, i1 = rrii.x, rrii.y, rrii.z, rrii.w
                 output[2 * local_fft_id, 2 * index + 0] = r0
                 output[2 * local_fft_id, 2 * index + 1] = i0
                 output[2 * local_fft_id + 1, 2 * index + 0] = r1
                 output[2 * local_fft_id + 1, 2 * index + 1] = i1
-            index += stride
+            index += fft.stride
 
     # Numpy has no FP16 complex, so we create a 2xlarger arrays of FP16 reals
     # Each consecutive pair of reals form one logical FP16 complex number
-    input = random_real((ffts_per_block, size), real_dtype=np.float32)
-    output = np.zeros((ffts_per_block, 2 * (size // 2 + 1)), dtype=np.float16)
+    input = random_real((fft.ffts_per_block, fft.size), real_dtype=np.float32)
+    output = np.zeros((fft.ffts_per_block, 2 * (fft.size // 2 + 1)), dtype=np.float16)
     input_d = cuda.to_device(input)
     output_d = cuda.to_device(output)
 
     print("input [1st FFT]:", input[0, :])
 
-    f[1, FFT.block_dim, 0, FFT.shared_memory_size](input_d, output_d)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](input_d, output_d)
     cuda.synchronize()
 
     data_test = fp16x2_to_complex64(output_d.copy_to_host())
diff --git a/examples/device/cufftdx_simple_fft_block_shared.py b/examples/device/cufftdx_simple_fft_block_shared.py
index dc71fe9..426de76 100644
--- a/examples/device/cufftdx_simple_fft_block_shared.py
+++ b/examples/device/cufftdx_simple_fft_block_shared.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=128,
         precision=np.float32,
@@ -20,45 +20,36 @@ def main():
         elements_per_thread=8,
         ffts_per_block=2,
         execution="Block",
-        compiler="numba",
-        execute_api="shared_memory",
     )
 
-    size = FFT.size
-    value_type = FFT.value_type
-    shared_memory_size = max(FFT.shared_memory_size, np.complex64(1.0).itemsize * size)
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data):
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
         local_fft_id = cuda.threadIdx.y
 
         index = cuda.threadIdx.x
-        for _ in range(elements_per_thread):
-            shared_mem[local_fft_id * size + index] = data[local_fft_id, index]
-            index += stride
+        for _ in range(fft.elements_per_thread):
+            shared_mem[local_fft_id * fft.size + index] = data[local_fft_id, index]
+            index += fft.stride
 
         cuda.syncthreads()
 
-        FFT(shared_mem)
+        fft(shared_mem)
 
         cuda.syncthreads()
 
         index = cuda.threadIdx.x
-        for _ in range(elements_per_thread):
-            data[local_fft_id, index] = shared_mem[local_fft_id * size + index]
-            index += stride
+        for _ in range(fft.elements_per_thread):
+            data[local_fft_id, index] = shared_mem[local_fft_id * fft.size + index]
+            index += fft.stride
 
-    data = np.ones((ffts_per_block, size), dtype=np.complex64)
+    data = np.ones((fft.ffts_per_block, fft.size), dtype=np.complex64)
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
 
-    f[1, block_dim, 0, shared_memory_size](data_d)
+    shared_memory_size = max(fft.shared_memory_size, np.complex64(1.0).itemsize * fft.size)
+    f[1, fft.block_dim, 0, shared_memory_size](data_d)
     cuda.synchronize()
 
     data_test = data_d.copy_to_host()
diff --git a/examples/device/cufftdx_simple_fft_thread.py b/examples/device/cufftdx_simple_fft_thread.py
index 2a848c3..20e3e48 100644
--- a/examples/device/cufftdx_simple_fft_thread.py
+++ b/examples/device/cufftdx_simple_fft_thread.py
@@ -8,34 +8,29 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft
+from nvmath.device import FFT
 
 
 def main():
     threads_count = 4
 
-    FFT = fft(fft_type="c2c", size=8, precision=np.float64, direction="forward", execution="Thread", compiler="numba")
+    fft = FFT(fft_type="c2c", size=8, precision=np.float64, direction="forward", execution="Thread")
 
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    elements_per_thread = FFT.elements_per_thread
-
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.x
 
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             thread_data[i] = data[local_fft_id, i]
 
-        FFT(thread_data)
+        fft.execute(thread_data)
 
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             data[local_fft_id, i] = thread_data[i]
 
-    data = np.ones((threads_count, size), dtype=np.complex128)
+    data = np.ones((threads_count, fft.size), dtype=np.complex128)
     data_d = cuda.to_device(data)
 
     print("input [1st FFT]:", data[0, :])
diff --git a/examples/device/cufftdx_simple_fft_thread_fp16.py b/examples/device/cufftdx_simple_fft_thread_fp16.py
index 51be9cc..1f2716c 100644
--- a/examples/device/cufftdx_simple_fft_thread_fp16.py
+++ b/examples/device/cufftdx_simple_fft_thread_fp16.py
@@ -8,38 +8,33 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, float16x4
+from nvmath.device import FFT, float16x4
 from common import random_complex, fp16x2_to_complex64, complex64_to_fp16x2
 
 
 def main():
     threads_count = 3
 
-    FFT = fft(fft_type="c2c", size=8, precision=np.float16, direction="forward", execution="Thread", compiler="numba")
+    fft = FFT(fft_type="c2c", size=8, precision=np.float16, direction="forward", execution="Thread")
 
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    elements_per_thread = FFT.elements_per_thread
-    implicit_type_batching = FFT.implicit_type_batching
-    assert implicit_type_batching == 2
+    assert fft.implicit_type_batching == 2
 
-    @cuda.jit(link=FFT.files)
+    @cuda.jit
     def f(data):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.x
 
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             r0 = data[2 * local_fft_id, 2 * i + 0]
             i0 = data[2 * local_fft_id, 2 * i + 1]
             r1 = data[2 * local_fft_id + 1, 2 * i + 0]
             i1 = data[2 * local_fft_id + 1, 2 * i + 1]
             thread_data[i] = float16x4(r0, r1, i0, i1)
 
-        FFT(thread_data)
+        fft.execute(thread_data)
 
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             rrii = thread_data[i]
             r0, r1, i0, i1 = rrii.x, rrii.y, rrii.z, rrii.w
             data[2 * local_fft_id, 2 * i + 0] = r0
@@ -49,7 +44,7 @@ def f(data):
 
     # Numpy has no FP16 complex, so we create a 2xlarger arrays of FP16 reals
     # Each consecutive pair of reals form one logical FP16 complex number
-    data = random_complex((implicit_type_batching * threads_count, size), real_dtype=np.float32)
+    data = random_complex((fft.implicit_type_batching * threads_count, fft.size), real_dtype=np.float32)
     data_fp16 = complex64_to_fp16x2(data)
     data_d = cuda.to_device(data_fp16)
 
diff --git a/examples/device/curand_cufftdx_block_fft.py b/examples/device/curand_cufftdx_block_fft.py
index d2b0774..ccf655c 100644
--- a/examples/device/curand_cufftdx_block_fft.py
+++ b/examples/device/curand_cufftdx_block_fft.py
@@ -10,7 +10,7 @@
 
 import numpy as np
 from numba import cuda
-from nvmath.device import fft, random, float32x2
+from nvmath.device import FFT, random, float32x2
 
 # Compile the random device APIs for the current device.
 compiled_random_apis = random.Compile(cc=None)
@@ -19,16 +19,7 @@
 def main():
     size = 64
 
-    FFT = fft(fft_type="c2c", size=size, precision=np.float32, direction="forward", execution="Block", compiler="numba")
-
-    size = FFT.size
-    value_type = FFT.value_type
-    storage_size = FFT.storage_size
-    shared_memory_size = FFT.shared_memory_size
-    stride = FFT.stride
-    block_dim = FFT.block_dim
-    ffts_per_block = FFT.ffts_per_block
-    elements_per_thread = FFT.elements_per_thread
+    fft = FFT(fft_type="c2c", size=size, precision=np.float32, direction="forward", execution="Block")
 
     # Kernel for initializing the RNG state.
     @cuda.jit(link=compiled_random_apis.files, extensions=compiled_random_apis.extension)
@@ -37,39 +28,39 @@ def setup_random(states):
         random.init(1234, index_in_block, 0, states[index_in_block])
 
     # Kernel with threads generating local data and performing an FFT.
-    @cuda.jit(link=FFT.files + compiled_random_apis.files, extensions=compiled_random_apis.extension)
+    @cuda.jit(link=compiled_random_apis.files, extensions=compiled_random_apis.extension)
     def f(data, result, states):
-        thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
+        thread_data = cuda.local.array(shape=(fft.storage_size,), dtype=fft.value_type)
 
         local_fft_id = cuda.threadIdx.y
-        fft_id = cuda.blockIdx.x * ffts_per_block + local_fft_id
+        fft_id = cuda.blockIdx.x * fft.ffts_per_block + local_fft_id
 
         index = cuda.threadIdx.x
         index_in_block = cuda.threadIdx.x + cuda.blockDim.x * cuda.threadIdx.y
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             v1 = random.normal(states[index_in_block])
             v2 = random.normal(states[index_in_block])
             data[fft_id, index] = thread_data[i] = float32x2(v1, v2)
-            index += stride
+            index += fft.stride
 
-        shared_mem = cuda.shared.array(shape=(0,), dtype=value_type)
-        FFT(thread_data, shared_mem)
+        shared_mem = cuda.shared.array(shape=(0,), dtype=fft.value_type)
+        fft.execute(thread_data, shared_mem)
 
         index = cuda.threadIdx.x
-        for i in range(elements_per_thread):
+        for i in range(fft.elements_per_thread):
             result[fft_id, index] = thread_data[i]
-            index += stride
+            index += fft.stride
 
     # Create host and device buffers to hold the input data and result, respectively.
-    data = np.empty((ffts_per_block, size), dtype=np.complex64)
+    data = np.empty((fft.ffts_per_block, fft.size), dtype=np.complex64)
     data_d = cuda.to_device(data)
-    result = np.empty((ffts_per_block, size), dtype=np.complex64)
+    result = np.empty((fft.ffts_per_block, fft.size), dtype=np.complex64)
     result_d = cuda.to_device(result)
 
-    states = random.StatesXORWOW(block_dim.x * block_dim.y)
-    setup_random[1, block_dim](states)
+    states = random.StatesXORWOW(fft.block_dim.x * fft.block_dim.y)
+    setup_random[1, fft.block_dim](states)
 
-    f[1, block_dim, 0, shared_memory_size](data_d, result_d, states)
+    f[1, fft.block_dim, 0, fft.shared_memory_size](data_d, result_d, states)
     cuda.synchronize()
 
     data = data_d.copy_to_host()
diff --git a/examples/device/curand_gbm.py b/examples/device/curand_gbm.py
new file mode 100644
index 0000000..0095020
--- /dev/null
+++ b/examples/device/curand_gbm.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example shows how to generate the Geometric Brownian Motion (GBM) for stock prices
+simulation. The normal distribution sample generator is used in combination with the
+Philox4_32_10 bit generator, which is converted into GBM in the numba.cuda kernel code.
+
+Following recommended practice, the implementation is split into a state initialization
+kernel and a path generation kernel. The generation kernel leverages that the Philox4_32_10
+generator returns 4 variates at a time allowing to produce 4 time steps in a single loop
+iteration.
+
+To learn more about the GBM, see: https://en.wikipedia.org/wiki/Geometric_Brownian_motion.
+"""
+
+from numba import cuda
+from nvmath.device import random
+import math
+import numpy as np
+import cupy as cp
+from scipy import stats
+
+
+# Pre-compile the random number generator into IR to use alongside other device code
+compiled_rng = random.Compile(cc=None)
+
+# GBM parameters
+rng_seed = 7777  # Random seed
+n_time_steps = 252  # Trading days in a year
+n_paths = 8_000  # Number of simulated paths
+mu = 0.003
+sigma = 0.027
+s0 = 100.0  # Initial stock price
+
+# Set up CUDA kernel launch configuration
+threads_per_block = 32
+blocks = n_paths // threads_per_block + bool(n_paths % threads_per_block)
+nthreads = threads_per_block * blocks
+
+
+# RNG initialization kernel
+@cuda.jit(link=compiled_rng.files, extensions=compiled_rng.extension)
+def init_rng_gpu(states, seed):
+    idx = cuda.grid(1)
+    random.init(seed, idx, 0, states[idx])
+
+
+# GBM path generation kernel. Note that the random numbers are generated
+# as they are needed, unlike for the CPU implementation where they are
+# generated upfront and stored.
+@cuda.jit(link=compiled_rng.files, extensions=compiled_rng.extension)
+def generate_gbm_paths_gpu(states, paths, nsteps, mu, sigma, s0):
+    idx = cuda.grid(1)
+    if idx >= paths.shape[0]:
+        return
+
+    # Each thread generates one path in the time domain
+    paths[idx, 0] = s0
+
+    # Consume 4 normal variates at a time for better throughput
+    for i in range(1, nsteps, 4):
+        v = random.normal4(states[idx])  # Returned as float32x4 type
+        vals = v.x, v.y, v.z, v.w  # Decompose into a tuple of float32
+        for j in range(i, min(i + 4, nsteps)):  # Process a chunk of 4 time steps
+            paths[idx, j] = paths[idx, j - 1] * math.exp(mu + sigma * vals[j - i])
+
+
+# Reference GBM path generation on CPU for validation
+def generate_gbm_paths_cpu(npaths, nsteps, mu, sigma, s0, seed):
+    """Generate GBM paths on the CPU and return paths_host.
+
+    This function internally generates Brownian increments and accumulates them to
+    form the GBM paths (so there's no separate generate_brownian_paths function).
+    """
+    np.random.seed(seed)
+    dBt = np.random.randn(npaths, nsteps - 1) * sigma + mu
+    dBt = np.insert(dBt, 0, 0.0, axis=1)  # The process starts at 0
+    Bt = np.cumsum(dBt, axis=1)
+    paths = s0 * np.exp(Bt)
+    return paths
+
+
+def main():
+    # Allocate space for paths
+    paths_device = cp.empty((n_paths, n_time_steps), dtype=cp.float32, order="F")
+
+    # Allocate space for random states
+    states = random.StatesPhilox4_32_10(nthreads)
+
+    # Initialize RNG states for GPU and CPU
+    init_rng_gpu[blocks, threads_per_block](states, rng_seed)
+
+    # Generate GBM paths on GPU
+    generate_gbm_paths_gpu[blocks, threads_per_block](states, paths_device, n_time_steps, mu, sigma, s0)
+
+    mean_device = cp.mean(paths_device[:, -1])
+    stdev_device = cp.std(paths_device[:, -1])
+    print(f"Mean stock price at maturity (GPU): {mean_device:.2f}, std.dev.: {stdev_device:.2f}")
+
+    # Generate reference GBM paths on CPU
+    paths_host = generate_gbm_paths_cpu(n_paths, n_time_steps, mu, sigma, s0, rng_seed)
+    mean_host = np.mean(paths_host[:, -1])
+    stdev_host = np.std(paths_host[:, -1])
+    print(f"Mean stock price at maturity (CPU): {mean_host:.2f}, std.dev.: {stdev_host:.2f}")
+
+    # Validate results
+    _, p_value = stats.levene(paths_host[:, -1], paths_device.get()[:, -1])  # Leven's test for equal variances
+    assert p_value > 0.05, "The variances are not equal (reject H0) - the test FAILED"
+
+    _, p_value = stats.ttest_ind(paths_host[:, -1], paths_device.get()[:, -1], equal_var=False)  # T-test for equal means
+    assert p_value > 0.05, "The means are not equal (reject H0) - the test FAILED"
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/distributed/fft/example01_cupy.py b/examples/distributed/fft/example01_cupy.py
index a89421a..df90174 100644
--- a/examples/distributed/fft/example01_cupy.py
+++ b/examples/distributed/fft/example01_cupy.py
@@ -16,13 +16,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 256, 512).
 # In this example, the input data is distributed across processes according to
@@ -39,7 +40,7 @@
 # Forward FFT.
 # In this example, the forward FFT operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same
 # buffer as operand a. Note, however, that operand b has a different shape (due
@@ -52,7 +53,7 @@
 # Recall from previous transform that the inverse FFT operand is distributed according to
 # Slab.Y. With reshape=False, the inverse FFT result will be distributed according to
 # Slab.X distribution.
-c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
 
 # The shape of c is the same as a (due to Slab.X distribution). Once again, note that
 # a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
diff --git a/examples/distributed/fft/example01_cupy_r2c_c2r.py b/examples/distributed/fft/example01_cupy_r2c_c2r.py
index 6dc032b..253d9c4 100644
--- a/examples/distributed/fft/example01_cupy_r2c_c2r.py
+++ b/examples/distributed/fft/example01_cupy_r2c_c2r.py
@@ -16,13 +16,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global *real* 3-D FFT size is (512, 256, 512).
 # In this example, the input data is distributed across processes according to
@@ -39,7 +40,7 @@
     shape,
     cp,
     input_dtype=cp.float32,
-    distribution=nvmath.distributed.fft.Slab.X,
+    distribution=Slab.X,
     fft_type="R2C",
 )
 # a is a cupy ndarray and can be operated on using in-place cupy operations.
@@ -49,7 +50,7 @@
 # R2C (forward) FFT.
 # In this example, the R2C operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.rfft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.rfft(a, distribution=Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same
 # buffer as operand a. Note, however, that operand b has a different dtype and shape
@@ -62,7 +63,7 @@
 # Recall from previous transform that the inverse FFT operand is distributed according to
 # Slab.Y. With reshape=False, the C2R result will be distributed according to
 # Slab.X distribution.
-c = nvmath.distributed.fft.irfft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.irfft(b, distribution=Slab.Y, options={"reshape": False})
 
 # The shape of c is the same as a (due to Slab.X distribution). Once again, note that
 # a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
diff --git a/examples/distributed/fft/example01_numpy.py b/examples/distributed/fft/example01_numpy.py
index a455b2a..308354d 100644
--- a/examples/distributed/fft/example01_numpy.py
+++ b/examples/distributed/fft/example01_numpy.py
@@ -26,7 +26,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (64, 256, 128).
 # In this example, the input data is distributed across processes according to
@@ -40,7 +40,7 @@
 # By default, the reshape option is True, which means that the output of the distributed
 # FFT will be re-distributed to retain the same distribution as the input (in this case
 # Slab.Y).
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.Y)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.distribution.Slab.Y)
 
 if rank == 0:
     # Note the same shape of a and b (they are both using the same distribution).
diff --git a/examples/distributed/fft/example01_numpy_uneven_4p.py b/examples/distributed/fft/example01_numpy_uneven_4p.py
index 1e7af87..6a05063 100644
--- a/examples/distributed/fft/example01_numpy_uneven_4p.py
+++ b/examples/distributed/fft/example01_numpy_uneven_4p.py
@@ -21,13 +21,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 if nranks != 4:
     raise RuntimeError("This example requires 4 processes")
@@ -49,7 +50,7 @@
     shape,  # local shape
     np,
     input_dtype=np.complex128,
-    distribution=nvmath.distributed.fft.Slab.X,
+    distribution=Slab.X,
     fft_type="C2C",
 )
 
@@ -59,7 +60,7 @@
 # Forward FFT.
 # In this example, the forward FFT operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same
 # buffer as operand a. Note, however, that operand b has a different shape (due
@@ -72,7 +73,7 @@
 # Recall from previous transform that the inverse FFT operand is distributed according to
 # Slab.Y. With reshape=False, the inverse FFT result will be distributed according to
 # Slab.X distribution.
-c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
 
 # The shape of c is the same as a (due to Slab.X distribution). Once again, note that
 # a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
diff --git a/examples/distributed/fft/example01_torch.py b/examples/distributed/fft/example01_torch.py
index 2f099fc..24154a5 100644
--- a/examples/distributed/fft/example01_torch.py
+++ b/examples/distributed/fft/example01_torch.py
@@ -13,13 +13,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 if nranks > 8:
     raise RuntimeError("This example requires <= 8 processes")
@@ -38,7 +39,7 @@
 # Forward FFT.
 # In this example, the forward FFT operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same buffer
 # as tensor a. Note, however, that tensor b has a different shape (due to Slab.Y
@@ -51,7 +52,7 @@
 # Recall from the previous transform that the inverse FFT operand is distributed
 # according to Slab.Y. With reshape=False, the inverse FFT result will be distributed
 # according to Slab.X distribution.
-c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
 
 # The shape of tensor c is the same as tensor a (due to Slab.X distribution). Once again,
 # note that a, b and c are sharing the same symmetric memory buffer (distributed FFT
diff --git a/examples/distributed/fft/example01_torch_r2c_c2r.py b/examples/distributed/fft/example01_torch_r2c_c2r.py
index 0ecda02..0871f4c 100644
--- a/examples/distributed/fft/example01_torch_r2c_c2r.py
+++ b/examples/distributed/fft/example01_torch_r2c_c2r.py
@@ -13,13 +13,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 if nranks > 8:
     raise RuntimeError("This example requires <= 8 processes")
@@ -40,7 +41,7 @@
     shape,
     torch,
     input_dtype=torch.float32,
-    distribution=nvmath.distributed.fft.Slab.X,
+    distribution=Slab.X,
     memory_space="cpu",  # allocate torch tensor on CPU
     fft_type="R2C",
 )
@@ -50,7 +51,7 @@
 # R2C (forward) FFT.
 # In this example, the R2C operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.rfft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.rfft(a, distribution=Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same
 # buffer as tensor a. Note, however, that tensor b has a different dtype and shape
@@ -65,9 +66,7 @@
 # Slab.X distribution.
 # Note that to transform back to the original shape of the real operand (which has odd last
 # axis length), we use the last_axis_parity="odd" option.
-c = nvmath.distributed.fft.irfft(
-    b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False, "last_axis_parity": "odd"}
-)
+c = nvmath.distributed.fft.irfft(b, distribution=Slab.Y, options={"reshape": False, "last_axis_parity": "odd"})
 
 # The shape of tensor c is the same as tensor a (due to Slab.X distribution). Once again,
 # note that a, b and c are sharing the same memory buffer (distributed FFT operations are
diff --git a/examples/distributed/fft/example02_custom_box_distribution_4p.py b/examples/distributed/fft/example02_custom_box_distribution_4p.py
index 91cfe05..ff7aa0a 100644
--- a/examples/distributed/fft/example02_custom_box_distribution_4p.py
+++ b/examples/distributed/fft/example02_custom_box_distribution_4p.py
@@ -20,13 +20,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 if nranks != 4:
     raise RuntimeError("This example requires 4 processes")
@@ -42,13 +43,13 @@
 
 # Forward FFT.
 if rank == 0:
-    input_box = [(0, 0, 0), (32, 128, 128)]
+    input_box = Box((0, 0, 0), (32, 128, 128))
 elif rank == 1:
-    input_box = [(0, 128, 0), (32, 256, 128)]
+    input_box = Box((0, 128, 0), (32, 256, 128))
 elif rank == 2:
-    input_box = [(32, 0, 0), (64, 128, 128)]
+    input_box = Box((32, 0, 0), (64, 128, 128))
 else:
-    input_box = [(32, 128, 0), (64, 256, 128)]
+    input_box = Box((32, 128, 0), (64, 256, 128))
 # Use the same pencil distribution for the output.
 output_box = input_box
 b = nvmath.distributed.fft.fft(a, distribution=[input_box, output_box])
diff --git a/examples/distributed/fft/example03_stateful_cupy.py b/examples/distributed/fft/example03_stateful_cupy.py
index 510e680..e9bffad 100644
--- a/examples/distributed/fft/example03_stateful_cupy.py
+++ b/examples/distributed/fft/example03_stateful_cupy.py
@@ -21,7 +21,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -36,7 +36,7 @@
     a[:] = cp.ones(shape, dtype=cp.complex64)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.distribution.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example03_stateful_torch.py b/examples/distributed/fft/example03_stateful_torch.py
index d9f1d84..afcaecd 100644
--- a/examples/distributed/fft/example03_stateful_torch.py
+++ b/examples/distributed/fft/example03_stateful_torch.py
@@ -20,7 +20,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -34,7 +34,7 @@
 a[:] = torch.ones(shape, dtype=torch.complex64, device=device_id)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.distribution.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example03_stateful_torch_cpu.py b/examples/distributed/fft/example03_stateful_torch_cpu.py
index cf4e115..9419d16 100644
--- a/examples/distributed/fft/example03_stateful_torch_cpu.py
+++ b/examples/distributed/fft/example03_stateful_torch_cpu.py
@@ -23,7 +23,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -33,7 +33,7 @@
 a = torch.ones(shape, dtype=torch.complex64)  # cpu tensor
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.distribution.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example04_options.py b/examples/distributed/fft/example04_options.py
index d58bbfb..9633aef 100644
--- a/examples/distributed/fft/example04_options.py
+++ b/examples/distributed/fft/example04_options.py
@@ -19,13 +19,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (64, 256, 128).
 # In this example, the input data is distributed across processes according to
@@ -47,14 +48,14 @@
 
 # Alternative #1 for specifying options, using dataclass.
 options = nvmath.distributed.fft.FFTOptions(reshape=False)
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options=options)
+b = nvmath.distributed.fft.fft(a, distribution=Slab.X, options=options)
 if rank == 0:
     print(f"Does the forward FFT result share the same distribution as the input ? {b.shape == a.shape}")
     print(f"Input type = {type(a)}, FFT output type = {type(b)}")
 
 # Alternative #2 for specifying options, using dict. The two alternatives are entirely
 # equivalent.
-c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=Slab.Y, options={"reshape": False})
 if rank == 0:
     print(f"Does the inverse FFT result share the same distribution as the forward input ? {c.shape == a.shape}")
     print(f"Input type = {type(a)}, FFT output type = {type(b)}")
diff --git a/examples/distributed/fft/example05_logging_global.py b/examples/distributed/fft/example05_logging_global.py
index e734d31..e078caf 100644
--- a/examples/distributed/fft/example05_logging_global.py
+++ b/examples/distributed/fft/example05_logging_global.py
@@ -20,7 +20,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 256).
 # In this example, the input data is distributed across processes according to
@@ -38,7 +38,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
 # Forward FFT.
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.distribution.Slab.X)
 
 # Synchronize the default stream
 with cp.cuda.Device(device_id):
diff --git a/examples/distributed/fft/example05_logging_user.py b/examples/distributed/fft/example05_logging_user.py
index f606d91..f4dad3c 100644
--- a/examples/distributed/fft/example05_logging_user.py
+++ b/examples/distributed/fft/example05_logging_user.py
@@ -20,7 +20,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 256).
 # In this example, the input data is distributed across processes according to
@@ -54,7 +54,7 @@
 o = nvmath.distributed.fft.FFTOptions(logger=logger)
 
 # Specify the options to the FFT operation.
-b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options=o)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.distribution.Slab.X, options=o)
 
 if rank == 0:
     print("---")
diff --git a/examples/distributed/fft/example06_stateful_reset_box_distribution_4p.py b/examples/distributed/fft/example06_stateful_reset_box_distribution_4p.py
index 0f49089..b7a2d97 100644
--- a/examples/distributed/fft/example06_stateful_reset_box_distribution_4p.py
+++ b/examples/distributed/fft/example06_stateful_reset_box_distribution_4p.py
@@ -16,13 +16,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 if nranks != 4:
     raise RuntimeError("This example requires 4 processes")
@@ -43,17 +44,17 @@
 # Input distribution is pencil decomposition on X and Y axes.
 # Output distribution is pencil decomposition on Y an Z axes.
 if rank == 0:
-    input_box = ([0, 0, 0], [64, 128, 128])
-    output_box = ([0, 0, 0], [128, 128, 64])
+    input_box = Box([0, 0, 0], [64, 128, 128])
+    output_box = Box([0, 0, 0], [128, 128, 64])
 elif rank == 1:
-    input_box = ([0, 128, 0], [64, 256, 128])
-    output_box = ([0, 0, 64], [128, 128, 128])
+    input_box = Box([0, 128, 0], [64, 256, 128])
+    output_box = Box([0, 0, 64], [128, 128, 128])
 elif rank == 2:
-    input_box = ([64, 0, 0], [128, 128, 128])
-    output_box = ([0, 128, 0], [128, 256, 64])
+    input_box = Box([64, 0, 0], [128, 128, 128])
+    output_box = Box([0, 128, 0], [128, 256, 64])
 else:
-    input_box = ([64, 128, 0], [128, 256, 128])
-    output_box = ([0, 128, 64], [128, 256, 128])
+    input_box = Box([64, 128, 0], [128, 256, 128])
+    output_box = Box([0, 128, 64], [128, 256, 128])
 
 # Create a stateful FFT object 'f'.
 with nvmath.distributed.fft.FFT(a, distribution=[input_box, output_box]) as f:
diff --git a/examples/distributed/fft/example06_stateful_reset_inplace.py b/examples/distributed/fft/example06_stateful_reset_inplace.py
index 003a0c5..325054d 100644
--- a/examples/distributed/fft/example06_stateful_reset_inplace.py
+++ b/examples/distributed/fft/example06_stateful_reset_inplace.py
@@ -20,7 +20,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -34,7 +34,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.distribution.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example06_stateful_reset_slab_distribution.py b/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
index c6d6867..0fe8093 100644
--- a/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
+++ b/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
@@ -16,13 +16,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -36,7 +37,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
+with nvmath.distributed.fft.FFT(a, distribution=Slab.X, options={"reshape": False}) as f:
     # Plan the FFT.
     f.plan()
 
@@ -51,7 +52,7 @@
     # original distribution was Slab.X, the reset operand is expected to have either a
     # Slab.X or Slab.Y distribution based on the same global shape, in this
     # case (512, 512, 512).
-    f.reset_operand(b, distribution=nvmath.distributed.fft.Slab.Y)
+    f.reset_operand(b, distribution=Slab.Y)
 
     # Execute the new inverse FFT.
     # The distribution of operand c will be Slab.X
diff --git a/examples/distributed/fft/example07_streams.py b/examples/distributed/fft/example07_streams.py
index 41bcf37..db1d108 100644
--- a/examples/distributed/fft/example07_streams.py
+++ b/examples/distributed/fft/example07_streams.py
@@ -12,13 +12,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 256, 256).
 # In this example, the input data is distributed across processes according to
@@ -36,7 +37,7 @@
     s1 = cp.cuda.Stream()
 
 # Create a stateful FFT object 'f' on stream s1.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
+with nvmath.distributed.fft.FFT(a, distribution=Slab.X, options={"blocking": "auto"}, stream=s1) as f:
     # Plan the FFT on stream s1.
     f.plan(stream=s1)
 
@@ -67,7 +68,7 @@
 
     # Set a new operand c on stream s2. Note that operand c is distributed in the same was
     # as operand a.
-    f.reset_operand(c, distribution=nvmath.distributed.fft.Slab.X, stream=s2)
+    f.reset_operand(c, distribution=Slab.X, stream=s2)
 
     # Execute the new FFT on stream s2.
     d = f.execute(stream=s2)
diff --git a/examples/distributed/fft/example08_sync_symmetric_memory.py b/examples/distributed/fft/example08_sync_symmetric_memory.py
index 7ad8a88..ff2e9a3 100644
--- a/examples/distributed/fft/example08_sync_symmetric_memory.py
+++ b/examples/distributed/fft/example08_sync_symmetric_memory.py
@@ -15,13 +15,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -35,7 +36,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
+with nvmath.distributed.fft.FFT(a, distribution=Slab.X, options={"reshape": False}) as f:
     # Plan the FFT.
     f.plan()
 
@@ -50,7 +51,7 @@
     # Reset the operand to the values in the frequency domain.
     # Note that because the FFT object is configured with reshape=False, the
     # distribution of operand b is Slab.Y
-    f.reset_operand(b, distribution=nvmath.distributed.fft.Slab.Y)
+    f.reset_operand(b, distribution=Slab.Y)
 
     # Execute the new inverse FFT.
     # After cuFFTMp performs a transform, it issues a symmetric memory synchronization
diff --git a/examples/distributed/fft/example08_sync_symmetric_memory_streams.py b/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
index 74e7ccd..74bff27 100644
--- a/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
+++ b/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
@@ -15,13 +15,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (512, 256, 256).
 # In this example, the input data is distributed across processes according to
@@ -39,7 +40,7 @@
     s1 = cp.cuda.Stream()
 
 # Create a stateful FFT object 'f' on stream s1.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
+with nvmath.distributed.fft.FFT(a, distribution=Slab.X, options={"blocking": "auto"}, stream=s1) as f:
     # Plan the FFT on stream s1.
     f.plan(stream=s1)
 
@@ -53,7 +54,7 @@
 
     # We're using the output of the previous forward transform as input for the
     # inverse transform.
-    f.reset_operand(b, distribution=nvmath.distributed.fft.Slab.X)
+    f.reset_operand(b, distribution=Slab.X)
 
     # Execute the inverse FFT on stream s1.
     # Since cuFFTMp issued a symmetric memory synchronization on stream s1 after
@@ -80,7 +81,7 @@
         s2.wait_event(e1)
 
     # Set a new operand d on stream s2.
-    f.reset_operand(d, distribution=nvmath.distributed.fft.Slab.X, stream=s2)
+    f.reset_operand(d, distribution=Slab.X, stream=s2)
 
     # Execute the new FFT on stream s2.
     # Operand d was filled on stream s1, and the GPUs have not synchronized on these
diff --git a/examples/distributed/fft/example09_resource_mgmt.py b/examples/distributed/fft/example09_resource_mgmt.py
index f78e409..671f4f7 100644
--- a/examples/distributed/fft/example09_resource_mgmt.py
+++ b/examples/distributed/fft/example09_resource_mgmt.py
@@ -26,7 +26,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (256, 512, 512).
 # In this example, the input data is distributed across processes according to
@@ -45,10 +45,10 @@
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
 
 # Create and prepare two FFT objects.
-f1 = nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X)
+f1 = nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.distribution.Slab.X)
 f1.plan()
 
-f2 = nvmath.distributed.fft.FFT(b, distribution=nvmath.distributed.fft.Slab.X)
+f2 = nvmath.distributed.fft.FFT(b, distribution=nvmath.distributed.distribution.Slab.X)
 f2.plan()
 
 num_iter = 3
diff --git a/examples/distributed/fft/example10_cupy_fft_benchmark.py b/examples/distributed/fft/example10_cupy_fft_benchmark.py
index c68bb19..28d491c 100644
--- a/examples/distributed/fft/example10_cupy_fft_benchmark.py
+++ b/examples/distributed/fft/example10_cupy_fft_benchmark.py
@@ -11,12 +11,13 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Slab
 
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D FFT size is (N, N, N)
 N = 512
@@ -32,7 +33,7 @@
 print(f"[{rank}] The local operand shape = {a.shape}, with data type {dtype} running on {nranks} processes.")
 
 # Create the distributed FFT op, plan, and benchmark.
-with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as fftobj:
+with nvmath.distributed.fft.FFT(a, distribution=Slab.X, options={"reshape": False}) as fftobj:
     fftobj.plan()
     b = cupyx.profiler.benchmark(fftobj.execute, n_repeat=10)
     print(f"[{rank}] {b}")
diff --git a/examples/distributed/linalg/advanced/matmul/example01_cupy.py b/examples/distributed/linalg/advanced/matmul/example01_cupy.py
new file mode 100644
index 0000000..36c2b18
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example01_cupy.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic distributed matrix multiplication of CuPy arrays,
+using the function-form APIs.
+
+nvmath-python accepts operands from multiple frameworks. The result of each operation
+is a tensor of the same framework that was used to pass the inputs, and is located
+on the same device as the inputs (GPU in this example).
+
+The global operation performed in this example is: A.T @ B
+
+$ mpiexec -n 4 python example01_cupy.py
+"""
+
+import numpy as np
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import Slab
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Prepare sample input data (CuPy matrices, on the GPU).
+
+# nvmath-python uses cuBLASMp for distributed matrix multiplication.
+# cuBLASMp supports PBLAS 2D block-cyclic distribution of matrices. For simplicity, in this
+# example we partition matrices on a single axis (distribution on a single dimension without
+# cyclic property is a special case of 2D block-cyclic).
+
+# Slab distribution can also be used to specify partitioning on a single axis and
+# nvmath-python implicitly converts to BlockCyclic format required by cuBLASMp.
+row_wise_distribution = Slab.X  # partitioning on rows
+col_wise_distribution = Slab.Y  # partitioning on columns
+
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on
+    # symmetric memory, which may further improve performance.
+    a = cp.random.rand(k, m // nranks)  # a is transposed and partitioned on m
+    b = cp.random.rand(k, n // nranks)  # b is partitioned on n
+
+# In Python, the memory layout of ndarrays and tensors by default uses row-major or C
+# ordering, while cuBLASMp requires column-major or Fortran ordering. To work with cuBLASMp,
+# you can follow these guidelines:
+# - The transpose of a C-ordered (row-major) matrix is a Fortran-ordered (column-major)
+#   matrix and vice-versa.
+# - In a distributed setting, a row-wise distributed matrix A is equivalent to a column-wise
+#   distributed matrix A.T, and vice-versa.
+
+# numpy, cupy and torch also have functions to allocate tensors with Fortran order
+# or to convert to Fortran order. Here we use cupy's asfortranarray function to convert
+# the inputs to Fortran order (note that this copies the matrix to a new buffer with the
+# same shape but different memory layout):
+with cp.cuda.Device(device_id):
+    a = cp.asfortranarray(a)
+    b = cp.asfortranarray(b)
+
+# Specify distribution of input and output matrices.
+
+# Note: The choice of distribution for a, b and output as well as whether a and b are
+# transposed influences the distributed algorithm used by cuBLASMp and can have a
+# substantial impact on performance.
+# The following configuration will run AllGather+GEMM.
+# Refer to https://docs.nvidia.com/cuda/cublasmp/usage/tp.html for more information.
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, col_wise_distribution, row_wise_distribution]
+
+# Perform the distributed matrix multiplication.
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
+
+if rank == 0:
+    # result has global shape (m, n) and is distributed row-wise (as specified above).
+    # The memory layout of result is Fortran on each process.
+    print(result.shape, result.flags)
+    assert result.shape == row_wise_distribution.shape(rank, (m, n))
+
+    # result.T has global shape (n, m) and is distributed column-wise, with C memory layout.
+    print(result.T.shape, result.T.flags)
+    # Transpose changes the shape and distribution.
+    assert result.T.shape == col_wise_distribution.shape(rank, (n, m))
+
+    # Check if the result is cupy array as well.
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    assert isinstance(result, cp.ndarray)
diff --git a/examples/distributed/linalg/advanced/matmul/example01_cupy_complex_conjugate_transpose.py b/examples/distributed/linalg/advanced/matmul/example01_cupy_complex_conjugate_transpose.py
new file mode 100644
index 0000000..2461cc3
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example01_cupy_complex_conjugate_transpose.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates distributed matrix multiplication of complex CuPy arrays
+with conjugate transpose, using the function-form APIs.
+
+nvmath-python accepts operands from multiple frameworks. The result of each operation
+is a tensor of the same framework that was used to pass the inputs, and is located
+on the same device as the inputs (GPU in this example).
+
+The global operation performed in this example is: A.H @ B where H is the hermitian
+transpose.
+
+$ mpiexec -n 4 python example01_cupy_complex_conjugate_transpose.py
+"""
+
+import numpy as np
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import Slab
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Prepare sample input data (CuPy matrices, on the GPU).
+
+# nvmath-python uses cuBLASMp for distributed matrix multiplication.
+# cuBLASMp supports PBLAS 2D block-cyclic distribution of matrices. For simplicity, in this
+# example we partition matrices on a single axis (distribution on a single dimension without
+# cyclic property is a special case of 2D block-cyclic).
+
+# Slab distribution can also be used to specify partitioning on a single axis and
+# nvmath-python implicitly converts to BlockCyclic format required by cuBLASMp.
+row_wise_distribution = Slab.X  # partitioning on rows
+col_wise_distribution = Slab.Y  # partitioning on columns
+
+a_shape = col_wise_distribution.shape(rank, (k, m))  # a is transposed and partitioned on m
+b_shape = col_wise_distribution.shape(rank, (k, n))  # b is partitioned on n
+
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(*a_shape, dtype=cp.float32) + 1j * cp.random.rand(*a_shape, dtype=cp.float32)
+    b = cp.random.rand(*b_shape, dtype=cp.float32) + 1j * cp.random.rand(*b_shape, dtype=cp.float32)
+
+# In Python, the memory layout of ndarrays and tensors by default uses row-major or C
+# ordering, while cuBLASMp requires column-major or Fortran ordering. To work with cuBLASMp,
+# you can follow these guidelines:
+# - The transpose of a C-ordered (row-major) matrix is a Fortran-ordered (column-major)
+#   matrix and vice-versa.
+# - In a distributed setting, a row-wise distributed matrix A is equivalent to a column-wise
+#   distributed matrix A.T, and vice-versa.
+
+# Here we use cupy's asfortranarray function to convert the inputs to Fortran order
+# (note that this copies the matrix to a new buffer with the same shape but different
+# memory layout):
+with cp.cuda.Device(device_id):
+    a = cp.asfortranarray(a)
+    b = cp.asfortranarray(b)
+
+# Specify distribution of input and output matrices.
+
+# Note: The choice of distribution for a, b and output as well as whether a and b are
+# transposed influences the distributed algorithm used by cuBLASMp and can have a
+# substantial impact on performance.
+# The following configuration will run AllGather+GEMM.
+# Refer to https://docs.nvidia.com/cuda/cublasmp/usage/tp.html for more information.
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, col_wise_distribution, row_wise_distribution]
+
+# Perform the distributed matrix multiplication.
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+# Conjugate transpose on A.
+qualifiers[0]["is_conjugate"] = qualifiers[0]["is_transpose"] = True
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
+
+if rank == 0:
+    # result has global shape (m, n) and is distributed row-wise (as specified above).
+    # The memory layout of result is Fortran on each process.
+    print(result.shape, result.flags)
+    assert result.shape == row_wise_distribution.shape(rank, (m, n))
+
+    # result.T has global shape (n, m) and is distributed column-wise, with C memory layout.
+    print(result.T.shape, result.T.flags)
+    # Transpose changes the shape and distribution.
+    assert result.T.shape == col_wise_distribution.shape(rank, (n, m))
+
+    # Check if the result is cupy array as well.
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    assert isinstance(result, cp.ndarray)
diff --git a/examples/distributed/linalg/advanced/matmul/example01_cupy_symmetric_memory.py b/examples/distributed/linalg/advanced/matmul/example01_cupy_symmetric_memory.py
new file mode 100644
index 0000000..1175eb0
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example01_cupy_symmetric_memory.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic distributed matrix multiplication of CuPy arrays,
+using the function-form APIs.
+
+nvmath-python accepts operands from multiple frameworks. The result of each operation
+is a tensor of the same framework that was used to pass the inputs, and is located
+on the same device as the inputs (GPU in this example).
+
+nvmath-python also accepts operands that are on the symmetric heap, which may improve
+performance for distributed matrix multiplication. If the inputs are on symmetric
+memory, the result will be as well.
+
+The global operation performed in this example is: A.T @ B
+
+$ mpiexec -n 4 python example01_cupy_symmetric_memory.py
+"""
+
+import numpy as np
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import Slab
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Prepare sample input data (CuPy matrices, on the GPU).
+
+# nvmath-python uses cuBLASMp for distributed matrix multiplication.
+# cuBLASMp supports PBLAS 2D block-cyclic distribution of matrices. For simplicity, in this
+# example we partition matrices on a single axis (distribution on a single dimension without
+# cyclic property is a special case of 2D block-cyclic).
+
+# Slab distribution can also be used to specify partitioning on a single axis and
+# nvmath-python implicitly converts to BlockCyclic format required by cuBLASMp.
+row_wise_distribution = Slab.X  # partitioning on rows
+col_wise_distribution = Slab.Y  # partitioning on columns
+
+# Get the shape of inputs a and b on this rank according to this distribution.
+a_shape = col_wise_distribution.shape(rank, (k, m))  # a is transposed and partitioned on m
+b_shape = col_wise_distribution.shape(rank, (k, n))  # b is partitioned on n
+
+# In this example we allocate the matrices on symmetric memory. Some distributed algorithms
+# in cuBLASMp use NVSHMEM, and the performance may improve when one or more operands
+# are on symmetric memory.
+
+# cuBLASMp requires column-major or Fortran ordering. Here, we allocate tensors on
+# symmetric memory directly using column-major memory layout.
+a = nvmath.distributed.allocate_symmetric_memory(a_shape, cp, axis_order="F")
+b = nvmath.distributed.allocate_symmetric_memory(b_shape, cp, axis_order="F")
+
+with cp.cuda.Device(device_id):
+    a[:] = cp.random.rand(*a_shape)
+    b[:] = cp.random.rand(*b_shape)
+
+# Specify distribution of input and output matrices.
+
+# Note: The choice of distribution for a, b and output as well as whether a and b are
+# transposed influences the distributed algorithm used by cuBLASMp and can have a
+# substantial impact on performance.
+# The following configuration will run AllGather+GEMM.
+# Refer to https://docs.nvidia.com/cuda/cublasmp/usage/tp.html for more information.
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, col_wise_distribution, row_wise_distribution]
+
+# Perform the distributed matrix multiplication.
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# Note: if all of the input operands are on symmetric memory, the result is also
+# on symmetric memory.
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
+
+if rank == 0:
+    # result has global shape (m, n) and is distributed row-wise (as specified above).
+    # The memory layout of result is Fortran on each process.
+    print(result.shape, result.flags)
+    assert result.shape == row_wise_distribution.shape(rank, (m, n))
+
+    # result.T has global shape (n, m) and is distributed column-wise, with C memory layout.
+    print(result.T.shape, result.T.flags)
+    # Transpose changes the shape and distribution.
+    assert result.T.shape == col_wise_distribution.shape(rank, (n, m))
+
+    # Check if the result is cupy array as well.
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    assert isinstance(result, cp.ndarray)
+
+# GPU operands on the symmetric heap are not garbage-collected and the user is
+# responsible for freeing any that they own (this deallocation is a collective
+# operation that must be called by all processes at the same point in the execution).
+nvmath.distributed.free_symmetric_memory(a, b, result)
diff --git a/examples/distributed/linalg/advanced/matmul/example01_numpy.py b/examples/distributed/linalg/advanced/matmul/example01_numpy.py
new file mode 100644
index 0000000..746ea0e
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example01_numpy.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic distributed matrix multiplication of NumPy arrays,
+using the function-form APIs.
+
+nvmath-python accepts operands from multiple frameworks. The result of each operation
+is a tensor of the same framework that was used to pass the inputs, and is located
+on the same device as the inputs (CPU in this example).
+
+The NumPy ndarrays reside in CPU memory, and are copied transparently to symmetric
+GPU memory to process them with cuBLASMp.
+
+The global operation performed in this example is: A.T @ B.T
+
+$ mpiexec -n 4 python example01_numpy.py
+"""
+
+import numpy as np
+import cuda.core.experimental
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cuda.core.experimental.system.num_devices
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Prepare sample input data (NumPy matrices, on the CPU).
+
+# nvmath-python uses cuBLASMp for distributed matrix multiplication.
+# cuBLASMp supports PBLAS 2D block-cyclic distribution of matrices. For simplicity, in this
+# example we partition matrices on a single axis (distribution on a single dimension without
+# cyclic property is a special case of 2D block-cyclic).
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a = np.random.rand(m // nranks, k)  # a is partitioned on m
+b = np.random.rand(k, n // nranks)  # b is partitioned on n
+
+# In Python, the memory layout of ndarrays and tensors by default uses row-major or C
+# ordering, while cuBLASMp requires column-major or Fortran ordering. To work with cuBLASMp,
+# you can follow these guidelines:
+# - The transpose of a C-ordered (row-major) matrix is a Fortran-ordered (column-major)
+#   matrix and vice-versa.
+# - In a distributed setting, a row-wise distributed matrix A is equivalent to a column-wise
+#   distributed matrix A.T, and vice-versa.
+
+# Note that numpy, cupy and torch also have functions to allocate tensors with Fortran order
+# or to convert to Fortran order (see example01_cupy.py for an example).
+
+# Get a transposed view (zero cost) of the matrices to obtain column-major ordering.
+a = a.T  # a is now (k, m) with col_wise_distribution
+b = b.T  # b is now (n, k) with row_wise_distribution
+
+# Specify distribution of input and output matrices.
+
+# Note: The choice of distribution for a, b and output as well as whether a and b are
+# transposed influences the distributed algorithm used by cuBLASMp and can have a
+# substantial impact on performance.
+# The following configuration will run AllGather+GEMM.
+# Refer to https://docs.nvidia.com/cuda/cublasmp/usage/tp.html for more information.
+
+# Distribution of a, b and output (note how transposing a and b influences their
+# distribution):
+distributions = [col_wise_distribution, row_wise_distribution, row_wise_distribution]
+
+# Perform the distributed matrix multiplication.
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+qualifiers[1]["is_transpose"] = True  # b is transposed
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+
+if rank == 0:
+    # result has global shape (m, n) and is distributed row-wise (as specified above).
+    # The memory layout of result is Fortran on each process.
+    print(result.shape, result.flags)
+    assert result.shape == row_wise_distribution.shape(rank, (m, n))
+
+    # result.T has global shape (n, m) and is distributed column-wise, with C memory layout.
+    print(result.T.shape, result.T.flags)
+    # Transpose changes the shape and distribution.
+    assert result.T.shape == col_wise_distribution.shape(rank, (n, m))
+
+    # Check if the result is numpy array as well.
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    assert isinstance(result, np.ndarray)
diff --git a/examples/distributed/linalg/advanced/matmul/example01_torch.py b/examples/distributed/linalg/advanced/matmul/example01_torch.py
new file mode 100644
index 0000000..511109a
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example01_torch.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic distributed matrix multiplication of torch tensors,
+using the function-form APIs.
+
+nvmath-python accepts operands from multiple frameworks. The result of each operation
+is a tensor of the same framework that was used to pass the inputs, and is located
+on the same device as the inputs.
+
+Tensors residing on CPU memory are copied transparently to symmetric GPU memory to
+process them with cuBLASMp.
+
+The global operation performed in this example is: A.T @ B.T
+
+$ mpiexec -n 4 python example01_torch.py
+"""
+
+import numpy as np
+import torch
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % torch.cuda.device_count()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Prepare sample input data (torch tensors on the CPU).
+
+# nvmath-python uses cuBLASMp for distributed matrix multiplication.
+# cuBLASMp supports PBLAS 2D block-cyclic distribution of matrices. For simplicity, in this
+# example we partition matrices on a single axis (distribution on a single dimension without
+# cyclic property is a special case of 2D block-cyclic).
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a = torch.rand(m, k // nranks)  # a is partitioned on k
+b = torch.rand(k // nranks, n)  # b is partitioned on k
+
+# In Python, the memory layout of ndarrays and tensors by default uses row-major or C
+# ordering, while cuBLASMp requires column-major or Fortran ordering. To work with cuBLASMp,
+# you can follow these guidelines:
+# - The transpose of a C-ordered (row-major) matrix is a Fortran-ordered (column-major)
+#   matrix and vice-versa.
+# - In a distributed setting, a row-wise distributed matrix A is equivalent to a column-wise
+#   distributed matrix A.T, and vice-versa.
+
+# Note that numpy, cupy and torch also have functions to allocate tensors with Fortran order
+# or to convert to Fortran order (see example01_cupy.py for an example).
+
+# Get a transposed view (zero cost) of the matrices to obtain column-major ordering.
+a = a.T  # a is now (k, m) with row_wise_distribution
+b = b.T  # b is now (n, k) with col_wise_distribution
+
+# Specify distribution of input and output matrices.
+
+# Note: The choice of distribution for a, b and output as well as whether a and b are
+# transposed influences the distributed algorithm used by cuBLASMp and can have a
+# substantial impact on performance.
+# The following configuration will run GEMM+ReduceScatter.
+# Refer to https://docs.nvidia.com/cuda/cublasmp/usage/tp.html for more information.
+
+# Distribution of a, b and output (note how transposing a and b influences their
+# distribution):
+distributions = [row_wise_distribution, col_wise_distribution, col_wise_distribution]
+
+# Perform the distributed matrix multiplication.
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+qualifiers[1]["is_transpose"] = True  # b is transposed
+if rank == 0:
+    print("Running the distributed multiplication on CPU tensors...")
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+
+if rank == 0:
+    # result has global shape (m, n) and is distributed row-wise (as specified above).
+    # The memory layout of result is Fortran on each process.
+    print(f"shape={result.shape} strides={result.stride()}")
+    assert result.shape == (m, n // nranks)
+
+    # result.T has global shape (n, m) and is distributed column-wise, with C memory layout.
+    print(f"shape={result.T.shape} strides={result.T.stride()}")
+    assert result.T.shape == (n // nranks, m)
+
+    # Check if the result is torch tensor as well.
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    print(f"Inputs were located on devices {a.device} and {b.device} and the result is on {result.device}")
+
+# Now, move the tensors to the GPU and verify that the result is on the GPU as well.
+a_gpu = a.cuda(device=f"cuda:{device_id}", memory_format=torch.preserve_format)
+b_gpu = b.cuda(device=f"cuda:{device_id}", memory_format=torch.preserve_format)
+if rank == 0:
+    print("\nRunning the distributed multiplication on GPU tensors...")
+result = nvmath.distributed.linalg.advanced.matmul(
+    a_gpu,
+    b_gpu,
+    distributions=distributions,
+    qualifiers=qualifiers,
+)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+torch.cuda.default_stream().synchronize()
+
+print(f"Inputs were of types {type(a_gpu)} and {type(b_gpu)} and the result is of type {type(result)}.")
+print(f"Inputs were located on devices {a_gpu.device} and {b_gpu.device} and the result is on {result.device}")
diff --git a/examples/distributed/linalg/advanced/matmul/example02_2d_block_cyclic_4p.py b/examples/distributed/linalg/advanced/matmul/example02_2d_block_cyclic_4p.py
new file mode 100644
index 0000000..e17c708
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example02_2d_block_cyclic_4p.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates distributed matrix multiplication using 2D block-cyclic
+distribution.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 4 python example02_2d_block_cyclic_4p.py
+"""
+
+import numpy as np
+import cuda.core.experimental
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import ProcessGrid, BlockCyclic
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cuda.core.experimental.system.num_devices
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 256, 512, 1024
+
+# This example uses the PBLAS 2D block-cyclic distribution.
+# See example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+assert nranks == 4, "This example requires 4 processes"
+
+# 2D 2x2 process grid (4 processes) with column-major layout:
+# ---------
+# | 0 | 2 |
+# ---------
+# | 1 | 3 |
+# ---------
+process_grid = ProcessGrid(shape=(2, 2), layout=ProcessGrid.Layout.COL_MAJOR)
+
+# Cyclic distribution with 4x4 block size.
+distribution = BlockCyclic(process_grid, (4, 4))
+
+# Get the shape of inputs a and b on this rank according to this distribution.
+a_shape = distribution.shape(rank, (m, k))
+b_shape = distribution.shape(rank, (k, n))
+
+# Prepare sample input data.
+a = np.random.rand(*a_shape).astype(np.float32)
+b = np.random.rand(*b_shape).astype(np.float32)
+# cuBLASMp requires column-major (Fortran) memory layout (see example01 for details and
+# alternate ways to handle).
+a = np.asfortranarray(a)
+b = np.asfortranarray(b)
+
+# Matrices a, b and output use the same distribution.
+distributions = [distribution, distribution, distribution]
+
+# Perform the distributed matrix multiplication.
+result = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
diff --git a/examples/distributed/linalg/advanced/matmul/example03_options.py b/examples/distributed/linalg/advanced/matmul/example03_options.py
new file mode 100644
index 0000000..194f0cc
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example03_options.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify options to a distributed matrix multiplication
+operation.
+
+In this example, we'll use NumPy ndarrays as input, and look at two equivalent ways to
+specify the compute type.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 4 python example03_options.py
+"""
+
+import numpy as np
+import cuda.core.experimental
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import Slab
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cuda.core.experimental.system.num_devices
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+# Prepare sample input data
+a = np.random.rand(k // nranks, m).astype(np.float32)  # partitioned on k
+b = np.random.rand(n, k // nranks).astype(np.float32)  # partitioned on k
+a = a.T
+b = b.T
+
+distributions = [Slab.Y, Slab.X, Slab.Y]
+
+# Here we'd like to use COMPUTE_32F_FAST_TF32 for the compute type, and we show two
+# alternatives for doing so. Tip: use
+# help(nvmath.distributed.linalg.advanced.MatmulComputeType) to see available
+# compute types.
+compute_type = nvmath.distributed.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_TF32
+
+# Alternative #1 for specifying options, using a dataclass.
+# Tip: use help(nvmath.distributed.linalg.advanced.MatmulOptions) to see available options.
+options = nvmath.distributed.linalg.advanced.MatmulOptions(compute_type=compute_type)
+result = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions, options=options)
+
+# Alternative #2 for specifying options, using dict. The two alternatives are entirely
+# equivalent.
+result = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions, options={"compute_type": compute_type})
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+
+# Check if the result is numpy array as well.
+if rank == 0:
+    print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+    print(f"Inputs were of data types {a.dtype} and {b.dtype} and the result is of data type {result.dtype}.")
+assert isinstance(result, np.ndarray)
diff --git a/examples/distributed/linalg/advanced/matmul/example04_logging_global.py b/examples/distributed/linalg/advanced/matmul/example04_logging_global.py
new file mode 100644
index 0000000..41b352c
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example04_logging_global.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates how to turn on logging using the global logger.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 2 python example04_logging_global.py
+"""
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+
+# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom
+# format for the log.
+import logging
+
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 64, 128, 256
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+# Prepare sample input data.
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(k // nranks, m).astype(cp.float32)  # partitioned on k
+    b = cp.random.rand(n, k // nranks).astype(cp.float32)  # partitioned on k
+a = a.T
+b = b.T
+alpha = 0.45
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+distributions = [col_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+# Perform the GEMM.
+result = nvmath.distributed.linalg.advanced.matmul(a, b, alpha=alpha, distributions=distributions)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/distributed/linalg/advanced/matmul/example04_logging_user.py b/examples/distributed/linalg/advanced/matmul/example04_logging_user.py
new file mode 100644
index 0000000..655c908
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example04_logging_user.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates the use of a user-provided logger.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 2 python example04_logging_user.py
+"""
+
+import logging
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import Slab
+
+# Create and configure a user logger.
+# Any of the features provided by the logging module can be used.
+logger = logging.getLogger("userlogger")
+logging.getLogger().setLevel(logging.NOTSET)
+
+# Create a console handler for the logger and set level.
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+
+# Create a formatter and associate with handler.
+formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+handler.setFormatter(formatter)
+
+# Associate handler with logger, resulting in a logger with the desired level, format, and
+# console output.
+logger.addHandler(handler)
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 64, 128, 256
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+# Prepare sample input data.
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(k // nranks, m).astype(cp.float16)  # partitioned on k
+    b = cp.random.rand(n, k // nranks).astype(cp.float16)  # partitioned on k
+a = a.T
+b = b.T
+alpha = 0.45
+
+distributions = [Slab.Y, Slab.X, Slab.Y]
+
+# Specify the custom logger in the matrix multiplication options.
+o = nvmath.distributed.linalg.advanced.MatmulOptions(logger=logger)
+# Specify the options to the matrix multiplication operation.
+result = nvmath.distributed.linalg.advanced.matmul(a, b, alpha=alpha, distributions=distributions, options=o)
+
+print("---")
+
+# Recall that the options can also be provided as a dict, so the following is an
+# alternative, entirely equivalent way to specify options.
+result = nvmath.distributed.linalg.advanced.matmul(a, b, alpha=alpha, distributions=distributions, options={"logger": logger})
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/distributed/linalg/advanced/matmul/example05_stateful_cupy.py b/examples/distributed/linalg/advanced/matmul/example05_stateful_cupy.py
new file mode 100644
index 0000000..bd62806
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example05_stateful_cupy.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful distributed matrix multiplication objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are CuPy ndarrays.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 4 python example05_stateful_cupy.py
+"""
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a_shape = row_wise_distribution.shape(rank, (k, m))
+b_shape = col_wise_distribution.shape(rank, (n, k))
+a = nvmath.distributed.allocate_symmetric_memory(a_shape, cp)
+b = nvmath.distributed.allocate_symmetric_memory(b_shape, cp)
+with cp.cuda.Device(device_id):
+    a[:] = cp.random.rand(*a_shape)
+    b[:] = cp.random.rand(*b_shape)
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (m, k) with col_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=distributions) as mm:
+    # Plan the matrix multiplication.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Note: if all of the input operands are on symmetric memory, the result is also
+    # on symmetric memory.
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    cp.cuda.get_current_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
+
+# GPU operands on the symmetric heap are not garbage-collected and the user is
+# responsible for freeing any that they own (this deallocation is a collective
+# operation that must be called by all processes at the same point in the execution).
+nvmath.distributed.free_symmetric_memory(a, b, result)
diff --git a/examples/distributed/linalg/advanced/matmul/example05_stateful_torch.py b/examples/distributed/linalg/advanced/matmul/example05_stateful_torch.py
new file mode 100644
index 0000000..7da6ae4
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example05_stateful_torch.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful distributed matrix multiplication objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are PyTorch tensors on the GPU.
+
+The global operation performed in this example is: A.T @ B
+
+$ mpiexec -n 4 python example05_stateful_torch.py
+"""
+
+import torch
+import numpy as np
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % torch.cuda.device_count()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 256, 512, 256
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a_shape = col_wise_distribution.shape(rank, (m, k))
+b_shape = col_wise_distribution.shape(rank, (n, k))
+a = nvmath.distributed.allocate_symmetric_memory(a_shape, torch)
+b = nvmath.distributed.allocate_symmetric_memory(b_shape, torch)
+with torch.cuda.device(device_id):
+    a[:] = torch.rand(*a_shape, device=f"cuda:{device_id}")
+    b[:] = torch.rand(*b_shape, device=f"cuda:{device_id}")
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (k, m) with row_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [row_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=distributions, qualifiers=qualifiers) as mm:
+    # Plan the matrix multiplication.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Note: if all of the input operands are on symmetric memory, the result is also
+    # on symmetric memory.
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    torch.cuda.default_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
+
+# GPU operands on the symmetric heap are not garbage-collected and the user is
+# responsible for freeing any that they own (this deallocation is a collective
+# operation that must be called by all processes at the same point in the execution).
+nvmath.distributed.free_symmetric_memory(a, b, result)
diff --git a/examples/distributed/linalg/advanced/matmul/example05_stateful_torch_cpu.py b/examples/distributed/linalg/advanced/matmul/example05_stateful_torch_cpu.py
new file mode 100644
index 0000000..e8ff12a
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example05_stateful_torch_cpu.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful distributed matrix multiplication objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are PyTorch tensors on the CPU.
+
+The global operation performed in this example is: A.T @ B
+
+$ mpiexec -n 4 python example05_stateful_torch_cpu.py
+"""
+
+import torch
+import numpy as np
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % torch.cuda.device_count()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 256, 512, 256
+
+# Note: see example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a = torch.rand(*col_wise_distribution.shape(rank, (m, k)))
+b = torch.rand(*col_wise_distribution.shape(rank, (n, k)))
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (k, m) with row_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [row_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=distributions, qualifiers=qualifiers) as mm:
+    # Plan the matrix multiplication.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # No synchronization is needed for CPU tensors, since the execution always blocks.
+
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/distributed/linalg/advanced/matmul/example06_stateful_inplace.py b/examples/distributed/linalg/advanced/matmul/example06_stateful_inplace.py
new file mode 100644
index 0000000..28b6d0f
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example06_stateful_inplace.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of inplace update of input operands in stateful matrix
+multiplication APIs.
+
+The inputs as well as the result are CuPy ndarrays.
+
+NOTE: The operands should be updated inplace only when they are in a memory space that is
+accessible from the execution space. In this case, the operands reside on the GPU while the
+execution also happens on the GPU.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 4 python example06_stateful_inplace.py
+"""
+
+import logging
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# Turn on logging to see what's happening.
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# See example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(*row_wise_distribution.shape(rank, (k, m)))
+    b = cp.random.rand(*col_wise_distribution.shape(rank, (n, k)))
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (m, k) with col_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=distributions) as mm:
+    # Plan the matrix multiplication.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Update the operand A in-place.
+    print("Updating 'a' in-place.")
+    with cp.cuda.Device(device_id):
+        a[:] = cp.random.rand(*col_wise_distribution.shape(rank, (m, k)))
+
+    # Execute the new matrix multiplication.
+    result = mm.execute()
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    cp.cuda.get_current_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/distributed/linalg/advanced/matmul/example06_stateful_reset.py b/examples/distributed/linalg/advanced/matmul/example06_stateful_reset.py
new file mode 100644
index 0000000..948093f
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example06_stateful_reset.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to reset operands in stateful matrix multiplication APIs, and
+reuse the object for multiple executions. This is needed when the memory space of the
+operands is not accessible from the execution space, or if it's desired to bind new
+(compatible) operands to the stateful object.
+
+The inputs as well as the result are NumPy ndarrays.
+
+The global operation performed in this example is: A @ B
+
+$ mpiexec -n 4 python example06_stateful_reset.py
+"""
+
+import logging
+
+import numpy as np
+import cuda.core.experimental
+from mpi4py import MPI
+
+import nvmath.distributed
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cuda.core.experimental.system.num_devices
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# Turn on logging to see what's happening.
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 256
+
+# See example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+a = np.random.rand(*row_wise_distribution.shape(rank, (k, m)))
+b = np.random.rand(*col_wise_distribution.shape(rank, (n, k)))
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (m, k) with col_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [col_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=distributions) as mm:
+    # Plan the matrix multiplication.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Create new operands and bind them.
+    c = np.random.rand(*col_wise_distribution.shape(rank, (m, k)))
+    d = np.random.rand(*row_wise_distribution.shape(rank, (k, n)))
+    mm.reset_operands(c, d)
+
+    # Execute the new matrix multiplication.
+    result = mm.execute()
+
+    # No synchronization is needed for CPU tensors, since the execution always blocks.
+
+    print(f"Input types = {type(c), type(d)}")
+    print(f"Result type = {type(result)}")
diff --git a/examples/distributed/linalg/advanced/matmul/example07_gemm.py b/examples/distributed/linalg/advanced/matmul/example07_gemm.py
new file mode 100644
index 0000000..e1f95e7
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example07_gemm.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates distributed GEMM on CuPy ndarrays.
+
+GEMM (General Matrix Multiply) is defined as:
+alpha * A @ B + beta * C
+where `@` denotes matrix multiplication.
+
+The global operation performed in this example is: alpha * A @ B + beta * C
+
+$ mpiexec -n 4 python example07_gemm.py
+"""
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import Slab
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 128, 512, 1024
+
+# See example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+# Prepare sample input data.
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(k // nranks, m).astype(cp.float32)  # partitioned on k
+    b = cp.random.rand(n, k // nranks).astype(cp.float32)  # partitioned on k
+    c = cp.random.rand(n // nranks, m).astype(cp.float32)  # partitioned on n
+a = a.T
+b = b.T
+c = c.T
+alpha = 0.45
+beta = 0.67
+
+distributions = [Slab.Y, Slab.X, Slab.Y]
+
+# Perform the distributed GEMM.
+result = nvmath.distributed.linalg.advanced.matmul(
+    a,
+    b,
+    c=c,
+    alpha=alpha,
+    beta=beta,
+    distributions=distributions,
+)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
+
+assert result.shape == Slab.Y.shape(rank, (m, n))  # result is distributed column-wise
diff --git a/examples/distributed/linalg/advanced/matmul/example08_epilog_allreduce.py b/examples/distributed/linalg/advanced/matmul/example08_epilog_allreduce.py
new file mode 100644
index 0000000..ffff68e
--- /dev/null
+++ b/examples/distributed/linalg/advanced/matmul/example08_epilog_allreduce.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates usage of AllReduce epilog.
+
+With cuBLASMp's GEMM+AllReduce algorithm, each process calculates a part of the output
+which will be then reduced (summed) using the AllReduce operation resulting in an output
+matrix that is the same across all processes.
+
+The global operation performed in this example is: A.T @ B
+The AllReduce epilog operation is a sum reduction of the partial result of each process,
+resulting in the same output matrix of shape (m, n) on all processes.
+
+$ mpiexec -n 4 python example08_epilog_allreduce.py
+"""
+
+import numpy as np
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+# cuBLASMp requires NVSHMEM and NCCL communication backends.
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem", "nccl"])
+
+# The global problem size m, n, k
+m, n, k = 256, 512, 128
+
+# See example01 for details on matrix distribution and memory layout impact and
+# requirements.
+
+# As of cuBLASMp 0.6, GEMM+AllReduce requires TN format with A and B distributed row-wise
+# and C and D matrices distributed column-wise.
+
+row_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))  # partitioning on rows
+col_wise_distribution = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))  # partitioning on columns
+
+with cp.cuda.Device(device_id):
+    # See example01_cupy_symmetric_memory.py for an example of allocating on symmetric
+    # memory, which may further improve performance.
+    a = cp.random.rand(*col_wise_distribution.shape(rank, (m, k)))
+    b = cp.random.rand(*col_wise_distribution.shape(rank, (n, k)))
+
+# Get a transposed view to obtain column-major Fortran memory layout. Note that this
+# also changes the distribution of a and b (see example01 for more information).
+a = a.T  # a is now (k, m) with row_wise_distribution
+b = b.T  # b is now (k, n) with row_wise_distribution
+
+# Distribution of a, b and output.
+distributions = [row_wise_distribution, row_wise_distribution, col_wise_distribution]
+
+qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+qualifiers[0]["is_transpose"] = True  # a is transposed
+
+epilog = nvmath.distributed.linalg.advanced.MatmulEpilog.ALLREDUCE
+result = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions, epilog=epilog, qualifiers=qualifiers)
+
+# AllReduce results in the same output matrix of shape (m, n) on each process.
+assert result.shape == (m, n)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/distributed/reshape/example01_cupy.py b/examples/distributed/reshape/example01_cupy.py
index f3bf990..43042c8 100644
--- a/examples/distributed/reshape/example01_cupy.py
+++ b/examples/distributed/reshape/example01_cupy.py
@@ -28,6 +28,7 @@
 
 import cupy as cp
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 from mpi4py import MPI
@@ -36,7 +37,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 assert nranks == 2, "Please run with two processes"
 
@@ -62,11 +63,11 @@
 
 # Reshape from column-wise to row-wise.
 if rank == 0:
-    input_box = [(0, 0), (4, 2)]
-    output_box = [(0, 0), (2, 4)]
+    input_box = Box((0, 0), (4, 2))
+    output_box = Box((0, 0), (2, 4))
 else:
-    input_box = [(0, 2), (4, 4)]
-    output_box = [(2, 0), (4, 4)]
+    input_box = Box((0, 2), (4, 4))
+    output_box = Box((2, 0), (4, 4))
 # Distributed reshape returns a new operand with its own memory buffer
 # on the symmetric heap.
 A_reshaped = nvmath.distributed.reshape.reshape(A, input_box, output_box)
diff --git a/examples/distributed/reshape/example01_numpy.py b/examples/distributed/reshape/example01_numpy.py
index 77e1e61..45b0e7d 100644
--- a/examples/distributed/reshape/example01_numpy.py
+++ b/examples/distributed/reshape/example01_numpy.py
@@ -32,6 +32,7 @@
 import numpy as np
 import cuda.core.experimental
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 from mpi4py import MPI
@@ -40,7 +41,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cuda.core.experimental.system.num_devices
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 assert nranks == 2, "Please run with two processes"
 
@@ -49,11 +50,11 @@
 
 # Reshape from column-wise to row-wise.
 if rank == 0:
-    input_box = [(0, 0), (4, 2)]
-    output_box = [(0, 0), (2, 4)]
+    input_box = Box((0, 0), (4, 2))
+    output_box = Box((0, 0), (2, 4))
 else:
-    input_box = [(0, 2), (4, 4)]
-    output_box = [(2, 0), (4, 4)]
+    input_box = Box((0, 2), (4, 4))
+    output_box = Box((2, 0), (4, 4))
 # Distributed reshape returns a new operand with its own buffer.
 A_reshaped = nvmath.distributed.reshape.reshape(A, input_box, output_box)
 
diff --git a/examples/distributed/reshape/example01_torch.py b/examples/distributed/reshape/example01_torch.py
index 6020776..986d174 100644
--- a/examples/distributed/reshape/example01_torch.py
+++ b/examples/distributed/reshape/example01_torch.py
@@ -28,6 +28,7 @@
 
 import torch
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 from mpi4py import MPI
@@ -36,7 +37,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 assert nranks == 2, "Please run with two processes"
 
@@ -61,11 +62,11 @@
 
 # Reshape from column-wise to row-wise.
 if rank == 0:
-    input_box = [(0, 0), (4, 2)]
-    output_box = [(0, 0), (2, 4)]
+    input_box = Box((0, 0), (4, 2))
+    output_box = Box((0, 0), (2, 4))
 else:
-    input_box = [(0, 2), (4, 4)]
-    output_box = [(2, 0), (4, 4)]
+    input_box = Box((0, 2), (4, 4))
+    output_box = Box((2, 0), (4, 4))
 # Distributed reshape returns a new operand with its own memory buffer
 # on the symmetric heap.
 A_reshaped = nvmath.distributed.reshape.reshape(A, input_box, output_box)
diff --git a/examples/distributed/reshape/example02_stateful_cupy.py b/examples/distributed/reshape/example02_stateful_cupy.py
index 2481dea..226c647 100644
--- a/examples/distributed/reshape/example02_stateful_cupy.py
+++ b/examples/distributed/reshape/example02_stateful_cupy.py
@@ -15,13 +15,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The problem consists of a global 3-D array of size (512, 256, 512), that is
 # initially partitioned on the X axis across processes.
@@ -39,10 +40,10 @@
 # We can get the offset of this process on the partitioned dimension with a prefix
 # reduction.
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-input_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+input_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-output_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+output_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 # Create a stateful Reshape object 'r'.
 with nvmath.distributed.reshape.Reshape(a, input_box, output_box) as r:
diff --git a/examples/distributed/reshape/example02_stateful_torch.py b/examples/distributed/reshape/example02_stateful_torch.py
index 94f8c17..99f6ca4 100644
--- a/examples/distributed/reshape/example02_stateful_torch.py
+++ b/examples/distributed/reshape/example02_stateful_torch.py
@@ -15,13 +15,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The problem consists of a global 3-D array of size (512, 256, 512), that is
 # initially partitioned on the Y axis across processes.
@@ -38,10 +39,10 @@
 # We can get the offset of this process on the partitioned dimension with a prefix
 # reduction.
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-input_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+input_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-output_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+output_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 # Create a stateful Reshape object 'r'.
 with nvmath.distributed.reshape.Reshape(a, input_box, output_box) as r:
diff --git a/examples/distributed/reshape/example02_stateful_torch_cpu.py b/examples/distributed/reshape/example02_stateful_torch_cpu.py
index a1974ca..ba517d5 100644
--- a/examples/distributed/reshape/example02_stateful_torch_cpu.py
+++ b/examples/distributed/reshape/example02_stateful_torch_cpu.py
@@ -18,13 +18,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % torch.cuda.device_count()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The problem consists of a global 3-D array of size (512, 256, 512), that is
 # initially partitioned on the Y axis across processes.
@@ -37,10 +38,10 @@
 # We can get the offset of this process on the partitioned dimension with a prefix
 # reduction.
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-input_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+input_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-output_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+output_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 # Create a stateful Reshape object 'r'.
 with nvmath.distributed.reshape.Reshape(a, input_box, output_box) as r:
diff --git a/examples/distributed/reshape/example03_options.py b/examples/distributed/reshape/example03_options.py
index 81a3c40..e6ecfc7 100644
--- a/examples/distributed/reshape/example03_options.py
+++ b/examples/distributed/reshape/example03_options.py
@@ -13,6 +13,7 @@
 
 import cupy as cp
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 from mpi4py import MPI
@@ -21,7 +22,7 @@
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The problem consists of a global 3-D array of size (64, 256, 128), that is
 # initially partitioned on the X axis across processes.
@@ -37,10 +38,10 @@
 
 # We're going to redistribute the operand so that it is partitioned on the Y axis.
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-input_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+input_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-output_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+output_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 # Execute the Reshape.
 
diff --git a/examples/distributed/reshape/example04_logging_global.py b/examples/distributed/reshape/example04_logging_global.py
index bba8a30..bc81d6c 100644
--- a/examples/distributed/reshape/example04_logging_global.py
+++ b/examples/distributed/reshape/example04_logging_global.py
@@ -14,13 +14,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (512, 512, 256), initially partitioned on the X axis
 # across processes.
@@ -40,10 +41,10 @@
 # Reshape the operand so that it is partitioned on the Y axis.
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-input_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+input_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-output_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+output_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 b = nvmath.distributed.reshape.reshape(a, input_box, output_box)
 
diff --git a/examples/distributed/reshape/example04_logging_user.py b/examples/distributed/reshape/example04_logging_user.py
index 8c66cb5..cf927a6 100644
--- a/examples/distributed/reshape/example04_logging_user.py
+++ b/examples/distributed/reshape/example04_logging_user.py
@@ -14,13 +14,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (512, 512, 256), initially partitioned on the X axis
 # across processes.
@@ -53,10 +54,10 @@
 # Reshape the operand so that it is partitioned on the Y axis.
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-input_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+input_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-output_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+output_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 b = nvmath.distributed.reshape.reshape(a, input_box, output_box)
 
diff --git a/examples/distributed/reshape/example05_stateful_reset.py b/examples/distributed/reshape/example05_stateful_reset.py
index cbcb4ac..d0cf9b7 100644
--- a/examples/distributed/reshape/example05_stateful_reset.py
+++ b/examples/distributed/reshape/example05_stateful_reset.py
@@ -13,13 +13,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (512, 512, 512), initially partitioned on the Y
 # axes across processes.
@@ -34,10 +35,10 @@
 
 # We're going to redistribute the operand so that it is partitioned on the X axis.
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-input_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+input_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-output_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+output_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 # Create a stateful Reshape object 'r'.
 with nvmath.distributed.reshape.Reshape(a, input_box, output_box) as r:
diff --git a/examples/distributed/reshape/example05_stateful_reset_inplace.py b/examples/distributed/reshape/example05_stateful_reset_inplace.py
index ccd0dc7..fd7d364 100644
--- a/examples/distributed/reshape/example05_stateful_reset_inplace.py
+++ b/examples/distributed/reshape/example05_stateful_reset_inplace.py
@@ -14,13 +14,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (512, 512, 512), initially partitioned on the Y
 # axes across processes.
@@ -35,10 +36,10 @@
 
 # We're going to redistribute the operand so that it is partitioned on the X axis.
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-input_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+input_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-output_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+output_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 # Create a stateful Reshape object 'r'.
 with nvmath.distributed.reshape.Reshape(a, input_box, output_box) as r:
diff --git a/examples/distributed/reshape/example06_streams.py b/examples/distributed/reshape/example06_streams.py
index 0ba94f6..4eb9ea5 100644
--- a/examples/distributed/reshape/example06_streams.py
+++ b/examples/distributed/reshape/example06_streams.py
@@ -12,13 +12,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (512, 256, 256), initially partitioned on the X
 # axis across processes.
@@ -33,10 +34,10 @@
 
 # We're going to redistribute the operand so that it is partitioned on the Y axis.
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-input_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+input_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-output_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+output_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 # Create a CUDA stream to use for instantiating, planning, and first execution of a stateful
 # distributed Reshape object 'r'.
diff --git a/examples/distributed/reshape/example07_sync_symmetric_memory.py b/examples/distributed/reshape/example07_sync_symmetric_memory.py
index c02611d..56e9181 100644
--- a/examples/distributed/reshape/example07_sync_symmetric_memory.py
+++ b/examples/distributed/reshape/example07_sync_symmetric_memory.py
@@ -15,13 +15,14 @@
 from mpi4py import MPI
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box, Slab
 
 # Initialize nvmath.distributed.
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nranks = comm.Get_size()
 device_id = rank % cp.cuda.runtime.getDeviceCount()
-nvmath.distributed.initialize(device_id, comm)
+nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
 # The global 3-D problem size is (256, 512, 512), initially partitioned on the Y
 # axes across processes.
@@ -41,10 +42,10 @@
 # Operand a is distributed according to Slab.Y distribution, so we need to re-distribute it
 # to Slab.X.
 y_offset = comm.scan(Y // nranks, op=MPI.SUM)
-input_box = [(0, y_offset - Y // nranks, 0), (X, y_offset, Z)]
+input_box = Box((0, y_offset - Y // nranks, 0), (X, y_offset, Z))
 
 x_offset = comm.scan(X // nranks, op=MPI.SUM)
-output_box = [(x_offset - X // nranks, 0, 0), (x_offset, Y, Z)]
+output_box = Box((x_offset - X // nranks, 0, 0), (x_offset, Y, Z))
 
 # Execute the Reshape.
 # Before the reshape executes, the local changes to the input operand must be visible
@@ -60,7 +61,7 @@
 # Create a stateful FFT object 'f' with Slab.X distribution.
 # Note that we could also specify the permuted boxes as distribution,
 # i.e. distribution=[output_box, input_box]
-with nvmath.distributed.fft.FFT(b, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
+with nvmath.distributed.fft.FFT(b, distribution=Slab.X, options={"reshape": False}) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/fft/example19_convolution_epilog_callback.py b/examples/fft/example19_convolution_epilog_callback.py
index c068092..70b4719 100644
--- a/examples/fft/example19_convolution_epilog_callback.py
+++ b/examples/fft/example19_convolution_epilog_callback.py
@@ -8,7 +8,7 @@
 
 To run this example, CUDA Toolkit 12.6U2 and device API (dx) dependencies
 are required. The quickest way for pip users to set them up is to install
-nvmath as ``pip install nvmath-python[cu12,dx]``.
+nvmath as ``pip install nvmath-python[cu12-dx]``.
 
 For further details, please see :ref:`FFT callbacks <fft-callback>`.
 """
diff --git a/examples/fft/example19_convolution_memory_layout_callback.py b/examples/fft/example19_convolution_memory_layout_callback.py
index 3b6b48d..f32803c 100644
--- a/examples/fft/example19_convolution_memory_layout_callback.py
+++ b/examples/fft/example19_convolution_memory_layout_callback.py
@@ -8,7 +8,7 @@
 
 To run this example, CUDA Toolkit 12.6U2 and device API (dx) dependencies
 are required. The quickest way for pip users to set them up is to install
-nvmath as ``pip install nvmath-python[cu12,dx]``.
+nvmath as ``pip install nvmath-python[cu12-dx]``.
 
 For further details, please see :ref:`FFT callbacks <fft-callback>`.
 """
diff --git a/examples/fft/example19_convolution_prolog_callback.py b/examples/fft/example19_convolution_prolog_callback.py
index 81a9052..9f6003e 100644
--- a/examples/fft/example19_convolution_prolog_callback.py
+++ b/examples/fft/example19_convolution_prolog_callback.py
@@ -8,7 +8,7 @@
 
 To run this example, CUDA Toolkit 12.6U2 and device API (dx) dependencies
 are required. The quickest way for pip users to set them up is to install
-nvmath as ``pip install nvmath-python[cu12,dx]``.
+nvmath as ``pip install nvmath-python[cu12-dx]``.
 
 For further details, please see :ref:`FFT callbacks <fft-callback>`.
 """
diff --git a/examples/linalg/advanced/matmul/example05_stateful_reset.py b/examples/linalg/advanced/matmul/example05_stateful_reset.py
index 462b8b6..696b371 100644
--- a/examples/linalg/advanced/matmul/example05_stateful_reset.py
+++ b/examples/linalg/advanced/matmul/example05_stateful_reset.py
@@ -22,7 +22,6 @@
 
 # Prepare sample input data
 m, n, k = 123, 456, 789
-m, n, k = 2, 3, 4
 a = np.random.rand(m, k)
 b = np.random.rand(k, n)
 
diff --git a/examples/linalg/generic/matmul/example01_cupy.py b/examples/linalg/generic/matmul/example01_cupy.py
new file mode 100644
index 0000000..00b7bbc
--- /dev/null
+++ b/examples/linalg/generic/matmul/example01_cupy.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic matrix multiplication of CuPy arrays using the generic API.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the GPU like the
+inputs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 123, 456, 789
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+
+# The execution happens on the GPU by default since the operands are on the GPU.
+
+# Perform the multiplication.
+result = nvmath.linalg.matmul(a, b)
+
+print(cp.allclose(a @ b, result))
diff --git a/examples/linalg/generic/matmul/example01_numpy.py b/examples/linalg/generic/matmul/example01_numpy.py
new file mode 100644
index 0000000..c833935
--- /dev/null
+++ b/examples/linalg/generic/matmul/example01_numpy.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic matrix multiplication of NumPy arrays using the generic API.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the CPU like the
+inputs.
+"""
+
+import numpy as np
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 123, 456, 789
+a = np.random.rand(m, k)
+b = np.random.rand(k, n)
+
+# We can choose the execution space for the matrix multiplication using ExecutionCUDA or
+# ExecutionCPU. By default, the execution space matches the operands, so in order to execute
+# a matrix multiplication on NumPy arrays using CUDA we need to specify ExecutionCUDA.
+# Tip: use help(nvmath.linalg.ExecutionCUDA) to see available options.
+execution = nvmath.linalg.ExecutionCUDA()
+
+# Perform the multiplication.
+result = nvmath.linalg.matmul(a, b, execution=execution)
+
+# Alternatively, the execution space can be specified as a string "cuda", which is
+# identical to providing a default-constructed ExecutionCUDA() object.
+result = nvmath.linalg.matmul(a, b, execution="cuda")
+
+print(np.allclose(a @ b, result))
diff --git a/examples/linalg/generic/matmul/example01_torch_cpu_inputs.py b/examples/linalg/generic/matmul/example01_torch_cpu_inputs.py
new file mode 100644
index 0000000..a4ee633
--- /dev/null
+++ b/examples/linalg/generic/matmul/example01_torch_cpu_inputs.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic matrix multiplication of PyTorch CPU arrays using the
+generic API.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the CPU like the
+inputs.
+"""
+
+import torch
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 123, 456, 789
+a = torch.rand(m, k)
+b = torch.rand(k, n)
+
+# We can choose the execution space for the matrix multiplication using ExecutionCUDA or
+# ExecutionCPU. By default, the execution space matches the operands, so in order to execute
+# a matrix multiplication on NumPy arrays using CUDA we need to specify ExecutionCUDA.
+# Tip: use help(nvmath.linalg.ExecutionCUDA) to see available options.
+execution = nvmath.linalg.ExecutionCUDA()
+
+# Perform the multiplication.
+result = nvmath.linalg.matmul(a, b, execution=execution)
+
+# Alternatively, the execution space can be specified as a string "cuda", which is
+# identical to providing a default-constructed ExecutionCUDA() object.
+result = nvmath.linalg.matmul(a, b, execution="cuda")
+
+print(torch.allclose(a @ b, result))
diff --git a/examples/linalg/generic/matmul/example01_torch_gpu_inputs.py b/examples/linalg/generic/matmul/example01_torch_gpu_inputs.py
new file mode 100644
index 0000000..01daa81
--- /dev/null
+++ b/examples/linalg/generic/matmul/example01_torch_gpu_inputs.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic matrix multiplication of PyTorch GPU arrays using the
+generic API.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the GPU like the
+inputs.
+"""
+
+import torch
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 123, 456, 789
+device_id = 0
+a = torch.rand(m, k, device=device_id)
+b = torch.rand(k, n, device=device_id)
+
+# The execution happens on the GPU by default since the operands are on the GPU.
+
+# Perform the multiplication.
+result = nvmath.linalg.matmul(a, b)
+
+print(torch.allclose(a @ b, result))
diff --git a/examples/linalg/generic/matmul/example02_diag_qualifiers.py b/examples/linalg/generic/matmul/example02_diag_qualifiers.py
new file mode 100644
index 0000000..fcabe7b
--- /dev/null
+++ b/examples/linalg/generic/matmul/example02_diag_qualifiers.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify the operand structure (triangular, symmetric, ...)
+to a generic matrix multiplication operation.
+
+In this example, we will multiply a general NumPy ndarray with a diagonal one. The
+result is also a general NumPy ndarray.
+"""
+
+import numpy as np
+
+import nvmath
+
+# Prepare sample input data.
+m, k = 123, 789
+# Transpose and conjugate operations are not supported for the diagonal matmul API, so we
+# must provide a column-order matrix.
+a = np.random.rand(m, k).astype(np.float32, order="F")
+# The matmul function accepts diagonal matrices as a vector. To extract the main diagonal
+# from an existing NumPy array, see np.diag() or np.diagonal().
+b = np.random.rand(k).astype(np.float32)
+
+# We can use structured matrices as inputs by providing the corresponding qualifier which
+# describes the matrix. By default, all inputs are assumed to be general matrices.
+# MatrixQualifiers are provided as a NumPy ndarray of custom NumPy dtype,
+# nvmath.linalg.matrix_qualifiers_dtype.
+qualifiers = np.full((2,), nvmath.linalg.GeneralMatrixQualifier.create(), dtype=nvmath.linalg.matrix_qualifiers_dtype)
+qualifiers[1] = nvmath.linalg.DiagonalMatrixQualifier.create()
+
+result = nvmath.linalg.matmul(a, b, execution="cuda", qualifiers=qualifiers)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+print(np.allclose(a @ np.diag(b), result))
diff --git a/examples/linalg/generic/matmul/example02_options.py b/examples/linalg/generic/matmul/example02_options.py
new file mode 100644
index 0000000..7787f7f
--- /dev/null
+++ b/examples/linalg/generic/matmul/example02_options.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify options to a matrix multiplication operation.
+
+In this example, we will use NumPy ndarrays as input, and we will look at two equivalent
+ways to specify the compute type.
+"""
+
+import numpy as np
+
+import nvmath
+
+# Prepare sample input data.
+m, k = 123, 789
+a = np.random.rand(m, k).astype(np.float32)
+b = np.tril(np.random.rand(k, k).astype(np.float32))
+
+# We can choose the execution space for the matrix multiplication using ExecutionCUDA or
+# ExecutionCPU. By default, the execution space matches the operands, so in order to execute
+# a matrix multiplication on NumPy arrays using CUDA we need to specify ExecutionCUDA.
+# Tip: use help(nvmath.linalg.generic.ExecutionCUDA) to see available options.
+execution = nvmath.linalg.ExecutionCUDA()
+
+# We can use structured matrices as inputs by providing the corresponding qualifier which
+# describes the matrix. By default, all inputs are assumed to be general matrices.
+# MatrixQualifiers are provided as an array of custom NumPy dtype,
+# nvmath.linalg.matrix_qualifiers_dtype.
+qualifiers = np.full((2,), nvmath.linalg.GeneralMatrixQualifier.create(), dtype=nvmath.linalg.matrix_qualifiers_dtype)
+qualifiers[1] = nvmath.linalg.TriangularMatrixQualifier.create(uplo=nvmath.linalg.FillMode.LOWER)
+
+result = nvmath.linalg.matmul(a, b, execution=execution, qualifiers=qualifiers)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+
+# Check if the result is numpy array as well.
+print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.")
+print(f"Inputs were of data types {a.dtype} and {b.dtype} and the result is of data type {result.dtype}.")
+assert isinstance(result, np.ndarray)
diff --git a/examples/linalg/generic/matmul/example02_symm_qualifiers.py b/examples/linalg/generic/matmul/example02_symm_qualifiers.py
new file mode 100644
index 0000000..e74c6a3
--- /dev/null
+++ b/examples/linalg/generic/matmul/example02_symm_qualifiers.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify the operand structure (triangular, symmetric, ...)
+to a generic matrix multiplication operation.
+
+In this example, we will multiply a symmetric CuPy ndarray with a general one. The result
+is also a general CuPy ndarray.
+"""
+
+import cupy as cp
+import numpy as np
+
+import nvmath
+
+# Prepare sample input data.
+n, k = 123, 789
+a = cp.random.rand(k, k).astype(cp.float32)
+b = cp.random.rand(k, n).astype(cp.float32)
+
+# We can use structured matrices as inputs by providing the corresponding qualifier which
+# describes the matrix. By default, all inputs are assumed to be general matrices.
+# MatrixQualifiers are provided as a NumPy ndarray of custom NumPy dtype,
+# nvmath.linalg.matrix_qualifiers_dtype.
+qualifiers = np.full((2,), nvmath.linalg.GeneralMatrixQualifier.create(), dtype=nvmath.linalg.matrix_qualifiers_dtype)
+qualifiers[0] = nvmath.linalg.SymmetricMatrixQualifier.create(uplo=nvmath.linalg.FillMode.LOWER)
+
+result = nvmath.linalg.matmul(a, b, qualifiers=qualifiers)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+
+# Create the symmetric matrix from the lower-triangular part of `a`.
+s = cp.tril(a, k=-1)
+s += s.T + cp.diag(cp.diag(a))
+print(cp.allclose(s @ b, result))
diff --git a/examples/linalg/generic/matmul/example02_tril_qualifiers.py b/examples/linalg/generic/matmul/example02_tril_qualifiers.py
new file mode 100644
index 0000000..d398084
--- /dev/null
+++ b/examples/linalg/generic/matmul/example02_tril_qualifiers.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify the operand structure (triangular, symmetric, ...)
+to a generic matrix multiplication operation.
+
+In this example, we will multiply a general NumPy ndarray with a lower-triangular one. The
+result is also a general NumPy ndarray.
+"""
+
+import numpy as np
+
+import nvmath
+
+# Prepare sample input data.
+m, k = 123, 789
+a = np.random.rand(m, k).astype(np.float32)
+b = np.tril(np.random.rand(k, k).astype(np.float32))
+
+# We can use structured matrices as inputs by providing the corresponding qualifier which
+# describes the matrix. By default, all inputs are assumed to be general matrices.
+# MatrixQualifiers are provided as a NumPy ndarray of custom NumPy dtype,
+# nvmath.linalg.matrix_qualifiers_dtype.
+qualifiers = np.full((2,), nvmath.linalg.GeneralMatrixQualifier.create(), dtype=nvmath.linalg.matrix_qualifiers_dtype)
+qualifiers[1] = nvmath.linalg.TriangularMatrixQualifier.create(uplo=nvmath.linalg.FillMode.LOWER)
+
+result = nvmath.linalg.matmul(a, b, execution="cuda", qualifiers=qualifiers)
+
+# No synchronization is needed for CPU tensors, since the execution always blocks.
+print(np.allclose(a @ b, result))
diff --git a/examples/linalg/generic/matmul/example03_logging_global.py b/examples/linalg/generic/matmul/example03_logging_global.py
new file mode 100644
index 0000000..bf1516b
--- /dev/null
+++ b/examples/linalg/generic/matmul/example03_logging_global.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates how to turn on logging using the global logger.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom
+# format for the log.
+import logging
+
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# Prepare sample input data.
+m, n, k = 64, 128, 256
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+alpha = 0.45
+
+# Perform the GEMM.
+result = nvmath.linalg.matmul(a, b, alpha=alpha)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/generic/matmul/example03_logging_user.py b/examples/linalg/generic/matmul/example03_logging_user.py
new file mode 100644
index 0000000..6779fe4
--- /dev/null
+++ b/examples/linalg/generic/matmul/example03_logging_user.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates the use of a user-provided logger.
+"""
+
+import logging
+
+import cupy as cp
+
+import nvmath
+
+# Create and configure a user logger.
+# Any of the features provided by the logging module can be used.
+logger = logging.getLogger("userlogger")
+logging.getLogger().setLevel(logging.NOTSET)
+
+# Create a console handler for the logger and set level.
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+
+# Create a formatter and associate with handler.
+formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+handler.setFormatter(formatter)
+
+# Associate handler with logger, resulting in a logger with the desired level, format, and
+# console output.
+logger.addHandler(handler)
+
+# Prepare sample input data.
+m, n, k = 64, 128, 256
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+alpha = 0.45
+
+# Specify the custom logger in the matrix multiplication options.
+o = nvmath.linalg.MatmulOptions(logger=logger)
+# Specify the options to the matrix multiplication operation.
+result = nvmath.linalg.matmul(a, b, alpha=alpha, options=o)
+
+print("---")
+
+# Recall that the options can also be provided as a dict, so the following is an
+#   alternative, entirely equivalent way to specify options.
+result = nvmath.linalg.matmul(a, b, alpha=alpha, options={"logger": logger})
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/generic/matmul/example04_stateful_cupy.py b/examples/linalg/generic/matmul/example04_stateful_cupy.py
new file mode 100644
index 0000000..e748589
--- /dev/null
+++ b/examples/linalg/generic/matmul/example04_stateful_cupy.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are CuPy ndarrays.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 123, 456, 789
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.linalg.Matmul(a, b) as mm:
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    cp.cuda.get_current_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/generic/matmul/example04_stateful_torch.py b/examples/linalg/generic/matmul/example04_stateful_torch.py
new file mode 100644
index 0000000..189bd63
--- /dev/null
+++ b/examples/linalg/generic/matmul/example04_stateful_torch.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are PyTorch tensors on the GPU.
+"""
+
+import torch
+
+import nvmath
+
+# Prepare sample input data
+device_id = 0
+m, n, k = 123, 456, 789
+a = torch.rand(m, k, device=device_id)
+b = torch.rand(k, n, device=device_id)
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.linalg.Matmul(a, b) as mm:
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured, as we'll see in a later example.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    torch.cuda.default_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/generic/matmul/example04_stateful_torch_cpu.py b/examples/linalg/generic/matmul/example04_stateful_torch_cpu.py
new file mode 100644
index 0000000..d3fcc34
--- /dev/null
+++ b/examples/linalg/generic/matmul/example04_stateful_torch_cpu.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are PyTorch tensors on the CPU.
+"""
+
+import torch
+
+import nvmath
+
+# Prepare sample input data
+m, n, k = 123, 456, 789
+a = torch.rand(m, k)
+b = torch.rand(k, n)
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.linalg.Matmul(a, b) as mm:
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured, as we'll see in a later example.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # No synchronization is needed for CPU tensors, since the execution always blocks.
+
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/generic/matmul/example05_stateful_inplace.py b/examples/linalg/generic/matmul/example05_stateful_inplace.py
new file mode 100644
index 0000000..9c14644
--- /dev/null
+++ b/examples/linalg/generic/matmul/example05_stateful_inplace.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of inplace update of input operands in stateful matrix
+multiplication APIs.
+
+The inputs as well as the result are CuPy ndarrays.
+
+NOTE: The operands should be updated inplace only when they are in a memory space that is
+accessible from the execution space. In this case, the operands reside on the GPU while the
+execution also happens on the GPU.
+"""
+
+import logging
+
+import cupy as cp
+
+import nvmath
+
+# Turn on logging to see what's happening.
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# Prepare sample input data
+m, n, k = 123, 456, 789
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.linalg.Matmul(a, b) as mm:
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Update the operand A in-place.
+    print("Updating 'a' in-place.")
+    a[:] = cp.random.rand(m, k)
+
+    # Execute the new matrix multiplication.
+    result = mm.execute()
+
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
+    cp.cuda.get_current_stream().synchronize()
+    print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
+    print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/generic/matmul/example05_stateful_reset.py b/examples/linalg/generic/matmul/example05_stateful_reset.py
new file mode 100644
index 0000000..7eaa0f9
--- /dev/null
+++ b/examples/linalg/generic/matmul/example05_stateful_reset.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to reset operands in stateful matrix multiplication APIs, and
+reuse the object for multiple executions. This is needed when the memory space of the
+operands is not accessible from the execution space, or if it's desired to bind new
+(compatible) operands to the stateful object.
+
+The inputs as well as the result are NumPy ndarrays.
+"""
+
+import logging
+
+import numpy as np
+
+import nvmath
+
+# Turn on logging to see what's happening.
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+# Prepare sample input data
+m, n, k = 123, 456, 789
+m, n, k = 2, 3, 4
+a = np.random.rand(m, k)
+b = np.random.rand(k, n)
+
+# Use the stateful object as a context manager to automatically release resources.
+with nvmath.linalg.Matmul(a, b, execution="cuda") as mm:
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
+    mm.plan()
+
+    # Execute the matrix multiplication.
+    result = mm.execute()
+
+    # Create new operands and bind them.
+    c = np.random.rand(m, k)
+    d = np.random.rand(k, n)
+    mm.reset_operands(c, d)
+
+    # Execute the new matrix multiplication.
+    result = mm.execute()
+
+    # No synchronization is needed for CPU tensors, since the execution always blocks.
+
+    print(f"Input types = {type(c), type(d)}")
+    print(f"Result type = {type(result)}")
diff --git a/examples/linalg/generic/matmul/example06_gemm.py b/examples/linalg/generic/matmul/example06_gemm.py
new file mode 100644
index 0000000..e2368cd
--- /dev/null
+++ b/examples/linalg/generic/matmul/example06_gemm.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates GEMM on CuPy ndarrays.
+
+GEMM (General Matrix Multiply) is defined as:
+alpha * A @ B + beta * C
+where `@` denotes matrix multiplication.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 64, 128, 256
+a = cp.random.rand(m, k)
+b = cp.random.rand(k, n)
+c = cp.random.rand(m, n)
+alpha = 0.45
+beta = 0.67
+
+# Perform the GEMM.
+result = nvmath.linalg.matmul(a, b, c=c, alpha=alpha, beta=beta)
+
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
+cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/generic/matmul/example08_sliced_cupy.py b/examples/linalg/generic/matmul/example08_sliced_cupy.py
new file mode 100644
index 0000000..6774fcd
--- /dev/null
+++ b/examples/linalg/generic/matmul/example08_sliced_cupy.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic matrix multiplication of CuPy arrays using the generic API,
+using sliced operands as input. It should be noted that not all non-dense layouts are
+supported.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# Prepare sample input data.
+m, n, k = 4, 6, 8
+a = cp.random.rand(m, k)[::2]
+b = cp.random.rand(k, n)
+
+# The execution happens on the GPU by default since the operands are on the GPU.
+
+# Perform the multiplication.
+result = nvmath.linalg.matmul(a, b)
+
+print(cp.allclose(a @ b, result))
diff --git a/examples/sparse/advanced/direct_solver/example01_torch.py b/examples/sparse/advanced/direct_solver/example01_torch.py
index 478c0d8..4a3abec 100644
--- a/examples/sparse/advanced/direct_solver/example01_torch.py
+++ b/examples/sparse/advanced/direct_solver/example01_torch.py
@@ -23,12 +23,8 @@
 
 # Prepare sample input data.
 # Create a diagonally-dominant random CSR matrix.
-a = torch.rand(n, n) + torch.diag(torch.tensor([10] * n))
+a = torch.rand(n, n, device=device_id) + torch.diag(torch.tensor([10] * n, device=device_id))
 a = a.to_sparse_csr()
-# Note that torch uses int64 for index buffers, whereas cuDSS currently requires int32.
-a = torch.sparse_csr_tensor(
-    a.crow_indices().to(dtype=torch.int32), a.col_indices().to(dtype=torch.int32), a.values(), size=a.size(), device=device_id
-)
 
 # Create the RHS, which can be a matrix or vector in column-major layout.
 b = torch.ones(2, n, device=device_id).T
diff --git a/examples/sparse/advanced/direct_solver/example04_stateful_torch.py b/examples/sparse/advanced/direct_solver/example04_stateful_torch.py
index 9276593..7da34e0 100644
--- a/examples/sparse/advanced/direct_solver/example04_stateful_torch.py
+++ b/examples/sparse/advanced/direct_solver/example04_stateful_torch.py
@@ -20,12 +20,8 @@
 
 # Prepare sample input data.
 # Create a diagonally dominant random CSR matrix.
-a = torch.rand(n, n) + torch.diag(torch.tensor([10] * n))
+a = torch.rand(n, n, device=device_id) + torch.diag(torch.tensor([10] * n, device=device_id))
 a = a.to_sparse_csr()
-# Note that torch uses int64 for index buffers, whereas cuDSS currently requires int32.
-a = torch.sparse_csr_tensor(
-    a.crow_indices().to(dtype=torch.int32), a.col_indices().to(dtype=torch.int32), a.values(), size=a.size(), device=device_id
-)
 
 # Create the RHS, which can be a matrix or vector in column-major layout.
 b = torch.ones(2, n, device=device_id).T
diff --git a/examples/sparse/advanced/direct_solver/example04_stateful_torch_cpu.py b/examples/sparse/advanced/direct_solver/example04_stateful_torch_cpu.py
index 8570800..3ad07e6 100644
--- a/examples/sparse/advanced/direct_solver/example04_stateful_torch_cpu.py
+++ b/examples/sparse/advanced/direct_solver/example04_stateful_torch_cpu.py
@@ -21,10 +21,6 @@
 # Create a diagonally dominant random CSR matrix.
 a = torch.rand(n, n) + torch.diag(torch.tensor([10] * n))
 a = a.to_sparse_csr()
-# Note that torch uses int64 for index buffers, whereas cuDSS currently requires int32.
-a = torch.sparse_csr_tensor(
-    a.crow_indices().to(dtype=torch.int32), a.col_indices().to(dtype=torch.int32), a.values(), size=a.size()
-)
 
 # Create the RHS, which can be a matrix or vector in column-major layout.
 b = torch.ones(2, n).T
diff --git a/examples/tensor/contraction/example01_cupy_binary.py b/examples/tensor/contraction/example01_cupy_binary.py
new file mode 100644
index 0000000..472619d
--- /dev/null
+++ b/examples/tensor/contraction/example01_cupy_binary.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic binary tensor contraction using CuPy arrays.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n]
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b)
+
+assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", a, b))
+
+print(f"Input type = {type(a), type(b)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand c to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+c = cp.random.rand(4, 4, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+# when c is specified for binary contraction, beta must be set
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, alpha=alpha, beta=beta)
+
+assert cp.allclose(result, alpha * cp.einsum("ijkl,klmn->ijmn", a, b) + beta * c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example01_cupy_ternary.py b/examples/tensor/contraction/example01_cupy_ternary.py
new file mode 100644
index 0000000..46a0a16
--- /dev/null
+++ b/examples/tensor/contraction/example01_cupy_ternary.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic ternary tensor contraction using CuPy arrays.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(8, 8, 8, 8)
+b = cp.random.rand(8, 8, 8, 8)
+c = cp.random.rand(8, 8, 8, 8)
+
+
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+# when d is specified for ternary contraction, beta must be set
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c)
+
+assert cp.allclose(result, cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c))
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand d to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+d = cp.random.rand(8, 8, 8, 8)
+
+# result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+#                   + beta * d[i,j,p,q]
+# when d is specified for ternary contraction, beta must be set
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, alpha=alpha, beta=beta)
+
+assert cp.allclose(result, alpha * cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + beta * d)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example01_numpy_binary.py b/examples/tensor/contraction/example01_numpy_binary.py
new file mode 100644
index 0000000..7597b8a
--- /dev/null
+++ b/examples/tensor/contraction/example01_numpy_binary.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic binary tensor contraction using NumPy arrays.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(4, 4, 12, 12)
+b = np.random.rand(12, 12, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n]
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b)
+
+assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b))
+
+print(f"Input type = {type(a), type(b)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand c to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+c = np.random.rand(4, 4, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+# when c is specified for binary contraction, beta must be set
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, alpha=alpha, beta=beta)
+
+assert np.allclose(result, alpha * np.einsum("ijkl,klmn->ijmn", a, b) + beta * c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example01_numpy_ternary.py b/examples/tensor/contraction/example01_numpy_ternary.py
new file mode 100644
index 0000000..3dc0864
--- /dev/null
+++ b/examples/tensor/contraction/example01_numpy_ternary.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic ternary tensor contraction using NumPy arrays.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(8, 8, 8, 8)
+b = np.random.rand(8, 8, 8, 8)
+c = np.random.rand(8, 8, 8, 8)
+
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+# when d is specified for ternary contraction, beta must be set
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c)
+
+assert np.allclose(result, np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c))
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand d to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+d = np.random.rand(8, 8, 8, 8)
+
+
+# result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+#                   + beta * d[i,j,p,q]
+# when d is specified for ternary contraction, beta must be set
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, alpha=alpha, beta=beta)
+
+assert np.allclose(result, alpha * np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + beta * d)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example01_torch_binary.py b/examples/tensor/contraction/example01_torch_binary.py
new file mode 100644
index 0000000..d400519
--- /dev/null
+++ b/examples/tensor/contraction/example01_torch_binary.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic binary tensor contraction using Torch tensors.
+
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import torch
+
+import nvmath
+
+a = torch.rand(4, 4, 12, 12, device="cuda")
+b = torch.rand(12, 12, 8, 8, device="cuda")
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n]
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b)
+
+assert torch.allclose(result, torch.einsum("ijkl,klmn->ijmn", a, b))
+
+print(f"Input type = {type(a), type(b)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand c to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+c = torch.rand(4, 4, 8, 8, device="cuda")
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+# when c is specified for binary contraction, beta must be set
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, alpha=alpha, beta=beta)
+
+assert torch.allclose(result, alpha * torch.einsum("ijkl,klmn->ijmn", a, b) + beta * c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example01_torch_ternary.py b/examples/tensor/contraction/example01_torch_ternary.py
new file mode 100644
index 0000000..2742b16
--- /dev/null
+++ b/examples/tensor/contraction/example01_torch_ternary.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example demonstrates basic ternary tensor contraction using Torch tensors.
+
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
+"""
+
+import torch
+
+import nvmath
+
+a = torch.rand(8, 8, 8, 8, device="cuda")
+b = torch.rand(8, 8, 8, 8, device="cuda")
+c = torch.rand(8, 8, 8, 8, device="cuda")
+
+
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c)
+
+assert torch.allclose(result, torch.einsum("ijkl,klmn,mnpq->ijpq", a, b, c))
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
+
+# Optionally, users may scale the contraction result with a scale factor alpha,
+# and/or add an additional operand d to the contraction result with a scale factor beta
+
+alpha, beta = 1.3, 0.7
+d = torch.rand(8, 8, 8, 8, device="cuda")
+
+# result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+#                   + beta * d[i,j,p,q]
+# when d is specified for ternary contraction, beta must be set
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, alpha=alpha, beta=beta)
+
+assert torch.allclose(result, alpha * torch.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + beta * d)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example02_stateful_cupy_binary.py b/examples/tensor/contraction/example02_stateful_cupy_binary.py
new file mode 100644
index 0000000..1a198d3
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_cupy_binary.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful binary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are CuPy ndarrays.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+c = cp.random.rand(4, 4, 8, 8)
+
+alpha, beta = 0.3, 0.9
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute(alpha=alpha, beta=beta)
+
+    # Synchronize the default stream
+    cp.cuda.get_current_stream().synchronize()
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"Contraction output type = {type(result)}, device = {result.device}")
diff --git a/examples/tensor/contraction/example02_stateful_cupy_ternary.py b/examples/tensor/contraction/example02_stateful_cupy_ternary.py
new file mode 100644
index 0000000..5c43cc5
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_cupy_ternary.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful ternary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are CuPy ndarrays.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 6, 8)
+b = cp.random.rand(6, 8, 3)
+c = cp.random.rand(3, 9)
+
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k] * b[j,k,l] * c[l,n]
+
+# Create a stateful TernaryContraction object 'contraction'.
+with nvmath.tensor.TernaryContraction("ijk,jkl,ln->in", a, b, c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute()
+
+    # Synchronize the default stream
+    cp.cuda.get_current_stream().synchronize()
+    assert cp.allclose(result, cp.einsum("ijk,jkl,ln->in", a, b, c))
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"Contraction output type = {type(result)}, device = {result.device}")
diff --git a/examples/tensor/contraction/example02_stateful_numpy_binary.py b/examples/tensor/contraction/example02_stateful_numpy_binary.py
new file mode 100644
index 0000000..33f5e7d
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_numpy_binary.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful binary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are NumPy ndarrays.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(4, 4, 12, 12)
+b = np.random.rand(12, 12, 8, 8)
+
+c = np.random.rand(4, 4, 8, 8)
+
+alpha, beta = 0.3, 0.9
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute(alpha=alpha, beta=beta)
+
+    print(f"Input type = {type(a)}, device = 'cpu'")
+    print(f"Contraction output type = {type(result)}, device = 'cpu'")
diff --git a/examples/tensor/contraction/example02_stateful_numpy_ternary.py b/examples/tensor/contraction/example02_stateful_numpy_ternary.py
new file mode 100644
index 0000000..a584b49
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_numpy_ternary.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful ternary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are NumPy ndarrays.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(4, 6, 8)
+b = np.random.rand(6, 8, 3)
+c = np.random.rand(3, 9)
+
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k] * b[j,k,l] * c[l,n]
+
+# Create a stateful TernaryContraction object 'contraction'.
+with nvmath.tensor.TernaryContraction("ijk,jkl,ln->in", a, b, c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute()
+
+    assert np.allclose(result, np.einsum("ijk,jkl,ln->in", a, b, c))
+    print(f"Input type = {type(a)}, device = 'cpu'")
+    print(f"Contraction output type = {type(result)}, device = 'cpu'")
diff --git a/examples/tensor/contraction/example02_stateful_torch_binary.py b/examples/tensor/contraction/example02_stateful_torch_binary.py
new file mode 100644
index 0000000..7ca5f48
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_torch_binary.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful binary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are Torch tensors.
+"""
+
+import torch
+
+import nvmath
+
+a = torch.rand(4, 4, 12, 12, device="cuda")
+b = torch.rand(12, 12, 8, 8, device="cuda")
+
+c = torch.rand(4, 4, 8, 8, device="cuda")
+
+alpha, beta = 0.3, 0.9
+
+# result[i,j,m,n] = \sum_{k,l} alpha * a[i,j,k,l] * b[k,l,m,n] + beta * c[i,j,m,n]
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute(alpha=alpha, beta=beta)
+
+    # Synchronize the default stream
+    torch.cuda.default_stream().synchronize()
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"Contraction output type = {type(result)}, device = {result.device}")
diff --git a/examples/tensor/contraction/example02_stateful_torch_ternary.py b/examples/tensor/contraction/example02_stateful_torch_ternary.py
new file mode 100644
index 0000000..6db86bd
--- /dev/null
+++ b/examples/tensor/contraction/example02_stateful_torch_ternary.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful ternary tensor contraction objects.
+Stateful objects amortize the cost of preparation across multiple executions.
+
+The inputs as well as the result are Torch tensors.
+"""
+
+import torch
+
+import nvmath
+
+a = torch.rand(4, 6, 8, device="cuda")
+b = torch.rand(6, 8, 3, device="cuda")
+c = torch.rand(3, 9, device="cuda")
+
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k] * b[j,k,l] * c[l,n]
+
+# Create a stateful TernaryContraction object 'contraction'.
+with nvmath.tensor.TernaryContraction("ijk,jkl,ln->in", a, b, c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute()
+
+    # Synchronize the default stream
+    torch.cuda.default_stream().synchronize()
+    assert torch.allclose(result, torch.einsum("ijk,jkl,ln->in", a, b, c))
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"Contraction output type = {type(result)}, device = {result.device}")
diff --git a/examples/tensor/contraction/example03_options.py b/examples/tensor/contraction/example03_options.py
new file mode 100644
index 0000000..1adb8c1
--- /dev/null
+++ b/examples/tensor/contraction/example03_options.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to specify options to a binary tensor contraction operation.
+
+In this example, we will use NumPy ndarrays as input, and we will look at two equivalent
+ways to specify the compute type.
+"""
+
+import numpy as np
+
+import nvmath
+
+np.random.seed(0)
+a = np.random.rand(8, 8, 12, 12)
+b = np.random.rand(12, 12, 8, 8)
+
+c = np.random.rand(8, 8, 8, 8)
+
+# Alternative #1 for specifying options, using ContractionOptions class.
+options = nvmath.tensor.ContractionOptions(compute_type=nvmath.tensor.ComputeDesc.COMPUTE_32F(), memory_limit="1GB")
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, beta=1, options=options)
+
+assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+# Alternative #2 for specifying options, using dict. The two alternatives are entirely
+# equivalent.
+options = {"compute_type": nvmath.tensor.ComputeDesc.COMPUTE_32F(), "memory_limit": "1GB"}
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, beta=1, options=options)
+
+assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b) + c)
diff --git a/examples/tensor/contraction/example04_logging_global.py b/examples/tensor/contraction/example04_logging_global.py
new file mode 100644
index 0000000..00905c0
--- /dev/null
+++ b/examples/tensor/contraction/example04_logging_global.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of the global Python logger to observe the
+computational details of a binary tensor contraction operation.
+"""
+
+import logging
+
+import cupy as cp
+
+import nvmath
+
+# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom
+# format for the log. Any of the features provided by the logging module can be used.
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+c = cp.random.rand(4, 4, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n] + c[i,j,m,n]
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, beta=1)
+
+assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example04_logging_user.py b/examples/tensor/contraction/example04_logging_user.py
new file mode 100644
index 0000000..60da572
--- /dev/null
+++ b/examples/tensor/contraction/example04_logging_user.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of a user-provided Python logger to observe the
+computational details of a binary tensor contraction operation.
+"""
+
+import logging
+
+import cupy as cp
+
+import nvmath
+
+# Create and configure a user logger.
+# Any of the features provided by the logging module can be used.
+logger = logging.getLogger("userlogger")
+logging.getLogger().setLevel(logging.NOTSET)
+
+# Create a console handler for the logger and set level.
+handler = logging.StreamHandler()
+handler.setLevel(logging.DEBUG)
+
+# Create a formatter and associate with handler.
+formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+handler.setFormatter(formatter)
+
+# Associate handler with logger, resulting in a logger with the desired level, format, and
+# console output.
+logger.addHandler(handler)
+
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+c = cp.random.rand(4, 4, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n] + c[i,j,m,n]
+result = nvmath.tensor.binary_contraction("ijkl,klmn->ijmn", a, b, c=c, beta=1, options={"logger": logger})
+
+assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example05_stateful_with_scalars.py b/examples/tensor/contraction/example05_stateful_with_scalars.py
new file mode 100644
index 0000000..2d9ee37
--- /dev/null
+++ b/examples/tensor/contraction/example05_stateful_with_scalars.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateful ternary tensor contraction
+objects with scalars.
+
+Stateful objects allow users to reuse the same contraction object for multiple executions
+with different set of scalar parameters. The inputs as well as the result are CuPy ndarrays.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(8, 8, 8, 8)
+b = cp.random.rand(8, 8, 8, 8)
+c = cp.random.rand(8, 8, 8, 8)
+
+# d may not be needed for the contraction depending on the specific contraction,
+# but here we pass it to demonstrate how to reuse the same contraction object.
+d = cp.random.rand(8, 8, 8, 8)
+
+with nvmath.tensor.TernaryContraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d) as contraction:
+    # Plan the contraction.
+    contraction.plan()
+
+    # Execute the contraction.
+    # NOTE: when d is specified for ternary contraction, beta must be set
+
+    # Case 1: alpha = 1.0 (default) and beta = 1.0 (must be set)
+    # result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q] + d[i,j,p,q]
+    result = contraction.execute(beta=1.0)
+    assert cp.allclose(result, cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + d)
+
+    # Case 2: alpha = 2 and beta = 0 (equivalent to d=0)
+    # result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+    alpha = 2
+    beta = 0
+    result = contraction.execute(alpha=alpha, beta=beta)
+    # NOTE: If d is not provided during the initialization of the contraction object,
+    #       beta does not need to be provided.
+    assert cp.allclose(result, cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) * alpha)
+
+    # Case 3: alpha = 1.0 (default) and beta = 0.2
+    # result[i,j,p,q] =
+    # \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q] + beta * d[i,j,p,q]
+    beta = 0.2
+    result = contraction.execute(beta=beta)
+    assert cp.allclose(result, cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + d * beta)
+
+    # Case 4: alpha = 1.4 and beta = 0.5
+    # result[i,j,p,q] =
+    # \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q] + beta * d[i,j,p,q]
+    alpha = 1.4
+    beta = 0.5
+    result = contraction.execute(alpha=alpha, beta=beta)
+    assert cp.allclose(result, cp.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) * alpha + d * beta)
diff --git a/examples/tensor/contraction/example05_stateless_with_scalars.py b/examples/tensor/contraction/example05_stateless_with_scalars.py
new file mode 100644
index 0000000..3081339
--- /dev/null
+++ b/examples/tensor/contraction/example05_stateless_with_scalars.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of stateless ternary tensor contraction
+function-form APIs with scalars.
+
+Stateless function-form APIs allow users to perform a single contraction with
+a given set of scalar parameters. The inputs as well as the result are NumPy ndarrays.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(8, 8, 8, 8)
+b = np.random.rand(8, 8, 8, 8)
+c = np.random.rand(8, 8, 8, 8)
+d = np.random.rand(8, 8, 8, 8)
+
+
+# Case 1: alpha = 1.0 (default) and d = None (default and beta does not need to be set)
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c)
+assert np.allclose(result, np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c))
+
+# Case 2: alpha = 2 and d = None (default and beta does not need to be set)
+# result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+alpha = 2
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, alpha=alpha)
+assert np.allclose(result, np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) * alpha)
+
+# Case 3: alpha = 1.0 (default) and beta = 0.2 with a non-zero d
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q] + beta * d[i,j,p,q]
+beta = 0.2
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, beta=beta)
+assert np.allclose(result, np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) + d * beta)
+
+
+# Case 4: alpha = 1.4 and beta = 0.5 with a non-zero d
+# result[i,j,p,q] = \sum_{k,l,m,n} alpha * a[i,j,k,l] * b[k,l,m,n] * c[m,n,p,q]
+alpha = 1.4
+beta = 0.5
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, alpha=alpha, beta=beta)
+assert np.allclose(result, np.einsum("ijkl,klmn,mnpq->ijpq", a, b, c) * alpha + d * beta)
diff --git a/examples/tensor/contraction/example06_stateful_inplace.py b/examples/tensor/contraction/example06_stateful_inplace.py
new file mode 100644
index 0000000..49e8f79
--- /dev/null
+++ b/examples/tensor/contraction/example06_stateful_inplace.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of inplace update of input operands in stateful tensor
+contraction APIs.
+
+The inputs as well as the result are CuPy ndarrays.
+
+NOTE: The operands should be updated inplace only when they are in a memory space that is
+accessible from the execution space. In this case, the operands reside on the GPU while the
+execution also happens on the GPU.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+c = cp.random.rand(4, 4, 8, 8)
+
+A = cp.random.rand(4, 4, 12, 12)
+B = cp.random.rand(12, 12, 8, 8)
+C = cp.random.rand(4, 4, 8, 8)
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute(beta=1)
+    assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+    # Inplace update the input operand 'a,b,c' with the result.
+    a[:] = A
+    b[:] = B
+    c[:] = C
+
+    # Re-execute the Contraction with the updated input operand.
+    result = contraction.execute(beta=1)
+    assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", A, B) + C)
diff --git a/examples/tensor/contraction/example06_stateful_reset.py b/examples/tensor/contraction/example06_stateful_reset.py
new file mode 100644
index 0000000..9a83ab7
--- /dev/null
+++ b/examples/tensor/contraction/example06_stateful_reset.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to reset operands in stateful tensor contraction APIs, and
+reuse the object for multiple executions. This is needed when the memory space of the
+operands is not accessible from the execution space, or if it's desired to bind new
+(compatible) operands to the stateful object.
+
+The inputs as well as the result are NumPy ndarrays.
+"""
+
+import numpy as np
+
+import nvmath
+
+a = np.random.rand(4, 4, 12, 12)
+b = np.random.rand(12, 12, 8, 8)
+c = np.random.rand(4, 4, 8, 8)
+
+A = np.random.rand(4, 4, 12, 12)
+B = np.random.rand(12, 12, 8, 8)
+C = np.random.rand(4, 4, 8, 8)
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c) as contraction:
+    # Plan the Contraction.
+    contraction.plan()
+
+    # Execute the Contraction.
+    result = contraction.execute(beta=1)
+    assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+    # Reset the input operands with new values.
+    contraction.reset_operands(a=A, b=B, c=C)
+
+    # Re-execute the Contraction with the updated input operand.
+    result = contraction.execute(beta=1)
+    assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", A, B) + C)
diff --git a/examples/tensor/contraction/example07_plan_preference.py b/examples/tensor/contraction/example07_plan_preference.py
new file mode 100644
index 0000000..3b1c747
--- /dev/null
+++ b/examples/tensor/contraction/example07_plan_preference.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of the plan preference object to configure the
+planning phase of a binary tensor contraction operation.
+
+Different contraction algorithms are profiled to demonstrate how contraction algorithms
+can potentially impact the performance of a tensor contraction operation.
+
+For a detailed explanation of the contraction algorithms supported by cuTensor, please
+refer to the cuTensor documentation:
+https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensoralgo-t
+"""
+
+import cupy as cp
+from cupyx.profiler import benchmark
+
+import nvmath
+
+a = cp.random.rand(64, 8, 8, 6, 6)
+b = cp.random.rand(64, 8, 8, 6, 6)
+
+# Create a stateful BinaryContraction object 'contraction'.
+with nvmath.tensor.BinaryContraction("pijkl,pjiab->lakbp", a, b) as contraction:
+    # Get the handle to the plan preference object
+    plan_preference = contraction.plan_preference
+    # update the kernel rank to the third best for the underlying algorithm
+    plan_preference.kernel_rank = 2
+
+    for algo in (
+        nvmath.tensor.ContractionAlgo.DEFAULT_PATIENT,
+        nvmath.tensor.ContractionAlgo.GETT,
+        nvmath.tensor.ContractionAlgo.TGETT,
+        nvmath.tensor.ContractionAlgo.TTGT,
+        nvmath.tensor.ContractionAlgo.DEFAULT,
+    ):
+        print(f"Algorithm: {algo.name}")
+        plan_preference.algo = algo
+        # Plan the Contraction to activate the updated plan preference
+        contraction.plan()
+        print(benchmark(contraction.execute, n_repeat=20))
diff --git a/examples/tensor/contraction/example08_qualifiers.py b/examples/tensor/contraction/example08_qualifiers.py
new file mode 100644
index 0000000..bc3addd
--- /dev/null
+++ b/examples/tensor/contraction/example08_qualifiers.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of qualifiers to specify the operators for the operands
+in a ternary tensor contraction operation. As of cuTensor 2.3.1, only the conjugate operator
+is supported in the contraction APIs when the operands are complex.
+
+The inputs as well as the result are CuPy ndarrays.
+"""
+
+import cupy as cp
+import numpy as np
+
+import nvmath
+
+a = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+b = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+c = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+d = cp.random.rand(8, 8, 8, 8) + 1j * cp.random.rand(8, 8, 8, 8)
+
+# create an array of qualifiers (of length # of operands) with the default identity operator
+qualifiers = np.full(4, nvmath.tensor.Operator.OP_IDENTITY, dtype=nvmath.tensor.tensor_qualifiers_dtype)
+# set the qualifier for operand b to conjugate
+qualifiers[1] = nvmath.tensor.Operator.OP_CONJ
+
+# result[i,j,p,q] = \sum_{k,l,m,n} a[i,j,k,l] * b[k,l,m,n].conj() * c[m,n,p,q] + d[i,j,p,q]
+result = nvmath.tensor.ternary_contraction("ijkl,klmn,mnpq->ijpq", a, b, c, d=d, qualifiers=qualifiers, beta=1)
+reference = cp.einsum("ijkl,klmn,mnpq->ijpq", a, b.conj(), c) + d
+assert cp.allclose(result, reference)
diff --git a/examples/tensor/contraction/example09_streams.py b/examples/tensor/contraction/example09_streams.py
new file mode 100644
index 0000000..cc8a217
--- /dev/null
+++ b/examples/tensor/contraction/example09_streams.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of multiple CUDA streams with the tensor contraction APIs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+c = cp.random.rand(4, 4, 8, 8)
+
+# Create a CUDA stream to use for instantiating, planning, and first execution of a stateful
+# BinaryContraction object 'contraction'.
+s1 = cp.cuda.Stream()
+
+# Create a stateful BinaryContraction object 'contraction' on stream s1.
+with nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b, c=c, options={"blocking": "auto"}, stream=s1) as contraction:
+    # Plan the BinaryContraction on stream s1.
+    contraction.plan(stream=s1)
+
+    # Execute the BinaryContraction on stream s1.
+    d = contraction.execute(beta=1, stream=s1)
+
+    assert cp.allclose(d, cp.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+    # Record an event on s1 for use later.
+    e1 = s1.record()
+
+    # Create a new stream to on which the new operand c for the second execution will be
+    # filled.
+    s2 = cp.cuda.Stream()
+
+    # Fill c on s2.
+    with s2:
+        a1 = cp.random.rand(*a.shape)
+        b1 = cp.random.rand(*b.shape)
+        c1 = cp.random.rand(*c.shape)
+
+    # In the following blocks, we will use stream s2 to perform subsequent operations. Note
+    # that it's our responsibility as a user to ensure proper ordering, and we want to order
+    # `reset_operand` after event e1 corresponding to the execute() call above.
+    s2.wait_event(e1)
+
+    # Alternatively, if we want to use stream s1 for subsequent operations (s2 only for
+    # operand creation), we need to order `reset_operands` after the event for
+    # cupy.random.rand on s2, e.g: e2 = s2.record() s1.wait_event(e2)
+
+    # Set a new operand c on stream s2.
+    contraction.reset_operands(a=a1, b=b1, c=c1, stream=s2)
+
+    # Execute the new BinaryContraction on stream s2.
+    d = contraction.execute(beta=1, stream=s2)
+
+    # Synchronize s2 at the end
+    s2.synchronize()
+
+    assert cp.allclose(d, cp.einsum("ijkl,klmn->ijmn", a1, b1) + c1)
diff --git a/examples/tensor/contraction/example10_execution.py b/examples/tensor/contraction/example10_execution.py
new file mode 100644
index 0000000..3bb5af9
--- /dev/null
+++ b/examples/tensor/contraction/example10_execution.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+An example illustrating how to specify execution options including memory limit
+and the execution device for a binary tensor contraction operation.
+
+The input execution options can be provided in the following ways:
+- As a string ('cuda')
+- As a :class:`ExecutionCUDA` object
+- As a dictionary containing the parameters for the :class:`ExecutionCUDA` constructor
+
+The inputs as well as the result are NumPy ndarrays.
+"""
+
+import numpy as np
+
+import cuda.core.experimental as ccx
+
+from nvmath.tensor import ExecutionCUDA, binary_contraction
+
+
+a = np.random.rand(4, 4, 12, 12)
+b = np.random.rand(12, 12, 8, 8)
+
+
+# By default, the execution is set to "cuda" with device_id = 0
+result = binary_contraction("ijkl,klmn->ijmn", a, b, execution="cuda")
+
+assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b))
+
+# Execution can also be provided as an ExecutionCUDA object
+num_devices = ccx.system.num_devices
+
+for device_id in range(num_devices):
+    execution = ExecutionCUDA(device_id=device_id)
+    result = binary_contraction("ijkl,klmn->ijmn", a, b, execution=execution)
+    assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b))
+
+# Additionally, execution can be provided as a dictionary
+# and the name key must be set to 'cuda'
+execution = {"name": "cuda", "device_id": num_devices - 1}
+result = binary_contraction("ijkl,klmn->ijmn", a, b, execution=execution)
+assert np.allclose(result, np.einsum("ijkl,klmn->ijmn", a, b))
diff --git a/examples/tensor/contraction/example11_memory_allocator.py b/examples/tensor/contraction/example11_memory_allocator.py
new file mode 100644
index 0000000..2394e34
--- /dev/null
+++ b/examples/tensor/contraction/example11_memory_allocator.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+An example illustrating user-provided memory allocator.
+"""
+
+import cupy as cp
+
+import nvmath
+from nvmath.memory import BaseCUDAMemoryManager, MemoryPointer
+
+
+class RawCUDAMemoryManager(BaseCUDAMemoryManager):
+    """
+    A simple allocator using cudaMalloc and cudaFree, instead of CuPy's memory pool.
+    """
+
+    def __init__(self, device_id):
+        self.device_id = device_id
+
+    def memalloc(self, size):
+        with cp.cuda.Device(self.device_id):
+            device_ptr = cp.cuda.runtime.malloc(size)
+        print(f"Allocated memory of size {size} bytes using {type(self).__name__}.")
+
+        def create_finalizer():
+            def finalizer():
+                cp.cuda.runtime.free(device_ptr)
+                print(f"Free'd allocated memory using {type(self).__name__}.")
+
+            return finalizer
+
+        return MemoryPointer(device_ptr, size, finalizer=create_finalizer())
+
+
+a = cp.random.rand(4, 4, 12, 12)
+b = cp.random.rand(12, 12, 8, 8)
+
+c = cp.random.rand(4, 4, 8, 8)
+
+# result[i,j,m,n] = \sum_{k,l} a[i,j,k,l] * b[k,l,m,n] + c[i,j,m,n]
+result = nvmath.tensor.binary_contraction(
+    "ijkl,klmn->ijmn", a, b, c=c, beta=1, options={"allocator": RawCUDAMemoryManager(a.device.id)}
+)
+
+assert cp.allclose(result, cp.einsum("ijkl,klmn->ijmn", a, b) + c)
+
+print(f"Input type = {type(a), type(b), type(c)}, contraction result type = {type(result)}")
diff --git a/examples/tensor/contraction/example12_resource_mgmt.py b/examples/tensor/contraction/example12_resource_mgmt.py
new file mode 100644
index 0000000..bdb7b97
--- /dev/null
+++ b/examples/tensor/contraction/example12_resource_mgmt.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example shows how to manage memory resources used by stateful objects. This is useful
+when the tensor contraction operation needs a lot of memory and calls to execution method
+on a stateful object are interleaved with calls to other operations
+(including another tensor contraction) also requiring a lot of memory.
+
+In this example, two tensor contraction operations are performed in a loop in an
+interleaved manner. We assume that the available device memory is large enough for only
+one tensor contraction at a time.
+"""
+
+import logging
+
+import cupy as cp
+
+import nvmath
+
+# Turn on logging and set the level to DEBUG to print memory management messages.
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+
+a = cp.random.rand(16, 16, 32, 32)
+b = cp.random.rand(32, 32, 32, 32)
+
+A = cp.random.rand(16, 16, 32, 32)
+B = cp.random.rand(32, 32, 32, 32)
+
+
+# Create and prepare two BinaryContraction objects.
+contraction1 = nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", a, b)
+contraction1.plan()
+
+contraction2 = nvmath.tensor.BinaryContraction("ijkl,klmn->ijmn", A, B)
+contraction2.plan()
+
+num_iter = 3
+# Use the BinaryContraction objects as context managers so that internal library resources
+#   are properly cleaned up.
+with contraction1, contraction2:
+    for i in range(num_iter):
+        print(f"Iteration {i}")
+        # Perform the first contraction, and request that the workspace be released at the
+        #   end of the operation so that there is enough memory for the second one.
+        r = contraction1.execute(release_workspace=True)
+
+        assert cp.allclose(r, cp.einsum("ijkl,klmn->ijmn", a, b))
+
+        # Update contraction1's operands for the next iteration.
+        if i < num_iter - 1:
+            a[:] = cp.random.rand(*a.shape)
+            b[:] = cp.random.rand(*b.shape)
+
+        # Perform the second contraction, and request that the workspace be released
+        #   at the end of the operation so that there is enough memory for the first
+        #   contraction in the next iteration.
+        r = contraction2.execute(release_workspace=True)
+
+        assert cp.allclose(r, cp.einsum("ijkl,klmn->ijmn", A, B))
+
+        # Update contraction2's operands for the next iteration.
+        if i < num_iter - 1:
+            A[:] = cp.random.rand(*A.shape)
+            B[:] = cp.random.rand(*B.shape)
+
+        # Synchronize the default stream
+        cp.cuda.get_current_stream().synchronize()
diff --git a/examples/tensor/contraction/example13_stateful_output.py b/examples/tensor/contraction/example13_stateful_output.py
new file mode 100644
index 0000000..f1b34b4
--- /dev/null
+++ b/examples/tensor/contraction/example13_stateful_output.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to write to a pre-allocated output array with stateful
+tensor contraction APIs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+a = cp.random.rand(4, 4, 4)
+b = cp.random.rand(4, 4, 4)
+
+# Case I: writing to a pre-allocated output array
+c = cp.empty((4, 4))
+with nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b, out=c) as contraction:
+    contraction.plan()
+    contraction.execute()
+    assert cp.allclose(c, cp.einsum("ijk,jkl->il", a, b))
+
+# Case II: with in-place update, where the output tensor is the same as the addend operand c
+c = cp.random.rand(4, 4)
+reference = cp.einsum("ijk,jkl->il", a, b) + c
+with nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b, c=c, out=c) as contraction:
+    contraction.plan()
+    contraction.execute(beta=1)
+    assert cp.allclose(c, reference)
+
+# Case III: writing to a slice of a pre-allocated output array with in-place update
+full_matrix = cp.random.rand(8, 8)
+matrix_slice = full_matrix[2:6, 2:6]
+
+reference = cp.einsum("ijk,jkl->il", a, b) + matrix_slice
+with nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b, c=matrix_slice, out=matrix_slice) as contraction:
+    contraction.plan()
+    contraction.execute(beta=1)
+    assert cp.allclose(matrix_slice, reference)
+
+# Case IV: resetting the target output operand to different buffer
+# Note that the updated tensor must be compatible with the original tensor
+
+c = cp.random.rand(4, 4)
+
+out1 = cp.empty((4, 4))
+out2 = cp.empty((4, 4))
+
+with nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b, c=c, out=out1) as contraction:
+    contraction.plan()
+
+    alpha = 1.4
+    beta = 0.7
+    contraction.execute(alpha=alpha, beta=beta)
+    assert cp.allclose(out1, alpha * cp.einsum("ijk,jkl->il", a, b) + beta * c)
+
+    alpha = 0.5
+    beta = 0.3
+    contraction.reset_operands(a=a, b=b, c=c, out=out2)
+    contraction.execute(alpha=alpha, beta=beta)
+    assert cp.allclose(out2, alpha * cp.einsum("ijk,jkl->il", a, b) + beta * c)
diff --git a/examples/tensor/contraction/example13_stateless_output.py b/examples/tensor/contraction/example13_stateless_output.py
new file mode 100644
index 0000000..287adff
--- /dev/null
+++ b/examples/tensor/contraction/example13_stateless_output.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates how to write to a pre-allocated output array with stateless
+tensor contraction APIs.
+"""
+
+import cupy as cp
+
+import nvmath
+
+# If CPU arrays are provided, execute on nvpl tensor by default
+# user also gets to use cutensor by providing `execution='cuda'`
+# If GPU arrays are provided, execute on cutensor by default
+# user also gets to use nvpl tensor by providing `execution='cpu'`
+
+a = cp.random.rand(4, 4, 4)
+b = cp.random.rand(4, 4, 4)
+
+# Case I: writing to a pre-allocated output array
+c = cp.empty((4, 4))
+out = nvmath.tensor.binary_contraction("ijk,jkl->il", a, b, out=c)
+assert out is c and cp.allclose(out, cp.einsum("ijk,jkl->il", a, b))
+
+
+# Case II: writing to a pre-allocated output array with in-place update
+c = cp.random.rand(4, 4)
+reference = cp.einsum("ijk,jkl->il", a, b) + c
+out = nvmath.tensor.binary_contraction("ijk,jkl->il", a, b, c=c, out=c, beta=1)
+
+assert out is c and cp.allclose(out, reference)
+
+
+# Case III: writing to a slice of a pre-allocated output array with in-place update
+full_matrix = cp.random.rand(8, 8)
+matrix_slice = full_matrix[2:6, 2:6]
+
+reference = cp.einsum("ijk,jkl->il", a, b) + matrix_slice
+out = nvmath.tensor.binary_contraction("ijk,jkl->il", a, b, c=matrix_slice, out=matrix_slice, beta=1)
+assert out is matrix_slice and cp.allclose(out, reference)
diff --git a/nvmath/__init__.py b/nvmath/__init__.py
index 0d55203..9985b9f 100644
--- a/nvmath/__init__.py
+++ b/nvmath/__init__.py
@@ -18,12 +18,17 @@ def _force_lib_load():
 from nvmath._utils import ComputeType  # noqa: E402
 from nvmath._utils import CudaDataType  # noqa: E402
 from nvmath._utils import LibraryPropertyType  # noqa: E402
+from nvmath._utils import PLATFORM_LINUX  # noqa: E402
 
-from nvmath import fft, linalg, sparse  # noqa: E402
-from nvmath.memory import BaseCUDAMemoryManager, MemoryPointer  # noqa: E402
+if PLATFORM_LINUX:
+    from nvmath import fft, linalg, sparse, tensor  # noqa: E402, F401
+else:
+    from nvmath import fft, linalg, sparse  # noqa: E402
+from nvmath.memory import BaseCUDAMemoryManager, BaseCUDAMemoryManagerAsync, MemoryPointer  # noqa: E402
 
 __all__ = [
     "BaseCUDAMemoryManager",
+    "BaseCUDAMemoryManagerAsync",
     "bindings",
     "ComputeType",
     "CudaDataType",
@@ -33,5 +38,7 @@ def _force_lib_load():
     "MemoryPointer",
     "sparse",
 ]
+if PLATFORM_LINUX:
+    __all__.append("tensor")
 
 __version__ = importlib.metadata.version("nvmath-python")
diff --git a/nvmath/_internal/layout.py b/nvmath/_internal/layout.py
index 2e5d6c1..786c914 100644
--- a/nvmath/_internal/layout.py
+++ b/nvmath/_internal/layout.py
@@ -27,6 +27,8 @@ def is_contiguous_and_dense(shape: Sequence[int], strides: Sequence[int]) -> boo
     Check if the provided (shape, strides) result in a contiguous memory layout
     with no extra stride in least strided dimension.
     """
+    if not shape and not strides:
+        return True
     sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
     if len(sorted_strides) > 0 and sorted_strides[0] != 1:
         return False
@@ -45,6 +47,8 @@ def is_overlapping_layout(shape: Sequence[int], strides: Sequence[int]) -> bool:
     The check should return False for contiguous
     or contiguous and sliced tensors.
     """
+    if not shape and not strides:
+        return False
     sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
     cur_max_offset = 0
     for s in range(1, len(sorted_strides)):
diff --git a/nvmath/_internal/templates.py b/nvmath/_internal/templates.py
new file mode 100644
index 0000000..ad3851f
--- /dev/null
+++ b/nvmath/_internal/templates.py
@@ -0,0 +1,369 @@
+import abc
+import contextlib
+import dataclasses
+import logging
+
+from logging import Logger
+from typing import Literal, ClassVar, Final, TypeVar, Generic
+from collections.abc import MutableSequence
+
+from nvmath.internal import utils
+from nvmath import memory
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class ExecutionCUDA:
+    """
+    A data class for providing GPU execution options.
+
+    Attributes:
+        device_id: CUDA device ordinal (only used if the operand resides on the CPU). The
+            default value is 0.
+
+    .. seealso::
+       :class:`ExecutionCPU`
+    """
+
+    name: ClassVar[Literal["cuda"]] = "cuda"
+    device_id: int = 0
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class ExecutionCPU:
+    """
+    A data class for providing CPU execution options.
+
+    Attributes:
+        num_threads: The number of CPU threads used to execute the operation.
+                     If not specified, defaults to the number of CPU cores available to the
+                     process.
+
+    .. seealso::
+       :class:`ExecutionCUDA`
+    """
+
+    name: ClassVar[Literal["cpu"]] = "cpu"
+    num_threads: int | None = None
+
+
+def copy_operand_perhaps(
+    internal_operand: utils.TensorHolder | None,
+    operand: utils.TensorHolder,
+    stream_holder: utils.StreamHolder | None,
+    execution_device_id: int | Literal["cpu"],
+    operands_device_id: int | Literal["cpu"],
+) -> tuple[utils.TensorHolder, utils.TensorHolder | None]:
+    """Private implementation of memory space management for tensor operands.
+
+    The `copy_operand_perhaps` function facilitates transitions of tensor operands between
+    different memory spaces, ensuring compatibility with execution requirements. Its role is
+    to determine whether a tensor operand needs to be copied to accommodate differing
+    execution and operand memory spaces, while preserving the original operand for cases
+    requiring in-place operations.
+
+    Args:
+        internal_operand: Represents an internal tensor for in-place
+            memory operations, or `None` if not applicable.
+
+        operand: Tensor to possibly copied to the execution memory space.
+
+        stream_holder: Manages the CUDA stream for device operations.
+
+        execution_device_id: Specifies the target execution space.
+
+        operands_device_id: Specifies the current operand memory space.
+
+    Returns:
+        A tuple containing:
+            - The operand copied to the execution space, or the original operand if
+              no copy is necessary.
+            - The original operand, or `None` if no copy occurred.
+
+    """
+    if execution_device_id == operands_device_id:
+        return operand, None
+    else:
+        # Copy the `operand` to memory that matches the exec space
+        # and keep the original `operand` to handle `options.inplace=True`
+        if internal_operand is None:
+            exec_space_copy = operand.to(execution_device_id, stream_holder)
+            return exec_space_copy, operand
+        else:
+            # In-place copy to existing pointer
+            internal_operand.copy_(src=operand, stream_holder=stream_holder)
+            return internal_operand, operand
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class StatefulAPIOptions:
+    """A dataclass for providing options to a :class:`StatefulAPI` object.
+
+    Attributes:
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used
+            to draw device memory. If an allocator is not provided, a memory allocator from
+            the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
+            PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
+
+        blocking: A flag specifying the behavior of the stream-ordered functions and
+            methods. When ``blocking`` is `True`, the stream-ordered methods do not return
+            until the operation is complete. When ``blocking`` is ``"auto"``, the methods
+            return immediately when the inputs are on the GPU. The stream-ordered methods
+            always block when the operands are on the CPU to ensure that the user doesn't
+            inadvertently use the result before it becomes available. The default is
+            ``"auto"``.
+
+        logger: Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+    .. seealso::
+       :class:`StatefulAPI`
+    """
+
+    allocator: memory.BaseCUDAMemoryManager | memory.BaseCUDAMemoryManagerAsync | None = None
+    blocking: Literal[True, "auto"] = "auto"
+    logger: Logger = dataclasses.field(default_factory=logging.getLogger)
+
+    def __post_init__(self):
+        if self.blocking not in (True, "auto"):
+            raise ValueError("The value specified for blocking must be either True or 'auto'.")
+
+        if self.allocator is not None and not isinstance(
+            self.allocator, memory.BaseCUDAMemoryManager | memory.BaseCUDAMemoryManagerAsync
+        ):
+            raise TypeError("The allocator must be an object of type that fulfills the BaseCUDAMemoryManager protocol.")
+
+
+OptionsPlaceholder = TypeVar("OptionsPlaceholder", bound=StatefulAPIOptions)
+
+
+class StatefulAPI(contextlib.AbstractContextManager, Generic[OptionsPlaceholder]):
+    """A base class for APIs which amortize setup costs across multiple executions.
+
+    StatefulAPIs separate planning (``plan()``) and setup (``__init__()``) actions from
+    execution (``_execute()``), so that plans may be reused with different operands. The
+    ``reset_operands()`` method allows changing the operands of the API without replanning
+    when the input and execution space do not match (the user does not have a reference to
+    the execution space buffers). If the execution and input space match, we expect the
+    user to be able update the operands by overwriting their buffers in-place.
+    """
+
+    # options is declared Final in __init__() because mypy issue #8982 is not fixed yet.
+    # options: Final[OptionsPlaceholder]
+    # See docstring for StatefulAPIOptions
+    _allocator: Final[memory.BaseCUDAMemoryManager | memory.BaseCUDAMemoryManagerAsync | None]
+    _blocking: Final[bool]
+    _logger: Final[logging.Logger]
+
+    # Metadata related to execution space
+    execution: Final[ExecutionCPU | ExecutionCUDA]
+    """A class which describes the execution space parameters."""
+    _internal_op_package: Final[str]
+    """The package of the operands in the execution space."""
+    _operands: MutableSequence[utils.TensorHolder]
+    """A copy of the operands in execution space."""
+    _result_class: Final[type[utils.TensorHolder]]
+    """The type of TensorHolder to use for the execution space result."""
+
+    # Metadata about the input/output tensors
+    _operands_backup: MutableSequence[utils.TensorHolder | None]
+    """A reference to original operands in their input space."""
+    _operands_device_id: Final[int | Literal["cpu"]]
+    """The device_id of the input space."""
+    _operands_package: Final[str]
+    """The package of the operands in the input space."""
+
+    _call_prologue: Final[str]
+    """Stores a message for logging about blocking behavior"""
+
+    _has_plan: bool
+    """True if plan has been called."""
+
+    @property
+    def options(self) -> OptionsPlaceholder:
+        """The options object used to construct this class."""
+        # This is a workaround for mypy issue #8982, where we cannot declare options as
+        # Final in the class definition, but we still want it to appear in the docs as an
+        # attribute.
+        return self._options
+
+    def __init__(
+        self,
+        operands: MutableSequence[utils.TensorHolder],
+        *,
+        options: OptionsPlaceholder,
+        execution: ExecutionCPU | ExecutionCUDA | None | Literal["cuda", "cpu"] = None,
+        stream: utils.AnyStream | int | None = None,
+    ) -> None:
+        """Copy operands to the execution space and setup options.
+
+        When inheriting from this class, you must create valid operands and options in
+        the child class before calling StatefulAPI.__init__( ... ).
+        """
+        self._options: Final[OptionsPlaceholder] = options
+        self._logger = self._options.logger
+
+        self._logger.info("= SPECIFICATION PHASE =")
+
+        operands_device_id = utils.get_operands_device_id(operands)
+
+        match execution, operands_device_id:
+            case (None | "cuda", int()):
+                execution = ExecutionCUDA(device_id=operands_device_id)
+            case ("cuda", "cpu"):
+                execution = ExecutionCUDA()
+            case (None, "cpu") | ("cpu", _):
+                execution = ExecutionCPU()
+            case (ExecutionCUDA(), int()):
+                # If operands are on a CUDA device, use the same device for execution.
+                execution = dataclasses.replace(execution, device_id=operands_device_id)
+            case (ExecutionCPU(), _) | (ExecutionCUDA(), "cpu"):
+                pass
+            case _:
+                raise ValueError(
+                    f"{self.__class__.__name__}.execution must be one of ExecutionCUDA, ExecutionCPU, None, 'cuda', or 'cpu'."
+                )
+        assert isinstance(execution, (ExecutionCPU, ExecutionCUDA))
+        self.execution = execution
+
+        self._operands_device_id = operands_device_id
+        self._operands_package = utils.get_operands_package(operands)
+        self._internal_op_package = self._internal_operand_package(self._operands_package)
+        exec_stream_holder, operand_stream_holder = self._get_or_create_stream_maybe(stream)
+
+        self._logger.info(
+            f"The input tensors are located on device {operands_device_id}, and the execution space "
+            f"is {self.execution.name}, with device {getattr(self.execution, 'device_id', 'cpu')}."
+        )
+
+        self._logger.info(
+            f"The specified stream for the {self.__class__.__name__} constructor is "
+            f"{(exec_stream_holder or operand_stream_holder) and getattr(exec_stream_holder or operand_stream_holder, 'obj', None)}."  # noqa: E501
+        )
+
+        operands_backup: list[utils.TensorHolder | None] = [None] * len(operands)
+        for i in range(len(operands)):
+            # Copy the operand to execution_space's device if needed.
+            operands[i], operands_backup[i] = copy_operand_perhaps(
+                None,
+                operands[i],
+                operand_stream_holder,
+                getattr(self.execution, "device_id", "cpu"),
+                self._operands_device_id,
+            )
+        self._operands = operands
+        self._operands_backup = operands_backup
+
+        # The result's package and device.
+        self._result_class = self._operands[0].__class__
+
+        # Set blocking or non-blocking behavior.
+        self._blocking = self._options.blocking != "auto" or self._operands_device_id == "cpu" or self.execution.name == "cpu"
+        if self._blocking:
+            call_prologue = "This call is blocking and will return only after the operation is complete."
+        else:
+            call_prologue = (
+                "This call is non-blocking and will return immediately after the operation is launched on the device."
+            )
+        self._call_prologue = call_prologue
+
+        # Set memory allocator.
+        allocator: memory.BaseCUDAMemoryManager | memory.BaseCUDAMemoryManagerAsync | None
+        match self.execution:
+            case ExecutionCUDA():
+                allocator = (
+                    memory._MEMORY_MANAGER[self._internal_op_package](self.execution.device_id, self._logger)
+                    if self._options.allocator is None
+                    else self._options.allocator
+                )
+            case ExecutionCPU() | _:
+                allocator = None  # currently, the nvpl/fftw does not support custom workspace allocation
+        self._allocator = allocator
+
+        self._has_plan = False
+
+    def _internal_operand_package(self, package_name: str) -> str:
+        if self.execution.name == "cuda":
+            return package_name if package_name != "numpy" else "cuda"
+        else:
+            return package_name if package_name != "cupy" else "cupy_host"
+
+    def _get_or_create_stream_maybe(
+        self, stream: utils.AnyStream
+    ) -> tuple[utils.StreamHolder | None, utils.StreamHolder | None]:
+        """Return a 2-tuple of Stream | None: one for execution space, one for input space.
+
+        The first stream should be used for everything in the execution space: doing work,
+        allocating workspace, allocating input/output buffers.
+
+        The second stream should be used whenever data is being moved between the input and
+        output spaces: copying data to/from the input/output tensors.
+
+        NOTE: If two streams are returned, they will be the same stream.
+        """
+        if self.execution.name == "cuda":
+            stream_holder = utils.get_or_create_stream(self.execution.device_id, stream, self._internal_op_package)
+            return stream_holder, stream_holder
+        elif isinstance(self._operands_device_id, int):
+            operand_device_steam = utils.get_or_create_stream(self._operands_device_id, stream, self._operands_package)
+            return None, operand_device_steam
+        else:
+            return None, None
+
+    # input checks
+
+    def _check_valid_operands(self, *args, **kwargs):
+        """
+        Check if the operands are available for the operation.
+        """
+        what = kwargs["what"]
+        if self._operands is None:
+            raise RuntimeError(
+                f"{what} cannot be performed if the operands have been set to None. Use reset_operands() to set the "
+                f"desired input before using performing the {what.lower()}."
+            )
+
+    def _check_planned(self, *args, **kwargs):
+        what = kwargs["what"]
+        if not self._has_plan:
+            raise RuntimeError(f"{what} cannot be performed before plan() has been called.")
+
+    # execution
+
+    @abc.abstractmethod
+    def _execute(self):
+        """Perform the main functionality of this :class:`StatefulAPI` instance without
+        safety checks."""
+        msg = f"{self.__name__}._execute() is not implemented."
+        raise NotImplementedError(msg)
+
+    @abc.abstractmethod
+    def plan(self):
+        """Plan the main functionality of this :class:`StatefulAPI` instance."""
+        msg = f"{self.__class__.__name__}.plan() is not implemented."
+        raise NotImplementedError(msg)
+
+    @abc.abstractmethod
+    def reset_operands(self):
+        """Reset the operands held by this :class:`StatefulAPI` instance."""
+        msg = f"{self.__class__.__name__}.reset_operands() is not implemented."
+        raise NotImplementedError(msg)
+
+
+class HasWorkspaceMemory(contextlib.AbstractContextManager):
+    """A base class for APIs which need to allocate a working buffer in memory."""
+
+    def _allocate_workspace_memory_perhaps(self, stream_holder: utils.StreamHolder):
+        """
+        Allocate workspace memory using the specified allocator, if it hasn't already been
+        done.
+        """
+        raise NotImplementedError
+
+    def _release_workspace_memory_perhaps(self, release_workspace: bool):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def free(self):
+        """Free the resources of this :class:`StatefulAPI` instance."""
+        msg = f"{self.__name__}.free() is not implemented."
+        raise NotImplementedError(msg)
diff --git a/nvmath/_utils.py b/nvmath/_utils.py
index b174cf4..488e7bd 100644
--- a/nvmath/_utils.py
+++ b/nvmath/_utils.py
@@ -93,7 +93,13 @@ def module_init_force_cupy_lib_load():
     """
     from nvmath.bindings import _internal
 
-    for lib in ("cublas", "cufft", "curand", "cusolverDn", "cusparse"):
+    # cutensor windows binding is not available for nvmath-python beta7.0.
+    libs = (
+        ("cublas", "cufft", "curand", "cusolverDn", "cusparse", "cutensor")
+        if PLATFORM_LINUX
+        else ("cublas", "cufft", "curand", "cusolverDn", "cusparse")
+    )
+    for lib in libs:
         try:
             mod = getattr(_internal, lib)
             mod._inspect_function_pointers()
diff --git a/nvmath/bindings/__init__.py b/nvmath/bindings/__init__.py
index 7086381..92e81e4 100644
--- a/nvmath/bindings/__init__.py
+++ b/nvmath/bindings/__init__.py
@@ -5,6 +5,7 @@
 # type: ignore
 
 from nvmath.bindings import cublas
+from nvmath.bindings import cublasLt
 from nvmath.bindings import cudss
 from nvmath.bindings import cufft
 from nvmath.bindings import curand
@@ -12,12 +13,6 @@
 from nvmath.bindings import cusolverDn
 from nvmath.bindings import cusparse
 
-try:
-    # nvpl is Linux-only.
-    from nvmath.bindings import nvpl
-except ImportError:
-    nvpl = None
-
 try:
     # cufftMp is Linux-only.
     from nvmath.bindings import cufftMp
@@ -30,8 +25,28 @@
 except ImportError:
     nvshmem = None
 
+try:
+    # NCCL is Linux-only.
+    from nvmath.bindings import nccl
+except ImportError:
+    nccl = None
+
+try:
+    # cublasMp is Linux-only.
+    from nvmath.bindings import cublasMp
+except ImportError:
+    cublasMp = None
+
+try:
+    # cutensor binding is Linux-only for nvmath-python beta7.0.
+    from nvmath.bindings import cutensor
+except ImportError:
+    cutensor = None
+
 __all__ = [
     "cublas",
+    "cublasLt",
+    "cublasMp",
     "cudss",
     "cufft",
     "cufftMp",
@@ -39,6 +54,8 @@
     "cusolver",
     "cusolverDn",
     "cusparse",
+    "cutensor",
+    "nccl",
     "nvpl",
     "nvshmem",
 ]
diff --git a/nvmath/bindings/_internal/cublasLt_linux.pyx b/nvmath/bindings/_internal/cublasLt_linux.pyx
index 65c4a0a..91bf408 100644
--- a/nvmath/bindings/_internal/cublasLt_linux.pyx
+++ b/nvmath/bindings/_internal/cublasLt_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cublasLt_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cublasLtCreate = NULL
 cdef void* __cublasLtDestroy = NULL
@@ -89,317 +110,301 @@ cdef int _check_or_init_cublasLt() except -1 nogil:
     if __py_cublasLt_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cublasLtCreate
-    __cublasLtCreate = dlsym(RTLD_DEFAULT, 'cublasLtCreate')
-    if __cublasLtCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtCreate = dlsym(handle, 'cublasLtCreate')
-
-    global __cublasLtDestroy
-    __cublasLtDestroy = dlsym(RTLD_DEFAULT, 'cublasLtDestroy')
-    if __cublasLtDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtDestroy = dlsym(handle, 'cublasLtDestroy')
-
-    global __cublasLtGetVersion
-    __cublasLtGetVersion = dlsym(RTLD_DEFAULT, 'cublasLtGetVersion')
-    if __cublasLtGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtGetVersion = dlsym(handle, 'cublasLtGetVersion')
-
-    global __cublasLtGetCudartVersion
-    __cublasLtGetCudartVersion = dlsym(RTLD_DEFAULT, 'cublasLtGetCudartVersion')
-    if __cublasLtGetCudartVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtGetCudartVersion = dlsym(handle, 'cublasLtGetCudartVersion')
-
-    global __cublasLtGetProperty
-    __cublasLtGetProperty = dlsym(RTLD_DEFAULT, 'cublasLtGetProperty')
-    if __cublasLtGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtGetProperty = dlsym(handle, 'cublasLtGetProperty')
-
-    global __cublasLtMatmul
-    __cublasLtMatmul = dlsym(RTLD_DEFAULT, 'cublasLtMatmul')
-    if __cublasLtMatmul == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmul = dlsym(handle, 'cublasLtMatmul')
-
-    global __cublasLtMatrixTransform
-    __cublasLtMatrixTransform = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransform')
-    if __cublasLtMatrixTransform == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixTransform = dlsym(handle, 'cublasLtMatrixTransform')
-
-    global __cublasLtMatrixLayoutCreate
-    __cublasLtMatrixLayoutCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutCreate')
-    if __cublasLtMatrixLayoutCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixLayoutCreate = dlsym(handle, 'cublasLtMatrixLayoutCreate')
-
-    global __cublasLtMatrixLayoutDestroy
-    __cublasLtMatrixLayoutDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutDestroy')
-    if __cublasLtMatrixLayoutDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixLayoutDestroy = dlsym(handle, 'cublasLtMatrixLayoutDestroy')
-
-    global __cublasLtMatrixLayoutSetAttribute
-    __cublasLtMatrixLayoutSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutSetAttribute')
-    if __cublasLtMatrixLayoutSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixLayoutSetAttribute = dlsym(handle, 'cublasLtMatrixLayoutSetAttribute')
-
-    global __cublasLtMatrixLayoutGetAttribute
-    __cublasLtMatrixLayoutGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutGetAttribute')
-    if __cublasLtMatrixLayoutGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixLayoutGetAttribute = dlsym(handle, 'cublasLtMatrixLayoutGetAttribute')
-
-    global __cublasLtMatmulDescCreate
-    __cublasLtMatmulDescCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescCreate')
-    if __cublasLtMatmulDescCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulDescCreate = dlsym(handle, 'cublasLtMatmulDescCreate')
-
-    global __cublasLtMatmulDescDestroy
-    __cublasLtMatmulDescDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescDestroy')
-    if __cublasLtMatmulDescDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulDescDestroy = dlsym(handle, 'cublasLtMatmulDescDestroy')
-
-    global __cublasLtMatmulDescSetAttribute
-    __cublasLtMatmulDescSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescSetAttribute')
-    if __cublasLtMatmulDescSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulDescSetAttribute = dlsym(handle, 'cublasLtMatmulDescSetAttribute')
-
-    global __cublasLtMatmulDescGetAttribute
-    __cublasLtMatmulDescGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescGetAttribute')
-    if __cublasLtMatmulDescGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulDescGetAttribute = dlsym(handle, 'cublasLtMatmulDescGetAttribute')
-
-    global __cublasLtMatrixTransformDescCreate
-    __cublasLtMatrixTransformDescCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescCreate')
-    if __cublasLtMatrixTransformDescCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixTransformDescCreate = dlsym(handle, 'cublasLtMatrixTransformDescCreate')
-
-    global __cublasLtMatrixTransformDescDestroy
-    __cublasLtMatrixTransformDescDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescDestroy')
-    if __cublasLtMatrixTransformDescDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixTransformDescDestroy = dlsym(handle, 'cublasLtMatrixTransformDescDestroy')
-
-    global __cublasLtMatrixTransformDescSetAttribute
-    __cublasLtMatrixTransformDescSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescSetAttribute')
-    if __cublasLtMatrixTransformDescSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixTransformDescSetAttribute = dlsym(handle, 'cublasLtMatrixTransformDescSetAttribute')
 
-    global __cublasLtMatrixTransformDescGetAttribute
-    __cublasLtMatrixTransformDescGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescGetAttribute')
-    if __cublasLtMatrixTransformDescGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatrixTransformDescGetAttribute = dlsym(handle, 'cublasLtMatrixTransformDescGetAttribute')
-
-    global __cublasLtMatmulPreferenceCreate
-    __cublasLtMatmulPreferenceCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceCreate')
-    if __cublasLtMatmulPreferenceCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulPreferenceCreate = dlsym(handle, 'cublasLtMatmulPreferenceCreate')
-
-    global __cublasLtMatmulPreferenceDestroy
-    __cublasLtMatmulPreferenceDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceDestroy')
-    if __cublasLtMatmulPreferenceDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulPreferenceDestroy = dlsym(handle, 'cublasLtMatmulPreferenceDestroy')
-
-    global __cublasLtMatmulPreferenceSetAttribute
-    __cublasLtMatmulPreferenceSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceSetAttribute')
-    if __cublasLtMatmulPreferenceSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulPreferenceSetAttribute = dlsym(handle, 'cublasLtMatmulPreferenceSetAttribute')
-
-    global __cublasLtMatmulPreferenceGetAttribute
-    __cublasLtMatmulPreferenceGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceGetAttribute')
-    if __cublasLtMatmulPreferenceGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulPreferenceGetAttribute = dlsym(handle, 'cublasLtMatmulPreferenceGetAttribute')
-
-    global __cublasLtMatmulAlgoGetHeuristic
-    __cublasLtMatmulAlgoGetHeuristic = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoGetHeuristic')
-    if __cublasLtMatmulAlgoGetHeuristic == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoGetHeuristic = dlsym(handle, 'cublasLtMatmulAlgoGetHeuristic')
-
-    global __cublasLtMatmulAlgoGetIds
-    __cublasLtMatmulAlgoGetIds = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoGetIds')
-    if __cublasLtMatmulAlgoGetIds == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoGetIds = dlsym(handle, 'cublasLtMatmulAlgoGetIds')
-
-    global __cublasLtMatmulAlgoInit
-    __cublasLtMatmulAlgoInit = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoInit')
-    if __cublasLtMatmulAlgoInit == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoInit = dlsym(handle, 'cublasLtMatmulAlgoInit')
-
-    global __cublasLtMatmulAlgoCheck
-    __cublasLtMatmulAlgoCheck = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoCheck')
-    if __cublasLtMatmulAlgoCheck == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoCheck = dlsym(handle, 'cublasLtMatmulAlgoCheck')
-
-    global __cublasLtMatmulAlgoCapGetAttribute
-    __cublasLtMatmulAlgoCapGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoCapGetAttribute')
-    if __cublasLtMatmulAlgoCapGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoCapGetAttribute = dlsym(handle, 'cublasLtMatmulAlgoCapGetAttribute')
-
-    global __cublasLtMatmulAlgoConfigSetAttribute
-    __cublasLtMatmulAlgoConfigSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoConfigSetAttribute')
-    if __cublasLtMatmulAlgoConfigSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoConfigSetAttribute = dlsym(handle, 'cublasLtMatmulAlgoConfigSetAttribute')
-
-    global __cublasLtMatmulAlgoConfigGetAttribute
-    __cublasLtMatmulAlgoConfigGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoConfigGetAttribute')
-    if __cublasLtMatmulAlgoConfigGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtMatmulAlgoConfigGetAttribute = dlsym(handle, 'cublasLtMatmulAlgoConfigGetAttribute')
-
-    global __cublasLtLoggerSetCallback
-    __cublasLtLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetCallback')
-    if __cublasLtLoggerSetCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerSetCallback = dlsym(handle, 'cublasLtLoggerSetCallback')
-
-    global __cublasLtLoggerSetFile
-    __cublasLtLoggerSetFile = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetFile')
-    if __cublasLtLoggerSetFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerSetFile = dlsym(handle, 'cublasLtLoggerSetFile')
-
-    global __cublasLtLoggerOpenFile
-    __cublasLtLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cublasLtLoggerOpenFile')
-    if __cublasLtLoggerOpenFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerOpenFile = dlsym(handle, 'cublasLtLoggerOpenFile')
-
-    global __cublasLtLoggerSetLevel
-    __cublasLtLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetLevel')
-    if __cublasLtLoggerSetLevel == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerSetLevel = dlsym(handle, 'cublasLtLoggerSetLevel')
-
-    global __cublasLtLoggerSetMask
-    __cublasLtLoggerSetMask = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetMask')
-    if __cublasLtLoggerSetMask == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerSetMask = dlsym(handle, 'cublasLtLoggerSetMask')
-
-    global __cublasLtLoggerForceDisable
-    __cublasLtLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cublasLtLoggerForceDisable')
-    if __cublasLtLoggerForceDisable == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtLoggerForceDisable = dlsym(handle, 'cublasLtLoggerForceDisable')
-
-    global __cublasLtGetStatusName
-    __cublasLtGetStatusName = dlsym(RTLD_DEFAULT, 'cublasLtGetStatusName')
-    if __cublasLtGetStatusName == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtGetStatusName = dlsym(handle, 'cublasLtGetStatusName')
-
-    global __cublasLtGetStatusString
-    __cublasLtGetStatusString = dlsym(RTLD_DEFAULT, 'cublasLtGetStatusString')
-    if __cublasLtGetStatusString == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtGetStatusString = dlsym(handle, 'cublasLtGetStatusString')
-
-    global __cublasLtHeuristicsCacheGetCapacity
-    __cublasLtHeuristicsCacheGetCapacity = dlsym(RTLD_DEFAULT, 'cublasLtHeuristicsCacheGetCapacity')
-    if __cublasLtHeuristicsCacheGetCapacity == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtHeuristicsCacheGetCapacity = dlsym(handle, 'cublasLtHeuristicsCacheGetCapacity')
-
-    global __cublasLtHeuristicsCacheSetCapacity
-    __cublasLtHeuristicsCacheSetCapacity = dlsym(RTLD_DEFAULT, 'cublasLtHeuristicsCacheSetCapacity')
-    if __cublasLtHeuristicsCacheSetCapacity == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtHeuristicsCacheSetCapacity = dlsym(handle, 'cublasLtHeuristicsCacheSetCapacity')
-
-    global __cublasLtDisableCpuInstructionsSetMask
-    __cublasLtDisableCpuInstructionsSetMask = dlsym(RTLD_DEFAULT, 'cublasLtDisableCpuInstructionsSetMask')
-    if __cublasLtDisableCpuInstructionsSetMask == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLtDisableCpuInstructionsSetMask = dlsym(handle, 'cublasLtDisableCpuInstructionsSetMask')
-
-    __py_cublasLt_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cublasLtCreate
+        __cublasLtCreate = dlsym(RTLD_DEFAULT, 'cublasLtCreate')
+        if __cublasLtCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtCreate = dlsym(handle, 'cublasLtCreate')
+
+        global __cublasLtDestroy
+        __cublasLtDestroy = dlsym(RTLD_DEFAULT, 'cublasLtDestroy')
+        if __cublasLtDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtDestroy = dlsym(handle, 'cublasLtDestroy')
+
+        global __cublasLtGetVersion
+        __cublasLtGetVersion = dlsym(RTLD_DEFAULT, 'cublasLtGetVersion')
+        if __cublasLtGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtGetVersion = dlsym(handle, 'cublasLtGetVersion')
+
+        global __cublasLtGetCudartVersion
+        __cublasLtGetCudartVersion = dlsym(RTLD_DEFAULT, 'cublasLtGetCudartVersion')
+        if __cublasLtGetCudartVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtGetCudartVersion = dlsym(handle, 'cublasLtGetCudartVersion')
+
+        global __cublasLtGetProperty
+        __cublasLtGetProperty = dlsym(RTLD_DEFAULT, 'cublasLtGetProperty')
+        if __cublasLtGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtGetProperty = dlsym(handle, 'cublasLtGetProperty')
+
+        global __cublasLtMatmul
+        __cublasLtMatmul = dlsym(RTLD_DEFAULT, 'cublasLtMatmul')
+        if __cublasLtMatmul == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmul = dlsym(handle, 'cublasLtMatmul')
+
+        global __cublasLtMatrixTransform
+        __cublasLtMatrixTransform = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransform')
+        if __cublasLtMatrixTransform == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixTransform = dlsym(handle, 'cublasLtMatrixTransform')
+
+        global __cublasLtMatrixLayoutCreate
+        __cublasLtMatrixLayoutCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutCreate')
+        if __cublasLtMatrixLayoutCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixLayoutCreate = dlsym(handle, 'cublasLtMatrixLayoutCreate')
+
+        global __cublasLtMatrixLayoutDestroy
+        __cublasLtMatrixLayoutDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutDestroy')
+        if __cublasLtMatrixLayoutDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixLayoutDestroy = dlsym(handle, 'cublasLtMatrixLayoutDestroy')
+
+        global __cublasLtMatrixLayoutSetAttribute
+        __cublasLtMatrixLayoutSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutSetAttribute')
+        if __cublasLtMatrixLayoutSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixLayoutSetAttribute = dlsym(handle, 'cublasLtMatrixLayoutSetAttribute')
+
+        global __cublasLtMatrixLayoutGetAttribute
+        __cublasLtMatrixLayoutGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixLayoutGetAttribute')
+        if __cublasLtMatrixLayoutGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixLayoutGetAttribute = dlsym(handle, 'cublasLtMatrixLayoutGetAttribute')
+
+        global __cublasLtMatmulDescCreate
+        __cublasLtMatmulDescCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescCreate')
+        if __cublasLtMatmulDescCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulDescCreate = dlsym(handle, 'cublasLtMatmulDescCreate')
+
+        global __cublasLtMatmulDescDestroy
+        __cublasLtMatmulDescDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescDestroy')
+        if __cublasLtMatmulDescDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulDescDestroy = dlsym(handle, 'cublasLtMatmulDescDestroy')
+
+        global __cublasLtMatmulDescSetAttribute
+        __cublasLtMatmulDescSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescSetAttribute')
+        if __cublasLtMatmulDescSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulDescSetAttribute = dlsym(handle, 'cublasLtMatmulDescSetAttribute')
+
+        global __cublasLtMatmulDescGetAttribute
+        __cublasLtMatmulDescGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulDescGetAttribute')
+        if __cublasLtMatmulDescGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulDescGetAttribute = dlsym(handle, 'cublasLtMatmulDescGetAttribute')
+
+        global __cublasLtMatrixTransformDescCreate
+        __cublasLtMatrixTransformDescCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescCreate')
+        if __cublasLtMatrixTransformDescCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixTransformDescCreate = dlsym(handle, 'cublasLtMatrixTransformDescCreate')
+
+        global __cublasLtMatrixTransformDescDestroy
+        __cublasLtMatrixTransformDescDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescDestroy')
+        if __cublasLtMatrixTransformDescDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixTransformDescDestroy = dlsym(handle, 'cublasLtMatrixTransformDescDestroy')
+
+        global __cublasLtMatrixTransformDescSetAttribute
+        __cublasLtMatrixTransformDescSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescSetAttribute')
+        if __cublasLtMatrixTransformDescSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixTransformDescSetAttribute = dlsym(handle, 'cublasLtMatrixTransformDescSetAttribute')
+
+        global __cublasLtMatrixTransformDescGetAttribute
+        __cublasLtMatrixTransformDescGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatrixTransformDescGetAttribute')
+        if __cublasLtMatrixTransformDescGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatrixTransformDescGetAttribute = dlsym(handle, 'cublasLtMatrixTransformDescGetAttribute')
+
+        global __cublasLtMatmulPreferenceCreate
+        __cublasLtMatmulPreferenceCreate = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceCreate')
+        if __cublasLtMatmulPreferenceCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulPreferenceCreate = dlsym(handle, 'cublasLtMatmulPreferenceCreate')
+
+        global __cublasLtMatmulPreferenceDestroy
+        __cublasLtMatmulPreferenceDestroy = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceDestroy')
+        if __cublasLtMatmulPreferenceDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulPreferenceDestroy = dlsym(handle, 'cublasLtMatmulPreferenceDestroy')
+
+        global __cublasLtMatmulPreferenceSetAttribute
+        __cublasLtMatmulPreferenceSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceSetAttribute')
+        if __cublasLtMatmulPreferenceSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulPreferenceSetAttribute = dlsym(handle, 'cublasLtMatmulPreferenceSetAttribute')
+
+        global __cublasLtMatmulPreferenceGetAttribute
+        __cublasLtMatmulPreferenceGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulPreferenceGetAttribute')
+        if __cublasLtMatmulPreferenceGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulPreferenceGetAttribute = dlsym(handle, 'cublasLtMatmulPreferenceGetAttribute')
+
+        global __cublasLtMatmulAlgoGetHeuristic
+        __cublasLtMatmulAlgoGetHeuristic = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoGetHeuristic')
+        if __cublasLtMatmulAlgoGetHeuristic == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoGetHeuristic = dlsym(handle, 'cublasLtMatmulAlgoGetHeuristic')
+
+        global __cublasLtMatmulAlgoGetIds
+        __cublasLtMatmulAlgoGetIds = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoGetIds')
+        if __cublasLtMatmulAlgoGetIds == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoGetIds = dlsym(handle, 'cublasLtMatmulAlgoGetIds')
+
+        global __cublasLtMatmulAlgoInit
+        __cublasLtMatmulAlgoInit = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoInit')
+        if __cublasLtMatmulAlgoInit == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoInit = dlsym(handle, 'cublasLtMatmulAlgoInit')
+
+        global __cublasLtMatmulAlgoCheck
+        __cublasLtMatmulAlgoCheck = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoCheck')
+        if __cublasLtMatmulAlgoCheck == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoCheck = dlsym(handle, 'cublasLtMatmulAlgoCheck')
+
+        global __cublasLtMatmulAlgoCapGetAttribute
+        __cublasLtMatmulAlgoCapGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoCapGetAttribute')
+        if __cublasLtMatmulAlgoCapGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoCapGetAttribute = dlsym(handle, 'cublasLtMatmulAlgoCapGetAttribute')
+
+        global __cublasLtMatmulAlgoConfigSetAttribute
+        __cublasLtMatmulAlgoConfigSetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoConfigSetAttribute')
+        if __cublasLtMatmulAlgoConfigSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoConfigSetAttribute = dlsym(handle, 'cublasLtMatmulAlgoConfigSetAttribute')
+
+        global __cublasLtMatmulAlgoConfigGetAttribute
+        __cublasLtMatmulAlgoConfigGetAttribute = dlsym(RTLD_DEFAULT, 'cublasLtMatmulAlgoConfigGetAttribute')
+        if __cublasLtMatmulAlgoConfigGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtMatmulAlgoConfigGetAttribute = dlsym(handle, 'cublasLtMatmulAlgoConfigGetAttribute')
+
+        global __cublasLtLoggerSetCallback
+        __cublasLtLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetCallback')
+        if __cublasLtLoggerSetCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerSetCallback = dlsym(handle, 'cublasLtLoggerSetCallback')
+
+        global __cublasLtLoggerSetFile
+        __cublasLtLoggerSetFile = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetFile')
+        if __cublasLtLoggerSetFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerSetFile = dlsym(handle, 'cublasLtLoggerSetFile')
+
+        global __cublasLtLoggerOpenFile
+        __cublasLtLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cublasLtLoggerOpenFile')
+        if __cublasLtLoggerOpenFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerOpenFile = dlsym(handle, 'cublasLtLoggerOpenFile')
+
+        global __cublasLtLoggerSetLevel
+        __cublasLtLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetLevel')
+        if __cublasLtLoggerSetLevel == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerSetLevel = dlsym(handle, 'cublasLtLoggerSetLevel')
+
+        global __cublasLtLoggerSetMask
+        __cublasLtLoggerSetMask = dlsym(RTLD_DEFAULT, 'cublasLtLoggerSetMask')
+        if __cublasLtLoggerSetMask == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerSetMask = dlsym(handle, 'cublasLtLoggerSetMask')
+
+        global __cublasLtLoggerForceDisable
+        __cublasLtLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cublasLtLoggerForceDisable')
+        if __cublasLtLoggerForceDisable == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtLoggerForceDisable = dlsym(handle, 'cublasLtLoggerForceDisable')
+
+        global __cublasLtGetStatusName
+        __cublasLtGetStatusName = dlsym(RTLD_DEFAULT, 'cublasLtGetStatusName')
+        if __cublasLtGetStatusName == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtGetStatusName = dlsym(handle, 'cublasLtGetStatusName')
+
+        global __cublasLtGetStatusString
+        __cublasLtGetStatusString = dlsym(RTLD_DEFAULT, 'cublasLtGetStatusString')
+        if __cublasLtGetStatusString == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtGetStatusString = dlsym(handle, 'cublasLtGetStatusString')
+
+        global __cublasLtHeuristicsCacheGetCapacity
+        __cublasLtHeuristicsCacheGetCapacity = dlsym(RTLD_DEFAULT, 'cublasLtHeuristicsCacheGetCapacity')
+        if __cublasLtHeuristicsCacheGetCapacity == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtHeuristicsCacheGetCapacity = dlsym(handle, 'cublasLtHeuristicsCacheGetCapacity')
+
+        global __cublasLtHeuristicsCacheSetCapacity
+        __cublasLtHeuristicsCacheSetCapacity = dlsym(RTLD_DEFAULT, 'cublasLtHeuristicsCacheSetCapacity')
+        if __cublasLtHeuristicsCacheSetCapacity == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtHeuristicsCacheSetCapacity = dlsym(handle, 'cublasLtHeuristicsCacheSetCapacity')
+
+        global __cublasLtDisableCpuInstructionsSetMask
+        __cublasLtDisableCpuInstructionsSetMask = dlsym(RTLD_DEFAULT, 'cublasLtDisableCpuInstructionsSetMask')
+        if __cublasLtDisableCpuInstructionsSetMask == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLtDisableCpuInstructionsSetMask = dlsym(handle, 'cublasLtDisableCpuInstructionsSetMask')
+
+        __py_cublasLt_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cublasLt_windows.pyx b/nvmath/bindings/_internal/cublasLt_windows.pyx
index 52b992f..aabb58a 100644
--- a/nvmath/bindings/_internal/cublasLt_windows.pyx
+++ b/nvmath/bindings/_internal/cublasLt_windows.pyx
@@ -9,20 +9,77 @@ from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cublasLt_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cublasLtCreate = NULL
 cdef void* __cublasLtDestroy = NULL
@@ -80,274 +137,138 @@ cdef int _check_or_init_cublasLt() except -1 nogil:
     if __py_cublasLt_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __cublasLtCreate
-        try:
-            __cublasLtCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtCreate')
-        except:
-            pass
+        __cublasLtCreate = GetProcAddress(handle, 'cublasLtCreate')
 
         global __cublasLtDestroy
-        try:
-            __cublasLtDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtDestroy')
-        except:
-            pass
+        __cublasLtDestroy = GetProcAddress(handle, 'cublasLtDestroy')
 
         global __cublasLtGetVersion
-        try:
-            __cublasLtGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtGetVersion')
-        except:
-            pass
+        __cublasLtGetVersion = GetProcAddress(handle, 'cublasLtGetVersion')
 
         global __cublasLtGetCudartVersion
-        try:
-            __cublasLtGetCudartVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtGetCudartVersion')
-        except:
-            pass
+        __cublasLtGetCudartVersion = GetProcAddress(handle, 'cublasLtGetCudartVersion')
 
         global __cublasLtGetProperty
-        try:
-            __cublasLtGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtGetProperty')
-        except:
-            pass
+        __cublasLtGetProperty = GetProcAddress(handle, 'cublasLtGetProperty')
 
         global __cublasLtMatmul
-        try:
-            __cublasLtMatmul = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmul')
-        except:
-            pass
+        __cublasLtMatmul = GetProcAddress(handle, 'cublasLtMatmul')
 
         global __cublasLtMatrixTransform
-        try:
-            __cublasLtMatrixTransform = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixTransform')
-        except:
-            pass
+        __cublasLtMatrixTransform = GetProcAddress(handle, 'cublasLtMatrixTransform')
 
         global __cublasLtMatrixLayoutCreate
-        try:
-            __cublasLtMatrixLayoutCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixLayoutCreate')
-        except:
-            pass
+        __cublasLtMatrixLayoutCreate = GetProcAddress(handle, 'cublasLtMatrixLayoutCreate')
 
         global __cublasLtMatrixLayoutDestroy
-        try:
-            __cublasLtMatrixLayoutDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixLayoutDestroy')
-        except:
-            pass
+        __cublasLtMatrixLayoutDestroy = GetProcAddress(handle, 'cublasLtMatrixLayoutDestroy')
 
         global __cublasLtMatrixLayoutSetAttribute
-        try:
-            __cublasLtMatrixLayoutSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixLayoutSetAttribute')
-        except:
-            pass
+        __cublasLtMatrixLayoutSetAttribute = GetProcAddress(handle, 'cublasLtMatrixLayoutSetAttribute')
 
         global __cublasLtMatrixLayoutGetAttribute
-        try:
-            __cublasLtMatrixLayoutGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixLayoutGetAttribute')
-        except:
-            pass
+        __cublasLtMatrixLayoutGetAttribute = GetProcAddress(handle, 'cublasLtMatrixLayoutGetAttribute')
 
         global __cublasLtMatmulDescCreate
-        try:
-            __cublasLtMatmulDescCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulDescCreate')
-        except:
-            pass
+        __cublasLtMatmulDescCreate = GetProcAddress(handle, 'cublasLtMatmulDescCreate')
 
         global __cublasLtMatmulDescDestroy
-        try:
-            __cublasLtMatmulDescDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulDescDestroy')
-        except:
-            pass
+        __cublasLtMatmulDescDestroy = GetProcAddress(handle, 'cublasLtMatmulDescDestroy')
 
         global __cublasLtMatmulDescSetAttribute
-        try:
-            __cublasLtMatmulDescSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulDescSetAttribute')
-        except:
-            pass
+        __cublasLtMatmulDescSetAttribute = GetProcAddress(handle, 'cublasLtMatmulDescSetAttribute')
 
         global __cublasLtMatmulDescGetAttribute
-        try:
-            __cublasLtMatmulDescGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulDescGetAttribute')
-        except:
-            pass
+        __cublasLtMatmulDescGetAttribute = GetProcAddress(handle, 'cublasLtMatmulDescGetAttribute')
 
         global __cublasLtMatrixTransformDescCreate
-        try:
-            __cublasLtMatrixTransformDescCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixTransformDescCreate')
-        except:
-            pass
+        __cublasLtMatrixTransformDescCreate = GetProcAddress(handle, 'cublasLtMatrixTransformDescCreate')
 
         global __cublasLtMatrixTransformDescDestroy
-        try:
-            __cublasLtMatrixTransformDescDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixTransformDescDestroy')
-        except:
-            pass
+        __cublasLtMatrixTransformDescDestroy = GetProcAddress(handle, 'cublasLtMatrixTransformDescDestroy')
 
         global __cublasLtMatrixTransformDescSetAttribute
-        try:
-            __cublasLtMatrixTransformDescSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixTransformDescSetAttribute')
-        except:
-            pass
+        __cublasLtMatrixTransformDescSetAttribute = GetProcAddress(handle, 'cublasLtMatrixTransformDescSetAttribute')
 
         global __cublasLtMatrixTransformDescGetAttribute
-        try:
-            __cublasLtMatrixTransformDescGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatrixTransformDescGetAttribute')
-        except:
-            pass
+        __cublasLtMatrixTransformDescGetAttribute = GetProcAddress(handle, 'cublasLtMatrixTransformDescGetAttribute')
 
         global __cublasLtMatmulPreferenceCreate
-        try:
-            __cublasLtMatmulPreferenceCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulPreferenceCreate')
-        except:
-            pass
+        __cublasLtMatmulPreferenceCreate = GetProcAddress(handle, 'cublasLtMatmulPreferenceCreate')
 
         global __cublasLtMatmulPreferenceDestroy
-        try:
-            __cublasLtMatmulPreferenceDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulPreferenceDestroy')
-        except:
-            pass
+        __cublasLtMatmulPreferenceDestroy = GetProcAddress(handle, 'cublasLtMatmulPreferenceDestroy')
 
         global __cublasLtMatmulPreferenceSetAttribute
-        try:
-            __cublasLtMatmulPreferenceSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulPreferenceSetAttribute')
-        except:
-            pass
+        __cublasLtMatmulPreferenceSetAttribute = GetProcAddress(handle, 'cublasLtMatmulPreferenceSetAttribute')
 
         global __cublasLtMatmulPreferenceGetAttribute
-        try:
-            __cublasLtMatmulPreferenceGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulPreferenceGetAttribute')
-        except:
-            pass
+        __cublasLtMatmulPreferenceGetAttribute = GetProcAddress(handle, 'cublasLtMatmulPreferenceGetAttribute')
 
         global __cublasLtMatmulAlgoGetHeuristic
-        try:
-            __cublasLtMatmulAlgoGetHeuristic = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoGetHeuristic')
-        except:
-            pass
+        __cublasLtMatmulAlgoGetHeuristic = GetProcAddress(handle, 'cublasLtMatmulAlgoGetHeuristic')
 
         global __cublasLtMatmulAlgoGetIds
-        try:
-            __cublasLtMatmulAlgoGetIds = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoGetIds')
-        except:
-            pass
+        __cublasLtMatmulAlgoGetIds = GetProcAddress(handle, 'cublasLtMatmulAlgoGetIds')
 
         global __cublasLtMatmulAlgoInit
-        try:
-            __cublasLtMatmulAlgoInit = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoInit')
-        except:
-            pass
+        __cublasLtMatmulAlgoInit = GetProcAddress(handle, 'cublasLtMatmulAlgoInit')
 
         global __cublasLtMatmulAlgoCheck
-        try:
-            __cublasLtMatmulAlgoCheck = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoCheck')
-        except:
-            pass
+        __cublasLtMatmulAlgoCheck = GetProcAddress(handle, 'cublasLtMatmulAlgoCheck')
 
         global __cublasLtMatmulAlgoCapGetAttribute
-        try:
-            __cublasLtMatmulAlgoCapGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoCapGetAttribute')
-        except:
-            pass
+        __cublasLtMatmulAlgoCapGetAttribute = GetProcAddress(handle, 'cublasLtMatmulAlgoCapGetAttribute')
 
         global __cublasLtMatmulAlgoConfigSetAttribute
-        try:
-            __cublasLtMatmulAlgoConfigSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoConfigSetAttribute')
-        except:
-            pass
+        __cublasLtMatmulAlgoConfigSetAttribute = GetProcAddress(handle, 'cublasLtMatmulAlgoConfigSetAttribute')
 
         global __cublasLtMatmulAlgoConfigGetAttribute
-        try:
-            __cublasLtMatmulAlgoConfigGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtMatmulAlgoConfigGetAttribute')
-        except:
-            pass
+        __cublasLtMatmulAlgoConfigGetAttribute = GetProcAddress(handle, 'cublasLtMatmulAlgoConfigGetAttribute')
 
         global __cublasLtLoggerSetCallback
-        try:
-            __cublasLtLoggerSetCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerSetCallback')
-        except:
-            pass
+        __cublasLtLoggerSetCallback = GetProcAddress(handle, 'cublasLtLoggerSetCallback')
 
         global __cublasLtLoggerSetFile
-        try:
-            __cublasLtLoggerSetFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerSetFile')
-        except:
-            pass
+        __cublasLtLoggerSetFile = GetProcAddress(handle, 'cublasLtLoggerSetFile')
 
         global __cublasLtLoggerOpenFile
-        try:
-            __cublasLtLoggerOpenFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerOpenFile')
-        except:
-            pass
+        __cublasLtLoggerOpenFile = GetProcAddress(handle, 'cublasLtLoggerOpenFile')
 
         global __cublasLtLoggerSetLevel
-        try:
-            __cublasLtLoggerSetLevel = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerSetLevel')
-        except:
-            pass
+        __cublasLtLoggerSetLevel = GetProcAddress(handle, 'cublasLtLoggerSetLevel')
 
         global __cublasLtLoggerSetMask
-        try:
-            __cublasLtLoggerSetMask = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerSetMask')
-        except:
-            pass
+        __cublasLtLoggerSetMask = GetProcAddress(handle, 'cublasLtLoggerSetMask')
 
         global __cublasLtLoggerForceDisable
-        try:
-            __cublasLtLoggerForceDisable = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtLoggerForceDisable')
-        except:
-            pass
+        __cublasLtLoggerForceDisable = GetProcAddress(handle, 'cublasLtLoggerForceDisable')
 
         global __cublasLtGetStatusName
-        try:
-            __cublasLtGetStatusName = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtGetStatusName')
-        except:
-            pass
+        __cublasLtGetStatusName = GetProcAddress(handle, 'cublasLtGetStatusName')
 
         global __cublasLtGetStatusString
-        try:
-            __cublasLtGetStatusString = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtGetStatusString')
-        except:
-            pass
+        __cublasLtGetStatusString = GetProcAddress(handle, 'cublasLtGetStatusString')
 
         global __cublasLtHeuristicsCacheGetCapacity
-        try:
-            __cublasLtHeuristicsCacheGetCapacity = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtHeuristicsCacheGetCapacity')
-        except:
-            pass
+        __cublasLtHeuristicsCacheGetCapacity = GetProcAddress(handle, 'cublasLtHeuristicsCacheGetCapacity')
 
         global __cublasLtHeuristicsCacheSetCapacity
-        try:
-            __cublasLtHeuristicsCacheSetCapacity = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtHeuristicsCacheSetCapacity')
-        except:
-            pass
+        __cublasLtHeuristicsCacheSetCapacity = GetProcAddress(handle, 'cublasLtHeuristicsCacheSetCapacity')
 
         global __cublasLtDisableCpuInstructionsSetMask
-        try:
-            __cublasLtDisableCpuInstructionsSetMask = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLtDisableCpuInstructionsSetMask')
-        except:
-            pass
+        __cublasLtDisableCpuInstructionsSetMask = GetProcAddress(handle, 'cublasLtDisableCpuInstructionsSetMask')
 
-    __py_cublasLt_init = True
-    return 0
+        __py_cublasLt_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cublasMp.pxd b/nvmath/bindings/_internal/cublasMp.pxd
new file mode 100644
index 0000000..cc1d9ec
--- /dev/null
+++ b/nvmath/bindings/_internal/cublasMp.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+
+from ..cycublasMp cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cublasMpStatus_t _cublasMpCreate(cublasMpHandle_t* handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpDestroy(cublasMpHandle_t handle) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpStreamSet(cublasMpHandle_t handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpGetVersion(int* version) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpGridCreate(int64_t nprow, int64_t npcol, cublasMpGridLayout_t layout, ncclComm_t comm, cublasMpGrid_t* grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpGridDestroy(cublasMpGrid_t grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatrixDescriptorCreate(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, cudaDataType_t type, cublasMpGrid_t grid, cublasMpMatrixDescriptor_t* desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatrixDescriptorDestroy(cublasMpMatrixDescriptor_t desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorCreate(cublasMpMatmulDescriptor_t* matmulDesc, cublasComputeType_t computeType) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorDestroy(cublasMpMatmulDescriptor_t matmulDesc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorAttributeSet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorAttributeGet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, void* buf, size_t sizeInBytes, size_t* sizeWritten) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmul_bufferSize(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, size_t* workspaceSizeInBytesOnDevice, size_t* workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t _cublasMpMatmul(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, void* d_work, size_t workspaceSizeInBytesOnDevice, void* h_work, size_t workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef int64_t _cublasMpNumroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except?-42 nogil
diff --git a/nvmath/bindings/_internal/cublasMp_linux.pyx b/nvmath/bindings/_internal/cublasMp_linux.pyx
new file mode 100644
index 0000000..347202c
--- /dev/null
+++ b/nvmath/bindings/_internal/cublasMp_linux.pyx
@@ -0,0 +1,420 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_cublasMp_init = False
+
+cdef void* __cublasMpCreate = NULL
+cdef void* __cublasMpDestroy = NULL
+cdef void* __cublasMpStreamSet = NULL
+cdef void* __cublasMpGetVersion = NULL
+cdef void* __cublasMpGridCreate = NULL
+cdef void* __cublasMpGridDestroy = NULL
+cdef void* __cublasMpMatrixDescriptorCreate = NULL
+cdef void* __cublasMpMatrixDescriptorDestroy = NULL
+cdef void* __cublasMpMatmulDescriptorCreate = NULL
+cdef void* __cublasMpMatmulDescriptorDestroy = NULL
+cdef void* __cublasMpMatmulDescriptorAttributeSet = NULL
+cdef void* __cublasMpMatmulDescriptorAttributeGet = NULL
+cdef void* __cublasMpMatmul_bufferSize = NULL
+cdef void* __cublasMpMatmul = NULL
+cdef void* __cublasMpNumroc = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cublasmp")._handle_uint
+    return <void*>handle
+
+
+cdef int _check_or_init_cublasMp() except -1 nogil:
+    global __py_cublasMp_init
+    if __py_cublasMp_init:
+        return 0
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Load function
+        global __cublasMpCreate
+        __cublasMpCreate = dlsym(RTLD_DEFAULT, 'cublasMpCreate')
+        if __cublasMpCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpCreate = dlsym(handle, 'cublasMpCreate')
+
+        global __cublasMpDestroy
+        __cublasMpDestroy = dlsym(RTLD_DEFAULT, 'cublasMpDestroy')
+        if __cublasMpDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpDestroy = dlsym(handle, 'cublasMpDestroy')
+
+        global __cublasMpStreamSet
+        __cublasMpStreamSet = dlsym(RTLD_DEFAULT, 'cublasMpStreamSet')
+        if __cublasMpStreamSet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpStreamSet = dlsym(handle, 'cublasMpStreamSet')
+
+        global __cublasMpGetVersion
+        __cublasMpGetVersion = dlsym(RTLD_DEFAULT, 'cublasMpGetVersion')
+        if __cublasMpGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpGetVersion = dlsym(handle, 'cublasMpGetVersion')
+
+        global __cublasMpGridCreate
+        __cublasMpGridCreate = dlsym(RTLD_DEFAULT, 'cublasMpGridCreate')
+        if __cublasMpGridCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpGridCreate = dlsym(handle, 'cublasMpGridCreate')
+
+        global __cublasMpGridDestroy
+        __cublasMpGridDestroy = dlsym(RTLD_DEFAULT, 'cublasMpGridDestroy')
+        if __cublasMpGridDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpGridDestroy = dlsym(handle, 'cublasMpGridDestroy')
+
+        global __cublasMpMatrixDescriptorCreate
+        __cublasMpMatrixDescriptorCreate = dlsym(RTLD_DEFAULT, 'cublasMpMatrixDescriptorCreate')
+        if __cublasMpMatrixDescriptorCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatrixDescriptorCreate = dlsym(handle, 'cublasMpMatrixDescriptorCreate')
+
+        global __cublasMpMatrixDescriptorDestroy
+        __cublasMpMatrixDescriptorDestroy = dlsym(RTLD_DEFAULT, 'cublasMpMatrixDescriptorDestroy')
+        if __cublasMpMatrixDescriptorDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatrixDescriptorDestroy = dlsym(handle, 'cublasMpMatrixDescriptorDestroy')
+
+        global __cublasMpMatmulDescriptorCreate
+        __cublasMpMatmulDescriptorCreate = dlsym(RTLD_DEFAULT, 'cublasMpMatmulDescriptorCreate')
+        if __cublasMpMatmulDescriptorCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmulDescriptorCreate = dlsym(handle, 'cublasMpMatmulDescriptorCreate')
+
+        global __cublasMpMatmulDescriptorDestroy
+        __cublasMpMatmulDescriptorDestroy = dlsym(RTLD_DEFAULT, 'cublasMpMatmulDescriptorDestroy')
+        if __cublasMpMatmulDescriptorDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmulDescriptorDestroy = dlsym(handle, 'cublasMpMatmulDescriptorDestroy')
+
+        global __cublasMpMatmulDescriptorAttributeSet
+        __cublasMpMatmulDescriptorAttributeSet = dlsym(RTLD_DEFAULT, 'cublasMpMatmulDescriptorAttributeSet')
+        if __cublasMpMatmulDescriptorAttributeSet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmulDescriptorAttributeSet = dlsym(handle, 'cublasMpMatmulDescriptorAttributeSet')
+
+        global __cublasMpMatmulDescriptorAttributeGet
+        __cublasMpMatmulDescriptorAttributeGet = dlsym(RTLD_DEFAULT, 'cublasMpMatmulDescriptorAttributeGet')
+        if __cublasMpMatmulDescriptorAttributeGet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmulDescriptorAttributeGet = dlsym(handle, 'cublasMpMatmulDescriptorAttributeGet')
+
+        global __cublasMpMatmul_bufferSize
+        __cublasMpMatmul_bufferSize = dlsym(RTLD_DEFAULT, 'cublasMpMatmul_bufferSize')
+        if __cublasMpMatmul_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmul_bufferSize = dlsym(handle, 'cublasMpMatmul_bufferSize')
+
+        global __cublasMpMatmul
+        __cublasMpMatmul = dlsym(RTLD_DEFAULT, 'cublasMpMatmul')
+        if __cublasMpMatmul == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpMatmul = dlsym(handle, 'cublasMpMatmul')
+
+        global __cublasMpNumroc
+        __cublasMpNumroc = dlsym(RTLD_DEFAULT, 'cublasMpNumroc')
+        if __cublasMpNumroc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cublasMpNumroc = dlsym(handle, 'cublasMpNumroc')
+        __py_cublasMp_init = True
+        return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_cublasMp()
+    cdef dict data = {}
+
+    global __cublasMpCreate
+    data["__cublasMpCreate"] = <intptr_t>__cublasMpCreate
+
+    global __cublasMpDestroy
+    data["__cublasMpDestroy"] = <intptr_t>__cublasMpDestroy
+
+    global __cublasMpStreamSet
+    data["__cublasMpStreamSet"] = <intptr_t>__cublasMpStreamSet
+
+    global __cublasMpGetVersion
+    data["__cublasMpGetVersion"] = <intptr_t>__cublasMpGetVersion
+
+    global __cublasMpGridCreate
+    data["__cublasMpGridCreate"] = <intptr_t>__cublasMpGridCreate
+
+    global __cublasMpGridDestroy
+    data["__cublasMpGridDestroy"] = <intptr_t>__cublasMpGridDestroy
+
+    global __cublasMpMatrixDescriptorCreate
+    data["__cublasMpMatrixDescriptorCreate"] = <intptr_t>__cublasMpMatrixDescriptorCreate
+
+    global __cublasMpMatrixDescriptorDestroy
+    data["__cublasMpMatrixDescriptorDestroy"] = <intptr_t>__cublasMpMatrixDescriptorDestroy
+
+    global __cublasMpMatmulDescriptorCreate
+    data["__cublasMpMatmulDescriptorCreate"] = <intptr_t>__cublasMpMatmulDescriptorCreate
+
+    global __cublasMpMatmulDescriptorDestroy
+    data["__cublasMpMatmulDescriptorDestroy"] = <intptr_t>__cublasMpMatmulDescriptorDestroy
+
+    global __cublasMpMatmulDescriptorAttributeSet
+    data["__cublasMpMatmulDescriptorAttributeSet"] = <intptr_t>__cublasMpMatmulDescriptorAttributeSet
+
+    global __cublasMpMatmulDescriptorAttributeGet
+    data["__cublasMpMatmulDescriptorAttributeGet"] = <intptr_t>__cublasMpMatmulDescriptorAttributeGet
+
+    global __cublasMpMatmul_bufferSize
+    data["__cublasMpMatmul_bufferSize"] = <intptr_t>__cublasMpMatmul_bufferSize
+
+    global __cublasMpMatmul
+    data["__cublasMpMatmul"] = <intptr_t>__cublasMpMatmul
+
+    global __cublasMpNumroc
+    data["__cublasMpNumroc"] = <intptr_t>__cublasMpNumroc
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cublasMpStatus_t _cublasMpCreate(cublasMpHandle_t* handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpCreate
+    _check_or_init_cublasMp()
+    if __cublasMpCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpCreate is not found")
+    return (<cublasMpStatus_t (*)(cublasMpHandle_t*, cudaStream_t) noexcept nogil>__cublasMpCreate)(
+        handle, stream)
+
+
+cdef cublasMpStatus_t _cublasMpDestroy(cublasMpHandle_t handle) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpDestroy
+    _check_or_init_cublasMp()
+    if __cublasMpDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpDestroy is not found")
+    return (<cublasMpStatus_t (*)(cublasMpHandle_t) noexcept nogil>__cublasMpDestroy)(
+        handle)
+
+
+cdef cublasMpStatus_t _cublasMpStreamSet(cublasMpHandle_t handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpStreamSet
+    _check_or_init_cublasMp()
+    if __cublasMpStreamSet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpStreamSet is not found")
+    return (<cublasMpStatus_t (*)(cublasMpHandle_t, cudaStream_t) noexcept nogil>__cublasMpStreamSet)(
+        handle, stream)
+
+
+cdef cublasMpStatus_t _cublasMpGetVersion(int* version) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpGetVersion
+    _check_or_init_cublasMp()
+    if __cublasMpGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpGetVersion is not found")
+    return (<cublasMpStatus_t (*)(int*) noexcept nogil>__cublasMpGetVersion)(
+        version)
+
+
+cdef cublasMpStatus_t _cublasMpGridCreate(int64_t nprow, int64_t npcol, cublasMpGridLayout_t layout, ncclComm_t comm, cublasMpGrid_t* grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpGridCreate
+    _check_or_init_cublasMp()
+    if __cublasMpGridCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpGridCreate is not found")
+    return (<cublasMpStatus_t (*)(int64_t, int64_t, cublasMpGridLayout_t, ncclComm_t, cublasMpGrid_t*) noexcept nogil>__cublasMpGridCreate)(
+        nprow, npcol, layout, comm, grid)
+
+
+cdef cublasMpStatus_t _cublasMpGridDestroy(cublasMpGrid_t grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpGridDestroy
+    _check_or_init_cublasMp()
+    if __cublasMpGridDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpGridDestroy is not found")
+    return (<cublasMpStatus_t (*)(cublasMpGrid_t) noexcept nogil>__cublasMpGridDestroy)(
+        grid)
+
+
+cdef cublasMpStatus_t _cublasMpMatrixDescriptorCreate(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, cudaDataType_t type, cublasMpGrid_t grid, cublasMpMatrixDescriptor_t* desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatrixDescriptorCreate
+    _check_or_init_cublasMp()
+    if __cublasMpMatrixDescriptorCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatrixDescriptorCreate is not found")
+    return (<cublasMpStatus_t (*)(int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, cudaDataType_t, cublasMpGrid_t, cublasMpMatrixDescriptor_t*) noexcept nogil>__cublasMpMatrixDescriptorCreate)(
+        m, n, mb, nb, rsrc, csrc, lld, type, grid, desc)
+
+
+cdef cublasMpStatus_t _cublasMpMatrixDescriptorDestroy(cublasMpMatrixDescriptor_t desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatrixDescriptorDestroy
+    _check_or_init_cublasMp()
+    if __cublasMpMatrixDescriptorDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatrixDescriptorDestroy is not found")
+    return (<cublasMpStatus_t (*)(cublasMpMatrixDescriptor_t) noexcept nogil>__cublasMpMatrixDescriptorDestroy)(
+        desc)
+
+
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorCreate(cublasMpMatmulDescriptor_t* matmulDesc, cublasComputeType_t computeType) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmulDescriptorCreate
+    _check_or_init_cublasMp()
+    if __cublasMpMatmulDescriptorCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmulDescriptorCreate is not found")
+    return (<cublasMpStatus_t (*)(cublasMpMatmulDescriptor_t*, cublasComputeType_t) noexcept nogil>__cublasMpMatmulDescriptorCreate)(
+        matmulDesc, computeType)
+
+
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorDestroy(cublasMpMatmulDescriptor_t matmulDesc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmulDescriptorDestroy
+    _check_or_init_cublasMp()
+    if __cublasMpMatmulDescriptorDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmulDescriptorDestroy is not found")
+    return (<cublasMpStatus_t (*)(cublasMpMatmulDescriptor_t) noexcept nogil>__cublasMpMatmulDescriptorDestroy)(
+        matmulDesc)
+
+
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorAttributeSet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmulDescriptorAttributeSet
+    _check_or_init_cublasMp()
+    if __cublasMpMatmulDescriptorAttributeSet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmulDescriptorAttributeSet is not found")
+    return (<cublasMpStatus_t (*)(cublasMpMatmulDescriptor_t, cublasMpMatmulDescriptorAttribute_t, const void*, size_t) noexcept nogil>__cublasMpMatmulDescriptorAttributeSet)(
+        matmulDesc, attr, buf, sizeInBytes)
+
+
+cdef cublasMpStatus_t _cublasMpMatmulDescriptorAttributeGet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, void* buf, size_t sizeInBytes, size_t* sizeWritten) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmulDescriptorAttributeGet
+    _check_or_init_cublasMp()
+    if __cublasMpMatmulDescriptorAttributeGet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmulDescriptorAttributeGet is not found")
+    return (<cublasMpStatus_t (*)(cublasMpMatmulDescriptor_t, cublasMpMatmulDescriptorAttribute_t, void*, size_t, size_t*) noexcept nogil>__cublasMpMatmulDescriptorAttributeGet)(
+        matmulDesc, attr, buf, sizeInBytes, sizeWritten)
+
+
+cdef cublasMpStatus_t _cublasMpMatmul_bufferSize(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, size_t* workspaceSizeInBytesOnDevice, size_t* workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmul_bufferSize
+    _check_or_init_cublasMp()
+    if __cublasMpMatmul_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmul_bufferSize is not found")
+    return (<cublasMpStatus_t (*)(cublasMpHandle_t, cublasMpMatmulDescriptor_t, int64_t, int64_t, int64_t, const void*, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, const void*, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, size_t*, size_t*) noexcept nogil>__cublasMpMatmul_bufferSize)(
+        handle, matmulDesc, m, n, k, alpha, a, ia, ja, descA, b, ib, jb, descB, beta, c, ic, jc, descC, d, id, jd, descD, workspaceSizeInBytesOnDevice, workspaceSizeInBytesOnHost)
+
+
+cdef cublasMpStatus_t _cublasMpMatmul(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, void* d_work, size_t workspaceSizeInBytesOnDevice, void* h_work, size_t workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasMpMatmul
+    _check_or_init_cublasMp()
+    if __cublasMpMatmul == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpMatmul is not found")
+    return (<cublasMpStatus_t (*)(cublasMpHandle_t, cublasMpMatmulDescriptor_t, int64_t, int64_t, int64_t, const void*, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, const void*, const void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, void*, int64_t, int64_t, cublasMpMatrixDescriptor_t, void*, size_t, void*, size_t) noexcept nogil>__cublasMpMatmul)(
+        handle, matmulDesc, m, n, k, alpha, a, ia, ja, descA, b, ib, jb, descB, beta, c, ic, jc, descC, d, id, jd, descD, d_work, workspaceSizeInBytesOnDevice, h_work, workspaceSizeInBytesOnHost)
+
+
+cdef int64_t _cublasMpNumroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except?-42 nogil:
+    global __cublasMpNumroc
+    _check_or_init_cublasMp()
+    if __cublasMpNumroc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasMpNumroc is not found")
+    return (<int64_t (*)(int64_t, int64_t, uint32_t, uint32_t, uint32_t) noexcept nogil>__cublasMpNumroc)(
+        n, nb, iproc, isrcproc, nprocs)
diff --git a/nvmath/bindings/_internal/cublas_linux.pyx b/nvmath/bindings/_internal/cublas_linux.pyx
index 31ddf6d..0ed540f 100644
--- a/nvmath/bindings/_internal/cublas_linux.pyx
+++ b/nvmath/bindings/_internal/cublas_linux.pyx
@@ -6,11 +6,14 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -29,13 +32,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cublas_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cublasCreate_v2 = NULL
 cdef void* __cublasDestroy_v2 = NULL
@@ -555,3572 +576,3556 @@ cdef int _check_or_init_cublas() except -1 nogil:
     if __py_cublas_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cublasCreate_v2
-    __cublasCreate_v2 = dlsym(RTLD_DEFAULT, 'cublasCreate_v2')
-    if __cublasCreate_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCreate_v2 = dlsym(handle, 'cublasCreate_v2')
-
-    global __cublasDestroy_v2
-    __cublasDestroy_v2 = dlsym(RTLD_DEFAULT, 'cublasDestroy_v2')
-    if __cublasDestroy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDestroy_v2 = dlsym(handle, 'cublasDestroy_v2')
-
-    global __cublasGetVersion_v2
-    __cublasGetVersion_v2 = dlsym(RTLD_DEFAULT, 'cublasGetVersion_v2')
-    if __cublasGetVersion_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetVersion_v2 = dlsym(handle, 'cublasGetVersion_v2')
-
-    global __cublasGetProperty
-    __cublasGetProperty = dlsym(RTLD_DEFAULT, 'cublasGetProperty')
-    if __cublasGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetProperty = dlsym(handle, 'cublasGetProperty')
-
-    global __cublasGetCudartVersion
-    __cublasGetCudartVersion = dlsym(RTLD_DEFAULT, 'cublasGetCudartVersion')
-    if __cublasGetCudartVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetCudartVersion = dlsym(handle, 'cublasGetCudartVersion')
-
-    global __cublasSetWorkspace_v2
-    __cublasSetWorkspace_v2 = dlsym(RTLD_DEFAULT, 'cublasSetWorkspace_v2')
-    if __cublasSetWorkspace_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetWorkspace_v2 = dlsym(handle, 'cublasSetWorkspace_v2')
-
-    global __cublasSetStream_v2
-    __cublasSetStream_v2 = dlsym(RTLD_DEFAULT, 'cublasSetStream_v2')
-    if __cublasSetStream_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetStream_v2 = dlsym(handle, 'cublasSetStream_v2')
-
-    global __cublasGetStream_v2
-    __cublasGetStream_v2 = dlsym(RTLD_DEFAULT, 'cublasGetStream_v2')
-    if __cublasGetStream_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetStream_v2 = dlsym(handle, 'cublasGetStream_v2')
-
-    global __cublasGetPointerMode_v2
-    __cublasGetPointerMode_v2 = dlsym(RTLD_DEFAULT, 'cublasGetPointerMode_v2')
-    if __cublasGetPointerMode_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetPointerMode_v2 = dlsym(handle, 'cublasGetPointerMode_v2')
-
-    global __cublasSetPointerMode_v2
-    __cublasSetPointerMode_v2 = dlsym(RTLD_DEFAULT, 'cublasSetPointerMode_v2')
-    if __cublasSetPointerMode_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetPointerMode_v2 = dlsym(handle, 'cublasSetPointerMode_v2')
-
-    global __cublasGetAtomicsMode
-    __cublasGetAtomicsMode = dlsym(RTLD_DEFAULT, 'cublasGetAtomicsMode')
-    if __cublasGetAtomicsMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetAtomicsMode = dlsym(handle, 'cublasGetAtomicsMode')
-
-    global __cublasSetAtomicsMode
-    __cublasSetAtomicsMode = dlsym(RTLD_DEFAULT, 'cublasSetAtomicsMode')
-    if __cublasSetAtomicsMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetAtomicsMode = dlsym(handle, 'cublasSetAtomicsMode')
-
-    global __cublasGetMathMode
-    __cublasGetMathMode = dlsym(RTLD_DEFAULT, 'cublasGetMathMode')
-    if __cublasGetMathMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetMathMode = dlsym(handle, 'cublasGetMathMode')
-
-    global __cublasSetMathMode
-    __cublasSetMathMode = dlsym(RTLD_DEFAULT, 'cublasSetMathMode')
-    if __cublasSetMathMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetMathMode = dlsym(handle, 'cublasSetMathMode')
-
-    global __cublasLoggerConfigure
-    __cublasLoggerConfigure = dlsym(RTLD_DEFAULT, 'cublasLoggerConfigure')
-    if __cublasLoggerConfigure == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasLoggerConfigure = dlsym(handle, 'cublasLoggerConfigure')
-
-    global __cublasSetLoggerCallback
-    __cublasSetLoggerCallback = dlsym(RTLD_DEFAULT, 'cublasSetLoggerCallback')
-    if __cublasSetLoggerCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetLoggerCallback = dlsym(handle, 'cublasSetLoggerCallback')
-
-    global __cublasGetLoggerCallback
-    __cublasGetLoggerCallback = dlsym(RTLD_DEFAULT, 'cublasGetLoggerCallback')
-    if __cublasGetLoggerCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetLoggerCallback = dlsym(handle, 'cublasGetLoggerCallback')
-
-    global __cublasSetVector
-    __cublasSetVector = dlsym(RTLD_DEFAULT, 'cublasSetVector')
-    if __cublasSetVector == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetVector = dlsym(handle, 'cublasSetVector')
-
-    global __cublasGetVector
-    __cublasGetVector = dlsym(RTLD_DEFAULT, 'cublasGetVector')
-    if __cublasGetVector == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetVector = dlsym(handle, 'cublasGetVector')
-
-    global __cublasSetMatrix
-    __cublasSetMatrix = dlsym(RTLD_DEFAULT, 'cublasSetMatrix')
-    if __cublasSetMatrix == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetMatrix = dlsym(handle, 'cublasSetMatrix')
-
-    global __cublasGetMatrix
-    __cublasGetMatrix = dlsym(RTLD_DEFAULT, 'cublasGetMatrix')
-    if __cublasGetMatrix == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetMatrix = dlsym(handle, 'cublasGetMatrix')
-
-    global __cublasSetVectorAsync
-    __cublasSetVectorAsync = dlsym(RTLD_DEFAULT, 'cublasSetVectorAsync')
-    if __cublasSetVectorAsync == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetVectorAsync = dlsym(handle, 'cublasSetVectorAsync')
-
-    global __cublasGetVectorAsync
-    __cublasGetVectorAsync = dlsym(RTLD_DEFAULT, 'cublasGetVectorAsync')
-    if __cublasGetVectorAsync == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetVectorAsync = dlsym(handle, 'cublasGetVectorAsync')
-
-    global __cublasSetMatrixAsync
-    __cublasSetMatrixAsync = dlsym(RTLD_DEFAULT, 'cublasSetMatrixAsync')
-    if __cublasSetMatrixAsync == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetMatrixAsync = dlsym(handle, 'cublasSetMatrixAsync')
-
-    global __cublasGetMatrixAsync
-    __cublasGetMatrixAsync = dlsym(RTLD_DEFAULT, 'cublasGetMatrixAsync')
-    if __cublasGetMatrixAsync == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetMatrixAsync = dlsym(handle, 'cublasGetMatrixAsync')
-
-    global __cublasNrm2Ex
-    __cublasNrm2Ex = dlsym(RTLD_DEFAULT, 'cublasNrm2Ex')
-    if __cublasNrm2Ex == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasNrm2Ex = dlsym(handle, 'cublasNrm2Ex')
-
-    global __cublasSnrm2_v2
-    __cublasSnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasSnrm2_v2')
-    if __cublasSnrm2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSnrm2_v2 = dlsym(handle, 'cublasSnrm2_v2')
-
-    global __cublasDnrm2_v2
-    __cublasDnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasDnrm2_v2')
-    if __cublasDnrm2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDnrm2_v2 = dlsym(handle, 'cublasDnrm2_v2')
-
-    global __cublasScnrm2_v2
-    __cublasScnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasScnrm2_v2')
-    if __cublasScnrm2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScnrm2_v2 = dlsym(handle, 'cublasScnrm2_v2')
-
-    global __cublasDznrm2_v2
-    __cublasDznrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasDznrm2_v2')
-    if __cublasDznrm2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDznrm2_v2 = dlsym(handle, 'cublasDznrm2_v2')
-
-    global __cublasDotEx
-    __cublasDotEx = dlsym(RTLD_DEFAULT, 'cublasDotEx')
-    if __cublasDotEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDotEx = dlsym(handle, 'cublasDotEx')
-
-    global __cublasDotcEx
-    __cublasDotcEx = dlsym(RTLD_DEFAULT, 'cublasDotcEx')
-    if __cublasDotcEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDotcEx = dlsym(handle, 'cublasDotcEx')
-
-    global __cublasSdot_v2
-    __cublasSdot_v2 = dlsym(RTLD_DEFAULT, 'cublasSdot_v2')
-    if __cublasSdot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSdot_v2 = dlsym(handle, 'cublasSdot_v2')
-
-    global __cublasDdot_v2
-    __cublasDdot_v2 = dlsym(RTLD_DEFAULT, 'cublasDdot_v2')
-    if __cublasDdot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDdot_v2 = dlsym(handle, 'cublasDdot_v2')
-
-    global __cublasCdotu_v2
-    __cublasCdotu_v2 = dlsym(RTLD_DEFAULT, 'cublasCdotu_v2')
-    if __cublasCdotu_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdotu_v2 = dlsym(handle, 'cublasCdotu_v2')
-
-    global __cublasCdotc_v2
-    __cublasCdotc_v2 = dlsym(RTLD_DEFAULT, 'cublasCdotc_v2')
-    if __cublasCdotc_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdotc_v2 = dlsym(handle, 'cublasCdotc_v2')
-
-    global __cublasZdotu_v2
-    __cublasZdotu_v2 = dlsym(RTLD_DEFAULT, 'cublasZdotu_v2')
-    if __cublasZdotu_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdotu_v2 = dlsym(handle, 'cublasZdotu_v2')
 
-    global __cublasZdotc_v2
-    __cublasZdotc_v2 = dlsym(RTLD_DEFAULT, 'cublasZdotc_v2')
-    if __cublasZdotc_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdotc_v2 = dlsym(handle, 'cublasZdotc_v2')
-
-    global __cublasScalEx
-    __cublasScalEx = dlsym(RTLD_DEFAULT, 'cublasScalEx')
-    if __cublasScalEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScalEx = dlsym(handle, 'cublasScalEx')
-
-    global __cublasSscal_v2
-    __cublasSscal_v2 = dlsym(RTLD_DEFAULT, 'cublasSscal_v2')
-    if __cublasSscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSscal_v2 = dlsym(handle, 'cublasSscal_v2')
-
-    global __cublasDscal_v2
-    __cublasDscal_v2 = dlsym(RTLD_DEFAULT, 'cublasDscal_v2')
-    if __cublasDscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDscal_v2 = dlsym(handle, 'cublasDscal_v2')
-
-    global __cublasCscal_v2
-    __cublasCscal_v2 = dlsym(RTLD_DEFAULT, 'cublasCscal_v2')
-    if __cublasCscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCscal_v2 = dlsym(handle, 'cublasCscal_v2')
-
-    global __cublasCsscal_v2
-    __cublasCsscal_v2 = dlsym(RTLD_DEFAULT, 'cublasCsscal_v2')
-    if __cublasCsscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsscal_v2 = dlsym(handle, 'cublasCsscal_v2')
-
-    global __cublasZscal_v2
-    __cublasZscal_v2 = dlsym(RTLD_DEFAULT, 'cublasZscal_v2')
-    if __cublasZscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZscal_v2 = dlsym(handle, 'cublasZscal_v2')
-
-    global __cublasZdscal_v2
-    __cublasZdscal_v2 = dlsym(RTLD_DEFAULT, 'cublasZdscal_v2')
-    if __cublasZdscal_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdscal_v2 = dlsym(handle, 'cublasZdscal_v2')
-
-    global __cublasAxpyEx
-    __cublasAxpyEx = dlsym(RTLD_DEFAULT, 'cublasAxpyEx')
-    if __cublasAxpyEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasAxpyEx = dlsym(handle, 'cublasAxpyEx')
-
-    global __cublasSaxpy_v2
-    __cublasSaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasSaxpy_v2')
-    if __cublasSaxpy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSaxpy_v2 = dlsym(handle, 'cublasSaxpy_v2')
-
-    global __cublasDaxpy_v2
-    __cublasDaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasDaxpy_v2')
-    if __cublasDaxpy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDaxpy_v2 = dlsym(handle, 'cublasDaxpy_v2')
-
-    global __cublasCaxpy_v2
-    __cublasCaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasCaxpy_v2')
-    if __cublasCaxpy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCaxpy_v2 = dlsym(handle, 'cublasCaxpy_v2')
-
-    global __cublasZaxpy_v2
-    __cublasZaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasZaxpy_v2')
-    if __cublasZaxpy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZaxpy_v2 = dlsym(handle, 'cublasZaxpy_v2')
-
-    global __cublasCopyEx
-    __cublasCopyEx = dlsym(RTLD_DEFAULT, 'cublasCopyEx')
-    if __cublasCopyEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCopyEx = dlsym(handle, 'cublasCopyEx')
-
-    global __cublasScopy_v2
-    __cublasScopy_v2 = dlsym(RTLD_DEFAULT, 'cublasScopy_v2')
-    if __cublasScopy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScopy_v2 = dlsym(handle, 'cublasScopy_v2')
-
-    global __cublasDcopy_v2
-    __cublasDcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasDcopy_v2')
-    if __cublasDcopy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDcopy_v2 = dlsym(handle, 'cublasDcopy_v2')
-
-    global __cublasCcopy_v2
-    __cublasCcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasCcopy_v2')
-    if __cublasCcopy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCcopy_v2 = dlsym(handle, 'cublasCcopy_v2')
-
-    global __cublasZcopy_v2
-    __cublasZcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasZcopy_v2')
-    if __cublasZcopy_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZcopy_v2 = dlsym(handle, 'cublasZcopy_v2')
-
-    global __cublasSswap_v2
-    __cublasSswap_v2 = dlsym(RTLD_DEFAULT, 'cublasSswap_v2')
-    if __cublasSswap_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSswap_v2 = dlsym(handle, 'cublasSswap_v2')
-
-    global __cublasDswap_v2
-    __cublasDswap_v2 = dlsym(RTLD_DEFAULT, 'cublasDswap_v2')
-    if __cublasDswap_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDswap_v2 = dlsym(handle, 'cublasDswap_v2')
-
-    global __cublasCswap_v2
-    __cublasCswap_v2 = dlsym(RTLD_DEFAULT, 'cublasCswap_v2')
-    if __cublasCswap_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCswap_v2 = dlsym(handle, 'cublasCswap_v2')
-
-    global __cublasZswap_v2
-    __cublasZswap_v2 = dlsym(RTLD_DEFAULT, 'cublasZswap_v2')
-    if __cublasZswap_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZswap_v2 = dlsym(handle, 'cublasZswap_v2')
-
-    global __cublasSwapEx
-    __cublasSwapEx = dlsym(RTLD_DEFAULT, 'cublasSwapEx')
-    if __cublasSwapEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSwapEx = dlsym(handle, 'cublasSwapEx')
-
-    global __cublasIsamax_v2
-    __cublasIsamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIsamax_v2')
-    if __cublasIsamax_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIsamax_v2 = dlsym(handle, 'cublasIsamax_v2')
-
-    global __cublasIdamax_v2
-    __cublasIdamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIdamax_v2')
-    if __cublasIdamax_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIdamax_v2 = dlsym(handle, 'cublasIdamax_v2')
-
-    global __cublasIcamax_v2
-    __cublasIcamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIcamax_v2')
-    if __cublasIcamax_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIcamax_v2 = dlsym(handle, 'cublasIcamax_v2')
-
-    global __cublasIzamax_v2
-    __cublasIzamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIzamax_v2')
-    if __cublasIzamax_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIzamax_v2 = dlsym(handle, 'cublasIzamax_v2')
-
-    global __cublasIamaxEx
-    __cublasIamaxEx = dlsym(RTLD_DEFAULT, 'cublasIamaxEx')
-    if __cublasIamaxEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIamaxEx = dlsym(handle, 'cublasIamaxEx')
-
-    global __cublasIsamin_v2
-    __cublasIsamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIsamin_v2')
-    if __cublasIsamin_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIsamin_v2 = dlsym(handle, 'cublasIsamin_v2')
-
-    global __cublasIdamin_v2
-    __cublasIdamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIdamin_v2')
-    if __cublasIdamin_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIdamin_v2 = dlsym(handle, 'cublasIdamin_v2')
-
-    global __cublasIcamin_v2
-    __cublasIcamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIcamin_v2')
-    if __cublasIcamin_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIcamin_v2 = dlsym(handle, 'cublasIcamin_v2')
-
-    global __cublasIzamin_v2
-    __cublasIzamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIzamin_v2')
-    if __cublasIzamin_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIzamin_v2 = dlsym(handle, 'cublasIzamin_v2')
-
-    global __cublasIaminEx
-    __cublasIaminEx = dlsym(RTLD_DEFAULT, 'cublasIaminEx')
-    if __cublasIaminEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIaminEx = dlsym(handle, 'cublasIaminEx')
-
-    global __cublasAsumEx
-    __cublasAsumEx = dlsym(RTLD_DEFAULT, 'cublasAsumEx')
-    if __cublasAsumEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasAsumEx = dlsym(handle, 'cublasAsumEx')
-
-    global __cublasSasum_v2
-    __cublasSasum_v2 = dlsym(RTLD_DEFAULT, 'cublasSasum_v2')
-    if __cublasSasum_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSasum_v2 = dlsym(handle, 'cublasSasum_v2')
-
-    global __cublasDasum_v2
-    __cublasDasum_v2 = dlsym(RTLD_DEFAULT, 'cublasDasum_v2')
-    if __cublasDasum_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDasum_v2 = dlsym(handle, 'cublasDasum_v2')
-
-    global __cublasScasum_v2
-    __cublasScasum_v2 = dlsym(RTLD_DEFAULT, 'cublasScasum_v2')
-    if __cublasScasum_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScasum_v2 = dlsym(handle, 'cublasScasum_v2')
-
-    global __cublasDzasum_v2
-    __cublasDzasum_v2 = dlsym(RTLD_DEFAULT, 'cublasDzasum_v2')
-    if __cublasDzasum_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDzasum_v2 = dlsym(handle, 'cublasDzasum_v2')
-
-    global __cublasSrot_v2
-    __cublasSrot_v2 = dlsym(RTLD_DEFAULT, 'cublasSrot_v2')
-    if __cublasSrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrot_v2 = dlsym(handle, 'cublasSrot_v2')
-
-    global __cublasDrot_v2
-    __cublasDrot_v2 = dlsym(RTLD_DEFAULT, 'cublasDrot_v2')
-    if __cublasDrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrot_v2 = dlsym(handle, 'cublasDrot_v2')
-
-    global __cublasCrot_v2
-    __cublasCrot_v2 = dlsym(RTLD_DEFAULT, 'cublasCrot_v2')
-    if __cublasCrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCrot_v2 = dlsym(handle, 'cublasCrot_v2')
-
-    global __cublasCsrot_v2
-    __cublasCsrot_v2 = dlsym(RTLD_DEFAULT, 'cublasCsrot_v2')
-    if __cublasCsrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsrot_v2 = dlsym(handle, 'cublasCsrot_v2')
-
-    global __cublasZrot_v2
-    __cublasZrot_v2 = dlsym(RTLD_DEFAULT, 'cublasZrot_v2')
-    if __cublasZrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZrot_v2 = dlsym(handle, 'cublasZrot_v2')
-
-    global __cublasZdrot_v2
-    __cublasZdrot_v2 = dlsym(RTLD_DEFAULT, 'cublasZdrot_v2')
-    if __cublasZdrot_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdrot_v2 = dlsym(handle, 'cublasZdrot_v2')
-
-    global __cublasRotEx
-    __cublasRotEx = dlsym(RTLD_DEFAULT, 'cublasRotEx')
-    if __cublasRotEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotEx = dlsym(handle, 'cublasRotEx')
-
-    global __cublasSrotg_v2
-    __cublasSrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotg_v2')
-    if __cublasSrotg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrotg_v2 = dlsym(handle, 'cublasSrotg_v2')
-
-    global __cublasDrotg_v2
-    __cublasDrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotg_v2')
-    if __cublasDrotg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrotg_v2 = dlsym(handle, 'cublasDrotg_v2')
-
-    global __cublasCrotg_v2
-    __cublasCrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasCrotg_v2')
-    if __cublasCrotg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCrotg_v2 = dlsym(handle, 'cublasCrotg_v2')
-
-    global __cublasZrotg_v2
-    __cublasZrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasZrotg_v2')
-    if __cublasZrotg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZrotg_v2 = dlsym(handle, 'cublasZrotg_v2')
-
-    global __cublasRotgEx
-    __cublasRotgEx = dlsym(RTLD_DEFAULT, 'cublasRotgEx')
-    if __cublasRotgEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotgEx = dlsym(handle, 'cublasRotgEx')
-
-    global __cublasSrotm_v2
-    __cublasSrotm_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotm_v2')
-    if __cublasSrotm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrotm_v2 = dlsym(handle, 'cublasSrotm_v2')
-
-    global __cublasDrotm_v2
-    __cublasDrotm_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotm_v2')
-    if __cublasDrotm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrotm_v2 = dlsym(handle, 'cublasDrotm_v2')
-
-    global __cublasRotmEx
-    __cublasRotmEx = dlsym(RTLD_DEFAULT, 'cublasRotmEx')
-    if __cublasRotmEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotmEx = dlsym(handle, 'cublasRotmEx')
-
-    global __cublasSrotmg_v2
-    __cublasSrotmg_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotmg_v2')
-    if __cublasSrotmg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrotmg_v2 = dlsym(handle, 'cublasSrotmg_v2')
-
-    global __cublasDrotmg_v2
-    __cublasDrotmg_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotmg_v2')
-    if __cublasDrotmg_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrotmg_v2 = dlsym(handle, 'cublasDrotmg_v2')
-
-    global __cublasRotmgEx
-    __cublasRotmgEx = dlsym(RTLD_DEFAULT, 'cublasRotmgEx')
-    if __cublasRotmgEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotmgEx = dlsym(handle, 'cublasRotmgEx')
-
-    global __cublasSgemv_v2
-    __cublasSgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasSgemv_v2')
-    if __cublasSgemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemv_v2 = dlsym(handle, 'cublasSgemv_v2')
-
-    global __cublasDgemv_v2
-    __cublasDgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasDgemv_v2')
-    if __cublasDgemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemv_v2 = dlsym(handle, 'cublasDgemv_v2')
-
-    global __cublasCgemv_v2
-    __cublasCgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasCgemv_v2')
-    if __cublasCgemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemv_v2 = dlsym(handle, 'cublasCgemv_v2')
-
-    global __cublasZgemv_v2
-    __cublasZgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasZgemv_v2')
-    if __cublasZgemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemv_v2 = dlsym(handle, 'cublasZgemv_v2')
-
-    global __cublasSgbmv_v2
-    __cublasSgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSgbmv_v2')
-    if __cublasSgbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgbmv_v2 = dlsym(handle, 'cublasSgbmv_v2')
-
-    global __cublasDgbmv_v2
-    __cublasDgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDgbmv_v2')
-    if __cublasDgbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgbmv_v2 = dlsym(handle, 'cublasDgbmv_v2')
-
-    global __cublasCgbmv_v2
-    __cublasCgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCgbmv_v2')
-    if __cublasCgbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgbmv_v2 = dlsym(handle, 'cublasCgbmv_v2')
-
-    global __cublasZgbmv_v2
-    __cublasZgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZgbmv_v2')
-    if __cublasZgbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgbmv_v2 = dlsym(handle, 'cublasZgbmv_v2')
-
-    global __cublasStrmv_v2
-    __cublasStrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStrmv_v2')
-    if __cublasStrmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrmv_v2 = dlsym(handle, 'cublasStrmv_v2')
-
-    global __cublasDtrmv_v2
-    __cublasDtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrmv_v2')
-    if __cublasDtrmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrmv_v2 = dlsym(handle, 'cublasDtrmv_v2')
-
-    global __cublasCtrmv_v2
-    __cublasCtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrmv_v2')
-    if __cublasCtrmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrmv_v2 = dlsym(handle, 'cublasCtrmv_v2')
-
-    global __cublasZtrmv_v2
-    __cublasZtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrmv_v2')
-    if __cublasZtrmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrmv_v2 = dlsym(handle, 'cublasZtrmv_v2')
-
-    global __cublasStbmv_v2
-    __cublasStbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStbmv_v2')
-    if __cublasStbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStbmv_v2 = dlsym(handle, 'cublasStbmv_v2')
-
-    global __cublasDtbmv_v2
-    __cublasDtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtbmv_v2')
-    if __cublasDtbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtbmv_v2 = dlsym(handle, 'cublasDtbmv_v2')
-
-    global __cublasCtbmv_v2
-    __cublasCtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtbmv_v2')
-    if __cublasCtbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtbmv_v2 = dlsym(handle, 'cublasCtbmv_v2')
-
-    global __cublasZtbmv_v2
-    __cublasZtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtbmv_v2')
-    if __cublasZtbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtbmv_v2 = dlsym(handle, 'cublasZtbmv_v2')
-
-    global __cublasStpmv_v2
-    __cublasStpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStpmv_v2')
-    if __cublasStpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStpmv_v2 = dlsym(handle, 'cublasStpmv_v2')
-
-    global __cublasDtpmv_v2
-    __cublasDtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtpmv_v2')
-    if __cublasDtpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtpmv_v2 = dlsym(handle, 'cublasDtpmv_v2')
-
-    global __cublasCtpmv_v2
-    __cublasCtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtpmv_v2')
-    if __cublasCtpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtpmv_v2 = dlsym(handle, 'cublasCtpmv_v2')
-
-    global __cublasZtpmv_v2
-    __cublasZtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtpmv_v2')
-    if __cublasZtpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtpmv_v2 = dlsym(handle, 'cublasZtpmv_v2')
-
-    global __cublasStrsv_v2
-    __cublasStrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStrsv_v2')
-    if __cublasStrsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsv_v2 = dlsym(handle, 'cublasStrsv_v2')
-
-    global __cublasDtrsv_v2
-    __cublasDtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrsv_v2')
-    if __cublasDtrsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsv_v2 = dlsym(handle, 'cublasDtrsv_v2')
-
-    global __cublasCtrsv_v2
-    __cublasCtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrsv_v2')
-    if __cublasCtrsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsv_v2 = dlsym(handle, 'cublasCtrsv_v2')
-
-    global __cublasZtrsv_v2
-    __cublasZtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrsv_v2')
-    if __cublasZtrsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsv_v2 = dlsym(handle, 'cublasZtrsv_v2')
-
-    global __cublasStpsv_v2
-    __cublasStpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStpsv_v2')
-    if __cublasStpsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStpsv_v2 = dlsym(handle, 'cublasStpsv_v2')
-
-    global __cublasDtpsv_v2
-    __cublasDtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtpsv_v2')
-    if __cublasDtpsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtpsv_v2 = dlsym(handle, 'cublasDtpsv_v2')
-
-    global __cublasCtpsv_v2
-    __cublasCtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtpsv_v2')
-    if __cublasCtpsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtpsv_v2 = dlsym(handle, 'cublasCtpsv_v2')
-
-    global __cublasZtpsv_v2
-    __cublasZtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtpsv_v2')
-    if __cublasZtpsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtpsv_v2 = dlsym(handle, 'cublasZtpsv_v2')
-
-    global __cublasStbsv_v2
-    __cublasStbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStbsv_v2')
-    if __cublasStbsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStbsv_v2 = dlsym(handle, 'cublasStbsv_v2')
-
-    global __cublasDtbsv_v2
-    __cublasDtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtbsv_v2')
-    if __cublasDtbsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtbsv_v2 = dlsym(handle, 'cublasDtbsv_v2')
-
-    global __cublasCtbsv_v2
-    __cublasCtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtbsv_v2')
-    if __cublasCtbsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtbsv_v2 = dlsym(handle, 'cublasCtbsv_v2')
-
-    global __cublasZtbsv_v2
-    __cublasZtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtbsv_v2')
-    if __cublasZtbsv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtbsv_v2 = dlsym(handle, 'cublasZtbsv_v2')
-
-    global __cublasSsymv_v2
-    __cublasSsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasSsymv_v2')
-    if __cublasSsymv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsymv_v2 = dlsym(handle, 'cublasSsymv_v2')
-
-    global __cublasDsymv_v2
-    __cublasDsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasDsymv_v2')
-    if __cublasDsymv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsymv_v2 = dlsym(handle, 'cublasDsymv_v2')
-
-    global __cublasCsymv_v2
-    __cublasCsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasCsymv_v2')
-    if __cublasCsymv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsymv_v2 = dlsym(handle, 'cublasCsymv_v2')
-
-    global __cublasZsymv_v2
-    __cublasZsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasZsymv_v2')
-    if __cublasZsymv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsymv_v2 = dlsym(handle, 'cublasZsymv_v2')
-
-    global __cublasChemv_v2
-    __cublasChemv_v2 = dlsym(RTLD_DEFAULT, 'cublasChemv_v2')
-    if __cublasChemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChemv_v2 = dlsym(handle, 'cublasChemv_v2')
-
-    global __cublasZhemv_v2
-    __cublasZhemv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhemv_v2')
-    if __cublasZhemv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhemv_v2 = dlsym(handle, 'cublasZhemv_v2')
-
-    global __cublasSsbmv_v2
-    __cublasSsbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSsbmv_v2')
-    if __cublasSsbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsbmv_v2 = dlsym(handle, 'cublasSsbmv_v2')
-
-    global __cublasDsbmv_v2
-    __cublasDsbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDsbmv_v2')
-    if __cublasDsbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsbmv_v2 = dlsym(handle, 'cublasDsbmv_v2')
-
-    global __cublasChbmv_v2
-    __cublasChbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasChbmv_v2')
-    if __cublasChbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChbmv_v2 = dlsym(handle, 'cublasChbmv_v2')
-
-    global __cublasZhbmv_v2
-    __cublasZhbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhbmv_v2')
-    if __cublasZhbmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhbmv_v2 = dlsym(handle, 'cublasZhbmv_v2')
-
-    global __cublasSspmv_v2
-    __cublasSspmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSspmv_v2')
-    if __cublasSspmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspmv_v2 = dlsym(handle, 'cublasSspmv_v2')
-
-    global __cublasDspmv_v2
-    __cublasDspmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDspmv_v2')
-    if __cublasDspmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspmv_v2 = dlsym(handle, 'cublasDspmv_v2')
-
-    global __cublasChpmv_v2
-    __cublasChpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasChpmv_v2')
-    if __cublasChpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpmv_v2 = dlsym(handle, 'cublasChpmv_v2')
-
-    global __cublasZhpmv_v2
-    __cublasZhpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpmv_v2')
-    if __cublasZhpmv_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpmv_v2 = dlsym(handle, 'cublasZhpmv_v2')
-
-    global __cublasSger_v2
-    __cublasSger_v2 = dlsym(RTLD_DEFAULT, 'cublasSger_v2')
-    if __cublasSger_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSger_v2 = dlsym(handle, 'cublasSger_v2')
-
-    global __cublasDger_v2
-    __cublasDger_v2 = dlsym(RTLD_DEFAULT, 'cublasDger_v2')
-    if __cublasDger_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDger_v2 = dlsym(handle, 'cublasDger_v2')
-
-    global __cublasCgeru_v2
-    __cublasCgeru_v2 = dlsym(RTLD_DEFAULT, 'cublasCgeru_v2')
-    if __cublasCgeru_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgeru_v2 = dlsym(handle, 'cublasCgeru_v2')
-
-    global __cublasCgerc_v2
-    __cublasCgerc_v2 = dlsym(RTLD_DEFAULT, 'cublasCgerc_v2')
-    if __cublasCgerc_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgerc_v2 = dlsym(handle, 'cublasCgerc_v2')
-
-    global __cublasZgeru_v2
-    __cublasZgeru_v2 = dlsym(RTLD_DEFAULT, 'cublasZgeru_v2')
-    if __cublasZgeru_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgeru_v2 = dlsym(handle, 'cublasZgeru_v2')
-
-    global __cublasZgerc_v2
-    __cublasZgerc_v2 = dlsym(RTLD_DEFAULT, 'cublasZgerc_v2')
-    if __cublasZgerc_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgerc_v2 = dlsym(handle, 'cublasZgerc_v2')
-
-    global __cublasSsyr_v2
-    __cublasSsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr_v2')
-    if __cublasSsyr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr_v2 = dlsym(handle, 'cublasSsyr_v2')
-
-    global __cublasDsyr_v2
-    __cublasDsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr_v2')
-    if __cublasDsyr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr_v2 = dlsym(handle, 'cublasDsyr_v2')
-
-    global __cublasCsyr_v2
-    __cublasCsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr_v2')
-    if __cublasCsyr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr_v2 = dlsym(handle, 'cublasCsyr_v2')
-
-    global __cublasZsyr_v2
-    __cublasZsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr_v2')
-    if __cublasZsyr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr_v2 = dlsym(handle, 'cublasZsyr_v2')
-
-    global __cublasCher_v2
-    __cublasCher_v2 = dlsym(RTLD_DEFAULT, 'cublasCher_v2')
-    if __cublasCher_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher_v2 = dlsym(handle, 'cublasCher_v2')
-
-    global __cublasZher_v2
-    __cublasZher_v2 = dlsym(RTLD_DEFAULT, 'cublasZher_v2')
-    if __cublasZher_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher_v2 = dlsym(handle, 'cublasZher_v2')
-
-    global __cublasSspr_v2
-    __cublasSspr_v2 = dlsym(RTLD_DEFAULT, 'cublasSspr_v2')
-    if __cublasSspr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspr_v2 = dlsym(handle, 'cublasSspr_v2')
-
-    global __cublasDspr_v2
-    __cublasDspr_v2 = dlsym(RTLD_DEFAULT, 'cublasDspr_v2')
-    if __cublasDspr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspr_v2 = dlsym(handle, 'cublasDspr_v2')
-
-    global __cublasChpr_v2
-    __cublasChpr_v2 = dlsym(RTLD_DEFAULT, 'cublasChpr_v2')
-    if __cublasChpr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpr_v2 = dlsym(handle, 'cublasChpr_v2')
-
-    global __cublasZhpr_v2
-    __cublasZhpr_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpr_v2')
-    if __cublasZhpr_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpr_v2 = dlsym(handle, 'cublasZhpr_v2')
-
-    global __cublasSsyr2_v2
-    __cublasSsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr2_v2')
-    if __cublasSsyr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr2_v2 = dlsym(handle, 'cublasSsyr2_v2')
-
-    global __cublasDsyr2_v2
-    __cublasDsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr2_v2')
-    if __cublasDsyr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr2_v2 = dlsym(handle, 'cublasDsyr2_v2')
-
-    global __cublasCsyr2_v2
-    __cublasCsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr2_v2')
-    if __cublasCsyr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr2_v2 = dlsym(handle, 'cublasCsyr2_v2')
-
-    global __cublasZsyr2_v2
-    __cublasZsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr2_v2')
-    if __cublasZsyr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr2_v2 = dlsym(handle, 'cublasZsyr2_v2')
-
-    global __cublasCher2_v2
-    __cublasCher2_v2 = dlsym(RTLD_DEFAULT, 'cublasCher2_v2')
-    if __cublasCher2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher2_v2 = dlsym(handle, 'cublasCher2_v2')
-
-    global __cublasZher2_v2
-    __cublasZher2_v2 = dlsym(RTLD_DEFAULT, 'cublasZher2_v2')
-    if __cublasZher2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher2_v2 = dlsym(handle, 'cublasZher2_v2')
-
-    global __cublasSspr2_v2
-    __cublasSspr2_v2 = dlsym(RTLD_DEFAULT, 'cublasSspr2_v2')
-    if __cublasSspr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspr2_v2 = dlsym(handle, 'cublasSspr2_v2')
-
-    global __cublasDspr2_v2
-    __cublasDspr2_v2 = dlsym(RTLD_DEFAULT, 'cublasDspr2_v2')
-    if __cublasDspr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspr2_v2 = dlsym(handle, 'cublasDspr2_v2')
-
-    global __cublasChpr2_v2
-    __cublasChpr2_v2 = dlsym(RTLD_DEFAULT, 'cublasChpr2_v2')
-    if __cublasChpr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpr2_v2 = dlsym(handle, 'cublasChpr2_v2')
-
-    global __cublasZhpr2_v2
-    __cublasZhpr2_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpr2_v2')
-    if __cublasZhpr2_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpr2_v2 = dlsym(handle, 'cublasZhpr2_v2')
-
-    global __cublasSgemm_v2
-    __cublasSgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasSgemm_v2')
-    if __cublasSgemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemm_v2 = dlsym(handle, 'cublasSgemm_v2')
-
-    global __cublasDgemm_v2
-    __cublasDgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasDgemm_v2')
-    if __cublasDgemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemm_v2 = dlsym(handle, 'cublasDgemm_v2')
-
-    global __cublasCgemm_v2
-    __cublasCgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasCgemm_v2')
-    if __cublasCgemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm_v2 = dlsym(handle, 'cublasCgemm_v2')
-
-    global __cublasCgemm3m
-    __cublasCgemm3m = dlsym(RTLD_DEFAULT, 'cublasCgemm3m')
-    if __cublasCgemm3m == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3m = dlsym(handle, 'cublasCgemm3m')
-
-    global __cublasCgemm3mEx
-    __cublasCgemm3mEx = dlsym(RTLD_DEFAULT, 'cublasCgemm3mEx')
-    if __cublasCgemm3mEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mEx = dlsym(handle, 'cublasCgemm3mEx')
-
-    global __cublasZgemm_v2
-    __cublasZgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasZgemm_v2')
-    if __cublasZgemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemm_v2 = dlsym(handle, 'cublasZgemm_v2')
-
-    global __cublasZgemm3m
-    __cublasZgemm3m = dlsym(RTLD_DEFAULT, 'cublasZgemm3m')
-    if __cublasZgemm3m == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemm3m = dlsym(handle, 'cublasZgemm3m')
-
-    global __cublasSgemmEx
-    __cublasSgemmEx = dlsym(RTLD_DEFAULT, 'cublasSgemmEx')
-    if __cublasSgemmEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmEx = dlsym(handle, 'cublasSgemmEx')
-
-    global __cublasGemmEx
-    __cublasGemmEx = dlsym(RTLD_DEFAULT, 'cublasGemmEx')
-    if __cublasGemmEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmEx = dlsym(handle, 'cublasGemmEx')
-
-    global __cublasCgemmEx
-    __cublasCgemmEx = dlsym(RTLD_DEFAULT, 'cublasCgemmEx')
-    if __cublasCgemmEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmEx = dlsym(handle, 'cublasCgemmEx')
-
-    global __cublasUint8gemmBias
-    __cublasUint8gemmBias = dlsym(RTLD_DEFAULT, 'cublasUint8gemmBias')
-    if __cublasUint8gemmBias == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasUint8gemmBias = dlsym(handle, 'cublasUint8gemmBias')
-
-    global __cublasSsyrk_v2
-    __cublasSsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyrk_v2')
-    if __cublasSsyrk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyrk_v2 = dlsym(handle, 'cublasSsyrk_v2')
-
-    global __cublasDsyrk_v2
-    __cublasDsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyrk_v2')
-    if __cublasDsyrk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyrk_v2 = dlsym(handle, 'cublasDsyrk_v2')
-
-    global __cublasCsyrk_v2
-    __cublasCsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyrk_v2')
-    if __cublasCsyrk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrk_v2 = dlsym(handle, 'cublasCsyrk_v2')
-
-    global __cublasZsyrk_v2
-    __cublasZsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyrk_v2')
-    if __cublasZsyrk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyrk_v2 = dlsym(handle, 'cublasZsyrk_v2')
-
-    global __cublasCsyrkEx
-    __cublasCsyrkEx = dlsym(RTLD_DEFAULT, 'cublasCsyrkEx')
-    if __cublasCsyrkEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrkEx = dlsym(handle, 'cublasCsyrkEx')
-
-    global __cublasCsyrk3mEx
-    __cublasCsyrk3mEx = dlsym(RTLD_DEFAULT, 'cublasCsyrk3mEx')
-    if __cublasCsyrk3mEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrk3mEx = dlsym(handle, 'cublasCsyrk3mEx')
-
-    global __cublasCherk_v2
-    __cublasCherk_v2 = dlsym(RTLD_DEFAULT, 'cublasCherk_v2')
-    if __cublasCherk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherk_v2 = dlsym(handle, 'cublasCherk_v2')
-
-    global __cublasZherk_v2
-    __cublasZherk_v2 = dlsym(RTLD_DEFAULT, 'cublasZherk_v2')
-    if __cublasZherk_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZherk_v2 = dlsym(handle, 'cublasZherk_v2')
-
-    global __cublasCherkEx
-    __cublasCherkEx = dlsym(RTLD_DEFAULT, 'cublasCherkEx')
-    if __cublasCherkEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherkEx = dlsym(handle, 'cublasCherkEx')
-
-    global __cublasCherk3mEx
-    __cublasCherk3mEx = dlsym(RTLD_DEFAULT, 'cublasCherk3mEx')
-    if __cublasCherk3mEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherk3mEx = dlsym(handle, 'cublasCherk3mEx')
-
-    global __cublasSsyr2k_v2
-    __cublasSsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr2k_v2')
-    if __cublasSsyr2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr2k_v2 = dlsym(handle, 'cublasSsyr2k_v2')
-
-    global __cublasDsyr2k_v2
-    __cublasDsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr2k_v2')
-    if __cublasDsyr2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr2k_v2 = dlsym(handle, 'cublasDsyr2k_v2')
-
-    global __cublasCsyr2k_v2
-    __cublasCsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr2k_v2')
-    if __cublasCsyr2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr2k_v2 = dlsym(handle, 'cublasCsyr2k_v2')
-
-    global __cublasZsyr2k_v2
-    __cublasZsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr2k_v2')
-    if __cublasZsyr2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr2k_v2 = dlsym(handle, 'cublasZsyr2k_v2')
-
-    global __cublasCher2k_v2
-    __cublasCher2k_v2 = dlsym(RTLD_DEFAULT, 'cublasCher2k_v2')
-    if __cublasCher2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher2k_v2 = dlsym(handle, 'cublasCher2k_v2')
-
-    global __cublasZher2k_v2
-    __cublasZher2k_v2 = dlsym(RTLD_DEFAULT, 'cublasZher2k_v2')
-    if __cublasZher2k_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher2k_v2 = dlsym(handle, 'cublasZher2k_v2')
-
-    global __cublasSsyrkx
-    __cublasSsyrkx = dlsym(RTLD_DEFAULT, 'cublasSsyrkx')
-    if __cublasSsyrkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyrkx = dlsym(handle, 'cublasSsyrkx')
-
-    global __cublasDsyrkx
-    __cublasDsyrkx = dlsym(RTLD_DEFAULT, 'cublasDsyrkx')
-    if __cublasDsyrkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyrkx = dlsym(handle, 'cublasDsyrkx')
-
-    global __cublasCsyrkx
-    __cublasCsyrkx = dlsym(RTLD_DEFAULT, 'cublasCsyrkx')
-    if __cublasCsyrkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrkx = dlsym(handle, 'cublasCsyrkx')
-
-    global __cublasZsyrkx
-    __cublasZsyrkx = dlsym(RTLD_DEFAULT, 'cublasZsyrkx')
-    if __cublasZsyrkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyrkx = dlsym(handle, 'cublasZsyrkx')
-
-    global __cublasCherkx
-    __cublasCherkx = dlsym(RTLD_DEFAULT, 'cublasCherkx')
-    if __cublasCherkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherkx = dlsym(handle, 'cublasCherkx')
-
-    global __cublasZherkx
-    __cublasZherkx = dlsym(RTLD_DEFAULT, 'cublasZherkx')
-    if __cublasZherkx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZherkx = dlsym(handle, 'cublasZherkx')
-
-    global __cublasSsymm_v2
-    __cublasSsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasSsymm_v2')
-    if __cublasSsymm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsymm_v2 = dlsym(handle, 'cublasSsymm_v2')
-
-    global __cublasDsymm_v2
-    __cublasDsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasDsymm_v2')
-    if __cublasDsymm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsymm_v2 = dlsym(handle, 'cublasDsymm_v2')
-
-    global __cublasCsymm_v2
-    __cublasCsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasCsymm_v2')
-    if __cublasCsymm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsymm_v2 = dlsym(handle, 'cublasCsymm_v2')
-
-    global __cublasZsymm_v2
-    __cublasZsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasZsymm_v2')
-    if __cublasZsymm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsymm_v2 = dlsym(handle, 'cublasZsymm_v2')
-
-    global __cublasChemm_v2
-    __cublasChemm_v2 = dlsym(RTLD_DEFAULT, 'cublasChemm_v2')
-    if __cublasChemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChemm_v2 = dlsym(handle, 'cublasChemm_v2')
-
-    global __cublasZhemm_v2
-    __cublasZhemm_v2 = dlsym(RTLD_DEFAULT, 'cublasZhemm_v2')
-    if __cublasZhemm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhemm_v2 = dlsym(handle, 'cublasZhemm_v2')
-
-    global __cublasStrsm_v2
-    __cublasStrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasStrsm_v2')
-    if __cublasStrsm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsm_v2 = dlsym(handle, 'cublasStrsm_v2')
-
-    global __cublasDtrsm_v2
-    __cublasDtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrsm_v2')
-    if __cublasDtrsm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsm_v2 = dlsym(handle, 'cublasDtrsm_v2')
-
-    global __cublasCtrsm_v2
-    __cublasCtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrsm_v2')
-    if __cublasCtrsm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsm_v2 = dlsym(handle, 'cublasCtrsm_v2')
-
-    global __cublasZtrsm_v2
-    __cublasZtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrsm_v2')
-    if __cublasZtrsm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsm_v2 = dlsym(handle, 'cublasZtrsm_v2')
-
-    global __cublasStrmm_v2
-    __cublasStrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasStrmm_v2')
-    if __cublasStrmm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrmm_v2 = dlsym(handle, 'cublasStrmm_v2')
-
-    global __cublasDtrmm_v2
-    __cublasDtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrmm_v2')
-    if __cublasDtrmm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrmm_v2 = dlsym(handle, 'cublasDtrmm_v2')
-
-    global __cublasCtrmm_v2
-    __cublasCtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrmm_v2')
-    if __cublasCtrmm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrmm_v2 = dlsym(handle, 'cublasCtrmm_v2')
-
-    global __cublasZtrmm_v2
-    __cublasZtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrmm_v2')
-    if __cublasZtrmm_v2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrmm_v2 = dlsym(handle, 'cublasZtrmm_v2')
-
-    global __cublasSgemmBatched
-    __cublasSgemmBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmBatched')
-    if __cublasSgemmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmBatched = dlsym(handle, 'cublasSgemmBatched')
-
-    global __cublasDgemmBatched
-    __cublasDgemmBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmBatched')
-    if __cublasDgemmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmBatched = dlsym(handle, 'cublasDgemmBatched')
-
-    global __cublasCgemmBatched
-    __cublasCgemmBatched = dlsym(RTLD_DEFAULT, 'cublasCgemmBatched')
-    if __cublasCgemmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmBatched = dlsym(handle, 'cublasCgemmBatched')
-
-    global __cublasCgemm3mBatched
-    __cublasCgemm3mBatched = dlsym(RTLD_DEFAULT, 'cublasCgemm3mBatched')
-    if __cublasCgemm3mBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mBatched = dlsym(handle, 'cublasCgemm3mBatched')
-
-    global __cublasZgemmBatched
-    __cublasZgemmBatched = dlsym(RTLD_DEFAULT, 'cublasZgemmBatched')
-    if __cublasZgemmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemmBatched = dlsym(handle, 'cublasZgemmBatched')
-
-    global __cublasGemmBatchedEx
-    __cublasGemmBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmBatchedEx')
-    if __cublasGemmBatchedEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmBatchedEx = dlsym(handle, 'cublasGemmBatchedEx')
-
-    global __cublasGemmStridedBatchedEx
-    __cublasGemmStridedBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmStridedBatchedEx')
-    if __cublasGemmStridedBatchedEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmStridedBatchedEx = dlsym(handle, 'cublasGemmStridedBatchedEx')
-
-    global __cublasSgemmStridedBatched
-    __cublasSgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmStridedBatched')
-    if __cublasSgemmStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmStridedBatched = dlsym(handle, 'cublasSgemmStridedBatched')
-
-    global __cublasDgemmStridedBatched
-    __cublasDgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmStridedBatched')
-    if __cublasDgemmStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmStridedBatched = dlsym(handle, 'cublasDgemmStridedBatched')
-
-    global __cublasCgemmStridedBatched
-    __cublasCgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemmStridedBatched')
-    if __cublasCgemmStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmStridedBatched = dlsym(handle, 'cublasCgemmStridedBatched')
-
-    global __cublasCgemm3mStridedBatched
-    __cublasCgemm3mStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemm3mStridedBatched')
-    if __cublasCgemm3mStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mStridedBatched = dlsym(handle, 'cublasCgemm3mStridedBatched')
-
-    global __cublasZgemmStridedBatched
-    __cublasZgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasZgemmStridedBatched')
-    if __cublasZgemmStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemmStridedBatched = dlsym(handle, 'cublasZgemmStridedBatched')
-
-    global __cublasSgeam
-    __cublasSgeam = dlsym(RTLD_DEFAULT, 'cublasSgeam')
-    if __cublasSgeam == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgeam = dlsym(handle, 'cublasSgeam')
-
-    global __cublasDgeam
-    __cublasDgeam = dlsym(RTLD_DEFAULT, 'cublasDgeam')
-    if __cublasDgeam == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgeam = dlsym(handle, 'cublasDgeam')
-
-    global __cublasCgeam
-    __cublasCgeam = dlsym(RTLD_DEFAULT, 'cublasCgeam')
-    if __cublasCgeam == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgeam = dlsym(handle, 'cublasCgeam')
-
-    global __cublasZgeam
-    __cublasZgeam = dlsym(RTLD_DEFAULT, 'cublasZgeam')
-    if __cublasZgeam == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgeam = dlsym(handle, 'cublasZgeam')
-
-    global __cublasSgetrfBatched
-    __cublasSgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasSgetrfBatched')
-    if __cublasSgetrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgetrfBatched = dlsym(handle, 'cublasSgetrfBatched')
-
-    global __cublasDgetrfBatched
-    __cublasDgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasDgetrfBatched')
-    if __cublasDgetrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgetrfBatched = dlsym(handle, 'cublasDgetrfBatched')
-
-    global __cublasCgetrfBatched
-    __cublasCgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasCgetrfBatched')
-    if __cublasCgetrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgetrfBatched = dlsym(handle, 'cublasCgetrfBatched')
-
-    global __cublasZgetrfBatched
-    __cublasZgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasZgetrfBatched')
-    if __cublasZgetrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgetrfBatched = dlsym(handle, 'cublasZgetrfBatched')
-
-    global __cublasSgetriBatched
-    __cublasSgetriBatched = dlsym(RTLD_DEFAULT, 'cublasSgetriBatched')
-    if __cublasSgetriBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgetriBatched = dlsym(handle, 'cublasSgetriBatched')
-
-    global __cublasDgetriBatched
-    __cublasDgetriBatched = dlsym(RTLD_DEFAULT, 'cublasDgetriBatched')
-    if __cublasDgetriBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgetriBatched = dlsym(handle, 'cublasDgetriBatched')
-
-    global __cublasCgetriBatched
-    __cublasCgetriBatched = dlsym(RTLD_DEFAULT, 'cublasCgetriBatched')
-    if __cublasCgetriBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgetriBatched = dlsym(handle, 'cublasCgetriBatched')
-
-    global __cublasZgetriBatched
-    __cublasZgetriBatched = dlsym(RTLD_DEFAULT, 'cublasZgetriBatched')
-    if __cublasZgetriBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgetriBatched = dlsym(handle, 'cublasZgetriBatched')
-
-    global __cublasSgetrsBatched
-    __cublasSgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasSgetrsBatched')
-    if __cublasSgetrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgetrsBatched = dlsym(handle, 'cublasSgetrsBatched')
-
-    global __cublasDgetrsBatched
-    __cublasDgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasDgetrsBatched')
-    if __cublasDgetrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgetrsBatched = dlsym(handle, 'cublasDgetrsBatched')
-
-    global __cublasCgetrsBatched
-    __cublasCgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasCgetrsBatched')
-    if __cublasCgetrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgetrsBatched = dlsym(handle, 'cublasCgetrsBatched')
-
-    global __cublasZgetrsBatched
-    __cublasZgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasZgetrsBatched')
-    if __cublasZgetrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgetrsBatched = dlsym(handle, 'cublasZgetrsBatched')
-
-    global __cublasStrsmBatched
-    __cublasStrsmBatched = dlsym(RTLD_DEFAULT, 'cublasStrsmBatched')
-    if __cublasStrsmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsmBatched = dlsym(handle, 'cublasStrsmBatched')
-
-    global __cublasDtrsmBatched
-    __cublasDtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasDtrsmBatched')
-    if __cublasDtrsmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsmBatched = dlsym(handle, 'cublasDtrsmBatched')
-
-    global __cublasCtrsmBatched
-    __cublasCtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasCtrsmBatched')
-    if __cublasCtrsmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsmBatched = dlsym(handle, 'cublasCtrsmBatched')
-
-    global __cublasZtrsmBatched
-    __cublasZtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasZtrsmBatched')
-    if __cublasZtrsmBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsmBatched = dlsym(handle, 'cublasZtrsmBatched')
-
-    global __cublasSmatinvBatched
-    __cublasSmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasSmatinvBatched')
-    if __cublasSmatinvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSmatinvBatched = dlsym(handle, 'cublasSmatinvBatched')
-
-    global __cublasDmatinvBatched
-    __cublasDmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasDmatinvBatched')
-    if __cublasDmatinvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDmatinvBatched = dlsym(handle, 'cublasDmatinvBatched')
-
-    global __cublasCmatinvBatched
-    __cublasCmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasCmatinvBatched')
-    if __cublasCmatinvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCmatinvBatched = dlsym(handle, 'cublasCmatinvBatched')
-
-    global __cublasZmatinvBatched
-    __cublasZmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasZmatinvBatched')
-    if __cublasZmatinvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZmatinvBatched = dlsym(handle, 'cublasZmatinvBatched')
-
-    global __cublasSgeqrfBatched
-    __cublasSgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasSgeqrfBatched')
-    if __cublasSgeqrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgeqrfBatched = dlsym(handle, 'cublasSgeqrfBatched')
-
-    global __cublasDgeqrfBatched
-    __cublasDgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasDgeqrfBatched')
-    if __cublasDgeqrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgeqrfBatched = dlsym(handle, 'cublasDgeqrfBatched')
-
-    global __cublasCgeqrfBatched
-    __cublasCgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasCgeqrfBatched')
-    if __cublasCgeqrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgeqrfBatched = dlsym(handle, 'cublasCgeqrfBatched')
-
-    global __cublasZgeqrfBatched
-    __cublasZgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasZgeqrfBatched')
-    if __cublasZgeqrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgeqrfBatched = dlsym(handle, 'cublasZgeqrfBatched')
-
-    global __cublasSgelsBatched
-    __cublasSgelsBatched = dlsym(RTLD_DEFAULT, 'cublasSgelsBatched')
-    if __cublasSgelsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgelsBatched = dlsym(handle, 'cublasSgelsBatched')
-
-    global __cublasDgelsBatched
-    __cublasDgelsBatched = dlsym(RTLD_DEFAULT, 'cublasDgelsBatched')
-    if __cublasDgelsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgelsBatched = dlsym(handle, 'cublasDgelsBatched')
-
-    global __cublasCgelsBatched
-    __cublasCgelsBatched = dlsym(RTLD_DEFAULT, 'cublasCgelsBatched')
-    if __cublasCgelsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgelsBatched = dlsym(handle, 'cublasCgelsBatched')
-
-    global __cublasZgelsBatched
-    __cublasZgelsBatched = dlsym(RTLD_DEFAULT, 'cublasZgelsBatched')
-    if __cublasZgelsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgelsBatched = dlsym(handle, 'cublasZgelsBatched')
-
-    global __cublasSdgmm
-    __cublasSdgmm = dlsym(RTLD_DEFAULT, 'cublasSdgmm')
-    if __cublasSdgmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSdgmm = dlsym(handle, 'cublasSdgmm')
-
-    global __cublasDdgmm
-    __cublasDdgmm = dlsym(RTLD_DEFAULT, 'cublasDdgmm')
-    if __cublasDdgmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDdgmm = dlsym(handle, 'cublasDdgmm')
-
-    global __cublasCdgmm
-    __cublasCdgmm = dlsym(RTLD_DEFAULT, 'cublasCdgmm')
-    if __cublasCdgmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdgmm = dlsym(handle, 'cublasCdgmm')
-
-    global __cublasZdgmm
-    __cublasZdgmm = dlsym(RTLD_DEFAULT, 'cublasZdgmm')
-    if __cublasZdgmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdgmm = dlsym(handle, 'cublasZdgmm')
-
-    global __cublasStpttr
-    __cublasStpttr = dlsym(RTLD_DEFAULT, 'cublasStpttr')
-    if __cublasStpttr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStpttr = dlsym(handle, 'cublasStpttr')
-
-    global __cublasDtpttr
-    __cublasDtpttr = dlsym(RTLD_DEFAULT, 'cublasDtpttr')
-    if __cublasDtpttr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtpttr = dlsym(handle, 'cublasDtpttr')
-
-    global __cublasCtpttr
-    __cublasCtpttr = dlsym(RTLD_DEFAULT, 'cublasCtpttr')
-    if __cublasCtpttr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtpttr = dlsym(handle, 'cublasCtpttr')
-
-    global __cublasZtpttr
-    __cublasZtpttr = dlsym(RTLD_DEFAULT, 'cublasZtpttr')
-    if __cublasZtpttr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtpttr = dlsym(handle, 'cublasZtpttr')
-
-    global __cublasStrttp
-    __cublasStrttp = dlsym(RTLD_DEFAULT, 'cublasStrttp')
-    if __cublasStrttp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrttp = dlsym(handle, 'cublasStrttp')
-
-    global __cublasDtrttp
-    __cublasDtrttp = dlsym(RTLD_DEFAULT, 'cublasDtrttp')
-    if __cublasDtrttp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrttp = dlsym(handle, 'cublasDtrttp')
-
-    global __cublasCtrttp
-    __cublasCtrttp = dlsym(RTLD_DEFAULT, 'cublasCtrttp')
-    if __cublasCtrttp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrttp = dlsym(handle, 'cublasCtrttp')
-
-    global __cublasZtrttp
-    __cublasZtrttp = dlsym(RTLD_DEFAULT, 'cublasZtrttp')
-    if __cublasZtrttp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrttp = dlsym(handle, 'cublasZtrttp')
-
-    global __cublasGetSmCountTarget
-    __cublasGetSmCountTarget = dlsym(RTLD_DEFAULT, 'cublasGetSmCountTarget')
-    if __cublasGetSmCountTarget == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetSmCountTarget = dlsym(handle, 'cublasGetSmCountTarget')
-
-    global __cublasSetSmCountTarget
-    __cublasSetSmCountTarget = dlsym(RTLD_DEFAULT, 'cublasSetSmCountTarget')
-    if __cublasSetSmCountTarget == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetSmCountTarget = dlsym(handle, 'cublasSetSmCountTarget')
-
-    global __cublasGetStatusName
-    __cublasGetStatusName = dlsym(RTLD_DEFAULT, 'cublasGetStatusName')
-    if __cublasGetStatusName == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetStatusName = dlsym(handle, 'cublasGetStatusName')
-
-    global __cublasGetStatusString
-    __cublasGetStatusString = dlsym(RTLD_DEFAULT, 'cublasGetStatusString')
-    if __cublasGetStatusString == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetStatusString = dlsym(handle, 'cublasGetStatusString')
-
-    global __cublasSgemvBatched
-    __cublasSgemvBatched = dlsym(RTLD_DEFAULT, 'cublasSgemvBatched')
-    if __cublasSgemvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemvBatched = dlsym(handle, 'cublasSgemvBatched')
-
-    global __cublasDgemvBatched
-    __cublasDgemvBatched = dlsym(RTLD_DEFAULT, 'cublasDgemvBatched')
-    if __cublasDgemvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemvBatched = dlsym(handle, 'cublasDgemvBatched')
-
-    global __cublasCgemvBatched
-    __cublasCgemvBatched = dlsym(RTLD_DEFAULT, 'cublasCgemvBatched')
-    if __cublasCgemvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemvBatched = dlsym(handle, 'cublasCgemvBatched')
-
-    global __cublasZgemvBatched
-    __cublasZgemvBatched = dlsym(RTLD_DEFAULT, 'cublasZgemvBatched')
-    if __cublasZgemvBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemvBatched = dlsym(handle, 'cublasZgemvBatched')
-
-    global __cublasSgemvStridedBatched
-    __cublasSgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemvStridedBatched')
-    if __cublasSgemvStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemvStridedBatched = dlsym(handle, 'cublasSgemvStridedBatched')
-
-    global __cublasDgemvStridedBatched
-    __cublasDgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemvStridedBatched')
-    if __cublasDgemvStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemvStridedBatched = dlsym(handle, 'cublasDgemvStridedBatched')
-
-    global __cublasCgemvStridedBatched
-    __cublasCgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemvStridedBatched')
-    if __cublasCgemvStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemvStridedBatched = dlsym(handle, 'cublasCgemvStridedBatched')
-
-    global __cublasZgemvStridedBatched
-    __cublasZgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasZgemvStridedBatched')
-    if __cublasZgemvStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemvStridedBatched = dlsym(handle, 'cublasZgemvStridedBatched')
-
-    global __cublasSetVector_64
-    __cublasSetVector_64 = dlsym(RTLD_DEFAULT, 'cublasSetVector_64')
-    if __cublasSetVector_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetVector_64 = dlsym(handle, 'cublasSetVector_64')
-
-    global __cublasGetVector_64
-    __cublasGetVector_64 = dlsym(RTLD_DEFAULT, 'cublasGetVector_64')
-    if __cublasGetVector_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetVector_64 = dlsym(handle, 'cublasGetVector_64')
-
-    global __cublasSetMatrix_64
-    __cublasSetMatrix_64 = dlsym(RTLD_DEFAULT, 'cublasSetMatrix_64')
-    if __cublasSetMatrix_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetMatrix_64 = dlsym(handle, 'cublasSetMatrix_64')
-
-    global __cublasGetMatrix_64
-    __cublasGetMatrix_64 = dlsym(RTLD_DEFAULT, 'cublasGetMatrix_64')
-    if __cublasGetMatrix_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetMatrix_64 = dlsym(handle, 'cublasGetMatrix_64')
-
-    global __cublasSetVectorAsync_64
-    __cublasSetVectorAsync_64 = dlsym(RTLD_DEFAULT, 'cublasSetVectorAsync_64')
-    if __cublasSetVectorAsync_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetVectorAsync_64 = dlsym(handle, 'cublasSetVectorAsync_64')
-
-    global __cublasGetVectorAsync_64
-    __cublasGetVectorAsync_64 = dlsym(RTLD_DEFAULT, 'cublasGetVectorAsync_64')
-    if __cublasGetVectorAsync_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetVectorAsync_64 = dlsym(handle, 'cublasGetVectorAsync_64')
-
-    global __cublasSetMatrixAsync_64
-    __cublasSetMatrixAsync_64 = dlsym(RTLD_DEFAULT, 'cublasSetMatrixAsync_64')
-    if __cublasSetMatrixAsync_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetMatrixAsync_64 = dlsym(handle, 'cublasSetMatrixAsync_64')
-
-    global __cublasGetMatrixAsync_64
-    __cublasGetMatrixAsync_64 = dlsym(RTLD_DEFAULT, 'cublasGetMatrixAsync_64')
-    if __cublasGetMatrixAsync_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetMatrixAsync_64 = dlsym(handle, 'cublasGetMatrixAsync_64')
-
-    global __cublasNrm2Ex_64
-    __cublasNrm2Ex_64 = dlsym(RTLD_DEFAULT, 'cublasNrm2Ex_64')
-    if __cublasNrm2Ex_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasNrm2Ex_64 = dlsym(handle, 'cublasNrm2Ex_64')
-
-    global __cublasSnrm2_v2_64
-    __cublasSnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSnrm2_v2_64')
-    if __cublasSnrm2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSnrm2_v2_64 = dlsym(handle, 'cublasSnrm2_v2_64')
-
-    global __cublasDnrm2_v2_64
-    __cublasDnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDnrm2_v2_64')
-    if __cublasDnrm2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDnrm2_v2_64 = dlsym(handle, 'cublasDnrm2_v2_64')
-
-    global __cublasScnrm2_v2_64
-    __cublasScnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScnrm2_v2_64')
-    if __cublasScnrm2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScnrm2_v2_64 = dlsym(handle, 'cublasScnrm2_v2_64')
-
-    global __cublasDznrm2_v2_64
-    __cublasDznrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDznrm2_v2_64')
-    if __cublasDznrm2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDznrm2_v2_64 = dlsym(handle, 'cublasDznrm2_v2_64')
-
-    global __cublasDotEx_64
-    __cublasDotEx_64 = dlsym(RTLD_DEFAULT, 'cublasDotEx_64')
-    if __cublasDotEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDotEx_64 = dlsym(handle, 'cublasDotEx_64')
-
-    global __cublasDotcEx_64
-    __cublasDotcEx_64 = dlsym(RTLD_DEFAULT, 'cublasDotcEx_64')
-    if __cublasDotcEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDotcEx_64 = dlsym(handle, 'cublasDotcEx_64')
-
-    global __cublasSdot_v2_64
-    __cublasSdot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSdot_v2_64')
-    if __cublasSdot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSdot_v2_64 = dlsym(handle, 'cublasSdot_v2_64')
-
-    global __cublasDdot_v2_64
-    __cublasDdot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDdot_v2_64')
-    if __cublasDdot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDdot_v2_64 = dlsym(handle, 'cublasDdot_v2_64')
-
-    global __cublasCdotu_v2_64
-    __cublasCdotu_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCdotu_v2_64')
-    if __cublasCdotu_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdotu_v2_64 = dlsym(handle, 'cublasCdotu_v2_64')
-
-    global __cublasCdotc_v2_64
-    __cublasCdotc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCdotc_v2_64')
-    if __cublasCdotc_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdotc_v2_64 = dlsym(handle, 'cublasCdotc_v2_64')
-
-    global __cublasZdotu_v2_64
-    __cublasZdotu_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdotu_v2_64')
-    if __cublasZdotu_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdotu_v2_64 = dlsym(handle, 'cublasZdotu_v2_64')
-
-    global __cublasZdotc_v2_64
-    __cublasZdotc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdotc_v2_64')
-    if __cublasZdotc_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdotc_v2_64 = dlsym(handle, 'cublasZdotc_v2_64')
-
-    global __cublasScalEx_64
-    __cublasScalEx_64 = dlsym(RTLD_DEFAULT, 'cublasScalEx_64')
-    if __cublasScalEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScalEx_64 = dlsym(handle, 'cublasScalEx_64')
-
-    global __cublasSscal_v2_64
-    __cublasSscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSscal_v2_64')
-    if __cublasSscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSscal_v2_64 = dlsym(handle, 'cublasSscal_v2_64')
-
-    global __cublasDscal_v2_64
-    __cublasDscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDscal_v2_64')
-    if __cublasDscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDscal_v2_64 = dlsym(handle, 'cublasDscal_v2_64')
-
-    global __cublasCscal_v2_64
-    __cublasCscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCscal_v2_64')
-    if __cublasCscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCscal_v2_64 = dlsym(handle, 'cublasCscal_v2_64')
-
-    global __cublasCsscal_v2_64
-    __cublasCsscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsscal_v2_64')
-    if __cublasCsscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsscal_v2_64 = dlsym(handle, 'cublasCsscal_v2_64')
-
-    global __cublasZscal_v2_64
-    __cublasZscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZscal_v2_64')
-    if __cublasZscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZscal_v2_64 = dlsym(handle, 'cublasZscal_v2_64')
-
-    global __cublasZdscal_v2_64
-    __cublasZdscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdscal_v2_64')
-    if __cublasZdscal_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdscal_v2_64 = dlsym(handle, 'cublasZdscal_v2_64')
-
-    global __cublasAxpyEx_64
-    __cublasAxpyEx_64 = dlsym(RTLD_DEFAULT, 'cublasAxpyEx_64')
-    if __cublasAxpyEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasAxpyEx_64 = dlsym(handle, 'cublasAxpyEx_64')
-
-    global __cublasSaxpy_v2_64
-    __cublasSaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSaxpy_v2_64')
-    if __cublasSaxpy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSaxpy_v2_64 = dlsym(handle, 'cublasSaxpy_v2_64')
-
-    global __cublasDaxpy_v2_64
-    __cublasDaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDaxpy_v2_64')
-    if __cublasDaxpy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDaxpy_v2_64 = dlsym(handle, 'cublasDaxpy_v2_64')
-
-    global __cublasCaxpy_v2_64
-    __cublasCaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCaxpy_v2_64')
-    if __cublasCaxpy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCaxpy_v2_64 = dlsym(handle, 'cublasCaxpy_v2_64')
-
-    global __cublasZaxpy_v2_64
-    __cublasZaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZaxpy_v2_64')
-    if __cublasZaxpy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZaxpy_v2_64 = dlsym(handle, 'cublasZaxpy_v2_64')
-
-    global __cublasCopyEx_64
-    __cublasCopyEx_64 = dlsym(RTLD_DEFAULT, 'cublasCopyEx_64')
-    if __cublasCopyEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCopyEx_64 = dlsym(handle, 'cublasCopyEx_64')
-
-    global __cublasScopy_v2_64
-    __cublasScopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScopy_v2_64')
-    if __cublasScopy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScopy_v2_64 = dlsym(handle, 'cublasScopy_v2_64')
-
-    global __cublasDcopy_v2_64
-    __cublasDcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDcopy_v2_64')
-    if __cublasDcopy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDcopy_v2_64 = dlsym(handle, 'cublasDcopy_v2_64')
-
-    global __cublasCcopy_v2_64
-    __cublasCcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCcopy_v2_64')
-    if __cublasCcopy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCcopy_v2_64 = dlsym(handle, 'cublasCcopy_v2_64')
-
-    global __cublasZcopy_v2_64
-    __cublasZcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZcopy_v2_64')
-    if __cublasZcopy_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZcopy_v2_64 = dlsym(handle, 'cublasZcopy_v2_64')
-
-    global __cublasSswap_v2_64
-    __cublasSswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSswap_v2_64')
-    if __cublasSswap_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSswap_v2_64 = dlsym(handle, 'cublasSswap_v2_64')
-
-    global __cublasDswap_v2_64
-    __cublasDswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDswap_v2_64')
-    if __cublasDswap_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDswap_v2_64 = dlsym(handle, 'cublasDswap_v2_64')
-
-    global __cublasCswap_v2_64
-    __cublasCswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCswap_v2_64')
-    if __cublasCswap_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCswap_v2_64 = dlsym(handle, 'cublasCswap_v2_64')
-
-    global __cublasZswap_v2_64
-    __cublasZswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZswap_v2_64')
-    if __cublasZswap_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZswap_v2_64 = dlsym(handle, 'cublasZswap_v2_64')
-
-    global __cublasSwapEx_64
-    __cublasSwapEx_64 = dlsym(RTLD_DEFAULT, 'cublasSwapEx_64')
-    if __cublasSwapEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSwapEx_64 = dlsym(handle, 'cublasSwapEx_64')
-
-    global __cublasIsamax_v2_64
-    __cublasIsamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIsamax_v2_64')
-    if __cublasIsamax_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIsamax_v2_64 = dlsym(handle, 'cublasIsamax_v2_64')
-
-    global __cublasIdamax_v2_64
-    __cublasIdamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIdamax_v2_64')
-    if __cublasIdamax_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIdamax_v2_64 = dlsym(handle, 'cublasIdamax_v2_64')
-
-    global __cublasIcamax_v2_64
-    __cublasIcamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIcamax_v2_64')
-    if __cublasIcamax_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIcamax_v2_64 = dlsym(handle, 'cublasIcamax_v2_64')
-
-    global __cublasIzamax_v2_64
-    __cublasIzamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIzamax_v2_64')
-    if __cublasIzamax_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIzamax_v2_64 = dlsym(handle, 'cublasIzamax_v2_64')
-
-    global __cublasIamaxEx_64
-    __cublasIamaxEx_64 = dlsym(RTLD_DEFAULT, 'cublasIamaxEx_64')
-    if __cublasIamaxEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIamaxEx_64 = dlsym(handle, 'cublasIamaxEx_64')
-
-    global __cublasIsamin_v2_64
-    __cublasIsamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIsamin_v2_64')
-    if __cublasIsamin_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIsamin_v2_64 = dlsym(handle, 'cublasIsamin_v2_64')
-
-    global __cublasIdamin_v2_64
-    __cublasIdamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIdamin_v2_64')
-    if __cublasIdamin_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIdamin_v2_64 = dlsym(handle, 'cublasIdamin_v2_64')
-
-    global __cublasIcamin_v2_64
-    __cublasIcamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIcamin_v2_64')
-    if __cublasIcamin_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIcamin_v2_64 = dlsym(handle, 'cublasIcamin_v2_64')
-
-    global __cublasIzamin_v2_64
-    __cublasIzamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIzamin_v2_64')
-    if __cublasIzamin_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIzamin_v2_64 = dlsym(handle, 'cublasIzamin_v2_64')
-
-    global __cublasIaminEx_64
-    __cublasIaminEx_64 = dlsym(RTLD_DEFAULT, 'cublasIaminEx_64')
-    if __cublasIaminEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasIaminEx_64 = dlsym(handle, 'cublasIaminEx_64')
-
-    global __cublasAsumEx_64
-    __cublasAsumEx_64 = dlsym(RTLD_DEFAULT, 'cublasAsumEx_64')
-    if __cublasAsumEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasAsumEx_64 = dlsym(handle, 'cublasAsumEx_64')
-
-    global __cublasSasum_v2_64
-    __cublasSasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSasum_v2_64')
-    if __cublasSasum_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSasum_v2_64 = dlsym(handle, 'cublasSasum_v2_64')
-
-    global __cublasDasum_v2_64
-    __cublasDasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDasum_v2_64')
-    if __cublasDasum_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDasum_v2_64 = dlsym(handle, 'cublasDasum_v2_64')
-
-    global __cublasScasum_v2_64
-    __cublasScasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScasum_v2_64')
-    if __cublasScasum_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasScasum_v2_64 = dlsym(handle, 'cublasScasum_v2_64')
-
-    global __cublasDzasum_v2_64
-    __cublasDzasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDzasum_v2_64')
-    if __cublasDzasum_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDzasum_v2_64 = dlsym(handle, 'cublasDzasum_v2_64')
-
-    global __cublasSrot_v2_64
-    __cublasSrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSrot_v2_64')
-    if __cublasSrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrot_v2_64 = dlsym(handle, 'cublasSrot_v2_64')
-
-    global __cublasDrot_v2_64
-    __cublasDrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDrot_v2_64')
-    if __cublasDrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrot_v2_64 = dlsym(handle, 'cublasDrot_v2_64')
-
-    global __cublasCrot_v2_64
-    __cublasCrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCrot_v2_64')
-    if __cublasCrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCrot_v2_64 = dlsym(handle, 'cublasCrot_v2_64')
-
-    global __cublasCsrot_v2_64
-    __cublasCsrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsrot_v2_64')
-    if __cublasCsrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsrot_v2_64 = dlsym(handle, 'cublasCsrot_v2_64')
-
-    global __cublasZrot_v2_64
-    __cublasZrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZrot_v2_64')
-    if __cublasZrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZrot_v2_64 = dlsym(handle, 'cublasZrot_v2_64')
-
-    global __cublasZdrot_v2_64
-    __cublasZdrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdrot_v2_64')
-    if __cublasZdrot_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdrot_v2_64 = dlsym(handle, 'cublasZdrot_v2_64')
-
-    global __cublasRotEx_64
-    __cublasRotEx_64 = dlsym(RTLD_DEFAULT, 'cublasRotEx_64')
-    if __cublasRotEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotEx_64 = dlsym(handle, 'cublasRotEx_64')
-
-    global __cublasSrotm_v2_64
-    __cublasSrotm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSrotm_v2_64')
-    if __cublasSrotm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSrotm_v2_64 = dlsym(handle, 'cublasSrotm_v2_64')
-
-    global __cublasDrotm_v2_64
-    __cublasDrotm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDrotm_v2_64')
-    if __cublasDrotm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDrotm_v2_64 = dlsym(handle, 'cublasDrotm_v2_64')
-
-    global __cublasRotmEx_64
-    __cublasRotmEx_64 = dlsym(RTLD_DEFAULT, 'cublasRotmEx_64')
-    if __cublasRotmEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasRotmEx_64 = dlsym(handle, 'cublasRotmEx_64')
-
-    global __cublasSgemv_v2_64
-    __cublasSgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgemv_v2_64')
-    if __cublasSgemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemv_v2_64 = dlsym(handle, 'cublasSgemv_v2_64')
-
-    global __cublasDgemv_v2_64
-    __cublasDgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgemv_v2_64')
-    if __cublasDgemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemv_v2_64 = dlsym(handle, 'cublasDgemv_v2_64')
-
-    global __cublasCgemv_v2_64
-    __cublasCgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgemv_v2_64')
-    if __cublasCgemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemv_v2_64 = dlsym(handle, 'cublasCgemv_v2_64')
-
-    global __cublasZgemv_v2_64
-    __cublasZgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgemv_v2_64')
-    if __cublasZgemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemv_v2_64 = dlsym(handle, 'cublasZgemv_v2_64')
-
-    global __cublasSgbmv_v2_64
-    __cublasSgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgbmv_v2_64')
-    if __cublasSgbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgbmv_v2_64 = dlsym(handle, 'cublasSgbmv_v2_64')
-
-    global __cublasDgbmv_v2_64
-    __cublasDgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgbmv_v2_64')
-    if __cublasDgbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgbmv_v2_64 = dlsym(handle, 'cublasDgbmv_v2_64')
-
-    global __cublasCgbmv_v2_64
-    __cublasCgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgbmv_v2_64')
-    if __cublasCgbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgbmv_v2_64 = dlsym(handle, 'cublasCgbmv_v2_64')
-
-    global __cublasZgbmv_v2_64
-    __cublasZgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgbmv_v2_64')
-    if __cublasZgbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgbmv_v2_64 = dlsym(handle, 'cublasZgbmv_v2_64')
-
-    global __cublasStrmv_v2_64
-    __cublasStrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrmv_v2_64')
-    if __cublasStrmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrmv_v2_64 = dlsym(handle, 'cublasStrmv_v2_64')
-
-    global __cublasDtrmv_v2_64
-    __cublasDtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrmv_v2_64')
-    if __cublasDtrmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrmv_v2_64 = dlsym(handle, 'cublasDtrmv_v2_64')
-
-    global __cublasCtrmv_v2_64
-    __cublasCtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrmv_v2_64')
-    if __cublasCtrmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrmv_v2_64 = dlsym(handle, 'cublasCtrmv_v2_64')
-
-    global __cublasZtrmv_v2_64
-    __cublasZtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrmv_v2_64')
-    if __cublasZtrmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrmv_v2_64 = dlsym(handle, 'cublasZtrmv_v2_64')
-
-    global __cublasStbmv_v2_64
-    __cublasStbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStbmv_v2_64')
-    if __cublasStbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStbmv_v2_64 = dlsym(handle, 'cublasStbmv_v2_64')
-
-    global __cublasDtbmv_v2_64
-    __cublasDtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtbmv_v2_64')
-    if __cublasDtbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtbmv_v2_64 = dlsym(handle, 'cublasDtbmv_v2_64')
-
-    global __cublasCtbmv_v2_64
-    __cublasCtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtbmv_v2_64')
-    if __cublasCtbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtbmv_v2_64 = dlsym(handle, 'cublasCtbmv_v2_64')
-
-    global __cublasZtbmv_v2_64
-    __cublasZtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtbmv_v2_64')
-    if __cublasZtbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtbmv_v2_64 = dlsym(handle, 'cublasZtbmv_v2_64')
-
-    global __cublasStpmv_v2_64
-    __cublasStpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStpmv_v2_64')
-    if __cublasStpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStpmv_v2_64 = dlsym(handle, 'cublasStpmv_v2_64')
-
-    global __cublasDtpmv_v2_64
-    __cublasDtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtpmv_v2_64')
-    if __cublasDtpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtpmv_v2_64 = dlsym(handle, 'cublasDtpmv_v2_64')
-
-    global __cublasCtpmv_v2_64
-    __cublasCtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtpmv_v2_64')
-    if __cublasCtpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtpmv_v2_64 = dlsym(handle, 'cublasCtpmv_v2_64')
-
-    global __cublasZtpmv_v2_64
-    __cublasZtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtpmv_v2_64')
-    if __cublasZtpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtpmv_v2_64 = dlsym(handle, 'cublasZtpmv_v2_64')
-
-    global __cublasStrsv_v2_64
-    __cublasStrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrsv_v2_64')
-    if __cublasStrsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsv_v2_64 = dlsym(handle, 'cublasStrsv_v2_64')
-
-    global __cublasDtrsv_v2_64
-    __cublasDtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsv_v2_64')
-    if __cublasDtrsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsv_v2_64 = dlsym(handle, 'cublasDtrsv_v2_64')
-
-    global __cublasCtrsv_v2_64
-    __cublasCtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsv_v2_64')
-    if __cublasCtrsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsv_v2_64 = dlsym(handle, 'cublasCtrsv_v2_64')
-
-    global __cublasZtrsv_v2_64
-    __cublasZtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsv_v2_64')
-    if __cublasZtrsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsv_v2_64 = dlsym(handle, 'cublasZtrsv_v2_64')
-
-    global __cublasStpsv_v2_64
-    __cublasStpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStpsv_v2_64')
-    if __cublasStpsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStpsv_v2_64 = dlsym(handle, 'cublasStpsv_v2_64')
-
-    global __cublasDtpsv_v2_64
-    __cublasDtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtpsv_v2_64')
-    if __cublasDtpsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtpsv_v2_64 = dlsym(handle, 'cublasDtpsv_v2_64')
-
-    global __cublasCtpsv_v2_64
-    __cublasCtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtpsv_v2_64')
-    if __cublasCtpsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtpsv_v2_64 = dlsym(handle, 'cublasCtpsv_v2_64')
-
-    global __cublasZtpsv_v2_64
-    __cublasZtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtpsv_v2_64')
-    if __cublasZtpsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtpsv_v2_64 = dlsym(handle, 'cublasZtpsv_v2_64')
-
-    global __cublasStbsv_v2_64
-    __cublasStbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStbsv_v2_64')
-    if __cublasStbsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStbsv_v2_64 = dlsym(handle, 'cublasStbsv_v2_64')
-
-    global __cublasDtbsv_v2_64
-    __cublasDtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtbsv_v2_64')
-    if __cublasDtbsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtbsv_v2_64 = dlsym(handle, 'cublasDtbsv_v2_64')
-
-    global __cublasCtbsv_v2_64
-    __cublasCtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtbsv_v2_64')
-    if __cublasCtbsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtbsv_v2_64 = dlsym(handle, 'cublasCtbsv_v2_64')
-
-    global __cublasZtbsv_v2_64
-    __cublasZtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtbsv_v2_64')
-    if __cublasZtbsv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtbsv_v2_64 = dlsym(handle, 'cublasZtbsv_v2_64')
-
-    global __cublasSsymv_v2_64
-    __cublasSsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsymv_v2_64')
-    if __cublasSsymv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsymv_v2_64 = dlsym(handle, 'cublasSsymv_v2_64')
-
-    global __cublasDsymv_v2_64
-    __cublasDsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsymv_v2_64')
-    if __cublasDsymv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsymv_v2_64 = dlsym(handle, 'cublasDsymv_v2_64')
-
-    global __cublasCsymv_v2_64
-    __cublasCsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsymv_v2_64')
-    if __cublasCsymv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsymv_v2_64 = dlsym(handle, 'cublasCsymv_v2_64')
-
-    global __cublasZsymv_v2_64
-    __cublasZsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsymv_v2_64')
-    if __cublasZsymv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsymv_v2_64 = dlsym(handle, 'cublasZsymv_v2_64')
-
-    global __cublasChemv_v2_64
-    __cublasChemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChemv_v2_64')
-    if __cublasChemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChemv_v2_64 = dlsym(handle, 'cublasChemv_v2_64')
-
-    global __cublasZhemv_v2_64
-    __cublasZhemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhemv_v2_64')
-    if __cublasZhemv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhemv_v2_64 = dlsym(handle, 'cublasZhemv_v2_64')
-
-    global __cublasSsbmv_v2_64
-    __cublasSsbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsbmv_v2_64')
-    if __cublasSsbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsbmv_v2_64 = dlsym(handle, 'cublasSsbmv_v2_64')
-
-    global __cublasDsbmv_v2_64
-    __cublasDsbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsbmv_v2_64')
-    if __cublasDsbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsbmv_v2_64 = dlsym(handle, 'cublasDsbmv_v2_64')
-
-    global __cublasChbmv_v2_64
-    __cublasChbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChbmv_v2_64')
-    if __cublasChbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChbmv_v2_64 = dlsym(handle, 'cublasChbmv_v2_64')
-
-    global __cublasZhbmv_v2_64
-    __cublasZhbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhbmv_v2_64')
-    if __cublasZhbmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhbmv_v2_64 = dlsym(handle, 'cublasZhbmv_v2_64')
-
-    global __cublasSspmv_v2_64
-    __cublasSspmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspmv_v2_64')
-    if __cublasSspmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspmv_v2_64 = dlsym(handle, 'cublasSspmv_v2_64')
-
-    global __cublasDspmv_v2_64
-    __cublasDspmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspmv_v2_64')
-    if __cublasDspmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspmv_v2_64 = dlsym(handle, 'cublasDspmv_v2_64')
-
-    global __cublasChpmv_v2_64
-    __cublasChpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpmv_v2_64')
-    if __cublasChpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpmv_v2_64 = dlsym(handle, 'cublasChpmv_v2_64')
-
-    global __cublasZhpmv_v2_64
-    __cublasZhpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpmv_v2_64')
-    if __cublasZhpmv_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpmv_v2_64 = dlsym(handle, 'cublasZhpmv_v2_64')
-
-    global __cublasSger_v2_64
-    __cublasSger_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSger_v2_64')
-    if __cublasSger_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSger_v2_64 = dlsym(handle, 'cublasSger_v2_64')
-
-    global __cublasDger_v2_64
-    __cublasDger_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDger_v2_64')
-    if __cublasDger_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDger_v2_64 = dlsym(handle, 'cublasDger_v2_64')
-
-    global __cublasCgeru_v2_64
-    __cublasCgeru_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgeru_v2_64')
-    if __cublasCgeru_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgeru_v2_64 = dlsym(handle, 'cublasCgeru_v2_64')
-
-    global __cublasCgerc_v2_64
-    __cublasCgerc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgerc_v2_64')
-    if __cublasCgerc_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgerc_v2_64 = dlsym(handle, 'cublasCgerc_v2_64')
-
-    global __cublasZgeru_v2_64
-    __cublasZgeru_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgeru_v2_64')
-    if __cublasZgeru_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgeru_v2_64 = dlsym(handle, 'cublasZgeru_v2_64')
-
-    global __cublasZgerc_v2_64
-    __cublasZgerc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgerc_v2_64')
-    if __cublasZgerc_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgerc_v2_64 = dlsym(handle, 'cublasZgerc_v2_64')
-
-    global __cublasSsyr_v2_64
-    __cublasSsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr_v2_64')
-    if __cublasSsyr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr_v2_64 = dlsym(handle, 'cublasSsyr_v2_64')
-
-    global __cublasDsyr_v2_64
-    __cublasDsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr_v2_64')
-    if __cublasDsyr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr_v2_64 = dlsym(handle, 'cublasDsyr_v2_64')
-
-    global __cublasCsyr_v2_64
-    __cublasCsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr_v2_64')
-    if __cublasCsyr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr_v2_64 = dlsym(handle, 'cublasCsyr_v2_64')
-
-    global __cublasZsyr_v2_64
-    __cublasZsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr_v2_64')
-    if __cublasZsyr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr_v2_64 = dlsym(handle, 'cublasZsyr_v2_64')
-
-    global __cublasCher_v2_64
-    __cublasCher_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher_v2_64')
-    if __cublasCher_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher_v2_64 = dlsym(handle, 'cublasCher_v2_64')
-
-    global __cublasZher_v2_64
-    __cublasZher_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher_v2_64')
-    if __cublasZher_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher_v2_64 = dlsym(handle, 'cublasZher_v2_64')
-
-    global __cublasSspr_v2_64
-    __cublasSspr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspr_v2_64')
-    if __cublasSspr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspr_v2_64 = dlsym(handle, 'cublasSspr_v2_64')
-
-    global __cublasDspr_v2_64
-    __cublasDspr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspr_v2_64')
-    if __cublasDspr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspr_v2_64 = dlsym(handle, 'cublasDspr_v2_64')
-
-    global __cublasChpr_v2_64
-    __cublasChpr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpr_v2_64')
-    if __cublasChpr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpr_v2_64 = dlsym(handle, 'cublasChpr_v2_64')
-
-    global __cublasZhpr_v2_64
-    __cublasZhpr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpr_v2_64')
-    if __cublasZhpr_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpr_v2_64 = dlsym(handle, 'cublasZhpr_v2_64')
-
-    global __cublasSsyr2_v2_64
-    __cublasSsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr2_v2_64')
-    if __cublasSsyr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr2_v2_64 = dlsym(handle, 'cublasSsyr2_v2_64')
-
-    global __cublasDsyr2_v2_64
-    __cublasDsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr2_v2_64')
-    if __cublasDsyr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr2_v2_64 = dlsym(handle, 'cublasDsyr2_v2_64')
-
-    global __cublasCsyr2_v2_64
-    __cublasCsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr2_v2_64')
-    if __cublasCsyr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr2_v2_64 = dlsym(handle, 'cublasCsyr2_v2_64')
-
-    global __cublasZsyr2_v2_64
-    __cublasZsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr2_v2_64')
-    if __cublasZsyr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr2_v2_64 = dlsym(handle, 'cublasZsyr2_v2_64')
-
-    global __cublasCher2_v2_64
-    __cublasCher2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher2_v2_64')
-    if __cublasCher2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher2_v2_64 = dlsym(handle, 'cublasCher2_v2_64')
-
-    global __cublasZher2_v2_64
-    __cublasZher2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher2_v2_64')
-    if __cublasZher2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher2_v2_64 = dlsym(handle, 'cublasZher2_v2_64')
-
-    global __cublasSspr2_v2_64
-    __cublasSspr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspr2_v2_64')
-    if __cublasSspr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSspr2_v2_64 = dlsym(handle, 'cublasSspr2_v2_64')
-
-    global __cublasDspr2_v2_64
-    __cublasDspr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspr2_v2_64')
-    if __cublasDspr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDspr2_v2_64 = dlsym(handle, 'cublasDspr2_v2_64')
-
-    global __cublasChpr2_v2_64
-    __cublasChpr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpr2_v2_64')
-    if __cublasChpr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChpr2_v2_64 = dlsym(handle, 'cublasChpr2_v2_64')
-
-    global __cublasZhpr2_v2_64
-    __cublasZhpr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpr2_v2_64')
-    if __cublasZhpr2_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhpr2_v2_64 = dlsym(handle, 'cublasZhpr2_v2_64')
-
-    global __cublasSgemvBatched_64
-    __cublasSgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemvBatched_64')
-    if __cublasSgemvBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemvBatched_64 = dlsym(handle, 'cublasSgemvBatched_64')
-
-    global __cublasDgemvBatched_64
-    __cublasDgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemvBatched_64')
-    if __cublasDgemvBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemvBatched_64 = dlsym(handle, 'cublasDgemvBatched_64')
-
-    global __cublasCgemvBatched_64
-    __cublasCgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemvBatched_64')
-    if __cublasCgemvBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemvBatched_64 = dlsym(handle, 'cublasCgemvBatched_64')
-
-    global __cublasZgemvBatched_64
-    __cublasZgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemvBatched_64')
-    if __cublasZgemvBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemvBatched_64 = dlsym(handle, 'cublasZgemvBatched_64')
-
-    global __cublasSgemvStridedBatched_64
-    __cublasSgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemvStridedBatched_64')
-    if __cublasSgemvStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemvStridedBatched_64 = dlsym(handle, 'cublasSgemvStridedBatched_64')
-
-    global __cublasDgemvStridedBatched_64
-    __cublasDgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemvStridedBatched_64')
-    if __cublasDgemvStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemvStridedBatched_64 = dlsym(handle, 'cublasDgemvStridedBatched_64')
-
-    global __cublasCgemvStridedBatched_64
-    __cublasCgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemvStridedBatched_64')
-    if __cublasCgemvStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemvStridedBatched_64 = dlsym(handle, 'cublasCgemvStridedBatched_64')
-
-    global __cublasZgemvStridedBatched_64
-    __cublasZgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemvStridedBatched_64')
-    if __cublasZgemvStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemvStridedBatched_64 = dlsym(handle, 'cublasZgemvStridedBatched_64')
-
-    global __cublasSgemm_v2_64
-    __cublasSgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgemm_v2_64')
-    if __cublasSgemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemm_v2_64 = dlsym(handle, 'cublasSgemm_v2_64')
-
-    global __cublasDgemm_v2_64
-    __cublasDgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgemm_v2_64')
-    if __cublasDgemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemm_v2_64 = dlsym(handle, 'cublasDgemm_v2_64')
-
-    global __cublasCgemm_v2_64
-    __cublasCgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm_v2_64')
-    if __cublasCgemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm_v2_64 = dlsym(handle, 'cublasCgemm_v2_64')
-
-    global __cublasCgemm3m_64
-    __cublasCgemm3m_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3m_64')
-    if __cublasCgemm3m_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3m_64 = dlsym(handle, 'cublasCgemm3m_64')
-
-    global __cublasCgemm3mEx_64
-    __cublasCgemm3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mEx_64')
-    if __cublasCgemm3mEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mEx_64 = dlsym(handle, 'cublasCgemm3mEx_64')
-
-    global __cublasZgemm_v2_64
-    __cublasZgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgemm_v2_64')
-    if __cublasZgemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemm_v2_64 = dlsym(handle, 'cublasZgemm_v2_64')
-
-    global __cublasZgemm3m_64
-    __cublasZgemm3m_64 = dlsym(RTLD_DEFAULT, 'cublasZgemm3m_64')
-    if __cublasZgemm3m_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemm3m_64 = dlsym(handle, 'cublasZgemm3m_64')
-
-    global __cublasSgemmEx_64
-    __cublasSgemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmEx_64')
-    if __cublasSgemmEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmEx_64 = dlsym(handle, 'cublasSgemmEx_64')
-
-    global __cublasGemmEx_64
-    __cublasGemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmEx_64')
-    if __cublasGemmEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmEx_64 = dlsym(handle, 'cublasGemmEx_64')
-
-    global __cublasCgemmEx_64
-    __cublasCgemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmEx_64')
-    if __cublasCgemmEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmEx_64 = dlsym(handle, 'cublasCgemmEx_64')
-
-    global __cublasSsyrk_v2_64
-    __cublasSsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyrk_v2_64')
-    if __cublasSsyrk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyrk_v2_64 = dlsym(handle, 'cublasSsyrk_v2_64')
-
-    global __cublasDsyrk_v2_64
-    __cublasDsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyrk_v2_64')
-    if __cublasDsyrk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyrk_v2_64 = dlsym(handle, 'cublasDsyrk_v2_64')
-
-    global __cublasCsyrk_v2_64
-    __cublasCsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrk_v2_64')
-    if __cublasCsyrk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrk_v2_64 = dlsym(handle, 'cublasCsyrk_v2_64')
-
-    global __cublasZsyrk_v2_64
-    __cublasZsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyrk_v2_64')
-    if __cublasZsyrk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyrk_v2_64 = dlsym(handle, 'cublasZsyrk_v2_64')
-
-    global __cublasCsyrkEx_64
-    __cublasCsyrkEx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrkEx_64')
-    if __cublasCsyrkEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrkEx_64 = dlsym(handle, 'cublasCsyrkEx_64')
-
-    global __cublasCsyrk3mEx_64
-    __cublasCsyrk3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrk3mEx_64')
-    if __cublasCsyrk3mEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrk3mEx_64 = dlsym(handle, 'cublasCsyrk3mEx_64')
-
-    global __cublasCherk_v2_64
-    __cublasCherk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCherk_v2_64')
-    if __cublasCherk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherk_v2_64 = dlsym(handle, 'cublasCherk_v2_64')
-
-    global __cublasZherk_v2_64
-    __cublasZherk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZherk_v2_64')
-    if __cublasZherk_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZherk_v2_64 = dlsym(handle, 'cublasZherk_v2_64')
-
-    global __cublasCherkEx_64
-    __cublasCherkEx_64 = dlsym(RTLD_DEFAULT, 'cublasCherkEx_64')
-    if __cublasCherkEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherkEx_64 = dlsym(handle, 'cublasCherkEx_64')
-
-    global __cublasCherk3mEx_64
-    __cublasCherk3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCherk3mEx_64')
-    if __cublasCherk3mEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherk3mEx_64 = dlsym(handle, 'cublasCherk3mEx_64')
-
-    global __cublasSsyr2k_v2_64
-    __cublasSsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr2k_v2_64')
-    if __cublasSsyr2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyr2k_v2_64 = dlsym(handle, 'cublasSsyr2k_v2_64')
-
-    global __cublasDsyr2k_v2_64
-    __cublasDsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr2k_v2_64')
-    if __cublasDsyr2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyr2k_v2_64 = dlsym(handle, 'cublasDsyr2k_v2_64')
-
-    global __cublasCsyr2k_v2_64
-    __cublasCsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr2k_v2_64')
-    if __cublasCsyr2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyr2k_v2_64 = dlsym(handle, 'cublasCsyr2k_v2_64')
-
-    global __cublasZsyr2k_v2_64
-    __cublasZsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr2k_v2_64')
-    if __cublasZsyr2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyr2k_v2_64 = dlsym(handle, 'cublasZsyr2k_v2_64')
-
-    global __cublasCher2k_v2_64
-    __cublasCher2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher2k_v2_64')
-    if __cublasCher2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCher2k_v2_64 = dlsym(handle, 'cublasCher2k_v2_64')
-
-    global __cublasZher2k_v2_64
-    __cublasZher2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher2k_v2_64')
-    if __cublasZher2k_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZher2k_v2_64 = dlsym(handle, 'cublasZher2k_v2_64')
-
-    global __cublasSsyrkx_64
-    __cublasSsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasSsyrkx_64')
-    if __cublasSsyrkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsyrkx_64 = dlsym(handle, 'cublasSsyrkx_64')
-
-    global __cublasDsyrkx_64
-    __cublasDsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasDsyrkx_64')
-    if __cublasDsyrkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsyrkx_64 = dlsym(handle, 'cublasDsyrkx_64')
-
-    global __cublasCsyrkx_64
-    __cublasCsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrkx_64')
-    if __cublasCsyrkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsyrkx_64 = dlsym(handle, 'cublasCsyrkx_64')
-
-    global __cublasZsyrkx_64
-    __cublasZsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasZsyrkx_64')
-    if __cublasZsyrkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsyrkx_64 = dlsym(handle, 'cublasZsyrkx_64')
-
-    global __cublasCherkx_64
-    __cublasCherkx_64 = dlsym(RTLD_DEFAULT, 'cublasCherkx_64')
-    if __cublasCherkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCherkx_64 = dlsym(handle, 'cublasCherkx_64')
-
-    global __cublasZherkx_64
-    __cublasZherkx_64 = dlsym(RTLD_DEFAULT, 'cublasZherkx_64')
-    if __cublasZherkx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZherkx_64 = dlsym(handle, 'cublasZherkx_64')
-
-    global __cublasSsymm_v2_64
-    __cublasSsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsymm_v2_64')
-    if __cublasSsymm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSsymm_v2_64 = dlsym(handle, 'cublasSsymm_v2_64')
-
-    global __cublasDsymm_v2_64
-    __cublasDsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsymm_v2_64')
-    if __cublasDsymm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDsymm_v2_64 = dlsym(handle, 'cublasDsymm_v2_64')
-
-    global __cublasCsymm_v2_64
-    __cublasCsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsymm_v2_64')
-    if __cublasCsymm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCsymm_v2_64 = dlsym(handle, 'cublasCsymm_v2_64')
-
-    global __cublasZsymm_v2_64
-    __cublasZsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsymm_v2_64')
-    if __cublasZsymm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZsymm_v2_64 = dlsym(handle, 'cublasZsymm_v2_64')
-
-    global __cublasChemm_v2_64
-    __cublasChemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChemm_v2_64')
-    if __cublasChemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasChemm_v2_64 = dlsym(handle, 'cublasChemm_v2_64')
-
-    global __cublasZhemm_v2_64
-    __cublasZhemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhemm_v2_64')
-    if __cublasZhemm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZhemm_v2_64 = dlsym(handle, 'cublasZhemm_v2_64')
-
-    global __cublasStrsm_v2_64
-    __cublasStrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrsm_v2_64')
-    if __cublasStrsm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsm_v2_64 = dlsym(handle, 'cublasStrsm_v2_64')
-
-    global __cublasDtrsm_v2_64
-    __cublasDtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsm_v2_64')
-    if __cublasDtrsm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsm_v2_64 = dlsym(handle, 'cublasDtrsm_v2_64')
-
-    global __cublasCtrsm_v2_64
-    __cublasCtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsm_v2_64')
-    if __cublasCtrsm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsm_v2_64 = dlsym(handle, 'cublasCtrsm_v2_64')
-
-    global __cublasZtrsm_v2_64
-    __cublasZtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsm_v2_64')
-    if __cublasZtrsm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsm_v2_64 = dlsym(handle, 'cublasZtrsm_v2_64')
-
-    global __cublasStrmm_v2_64
-    __cublasStrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrmm_v2_64')
-    if __cublasStrmm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrmm_v2_64 = dlsym(handle, 'cublasStrmm_v2_64')
-
-    global __cublasDtrmm_v2_64
-    __cublasDtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrmm_v2_64')
-    if __cublasDtrmm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrmm_v2_64 = dlsym(handle, 'cublasDtrmm_v2_64')
-
-    global __cublasCtrmm_v2_64
-    __cublasCtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrmm_v2_64')
-    if __cublasCtrmm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrmm_v2_64 = dlsym(handle, 'cublasCtrmm_v2_64')
-
-    global __cublasZtrmm_v2_64
-    __cublasZtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrmm_v2_64')
-    if __cublasZtrmm_v2_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrmm_v2_64 = dlsym(handle, 'cublasZtrmm_v2_64')
-
-    global __cublasSgemmBatched_64
-    __cublasSgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmBatched_64')
-    if __cublasSgemmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmBatched_64 = dlsym(handle, 'cublasSgemmBatched_64')
-
-    global __cublasDgemmBatched_64
-    __cublasDgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmBatched_64')
-    if __cublasDgemmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmBatched_64 = dlsym(handle, 'cublasDgemmBatched_64')
-
-    global __cublasCgemmBatched_64
-    __cublasCgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmBatched_64')
-    if __cublasCgemmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmBatched_64 = dlsym(handle, 'cublasCgemmBatched_64')
-
-    global __cublasCgemm3mBatched_64
-    __cublasCgemm3mBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mBatched_64')
-    if __cublasCgemm3mBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mBatched_64 = dlsym(handle, 'cublasCgemm3mBatched_64')
-
-    global __cublasZgemmBatched_64
-    __cublasZgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemmBatched_64')
-    if __cublasZgemmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemmBatched_64 = dlsym(handle, 'cublasZgemmBatched_64')
-
-    global __cublasSgemmStridedBatched_64
-    __cublasSgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmStridedBatched_64')
-    if __cublasSgemmStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmStridedBatched_64 = dlsym(handle, 'cublasSgemmStridedBatched_64')
-
-    global __cublasDgemmStridedBatched_64
-    __cublasDgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmStridedBatched_64')
-    if __cublasDgemmStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmStridedBatched_64 = dlsym(handle, 'cublasDgemmStridedBatched_64')
-
-    global __cublasCgemmStridedBatched_64
-    __cublasCgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmStridedBatched_64')
-    if __cublasCgemmStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemmStridedBatched_64 = dlsym(handle, 'cublasCgemmStridedBatched_64')
-
-    global __cublasCgemm3mStridedBatched_64
-    __cublasCgemm3mStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mStridedBatched_64')
-    if __cublasCgemm3mStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgemm3mStridedBatched_64 = dlsym(handle, 'cublasCgemm3mStridedBatched_64')
-
-    global __cublasZgemmStridedBatched_64
-    __cublasZgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemmStridedBatched_64')
-    if __cublasZgemmStridedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgemmStridedBatched_64 = dlsym(handle, 'cublasZgemmStridedBatched_64')
-
-    global __cublasGemmBatchedEx_64
-    __cublasGemmBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmBatchedEx_64')
-    if __cublasGemmBatchedEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmBatchedEx_64 = dlsym(handle, 'cublasGemmBatchedEx_64')
-
-    global __cublasGemmStridedBatchedEx_64
-    __cublasGemmStridedBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmStridedBatchedEx_64')
-    if __cublasGemmStridedBatchedEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmStridedBatchedEx_64 = dlsym(handle, 'cublasGemmStridedBatchedEx_64')
-
-    global __cublasSgeam_64
-    __cublasSgeam_64 = dlsym(RTLD_DEFAULT, 'cublasSgeam_64')
-    if __cublasSgeam_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgeam_64 = dlsym(handle, 'cublasSgeam_64')
-
-    global __cublasDgeam_64
-    __cublasDgeam_64 = dlsym(RTLD_DEFAULT, 'cublasDgeam_64')
-    if __cublasDgeam_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgeam_64 = dlsym(handle, 'cublasDgeam_64')
-
-    global __cublasCgeam_64
-    __cublasCgeam_64 = dlsym(RTLD_DEFAULT, 'cublasCgeam_64')
-    if __cublasCgeam_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCgeam_64 = dlsym(handle, 'cublasCgeam_64')
-
-    global __cublasZgeam_64
-    __cublasZgeam_64 = dlsym(RTLD_DEFAULT, 'cublasZgeam_64')
-    if __cublasZgeam_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZgeam_64 = dlsym(handle, 'cublasZgeam_64')
-
-    global __cublasStrsmBatched_64
-    __cublasStrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasStrsmBatched_64')
-    if __cublasStrsmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasStrsmBatched_64 = dlsym(handle, 'cublasStrsmBatched_64')
-
-    global __cublasDtrsmBatched_64
-    __cublasDtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsmBatched_64')
-    if __cublasDtrsmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDtrsmBatched_64 = dlsym(handle, 'cublasDtrsmBatched_64')
-
-    global __cublasCtrsmBatched_64
-    __cublasCtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsmBatched_64')
-    if __cublasCtrsmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCtrsmBatched_64 = dlsym(handle, 'cublasCtrsmBatched_64')
-
-    global __cublasZtrsmBatched_64
-    __cublasZtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsmBatched_64')
-    if __cublasZtrsmBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZtrsmBatched_64 = dlsym(handle, 'cublasZtrsmBatched_64')
-
-    global __cublasSdgmm_64
-    __cublasSdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasSdgmm_64')
-    if __cublasSdgmm_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSdgmm_64 = dlsym(handle, 'cublasSdgmm_64')
-
-    global __cublasDdgmm_64
-    __cublasDdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasDdgmm_64')
-    if __cublasDdgmm_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDdgmm_64 = dlsym(handle, 'cublasDdgmm_64')
-
-    global __cublasCdgmm_64
-    __cublasCdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasCdgmm_64')
-    if __cublasCdgmm_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasCdgmm_64 = dlsym(handle, 'cublasCdgmm_64')
-
-    global __cublasZdgmm_64
-    __cublasZdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasZdgmm_64')
-    if __cublasZdgmm_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasZdgmm_64 = dlsym(handle, 'cublasZdgmm_64')
-
-    global __cublasSgemmGroupedBatched
-    __cublasSgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched')
-    if __cublasSgemmGroupedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmGroupedBatched = dlsym(handle, 'cublasSgemmGroupedBatched')
-
-    global __cublasSgemmGroupedBatched_64
-    __cublasSgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched_64')
-    if __cublasSgemmGroupedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSgemmGroupedBatched_64 = dlsym(handle, 'cublasSgemmGroupedBatched_64')
-
-    global __cublasDgemmGroupedBatched
-    __cublasDgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched')
-    if __cublasDgemmGroupedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmGroupedBatched = dlsym(handle, 'cublasDgemmGroupedBatched')
-
-    global __cublasDgemmGroupedBatched_64
-    __cublasDgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched_64')
-    if __cublasDgemmGroupedBatched_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasDgemmGroupedBatched_64 = dlsym(handle, 'cublasDgemmGroupedBatched_64')
-
-    global __cublasGemmGroupedBatchedEx
-    __cublasGemmGroupedBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx')
-    if __cublasGemmGroupedBatchedEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmGroupedBatchedEx = dlsym(handle, 'cublasGemmGroupedBatchedEx')
-
-    global __cublasGemmGroupedBatchedEx_64
-    __cublasGemmGroupedBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx_64')
-    if __cublasGemmGroupedBatchedEx_64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGemmGroupedBatchedEx_64 = dlsym(handle, 'cublasGemmGroupedBatchedEx_64')
-
-    global __cublasGetEmulationStrategy
-    __cublasGetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasGetEmulationStrategy')
-    if __cublasGetEmulationStrategy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasGetEmulationStrategy = dlsym(handle, 'cublasGetEmulationStrategy')
-
-    global __cublasSetEmulationStrategy
-    __cublasSetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasSetEmulationStrategy')
-    if __cublasSetEmulationStrategy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasSetEmulationStrategy = dlsym(handle, 'cublasSetEmulationStrategy')
-
-    __py_cublas_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cublasCreate_v2
+        __cublasCreate_v2 = dlsym(RTLD_DEFAULT, 'cublasCreate_v2')
+        if __cublasCreate_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCreate_v2 = dlsym(handle, 'cublasCreate_v2')
+
+        global __cublasDestroy_v2
+        __cublasDestroy_v2 = dlsym(RTLD_DEFAULT, 'cublasDestroy_v2')
+        if __cublasDestroy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDestroy_v2 = dlsym(handle, 'cublasDestroy_v2')
+
+        global __cublasGetVersion_v2
+        __cublasGetVersion_v2 = dlsym(RTLD_DEFAULT, 'cublasGetVersion_v2')
+        if __cublasGetVersion_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetVersion_v2 = dlsym(handle, 'cublasGetVersion_v2')
+
+        global __cublasGetProperty
+        __cublasGetProperty = dlsym(RTLD_DEFAULT, 'cublasGetProperty')
+        if __cublasGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetProperty = dlsym(handle, 'cublasGetProperty')
+
+        global __cublasGetCudartVersion
+        __cublasGetCudartVersion = dlsym(RTLD_DEFAULT, 'cublasGetCudartVersion')
+        if __cublasGetCudartVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetCudartVersion = dlsym(handle, 'cublasGetCudartVersion')
+
+        global __cublasSetWorkspace_v2
+        __cublasSetWorkspace_v2 = dlsym(RTLD_DEFAULT, 'cublasSetWorkspace_v2')
+        if __cublasSetWorkspace_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetWorkspace_v2 = dlsym(handle, 'cublasSetWorkspace_v2')
+
+        global __cublasSetStream_v2
+        __cublasSetStream_v2 = dlsym(RTLD_DEFAULT, 'cublasSetStream_v2')
+        if __cublasSetStream_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetStream_v2 = dlsym(handle, 'cublasSetStream_v2')
+
+        global __cublasGetStream_v2
+        __cublasGetStream_v2 = dlsym(RTLD_DEFAULT, 'cublasGetStream_v2')
+        if __cublasGetStream_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetStream_v2 = dlsym(handle, 'cublasGetStream_v2')
+
+        global __cublasGetPointerMode_v2
+        __cublasGetPointerMode_v2 = dlsym(RTLD_DEFAULT, 'cublasGetPointerMode_v2')
+        if __cublasGetPointerMode_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetPointerMode_v2 = dlsym(handle, 'cublasGetPointerMode_v2')
+
+        global __cublasSetPointerMode_v2
+        __cublasSetPointerMode_v2 = dlsym(RTLD_DEFAULT, 'cublasSetPointerMode_v2')
+        if __cublasSetPointerMode_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetPointerMode_v2 = dlsym(handle, 'cublasSetPointerMode_v2')
+
+        global __cublasGetAtomicsMode
+        __cublasGetAtomicsMode = dlsym(RTLD_DEFAULT, 'cublasGetAtomicsMode')
+        if __cublasGetAtomicsMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetAtomicsMode = dlsym(handle, 'cublasGetAtomicsMode')
+
+        global __cublasSetAtomicsMode
+        __cublasSetAtomicsMode = dlsym(RTLD_DEFAULT, 'cublasSetAtomicsMode')
+        if __cublasSetAtomicsMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetAtomicsMode = dlsym(handle, 'cublasSetAtomicsMode')
+
+        global __cublasGetMathMode
+        __cublasGetMathMode = dlsym(RTLD_DEFAULT, 'cublasGetMathMode')
+        if __cublasGetMathMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetMathMode = dlsym(handle, 'cublasGetMathMode')
+
+        global __cublasSetMathMode
+        __cublasSetMathMode = dlsym(RTLD_DEFAULT, 'cublasSetMathMode')
+        if __cublasSetMathMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetMathMode = dlsym(handle, 'cublasSetMathMode')
+
+        global __cublasLoggerConfigure
+        __cublasLoggerConfigure = dlsym(RTLD_DEFAULT, 'cublasLoggerConfigure')
+        if __cublasLoggerConfigure == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasLoggerConfigure = dlsym(handle, 'cublasLoggerConfigure')
+
+        global __cublasSetLoggerCallback
+        __cublasSetLoggerCallback = dlsym(RTLD_DEFAULT, 'cublasSetLoggerCallback')
+        if __cublasSetLoggerCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetLoggerCallback = dlsym(handle, 'cublasSetLoggerCallback')
+
+        global __cublasGetLoggerCallback
+        __cublasGetLoggerCallback = dlsym(RTLD_DEFAULT, 'cublasGetLoggerCallback')
+        if __cublasGetLoggerCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetLoggerCallback = dlsym(handle, 'cublasGetLoggerCallback')
+
+        global __cublasSetVector
+        __cublasSetVector = dlsym(RTLD_DEFAULT, 'cublasSetVector')
+        if __cublasSetVector == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetVector = dlsym(handle, 'cublasSetVector')
+
+        global __cublasGetVector
+        __cublasGetVector = dlsym(RTLD_DEFAULT, 'cublasGetVector')
+        if __cublasGetVector == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetVector = dlsym(handle, 'cublasGetVector')
+
+        global __cublasSetMatrix
+        __cublasSetMatrix = dlsym(RTLD_DEFAULT, 'cublasSetMatrix')
+        if __cublasSetMatrix == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetMatrix = dlsym(handle, 'cublasSetMatrix')
+
+        global __cublasGetMatrix
+        __cublasGetMatrix = dlsym(RTLD_DEFAULT, 'cublasGetMatrix')
+        if __cublasGetMatrix == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetMatrix = dlsym(handle, 'cublasGetMatrix')
+
+        global __cublasSetVectorAsync
+        __cublasSetVectorAsync = dlsym(RTLD_DEFAULT, 'cublasSetVectorAsync')
+        if __cublasSetVectorAsync == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetVectorAsync = dlsym(handle, 'cublasSetVectorAsync')
+
+        global __cublasGetVectorAsync
+        __cublasGetVectorAsync = dlsym(RTLD_DEFAULT, 'cublasGetVectorAsync')
+        if __cublasGetVectorAsync == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetVectorAsync = dlsym(handle, 'cublasGetVectorAsync')
+
+        global __cublasSetMatrixAsync
+        __cublasSetMatrixAsync = dlsym(RTLD_DEFAULT, 'cublasSetMatrixAsync')
+        if __cublasSetMatrixAsync == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetMatrixAsync = dlsym(handle, 'cublasSetMatrixAsync')
+
+        global __cublasGetMatrixAsync
+        __cublasGetMatrixAsync = dlsym(RTLD_DEFAULT, 'cublasGetMatrixAsync')
+        if __cublasGetMatrixAsync == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetMatrixAsync = dlsym(handle, 'cublasGetMatrixAsync')
+
+        global __cublasNrm2Ex
+        __cublasNrm2Ex = dlsym(RTLD_DEFAULT, 'cublasNrm2Ex')
+        if __cublasNrm2Ex == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasNrm2Ex = dlsym(handle, 'cublasNrm2Ex')
+
+        global __cublasSnrm2_v2
+        __cublasSnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasSnrm2_v2')
+        if __cublasSnrm2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSnrm2_v2 = dlsym(handle, 'cublasSnrm2_v2')
+
+        global __cublasDnrm2_v2
+        __cublasDnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasDnrm2_v2')
+        if __cublasDnrm2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDnrm2_v2 = dlsym(handle, 'cublasDnrm2_v2')
+
+        global __cublasScnrm2_v2
+        __cublasScnrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasScnrm2_v2')
+        if __cublasScnrm2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScnrm2_v2 = dlsym(handle, 'cublasScnrm2_v2')
+
+        global __cublasDznrm2_v2
+        __cublasDznrm2_v2 = dlsym(RTLD_DEFAULT, 'cublasDznrm2_v2')
+        if __cublasDznrm2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDznrm2_v2 = dlsym(handle, 'cublasDznrm2_v2')
+
+        global __cublasDotEx
+        __cublasDotEx = dlsym(RTLD_DEFAULT, 'cublasDotEx')
+        if __cublasDotEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDotEx = dlsym(handle, 'cublasDotEx')
+
+        global __cublasDotcEx
+        __cublasDotcEx = dlsym(RTLD_DEFAULT, 'cublasDotcEx')
+        if __cublasDotcEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDotcEx = dlsym(handle, 'cublasDotcEx')
+
+        global __cublasSdot_v2
+        __cublasSdot_v2 = dlsym(RTLD_DEFAULT, 'cublasSdot_v2')
+        if __cublasSdot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSdot_v2 = dlsym(handle, 'cublasSdot_v2')
+
+        global __cublasDdot_v2
+        __cublasDdot_v2 = dlsym(RTLD_DEFAULT, 'cublasDdot_v2')
+        if __cublasDdot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDdot_v2 = dlsym(handle, 'cublasDdot_v2')
+
+        global __cublasCdotu_v2
+        __cublasCdotu_v2 = dlsym(RTLD_DEFAULT, 'cublasCdotu_v2')
+        if __cublasCdotu_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdotu_v2 = dlsym(handle, 'cublasCdotu_v2')
+
+        global __cublasCdotc_v2
+        __cublasCdotc_v2 = dlsym(RTLD_DEFAULT, 'cublasCdotc_v2')
+        if __cublasCdotc_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdotc_v2 = dlsym(handle, 'cublasCdotc_v2')
+
+        global __cublasZdotu_v2
+        __cublasZdotu_v2 = dlsym(RTLD_DEFAULT, 'cublasZdotu_v2')
+        if __cublasZdotu_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdotu_v2 = dlsym(handle, 'cublasZdotu_v2')
+
+        global __cublasZdotc_v2
+        __cublasZdotc_v2 = dlsym(RTLD_DEFAULT, 'cublasZdotc_v2')
+        if __cublasZdotc_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdotc_v2 = dlsym(handle, 'cublasZdotc_v2')
+
+        global __cublasScalEx
+        __cublasScalEx = dlsym(RTLD_DEFAULT, 'cublasScalEx')
+        if __cublasScalEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScalEx = dlsym(handle, 'cublasScalEx')
+
+        global __cublasSscal_v2
+        __cublasSscal_v2 = dlsym(RTLD_DEFAULT, 'cublasSscal_v2')
+        if __cublasSscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSscal_v2 = dlsym(handle, 'cublasSscal_v2')
+
+        global __cublasDscal_v2
+        __cublasDscal_v2 = dlsym(RTLD_DEFAULT, 'cublasDscal_v2')
+        if __cublasDscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDscal_v2 = dlsym(handle, 'cublasDscal_v2')
+
+        global __cublasCscal_v2
+        __cublasCscal_v2 = dlsym(RTLD_DEFAULT, 'cublasCscal_v2')
+        if __cublasCscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCscal_v2 = dlsym(handle, 'cublasCscal_v2')
+
+        global __cublasCsscal_v2
+        __cublasCsscal_v2 = dlsym(RTLD_DEFAULT, 'cublasCsscal_v2')
+        if __cublasCsscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsscal_v2 = dlsym(handle, 'cublasCsscal_v2')
+
+        global __cublasZscal_v2
+        __cublasZscal_v2 = dlsym(RTLD_DEFAULT, 'cublasZscal_v2')
+        if __cublasZscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZscal_v2 = dlsym(handle, 'cublasZscal_v2')
+
+        global __cublasZdscal_v2
+        __cublasZdscal_v2 = dlsym(RTLD_DEFAULT, 'cublasZdscal_v2')
+        if __cublasZdscal_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdscal_v2 = dlsym(handle, 'cublasZdscal_v2')
+
+        global __cublasAxpyEx
+        __cublasAxpyEx = dlsym(RTLD_DEFAULT, 'cublasAxpyEx')
+        if __cublasAxpyEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasAxpyEx = dlsym(handle, 'cublasAxpyEx')
+
+        global __cublasSaxpy_v2
+        __cublasSaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasSaxpy_v2')
+        if __cublasSaxpy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSaxpy_v2 = dlsym(handle, 'cublasSaxpy_v2')
+
+        global __cublasDaxpy_v2
+        __cublasDaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasDaxpy_v2')
+        if __cublasDaxpy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDaxpy_v2 = dlsym(handle, 'cublasDaxpy_v2')
+
+        global __cublasCaxpy_v2
+        __cublasCaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasCaxpy_v2')
+        if __cublasCaxpy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCaxpy_v2 = dlsym(handle, 'cublasCaxpy_v2')
+
+        global __cublasZaxpy_v2
+        __cublasZaxpy_v2 = dlsym(RTLD_DEFAULT, 'cublasZaxpy_v2')
+        if __cublasZaxpy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZaxpy_v2 = dlsym(handle, 'cublasZaxpy_v2')
+
+        global __cublasCopyEx
+        __cublasCopyEx = dlsym(RTLD_DEFAULT, 'cublasCopyEx')
+        if __cublasCopyEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCopyEx = dlsym(handle, 'cublasCopyEx')
+
+        global __cublasScopy_v2
+        __cublasScopy_v2 = dlsym(RTLD_DEFAULT, 'cublasScopy_v2')
+        if __cublasScopy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScopy_v2 = dlsym(handle, 'cublasScopy_v2')
+
+        global __cublasDcopy_v2
+        __cublasDcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasDcopy_v2')
+        if __cublasDcopy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDcopy_v2 = dlsym(handle, 'cublasDcopy_v2')
+
+        global __cublasCcopy_v2
+        __cublasCcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasCcopy_v2')
+        if __cublasCcopy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCcopy_v2 = dlsym(handle, 'cublasCcopy_v2')
+
+        global __cublasZcopy_v2
+        __cublasZcopy_v2 = dlsym(RTLD_DEFAULT, 'cublasZcopy_v2')
+        if __cublasZcopy_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZcopy_v2 = dlsym(handle, 'cublasZcopy_v2')
+
+        global __cublasSswap_v2
+        __cublasSswap_v2 = dlsym(RTLD_DEFAULT, 'cublasSswap_v2')
+        if __cublasSswap_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSswap_v2 = dlsym(handle, 'cublasSswap_v2')
+
+        global __cublasDswap_v2
+        __cublasDswap_v2 = dlsym(RTLD_DEFAULT, 'cublasDswap_v2')
+        if __cublasDswap_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDswap_v2 = dlsym(handle, 'cublasDswap_v2')
+
+        global __cublasCswap_v2
+        __cublasCswap_v2 = dlsym(RTLD_DEFAULT, 'cublasCswap_v2')
+        if __cublasCswap_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCswap_v2 = dlsym(handle, 'cublasCswap_v2')
+
+        global __cublasZswap_v2
+        __cublasZswap_v2 = dlsym(RTLD_DEFAULT, 'cublasZswap_v2')
+        if __cublasZswap_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZswap_v2 = dlsym(handle, 'cublasZswap_v2')
+
+        global __cublasSwapEx
+        __cublasSwapEx = dlsym(RTLD_DEFAULT, 'cublasSwapEx')
+        if __cublasSwapEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSwapEx = dlsym(handle, 'cublasSwapEx')
+
+        global __cublasIsamax_v2
+        __cublasIsamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIsamax_v2')
+        if __cublasIsamax_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIsamax_v2 = dlsym(handle, 'cublasIsamax_v2')
+
+        global __cublasIdamax_v2
+        __cublasIdamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIdamax_v2')
+        if __cublasIdamax_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIdamax_v2 = dlsym(handle, 'cublasIdamax_v2')
+
+        global __cublasIcamax_v2
+        __cublasIcamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIcamax_v2')
+        if __cublasIcamax_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIcamax_v2 = dlsym(handle, 'cublasIcamax_v2')
+
+        global __cublasIzamax_v2
+        __cublasIzamax_v2 = dlsym(RTLD_DEFAULT, 'cublasIzamax_v2')
+        if __cublasIzamax_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIzamax_v2 = dlsym(handle, 'cublasIzamax_v2')
+
+        global __cublasIamaxEx
+        __cublasIamaxEx = dlsym(RTLD_DEFAULT, 'cublasIamaxEx')
+        if __cublasIamaxEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIamaxEx = dlsym(handle, 'cublasIamaxEx')
+
+        global __cublasIsamin_v2
+        __cublasIsamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIsamin_v2')
+        if __cublasIsamin_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIsamin_v2 = dlsym(handle, 'cublasIsamin_v2')
+
+        global __cublasIdamin_v2
+        __cublasIdamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIdamin_v2')
+        if __cublasIdamin_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIdamin_v2 = dlsym(handle, 'cublasIdamin_v2')
+
+        global __cublasIcamin_v2
+        __cublasIcamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIcamin_v2')
+        if __cublasIcamin_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIcamin_v2 = dlsym(handle, 'cublasIcamin_v2')
+
+        global __cublasIzamin_v2
+        __cublasIzamin_v2 = dlsym(RTLD_DEFAULT, 'cublasIzamin_v2')
+        if __cublasIzamin_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIzamin_v2 = dlsym(handle, 'cublasIzamin_v2')
+
+        global __cublasIaminEx
+        __cublasIaminEx = dlsym(RTLD_DEFAULT, 'cublasIaminEx')
+        if __cublasIaminEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIaminEx = dlsym(handle, 'cublasIaminEx')
+
+        global __cublasAsumEx
+        __cublasAsumEx = dlsym(RTLD_DEFAULT, 'cublasAsumEx')
+        if __cublasAsumEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasAsumEx = dlsym(handle, 'cublasAsumEx')
+
+        global __cublasSasum_v2
+        __cublasSasum_v2 = dlsym(RTLD_DEFAULT, 'cublasSasum_v2')
+        if __cublasSasum_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSasum_v2 = dlsym(handle, 'cublasSasum_v2')
+
+        global __cublasDasum_v2
+        __cublasDasum_v2 = dlsym(RTLD_DEFAULT, 'cublasDasum_v2')
+        if __cublasDasum_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDasum_v2 = dlsym(handle, 'cublasDasum_v2')
+
+        global __cublasScasum_v2
+        __cublasScasum_v2 = dlsym(RTLD_DEFAULT, 'cublasScasum_v2')
+        if __cublasScasum_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScasum_v2 = dlsym(handle, 'cublasScasum_v2')
+
+        global __cublasDzasum_v2
+        __cublasDzasum_v2 = dlsym(RTLD_DEFAULT, 'cublasDzasum_v2')
+        if __cublasDzasum_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDzasum_v2 = dlsym(handle, 'cublasDzasum_v2')
+
+        global __cublasSrot_v2
+        __cublasSrot_v2 = dlsym(RTLD_DEFAULT, 'cublasSrot_v2')
+        if __cublasSrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrot_v2 = dlsym(handle, 'cublasSrot_v2')
+
+        global __cublasDrot_v2
+        __cublasDrot_v2 = dlsym(RTLD_DEFAULT, 'cublasDrot_v2')
+        if __cublasDrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrot_v2 = dlsym(handle, 'cublasDrot_v2')
+
+        global __cublasCrot_v2
+        __cublasCrot_v2 = dlsym(RTLD_DEFAULT, 'cublasCrot_v2')
+        if __cublasCrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCrot_v2 = dlsym(handle, 'cublasCrot_v2')
+
+        global __cublasCsrot_v2
+        __cublasCsrot_v2 = dlsym(RTLD_DEFAULT, 'cublasCsrot_v2')
+        if __cublasCsrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsrot_v2 = dlsym(handle, 'cublasCsrot_v2')
+
+        global __cublasZrot_v2
+        __cublasZrot_v2 = dlsym(RTLD_DEFAULT, 'cublasZrot_v2')
+        if __cublasZrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZrot_v2 = dlsym(handle, 'cublasZrot_v2')
+
+        global __cublasZdrot_v2
+        __cublasZdrot_v2 = dlsym(RTLD_DEFAULT, 'cublasZdrot_v2')
+        if __cublasZdrot_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdrot_v2 = dlsym(handle, 'cublasZdrot_v2')
+
+        global __cublasRotEx
+        __cublasRotEx = dlsym(RTLD_DEFAULT, 'cublasRotEx')
+        if __cublasRotEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotEx = dlsym(handle, 'cublasRotEx')
+
+        global __cublasSrotg_v2
+        __cublasSrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotg_v2')
+        if __cublasSrotg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrotg_v2 = dlsym(handle, 'cublasSrotg_v2')
+
+        global __cublasDrotg_v2
+        __cublasDrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotg_v2')
+        if __cublasDrotg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrotg_v2 = dlsym(handle, 'cublasDrotg_v2')
+
+        global __cublasCrotg_v2
+        __cublasCrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasCrotg_v2')
+        if __cublasCrotg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCrotg_v2 = dlsym(handle, 'cublasCrotg_v2')
+
+        global __cublasZrotg_v2
+        __cublasZrotg_v2 = dlsym(RTLD_DEFAULT, 'cublasZrotg_v2')
+        if __cublasZrotg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZrotg_v2 = dlsym(handle, 'cublasZrotg_v2')
+
+        global __cublasRotgEx
+        __cublasRotgEx = dlsym(RTLD_DEFAULT, 'cublasRotgEx')
+        if __cublasRotgEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotgEx = dlsym(handle, 'cublasRotgEx')
+
+        global __cublasSrotm_v2
+        __cublasSrotm_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotm_v2')
+        if __cublasSrotm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrotm_v2 = dlsym(handle, 'cublasSrotm_v2')
+
+        global __cublasDrotm_v2
+        __cublasDrotm_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotm_v2')
+        if __cublasDrotm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrotm_v2 = dlsym(handle, 'cublasDrotm_v2')
+
+        global __cublasRotmEx
+        __cublasRotmEx = dlsym(RTLD_DEFAULT, 'cublasRotmEx')
+        if __cublasRotmEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotmEx = dlsym(handle, 'cublasRotmEx')
+
+        global __cublasSrotmg_v2
+        __cublasSrotmg_v2 = dlsym(RTLD_DEFAULT, 'cublasSrotmg_v2')
+        if __cublasSrotmg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrotmg_v2 = dlsym(handle, 'cublasSrotmg_v2')
+
+        global __cublasDrotmg_v2
+        __cublasDrotmg_v2 = dlsym(RTLD_DEFAULT, 'cublasDrotmg_v2')
+        if __cublasDrotmg_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrotmg_v2 = dlsym(handle, 'cublasDrotmg_v2')
+
+        global __cublasRotmgEx
+        __cublasRotmgEx = dlsym(RTLD_DEFAULT, 'cublasRotmgEx')
+        if __cublasRotmgEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotmgEx = dlsym(handle, 'cublasRotmgEx')
+
+        global __cublasSgemv_v2
+        __cublasSgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasSgemv_v2')
+        if __cublasSgemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemv_v2 = dlsym(handle, 'cublasSgemv_v2')
+
+        global __cublasDgemv_v2
+        __cublasDgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasDgemv_v2')
+        if __cublasDgemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemv_v2 = dlsym(handle, 'cublasDgemv_v2')
+
+        global __cublasCgemv_v2
+        __cublasCgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasCgemv_v2')
+        if __cublasCgemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemv_v2 = dlsym(handle, 'cublasCgemv_v2')
+
+        global __cublasZgemv_v2
+        __cublasZgemv_v2 = dlsym(RTLD_DEFAULT, 'cublasZgemv_v2')
+        if __cublasZgemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemv_v2 = dlsym(handle, 'cublasZgemv_v2')
+
+        global __cublasSgbmv_v2
+        __cublasSgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSgbmv_v2')
+        if __cublasSgbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgbmv_v2 = dlsym(handle, 'cublasSgbmv_v2')
+
+        global __cublasDgbmv_v2
+        __cublasDgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDgbmv_v2')
+        if __cublasDgbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgbmv_v2 = dlsym(handle, 'cublasDgbmv_v2')
+
+        global __cublasCgbmv_v2
+        __cublasCgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCgbmv_v2')
+        if __cublasCgbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgbmv_v2 = dlsym(handle, 'cublasCgbmv_v2')
+
+        global __cublasZgbmv_v2
+        __cublasZgbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZgbmv_v2')
+        if __cublasZgbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgbmv_v2 = dlsym(handle, 'cublasZgbmv_v2')
+
+        global __cublasStrmv_v2
+        __cublasStrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStrmv_v2')
+        if __cublasStrmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrmv_v2 = dlsym(handle, 'cublasStrmv_v2')
+
+        global __cublasDtrmv_v2
+        __cublasDtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrmv_v2')
+        if __cublasDtrmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrmv_v2 = dlsym(handle, 'cublasDtrmv_v2')
+
+        global __cublasCtrmv_v2
+        __cublasCtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrmv_v2')
+        if __cublasCtrmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrmv_v2 = dlsym(handle, 'cublasCtrmv_v2')
+
+        global __cublasZtrmv_v2
+        __cublasZtrmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrmv_v2')
+        if __cublasZtrmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrmv_v2 = dlsym(handle, 'cublasZtrmv_v2')
+
+        global __cublasStbmv_v2
+        __cublasStbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStbmv_v2')
+        if __cublasStbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStbmv_v2 = dlsym(handle, 'cublasStbmv_v2')
+
+        global __cublasDtbmv_v2
+        __cublasDtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtbmv_v2')
+        if __cublasDtbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtbmv_v2 = dlsym(handle, 'cublasDtbmv_v2')
+
+        global __cublasCtbmv_v2
+        __cublasCtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtbmv_v2')
+        if __cublasCtbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtbmv_v2 = dlsym(handle, 'cublasCtbmv_v2')
+
+        global __cublasZtbmv_v2
+        __cublasZtbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtbmv_v2')
+        if __cublasZtbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtbmv_v2 = dlsym(handle, 'cublasZtbmv_v2')
+
+        global __cublasStpmv_v2
+        __cublasStpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasStpmv_v2')
+        if __cublasStpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStpmv_v2 = dlsym(handle, 'cublasStpmv_v2')
+
+        global __cublasDtpmv_v2
+        __cublasDtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtpmv_v2')
+        if __cublasDtpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtpmv_v2 = dlsym(handle, 'cublasDtpmv_v2')
+
+        global __cublasCtpmv_v2
+        __cublasCtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtpmv_v2')
+        if __cublasCtpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtpmv_v2 = dlsym(handle, 'cublasCtpmv_v2')
+
+        global __cublasZtpmv_v2
+        __cublasZtpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtpmv_v2')
+        if __cublasZtpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtpmv_v2 = dlsym(handle, 'cublasZtpmv_v2')
+
+        global __cublasStrsv_v2
+        __cublasStrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStrsv_v2')
+        if __cublasStrsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsv_v2 = dlsym(handle, 'cublasStrsv_v2')
+
+        global __cublasDtrsv_v2
+        __cublasDtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrsv_v2')
+        if __cublasDtrsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsv_v2 = dlsym(handle, 'cublasDtrsv_v2')
+
+        global __cublasCtrsv_v2
+        __cublasCtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrsv_v2')
+        if __cublasCtrsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsv_v2 = dlsym(handle, 'cublasCtrsv_v2')
+
+        global __cublasZtrsv_v2
+        __cublasZtrsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrsv_v2')
+        if __cublasZtrsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsv_v2 = dlsym(handle, 'cublasZtrsv_v2')
+
+        global __cublasStpsv_v2
+        __cublasStpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStpsv_v2')
+        if __cublasStpsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStpsv_v2 = dlsym(handle, 'cublasStpsv_v2')
+
+        global __cublasDtpsv_v2
+        __cublasDtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtpsv_v2')
+        if __cublasDtpsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtpsv_v2 = dlsym(handle, 'cublasDtpsv_v2')
+
+        global __cublasCtpsv_v2
+        __cublasCtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtpsv_v2')
+        if __cublasCtpsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtpsv_v2 = dlsym(handle, 'cublasCtpsv_v2')
+
+        global __cublasZtpsv_v2
+        __cublasZtpsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtpsv_v2')
+        if __cublasZtpsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtpsv_v2 = dlsym(handle, 'cublasZtpsv_v2')
+
+        global __cublasStbsv_v2
+        __cublasStbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasStbsv_v2')
+        if __cublasStbsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStbsv_v2 = dlsym(handle, 'cublasStbsv_v2')
+
+        global __cublasDtbsv_v2
+        __cublasDtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasDtbsv_v2')
+        if __cublasDtbsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtbsv_v2 = dlsym(handle, 'cublasDtbsv_v2')
+
+        global __cublasCtbsv_v2
+        __cublasCtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasCtbsv_v2')
+        if __cublasCtbsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtbsv_v2 = dlsym(handle, 'cublasCtbsv_v2')
+
+        global __cublasZtbsv_v2
+        __cublasZtbsv_v2 = dlsym(RTLD_DEFAULT, 'cublasZtbsv_v2')
+        if __cublasZtbsv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtbsv_v2 = dlsym(handle, 'cublasZtbsv_v2')
+
+        global __cublasSsymv_v2
+        __cublasSsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasSsymv_v2')
+        if __cublasSsymv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsymv_v2 = dlsym(handle, 'cublasSsymv_v2')
+
+        global __cublasDsymv_v2
+        __cublasDsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasDsymv_v2')
+        if __cublasDsymv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsymv_v2 = dlsym(handle, 'cublasDsymv_v2')
+
+        global __cublasCsymv_v2
+        __cublasCsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasCsymv_v2')
+        if __cublasCsymv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsymv_v2 = dlsym(handle, 'cublasCsymv_v2')
+
+        global __cublasZsymv_v2
+        __cublasZsymv_v2 = dlsym(RTLD_DEFAULT, 'cublasZsymv_v2')
+        if __cublasZsymv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsymv_v2 = dlsym(handle, 'cublasZsymv_v2')
+
+        global __cublasChemv_v2
+        __cublasChemv_v2 = dlsym(RTLD_DEFAULT, 'cublasChemv_v2')
+        if __cublasChemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChemv_v2 = dlsym(handle, 'cublasChemv_v2')
+
+        global __cublasZhemv_v2
+        __cublasZhemv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhemv_v2')
+        if __cublasZhemv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhemv_v2 = dlsym(handle, 'cublasZhemv_v2')
+
+        global __cublasSsbmv_v2
+        __cublasSsbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSsbmv_v2')
+        if __cublasSsbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsbmv_v2 = dlsym(handle, 'cublasSsbmv_v2')
+
+        global __cublasDsbmv_v2
+        __cublasDsbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDsbmv_v2')
+        if __cublasDsbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsbmv_v2 = dlsym(handle, 'cublasDsbmv_v2')
+
+        global __cublasChbmv_v2
+        __cublasChbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasChbmv_v2')
+        if __cublasChbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChbmv_v2 = dlsym(handle, 'cublasChbmv_v2')
+
+        global __cublasZhbmv_v2
+        __cublasZhbmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhbmv_v2')
+        if __cublasZhbmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhbmv_v2 = dlsym(handle, 'cublasZhbmv_v2')
+
+        global __cublasSspmv_v2
+        __cublasSspmv_v2 = dlsym(RTLD_DEFAULT, 'cublasSspmv_v2')
+        if __cublasSspmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspmv_v2 = dlsym(handle, 'cublasSspmv_v2')
+
+        global __cublasDspmv_v2
+        __cublasDspmv_v2 = dlsym(RTLD_DEFAULT, 'cublasDspmv_v2')
+        if __cublasDspmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspmv_v2 = dlsym(handle, 'cublasDspmv_v2')
+
+        global __cublasChpmv_v2
+        __cublasChpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasChpmv_v2')
+        if __cublasChpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpmv_v2 = dlsym(handle, 'cublasChpmv_v2')
+
+        global __cublasZhpmv_v2
+        __cublasZhpmv_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpmv_v2')
+        if __cublasZhpmv_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpmv_v2 = dlsym(handle, 'cublasZhpmv_v2')
+
+        global __cublasSger_v2
+        __cublasSger_v2 = dlsym(RTLD_DEFAULT, 'cublasSger_v2')
+        if __cublasSger_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSger_v2 = dlsym(handle, 'cublasSger_v2')
+
+        global __cublasDger_v2
+        __cublasDger_v2 = dlsym(RTLD_DEFAULT, 'cublasDger_v2')
+        if __cublasDger_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDger_v2 = dlsym(handle, 'cublasDger_v2')
+
+        global __cublasCgeru_v2
+        __cublasCgeru_v2 = dlsym(RTLD_DEFAULT, 'cublasCgeru_v2')
+        if __cublasCgeru_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgeru_v2 = dlsym(handle, 'cublasCgeru_v2')
+
+        global __cublasCgerc_v2
+        __cublasCgerc_v2 = dlsym(RTLD_DEFAULT, 'cublasCgerc_v2')
+        if __cublasCgerc_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgerc_v2 = dlsym(handle, 'cublasCgerc_v2')
+
+        global __cublasZgeru_v2
+        __cublasZgeru_v2 = dlsym(RTLD_DEFAULT, 'cublasZgeru_v2')
+        if __cublasZgeru_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgeru_v2 = dlsym(handle, 'cublasZgeru_v2')
+
+        global __cublasZgerc_v2
+        __cublasZgerc_v2 = dlsym(RTLD_DEFAULT, 'cublasZgerc_v2')
+        if __cublasZgerc_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgerc_v2 = dlsym(handle, 'cublasZgerc_v2')
+
+        global __cublasSsyr_v2
+        __cublasSsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr_v2')
+        if __cublasSsyr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr_v2 = dlsym(handle, 'cublasSsyr_v2')
+
+        global __cublasDsyr_v2
+        __cublasDsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr_v2')
+        if __cublasDsyr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr_v2 = dlsym(handle, 'cublasDsyr_v2')
+
+        global __cublasCsyr_v2
+        __cublasCsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr_v2')
+        if __cublasCsyr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr_v2 = dlsym(handle, 'cublasCsyr_v2')
+
+        global __cublasZsyr_v2
+        __cublasZsyr_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr_v2')
+        if __cublasZsyr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr_v2 = dlsym(handle, 'cublasZsyr_v2')
+
+        global __cublasCher_v2
+        __cublasCher_v2 = dlsym(RTLD_DEFAULT, 'cublasCher_v2')
+        if __cublasCher_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher_v2 = dlsym(handle, 'cublasCher_v2')
+
+        global __cublasZher_v2
+        __cublasZher_v2 = dlsym(RTLD_DEFAULT, 'cublasZher_v2')
+        if __cublasZher_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher_v2 = dlsym(handle, 'cublasZher_v2')
+
+        global __cublasSspr_v2
+        __cublasSspr_v2 = dlsym(RTLD_DEFAULT, 'cublasSspr_v2')
+        if __cublasSspr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspr_v2 = dlsym(handle, 'cublasSspr_v2')
+
+        global __cublasDspr_v2
+        __cublasDspr_v2 = dlsym(RTLD_DEFAULT, 'cublasDspr_v2')
+        if __cublasDspr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspr_v2 = dlsym(handle, 'cublasDspr_v2')
+
+        global __cublasChpr_v2
+        __cublasChpr_v2 = dlsym(RTLD_DEFAULT, 'cublasChpr_v2')
+        if __cublasChpr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpr_v2 = dlsym(handle, 'cublasChpr_v2')
+
+        global __cublasZhpr_v2
+        __cublasZhpr_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpr_v2')
+        if __cublasZhpr_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpr_v2 = dlsym(handle, 'cublasZhpr_v2')
+
+        global __cublasSsyr2_v2
+        __cublasSsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr2_v2')
+        if __cublasSsyr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr2_v2 = dlsym(handle, 'cublasSsyr2_v2')
+
+        global __cublasDsyr2_v2
+        __cublasDsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr2_v2')
+        if __cublasDsyr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr2_v2 = dlsym(handle, 'cublasDsyr2_v2')
+
+        global __cublasCsyr2_v2
+        __cublasCsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr2_v2')
+        if __cublasCsyr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr2_v2 = dlsym(handle, 'cublasCsyr2_v2')
+
+        global __cublasZsyr2_v2
+        __cublasZsyr2_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr2_v2')
+        if __cublasZsyr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr2_v2 = dlsym(handle, 'cublasZsyr2_v2')
+
+        global __cublasCher2_v2
+        __cublasCher2_v2 = dlsym(RTLD_DEFAULT, 'cublasCher2_v2')
+        if __cublasCher2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher2_v2 = dlsym(handle, 'cublasCher2_v2')
+
+        global __cublasZher2_v2
+        __cublasZher2_v2 = dlsym(RTLD_DEFAULT, 'cublasZher2_v2')
+        if __cublasZher2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher2_v2 = dlsym(handle, 'cublasZher2_v2')
+
+        global __cublasSspr2_v2
+        __cublasSspr2_v2 = dlsym(RTLD_DEFAULT, 'cublasSspr2_v2')
+        if __cublasSspr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspr2_v2 = dlsym(handle, 'cublasSspr2_v2')
+
+        global __cublasDspr2_v2
+        __cublasDspr2_v2 = dlsym(RTLD_DEFAULT, 'cublasDspr2_v2')
+        if __cublasDspr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspr2_v2 = dlsym(handle, 'cublasDspr2_v2')
+
+        global __cublasChpr2_v2
+        __cublasChpr2_v2 = dlsym(RTLD_DEFAULT, 'cublasChpr2_v2')
+        if __cublasChpr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpr2_v2 = dlsym(handle, 'cublasChpr2_v2')
+
+        global __cublasZhpr2_v2
+        __cublasZhpr2_v2 = dlsym(RTLD_DEFAULT, 'cublasZhpr2_v2')
+        if __cublasZhpr2_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpr2_v2 = dlsym(handle, 'cublasZhpr2_v2')
+
+        global __cublasSgemm_v2
+        __cublasSgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasSgemm_v2')
+        if __cublasSgemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemm_v2 = dlsym(handle, 'cublasSgemm_v2')
+
+        global __cublasDgemm_v2
+        __cublasDgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasDgemm_v2')
+        if __cublasDgemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemm_v2 = dlsym(handle, 'cublasDgemm_v2')
+
+        global __cublasCgemm_v2
+        __cublasCgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasCgemm_v2')
+        if __cublasCgemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm_v2 = dlsym(handle, 'cublasCgemm_v2')
+
+        global __cublasCgemm3m
+        __cublasCgemm3m = dlsym(RTLD_DEFAULT, 'cublasCgemm3m')
+        if __cublasCgemm3m == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3m = dlsym(handle, 'cublasCgemm3m')
+
+        global __cublasCgemm3mEx
+        __cublasCgemm3mEx = dlsym(RTLD_DEFAULT, 'cublasCgemm3mEx')
+        if __cublasCgemm3mEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mEx = dlsym(handle, 'cublasCgemm3mEx')
+
+        global __cublasZgemm_v2
+        __cublasZgemm_v2 = dlsym(RTLD_DEFAULT, 'cublasZgemm_v2')
+        if __cublasZgemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemm_v2 = dlsym(handle, 'cublasZgemm_v2')
+
+        global __cublasZgemm3m
+        __cublasZgemm3m = dlsym(RTLD_DEFAULT, 'cublasZgemm3m')
+        if __cublasZgemm3m == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemm3m = dlsym(handle, 'cublasZgemm3m')
+
+        global __cublasSgemmEx
+        __cublasSgemmEx = dlsym(RTLD_DEFAULT, 'cublasSgemmEx')
+        if __cublasSgemmEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmEx = dlsym(handle, 'cublasSgemmEx')
+
+        global __cublasGemmEx
+        __cublasGemmEx = dlsym(RTLD_DEFAULT, 'cublasGemmEx')
+        if __cublasGemmEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmEx = dlsym(handle, 'cublasGemmEx')
+
+        global __cublasCgemmEx
+        __cublasCgemmEx = dlsym(RTLD_DEFAULT, 'cublasCgemmEx')
+        if __cublasCgemmEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmEx = dlsym(handle, 'cublasCgemmEx')
+
+        global __cublasUint8gemmBias
+        __cublasUint8gemmBias = dlsym(RTLD_DEFAULT, 'cublasUint8gemmBias')
+        if __cublasUint8gemmBias == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasUint8gemmBias = dlsym(handle, 'cublasUint8gemmBias')
+
+        global __cublasSsyrk_v2
+        __cublasSsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyrk_v2')
+        if __cublasSsyrk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyrk_v2 = dlsym(handle, 'cublasSsyrk_v2')
+
+        global __cublasDsyrk_v2
+        __cublasDsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyrk_v2')
+        if __cublasDsyrk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyrk_v2 = dlsym(handle, 'cublasDsyrk_v2')
+
+        global __cublasCsyrk_v2
+        __cublasCsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyrk_v2')
+        if __cublasCsyrk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrk_v2 = dlsym(handle, 'cublasCsyrk_v2')
+
+        global __cublasZsyrk_v2
+        __cublasZsyrk_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyrk_v2')
+        if __cublasZsyrk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyrk_v2 = dlsym(handle, 'cublasZsyrk_v2')
+
+        global __cublasCsyrkEx
+        __cublasCsyrkEx = dlsym(RTLD_DEFAULT, 'cublasCsyrkEx')
+        if __cublasCsyrkEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrkEx = dlsym(handle, 'cublasCsyrkEx')
+
+        global __cublasCsyrk3mEx
+        __cublasCsyrk3mEx = dlsym(RTLD_DEFAULT, 'cublasCsyrk3mEx')
+        if __cublasCsyrk3mEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrk3mEx = dlsym(handle, 'cublasCsyrk3mEx')
+
+        global __cublasCherk_v2
+        __cublasCherk_v2 = dlsym(RTLD_DEFAULT, 'cublasCherk_v2')
+        if __cublasCherk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherk_v2 = dlsym(handle, 'cublasCherk_v2')
+
+        global __cublasZherk_v2
+        __cublasZherk_v2 = dlsym(RTLD_DEFAULT, 'cublasZherk_v2')
+        if __cublasZherk_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZherk_v2 = dlsym(handle, 'cublasZherk_v2')
+
+        global __cublasCherkEx
+        __cublasCherkEx = dlsym(RTLD_DEFAULT, 'cublasCherkEx')
+        if __cublasCherkEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherkEx = dlsym(handle, 'cublasCherkEx')
+
+        global __cublasCherk3mEx
+        __cublasCherk3mEx = dlsym(RTLD_DEFAULT, 'cublasCherk3mEx')
+        if __cublasCherk3mEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherk3mEx = dlsym(handle, 'cublasCherk3mEx')
+
+        global __cublasSsyr2k_v2
+        __cublasSsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasSsyr2k_v2')
+        if __cublasSsyr2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr2k_v2 = dlsym(handle, 'cublasSsyr2k_v2')
+
+        global __cublasDsyr2k_v2
+        __cublasDsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasDsyr2k_v2')
+        if __cublasDsyr2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr2k_v2 = dlsym(handle, 'cublasDsyr2k_v2')
+
+        global __cublasCsyr2k_v2
+        __cublasCsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasCsyr2k_v2')
+        if __cublasCsyr2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr2k_v2 = dlsym(handle, 'cublasCsyr2k_v2')
+
+        global __cublasZsyr2k_v2
+        __cublasZsyr2k_v2 = dlsym(RTLD_DEFAULT, 'cublasZsyr2k_v2')
+        if __cublasZsyr2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr2k_v2 = dlsym(handle, 'cublasZsyr2k_v2')
+
+        global __cublasCher2k_v2
+        __cublasCher2k_v2 = dlsym(RTLD_DEFAULT, 'cublasCher2k_v2')
+        if __cublasCher2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher2k_v2 = dlsym(handle, 'cublasCher2k_v2')
+
+        global __cublasZher2k_v2
+        __cublasZher2k_v2 = dlsym(RTLD_DEFAULT, 'cublasZher2k_v2')
+        if __cublasZher2k_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher2k_v2 = dlsym(handle, 'cublasZher2k_v2')
+
+        global __cublasSsyrkx
+        __cublasSsyrkx = dlsym(RTLD_DEFAULT, 'cublasSsyrkx')
+        if __cublasSsyrkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyrkx = dlsym(handle, 'cublasSsyrkx')
+
+        global __cublasDsyrkx
+        __cublasDsyrkx = dlsym(RTLD_DEFAULT, 'cublasDsyrkx')
+        if __cublasDsyrkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyrkx = dlsym(handle, 'cublasDsyrkx')
+
+        global __cublasCsyrkx
+        __cublasCsyrkx = dlsym(RTLD_DEFAULT, 'cublasCsyrkx')
+        if __cublasCsyrkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrkx = dlsym(handle, 'cublasCsyrkx')
+
+        global __cublasZsyrkx
+        __cublasZsyrkx = dlsym(RTLD_DEFAULT, 'cublasZsyrkx')
+        if __cublasZsyrkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyrkx = dlsym(handle, 'cublasZsyrkx')
+
+        global __cublasCherkx
+        __cublasCherkx = dlsym(RTLD_DEFAULT, 'cublasCherkx')
+        if __cublasCherkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherkx = dlsym(handle, 'cublasCherkx')
+
+        global __cublasZherkx
+        __cublasZherkx = dlsym(RTLD_DEFAULT, 'cublasZherkx')
+        if __cublasZherkx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZherkx = dlsym(handle, 'cublasZherkx')
+
+        global __cublasSsymm_v2
+        __cublasSsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasSsymm_v2')
+        if __cublasSsymm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsymm_v2 = dlsym(handle, 'cublasSsymm_v2')
+
+        global __cublasDsymm_v2
+        __cublasDsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasDsymm_v2')
+        if __cublasDsymm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsymm_v2 = dlsym(handle, 'cublasDsymm_v2')
+
+        global __cublasCsymm_v2
+        __cublasCsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasCsymm_v2')
+        if __cublasCsymm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsymm_v2 = dlsym(handle, 'cublasCsymm_v2')
+
+        global __cublasZsymm_v2
+        __cublasZsymm_v2 = dlsym(RTLD_DEFAULT, 'cublasZsymm_v2')
+        if __cublasZsymm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsymm_v2 = dlsym(handle, 'cublasZsymm_v2')
+
+        global __cublasChemm_v2
+        __cublasChemm_v2 = dlsym(RTLD_DEFAULT, 'cublasChemm_v2')
+        if __cublasChemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChemm_v2 = dlsym(handle, 'cublasChemm_v2')
+
+        global __cublasZhemm_v2
+        __cublasZhemm_v2 = dlsym(RTLD_DEFAULT, 'cublasZhemm_v2')
+        if __cublasZhemm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhemm_v2 = dlsym(handle, 'cublasZhemm_v2')
+
+        global __cublasStrsm_v2
+        __cublasStrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasStrsm_v2')
+        if __cublasStrsm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsm_v2 = dlsym(handle, 'cublasStrsm_v2')
+
+        global __cublasDtrsm_v2
+        __cublasDtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrsm_v2')
+        if __cublasDtrsm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsm_v2 = dlsym(handle, 'cublasDtrsm_v2')
+
+        global __cublasCtrsm_v2
+        __cublasCtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrsm_v2')
+        if __cublasCtrsm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsm_v2 = dlsym(handle, 'cublasCtrsm_v2')
+
+        global __cublasZtrsm_v2
+        __cublasZtrsm_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrsm_v2')
+        if __cublasZtrsm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsm_v2 = dlsym(handle, 'cublasZtrsm_v2')
+
+        global __cublasStrmm_v2
+        __cublasStrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasStrmm_v2')
+        if __cublasStrmm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrmm_v2 = dlsym(handle, 'cublasStrmm_v2')
+
+        global __cublasDtrmm_v2
+        __cublasDtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasDtrmm_v2')
+        if __cublasDtrmm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrmm_v2 = dlsym(handle, 'cublasDtrmm_v2')
+
+        global __cublasCtrmm_v2
+        __cublasCtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasCtrmm_v2')
+        if __cublasCtrmm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrmm_v2 = dlsym(handle, 'cublasCtrmm_v2')
+
+        global __cublasZtrmm_v2
+        __cublasZtrmm_v2 = dlsym(RTLD_DEFAULT, 'cublasZtrmm_v2')
+        if __cublasZtrmm_v2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrmm_v2 = dlsym(handle, 'cublasZtrmm_v2')
+
+        global __cublasSgemmBatched
+        __cublasSgemmBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmBatched')
+        if __cublasSgemmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmBatched = dlsym(handle, 'cublasSgemmBatched')
+
+        global __cublasDgemmBatched
+        __cublasDgemmBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmBatched')
+        if __cublasDgemmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmBatched = dlsym(handle, 'cublasDgemmBatched')
+
+        global __cublasCgemmBatched
+        __cublasCgemmBatched = dlsym(RTLD_DEFAULT, 'cublasCgemmBatched')
+        if __cublasCgemmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmBatched = dlsym(handle, 'cublasCgemmBatched')
+
+        global __cublasCgemm3mBatched
+        __cublasCgemm3mBatched = dlsym(RTLD_DEFAULT, 'cublasCgemm3mBatched')
+        if __cublasCgemm3mBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mBatched = dlsym(handle, 'cublasCgemm3mBatched')
+
+        global __cublasZgemmBatched
+        __cublasZgemmBatched = dlsym(RTLD_DEFAULT, 'cublasZgemmBatched')
+        if __cublasZgemmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemmBatched = dlsym(handle, 'cublasZgemmBatched')
+
+        global __cublasGemmBatchedEx
+        __cublasGemmBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmBatchedEx')
+        if __cublasGemmBatchedEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmBatchedEx = dlsym(handle, 'cublasGemmBatchedEx')
+
+        global __cublasGemmStridedBatchedEx
+        __cublasGemmStridedBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmStridedBatchedEx')
+        if __cublasGemmStridedBatchedEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmStridedBatchedEx = dlsym(handle, 'cublasGemmStridedBatchedEx')
+
+        global __cublasSgemmStridedBatched
+        __cublasSgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmStridedBatched')
+        if __cublasSgemmStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmStridedBatched = dlsym(handle, 'cublasSgemmStridedBatched')
+
+        global __cublasDgemmStridedBatched
+        __cublasDgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmStridedBatched')
+        if __cublasDgemmStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmStridedBatched = dlsym(handle, 'cublasDgemmStridedBatched')
+
+        global __cublasCgemmStridedBatched
+        __cublasCgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemmStridedBatched')
+        if __cublasCgemmStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmStridedBatched = dlsym(handle, 'cublasCgemmStridedBatched')
+
+        global __cublasCgemm3mStridedBatched
+        __cublasCgemm3mStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemm3mStridedBatched')
+        if __cublasCgemm3mStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mStridedBatched = dlsym(handle, 'cublasCgemm3mStridedBatched')
+
+        global __cublasZgemmStridedBatched
+        __cublasZgemmStridedBatched = dlsym(RTLD_DEFAULT, 'cublasZgemmStridedBatched')
+        if __cublasZgemmStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemmStridedBatched = dlsym(handle, 'cublasZgemmStridedBatched')
+
+        global __cublasSgeam
+        __cublasSgeam = dlsym(RTLD_DEFAULT, 'cublasSgeam')
+        if __cublasSgeam == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgeam = dlsym(handle, 'cublasSgeam')
+
+        global __cublasDgeam
+        __cublasDgeam = dlsym(RTLD_DEFAULT, 'cublasDgeam')
+        if __cublasDgeam == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgeam = dlsym(handle, 'cublasDgeam')
+
+        global __cublasCgeam
+        __cublasCgeam = dlsym(RTLD_DEFAULT, 'cublasCgeam')
+        if __cublasCgeam == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgeam = dlsym(handle, 'cublasCgeam')
+
+        global __cublasZgeam
+        __cublasZgeam = dlsym(RTLD_DEFAULT, 'cublasZgeam')
+        if __cublasZgeam == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgeam = dlsym(handle, 'cublasZgeam')
+
+        global __cublasSgetrfBatched
+        __cublasSgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasSgetrfBatched')
+        if __cublasSgetrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgetrfBatched = dlsym(handle, 'cublasSgetrfBatched')
+
+        global __cublasDgetrfBatched
+        __cublasDgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasDgetrfBatched')
+        if __cublasDgetrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgetrfBatched = dlsym(handle, 'cublasDgetrfBatched')
+
+        global __cublasCgetrfBatched
+        __cublasCgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasCgetrfBatched')
+        if __cublasCgetrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgetrfBatched = dlsym(handle, 'cublasCgetrfBatched')
+
+        global __cublasZgetrfBatched
+        __cublasZgetrfBatched = dlsym(RTLD_DEFAULT, 'cublasZgetrfBatched')
+        if __cublasZgetrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgetrfBatched = dlsym(handle, 'cublasZgetrfBatched')
+
+        global __cublasSgetriBatched
+        __cublasSgetriBatched = dlsym(RTLD_DEFAULT, 'cublasSgetriBatched')
+        if __cublasSgetriBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgetriBatched = dlsym(handle, 'cublasSgetriBatched')
+
+        global __cublasDgetriBatched
+        __cublasDgetriBatched = dlsym(RTLD_DEFAULT, 'cublasDgetriBatched')
+        if __cublasDgetriBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgetriBatched = dlsym(handle, 'cublasDgetriBatched')
+
+        global __cublasCgetriBatched
+        __cublasCgetriBatched = dlsym(RTLD_DEFAULT, 'cublasCgetriBatched')
+        if __cublasCgetriBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgetriBatched = dlsym(handle, 'cublasCgetriBatched')
+
+        global __cublasZgetriBatched
+        __cublasZgetriBatched = dlsym(RTLD_DEFAULT, 'cublasZgetriBatched')
+        if __cublasZgetriBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgetriBatched = dlsym(handle, 'cublasZgetriBatched')
+
+        global __cublasSgetrsBatched
+        __cublasSgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasSgetrsBatched')
+        if __cublasSgetrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgetrsBatched = dlsym(handle, 'cublasSgetrsBatched')
+
+        global __cublasDgetrsBatched
+        __cublasDgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasDgetrsBatched')
+        if __cublasDgetrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgetrsBatched = dlsym(handle, 'cublasDgetrsBatched')
+
+        global __cublasCgetrsBatched
+        __cublasCgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasCgetrsBatched')
+        if __cublasCgetrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgetrsBatched = dlsym(handle, 'cublasCgetrsBatched')
+
+        global __cublasZgetrsBatched
+        __cublasZgetrsBatched = dlsym(RTLD_DEFAULT, 'cublasZgetrsBatched')
+        if __cublasZgetrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgetrsBatched = dlsym(handle, 'cublasZgetrsBatched')
+
+        global __cublasStrsmBatched
+        __cublasStrsmBatched = dlsym(RTLD_DEFAULT, 'cublasStrsmBatched')
+        if __cublasStrsmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsmBatched = dlsym(handle, 'cublasStrsmBatched')
+
+        global __cublasDtrsmBatched
+        __cublasDtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasDtrsmBatched')
+        if __cublasDtrsmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsmBatched = dlsym(handle, 'cublasDtrsmBatched')
+
+        global __cublasCtrsmBatched
+        __cublasCtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasCtrsmBatched')
+        if __cublasCtrsmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsmBatched = dlsym(handle, 'cublasCtrsmBatched')
+
+        global __cublasZtrsmBatched
+        __cublasZtrsmBatched = dlsym(RTLD_DEFAULT, 'cublasZtrsmBatched')
+        if __cublasZtrsmBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsmBatched = dlsym(handle, 'cublasZtrsmBatched')
+
+        global __cublasSmatinvBatched
+        __cublasSmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasSmatinvBatched')
+        if __cublasSmatinvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSmatinvBatched = dlsym(handle, 'cublasSmatinvBatched')
+
+        global __cublasDmatinvBatched
+        __cublasDmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasDmatinvBatched')
+        if __cublasDmatinvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDmatinvBatched = dlsym(handle, 'cublasDmatinvBatched')
+
+        global __cublasCmatinvBatched
+        __cublasCmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasCmatinvBatched')
+        if __cublasCmatinvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCmatinvBatched = dlsym(handle, 'cublasCmatinvBatched')
+
+        global __cublasZmatinvBatched
+        __cublasZmatinvBatched = dlsym(RTLD_DEFAULT, 'cublasZmatinvBatched')
+        if __cublasZmatinvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZmatinvBatched = dlsym(handle, 'cublasZmatinvBatched')
+
+        global __cublasSgeqrfBatched
+        __cublasSgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasSgeqrfBatched')
+        if __cublasSgeqrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgeqrfBatched = dlsym(handle, 'cublasSgeqrfBatched')
+
+        global __cublasDgeqrfBatched
+        __cublasDgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasDgeqrfBatched')
+        if __cublasDgeqrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgeqrfBatched = dlsym(handle, 'cublasDgeqrfBatched')
+
+        global __cublasCgeqrfBatched
+        __cublasCgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasCgeqrfBatched')
+        if __cublasCgeqrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgeqrfBatched = dlsym(handle, 'cublasCgeqrfBatched')
+
+        global __cublasZgeqrfBatched
+        __cublasZgeqrfBatched = dlsym(RTLD_DEFAULT, 'cublasZgeqrfBatched')
+        if __cublasZgeqrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgeqrfBatched = dlsym(handle, 'cublasZgeqrfBatched')
+
+        global __cublasSgelsBatched
+        __cublasSgelsBatched = dlsym(RTLD_DEFAULT, 'cublasSgelsBatched')
+        if __cublasSgelsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgelsBatched = dlsym(handle, 'cublasSgelsBatched')
+
+        global __cublasDgelsBatched
+        __cublasDgelsBatched = dlsym(RTLD_DEFAULT, 'cublasDgelsBatched')
+        if __cublasDgelsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgelsBatched = dlsym(handle, 'cublasDgelsBatched')
+
+        global __cublasCgelsBatched
+        __cublasCgelsBatched = dlsym(RTLD_DEFAULT, 'cublasCgelsBatched')
+        if __cublasCgelsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgelsBatched = dlsym(handle, 'cublasCgelsBatched')
+
+        global __cublasZgelsBatched
+        __cublasZgelsBatched = dlsym(RTLD_DEFAULT, 'cublasZgelsBatched')
+        if __cublasZgelsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgelsBatched = dlsym(handle, 'cublasZgelsBatched')
+
+        global __cublasSdgmm
+        __cublasSdgmm = dlsym(RTLD_DEFAULT, 'cublasSdgmm')
+        if __cublasSdgmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSdgmm = dlsym(handle, 'cublasSdgmm')
+
+        global __cublasDdgmm
+        __cublasDdgmm = dlsym(RTLD_DEFAULT, 'cublasDdgmm')
+        if __cublasDdgmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDdgmm = dlsym(handle, 'cublasDdgmm')
+
+        global __cublasCdgmm
+        __cublasCdgmm = dlsym(RTLD_DEFAULT, 'cublasCdgmm')
+        if __cublasCdgmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdgmm = dlsym(handle, 'cublasCdgmm')
+
+        global __cublasZdgmm
+        __cublasZdgmm = dlsym(RTLD_DEFAULT, 'cublasZdgmm')
+        if __cublasZdgmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdgmm = dlsym(handle, 'cublasZdgmm')
+
+        global __cublasStpttr
+        __cublasStpttr = dlsym(RTLD_DEFAULT, 'cublasStpttr')
+        if __cublasStpttr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStpttr = dlsym(handle, 'cublasStpttr')
+
+        global __cublasDtpttr
+        __cublasDtpttr = dlsym(RTLD_DEFAULT, 'cublasDtpttr')
+        if __cublasDtpttr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtpttr = dlsym(handle, 'cublasDtpttr')
+
+        global __cublasCtpttr
+        __cublasCtpttr = dlsym(RTLD_DEFAULT, 'cublasCtpttr')
+        if __cublasCtpttr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtpttr = dlsym(handle, 'cublasCtpttr')
+
+        global __cublasZtpttr
+        __cublasZtpttr = dlsym(RTLD_DEFAULT, 'cublasZtpttr')
+        if __cublasZtpttr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtpttr = dlsym(handle, 'cublasZtpttr')
+
+        global __cublasStrttp
+        __cublasStrttp = dlsym(RTLD_DEFAULT, 'cublasStrttp')
+        if __cublasStrttp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrttp = dlsym(handle, 'cublasStrttp')
+
+        global __cublasDtrttp
+        __cublasDtrttp = dlsym(RTLD_DEFAULT, 'cublasDtrttp')
+        if __cublasDtrttp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrttp = dlsym(handle, 'cublasDtrttp')
+
+        global __cublasCtrttp
+        __cublasCtrttp = dlsym(RTLD_DEFAULT, 'cublasCtrttp')
+        if __cublasCtrttp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrttp = dlsym(handle, 'cublasCtrttp')
+
+        global __cublasZtrttp
+        __cublasZtrttp = dlsym(RTLD_DEFAULT, 'cublasZtrttp')
+        if __cublasZtrttp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrttp = dlsym(handle, 'cublasZtrttp')
+
+        global __cublasGetSmCountTarget
+        __cublasGetSmCountTarget = dlsym(RTLD_DEFAULT, 'cublasGetSmCountTarget')
+        if __cublasGetSmCountTarget == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetSmCountTarget = dlsym(handle, 'cublasGetSmCountTarget')
+
+        global __cublasSetSmCountTarget
+        __cublasSetSmCountTarget = dlsym(RTLD_DEFAULT, 'cublasSetSmCountTarget')
+        if __cublasSetSmCountTarget == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetSmCountTarget = dlsym(handle, 'cublasSetSmCountTarget')
+
+        global __cublasGetStatusName
+        __cublasGetStatusName = dlsym(RTLD_DEFAULT, 'cublasGetStatusName')
+        if __cublasGetStatusName == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetStatusName = dlsym(handle, 'cublasGetStatusName')
+
+        global __cublasGetStatusString
+        __cublasGetStatusString = dlsym(RTLD_DEFAULT, 'cublasGetStatusString')
+        if __cublasGetStatusString == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetStatusString = dlsym(handle, 'cublasGetStatusString')
+
+        global __cublasSgemvBatched
+        __cublasSgemvBatched = dlsym(RTLD_DEFAULT, 'cublasSgemvBatched')
+        if __cublasSgemvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemvBatched = dlsym(handle, 'cublasSgemvBatched')
+
+        global __cublasDgemvBatched
+        __cublasDgemvBatched = dlsym(RTLD_DEFAULT, 'cublasDgemvBatched')
+        if __cublasDgemvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemvBatched = dlsym(handle, 'cublasDgemvBatched')
+
+        global __cublasCgemvBatched
+        __cublasCgemvBatched = dlsym(RTLD_DEFAULT, 'cublasCgemvBatched')
+        if __cublasCgemvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemvBatched = dlsym(handle, 'cublasCgemvBatched')
+
+        global __cublasZgemvBatched
+        __cublasZgemvBatched = dlsym(RTLD_DEFAULT, 'cublasZgemvBatched')
+        if __cublasZgemvBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemvBatched = dlsym(handle, 'cublasZgemvBatched')
+
+        global __cublasSgemvStridedBatched
+        __cublasSgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemvStridedBatched')
+        if __cublasSgemvStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemvStridedBatched = dlsym(handle, 'cublasSgemvStridedBatched')
+
+        global __cublasDgemvStridedBatched
+        __cublasDgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemvStridedBatched')
+        if __cublasDgemvStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemvStridedBatched = dlsym(handle, 'cublasDgemvStridedBatched')
+
+        global __cublasCgemvStridedBatched
+        __cublasCgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasCgemvStridedBatched')
+        if __cublasCgemvStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemvStridedBatched = dlsym(handle, 'cublasCgemvStridedBatched')
+
+        global __cublasZgemvStridedBatched
+        __cublasZgemvStridedBatched = dlsym(RTLD_DEFAULT, 'cublasZgemvStridedBatched')
+        if __cublasZgemvStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemvStridedBatched = dlsym(handle, 'cublasZgemvStridedBatched')
+
+        global __cublasSetVector_64
+        __cublasSetVector_64 = dlsym(RTLD_DEFAULT, 'cublasSetVector_64')
+        if __cublasSetVector_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetVector_64 = dlsym(handle, 'cublasSetVector_64')
+
+        global __cublasGetVector_64
+        __cublasGetVector_64 = dlsym(RTLD_DEFAULT, 'cublasGetVector_64')
+        if __cublasGetVector_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetVector_64 = dlsym(handle, 'cublasGetVector_64')
+
+        global __cublasSetMatrix_64
+        __cublasSetMatrix_64 = dlsym(RTLD_DEFAULT, 'cublasSetMatrix_64')
+        if __cublasSetMatrix_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetMatrix_64 = dlsym(handle, 'cublasSetMatrix_64')
+
+        global __cublasGetMatrix_64
+        __cublasGetMatrix_64 = dlsym(RTLD_DEFAULT, 'cublasGetMatrix_64')
+        if __cublasGetMatrix_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetMatrix_64 = dlsym(handle, 'cublasGetMatrix_64')
+
+        global __cublasSetVectorAsync_64
+        __cublasSetVectorAsync_64 = dlsym(RTLD_DEFAULT, 'cublasSetVectorAsync_64')
+        if __cublasSetVectorAsync_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetVectorAsync_64 = dlsym(handle, 'cublasSetVectorAsync_64')
+
+        global __cublasGetVectorAsync_64
+        __cublasGetVectorAsync_64 = dlsym(RTLD_DEFAULT, 'cublasGetVectorAsync_64')
+        if __cublasGetVectorAsync_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetVectorAsync_64 = dlsym(handle, 'cublasGetVectorAsync_64')
+
+        global __cublasSetMatrixAsync_64
+        __cublasSetMatrixAsync_64 = dlsym(RTLD_DEFAULT, 'cublasSetMatrixAsync_64')
+        if __cublasSetMatrixAsync_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetMatrixAsync_64 = dlsym(handle, 'cublasSetMatrixAsync_64')
+
+        global __cublasGetMatrixAsync_64
+        __cublasGetMatrixAsync_64 = dlsym(RTLD_DEFAULT, 'cublasGetMatrixAsync_64')
+        if __cublasGetMatrixAsync_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetMatrixAsync_64 = dlsym(handle, 'cublasGetMatrixAsync_64')
+
+        global __cublasNrm2Ex_64
+        __cublasNrm2Ex_64 = dlsym(RTLD_DEFAULT, 'cublasNrm2Ex_64')
+        if __cublasNrm2Ex_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasNrm2Ex_64 = dlsym(handle, 'cublasNrm2Ex_64')
+
+        global __cublasSnrm2_v2_64
+        __cublasSnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSnrm2_v2_64')
+        if __cublasSnrm2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSnrm2_v2_64 = dlsym(handle, 'cublasSnrm2_v2_64')
+
+        global __cublasDnrm2_v2_64
+        __cublasDnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDnrm2_v2_64')
+        if __cublasDnrm2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDnrm2_v2_64 = dlsym(handle, 'cublasDnrm2_v2_64')
+
+        global __cublasScnrm2_v2_64
+        __cublasScnrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScnrm2_v2_64')
+        if __cublasScnrm2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScnrm2_v2_64 = dlsym(handle, 'cublasScnrm2_v2_64')
+
+        global __cublasDznrm2_v2_64
+        __cublasDznrm2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDznrm2_v2_64')
+        if __cublasDznrm2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDznrm2_v2_64 = dlsym(handle, 'cublasDznrm2_v2_64')
+
+        global __cublasDotEx_64
+        __cublasDotEx_64 = dlsym(RTLD_DEFAULT, 'cublasDotEx_64')
+        if __cublasDotEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDotEx_64 = dlsym(handle, 'cublasDotEx_64')
+
+        global __cublasDotcEx_64
+        __cublasDotcEx_64 = dlsym(RTLD_DEFAULT, 'cublasDotcEx_64')
+        if __cublasDotcEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDotcEx_64 = dlsym(handle, 'cublasDotcEx_64')
+
+        global __cublasSdot_v2_64
+        __cublasSdot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSdot_v2_64')
+        if __cublasSdot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSdot_v2_64 = dlsym(handle, 'cublasSdot_v2_64')
+
+        global __cublasDdot_v2_64
+        __cublasDdot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDdot_v2_64')
+        if __cublasDdot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDdot_v2_64 = dlsym(handle, 'cublasDdot_v2_64')
+
+        global __cublasCdotu_v2_64
+        __cublasCdotu_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCdotu_v2_64')
+        if __cublasCdotu_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdotu_v2_64 = dlsym(handle, 'cublasCdotu_v2_64')
+
+        global __cublasCdotc_v2_64
+        __cublasCdotc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCdotc_v2_64')
+        if __cublasCdotc_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdotc_v2_64 = dlsym(handle, 'cublasCdotc_v2_64')
+
+        global __cublasZdotu_v2_64
+        __cublasZdotu_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdotu_v2_64')
+        if __cublasZdotu_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdotu_v2_64 = dlsym(handle, 'cublasZdotu_v2_64')
+
+        global __cublasZdotc_v2_64
+        __cublasZdotc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdotc_v2_64')
+        if __cublasZdotc_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdotc_v2_64 = dlsym(handle, 'cublasZdotc_v2_64')
+
+        global __cublasScalEx_64
+        __cublasScalEx_64 = dlsym(RTLD_DEFAULT, 'cublasScalEx_64')
+        if __cublasScalEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScalEx_64 = dlsym(handle, 'cublasScalEx_64')
+
+        global __cublasSscal_v2_64
+        __cublasSscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSscal_v2_64')
+        if __cublasSscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSscal_v2_64 = dlsym(handle, 'cublasSscal_v2_64')
+
+        global __cublasDscal_v2_64
+        __cublasDscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDscal_v2_64')
+        if __cublasDscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDscal_v2_64 = dlsym(handle, 'cublasDscal_v2_64')
+
+        global __cublasCscal_v2_64
+        __cublasCscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCscal_v2_64')
+        if __cublasCscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCscal_v2_64 = dlsym(handle, 'cublasCscal_v2_64')
+
+        global __cublasCsscal_v2_64
+        __cublasCsscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsscal_v2_64')
+        if __cublasCsscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsscal_v2_64 = dlsym(handle, 'cublasCsscal_v2_64')
+
+        global __cublasZscal_v2_64
+        __cublasZscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZscal_v2_64')
+        if __cublasZscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZscal_v2_64 = dlsym(handle, 'cublasZscal_v2_64')
+
+        global __cublasZdscal_v2_64
+        __cublasZdscal_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdscal_v2_64')
+        if __cublasZdscal_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdscal_v2_64 = dlsym(handle, 'cublasZdscal_v2_64')
+
+        global __cublasAxpyEx_64
+        __cublasAxpyEx_64 = dlsym(RTLD_DEFAULT, 'cublasAxpyEx_64')
+        if __cublasAxpyEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasAxpyEx_64 = dlsym(handle, 'cublasAxpyEx_64')
+
+        global __cublasSaxpy_v2_64
+        __cublasSaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSaxpy_v2_64')
+        if __cublasSaxpy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSaxpy_v2_64 = dlsym(handle, 'cublasSaxpy_v2_64')
+
+        global __cublasDaxpy_v2_64
+        __cublasDaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDaxpy_v2_64')
+        if __cublasDaxpy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDaxpy_v2_64 = dlsym(handle, 'cublasDaxpy_v2_64')
+
+        global __cublasCaxpy_v2_64
+        __cublasCaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCaxpy_v2_64')
+        if __cublasCaxpy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCaxpy_v2_64 = dlsym(handle, 'cublasCaxpy_v2_64')
+
+        global __cublasZaxpy_v2_64
+        __cublasZaxpy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZaxpy_v2_64')
+        if __cublasZaxpy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZaxpy_v2_64 = dlsym(handle, 'cublasZaxpy_v2_64')
+
+        global __cublasCopyEx_64
+        __cublasCopyEx_64 = dlsym(RTLD_DEFAULT, 'cublasCopyEx_64')
+        if __cublasCopyEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCopyEx_64 = dlsym(handle, 'cublasCopyEx_64')
+
+        global __cublasScopy_v2_64
+        __cublasScopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScopy_v2_64')
+        if __cublasScopy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScopy_v2_64 = dlsym(handle, 'cublasScopy_v2_64')
+
+        global __cublasDcopy_v2_64
+        __cublasDcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDcopy_v2_64')
+        if __cublasDcopy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDcopy_v2_64 = dlsym(handle, 'cublasDcopy_v2_64')
+
+        global __cublasCcopy_v2_64
+        __cublasCcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCcopy_v2_64')
+        if __cublasCcopy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCcopy_v2_64 = dlsym(handle, 'cublasCcopy_v2_64')
+
+        global __cublasZcopy_v2_64
+        __cublasZcopy_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZcopy_v2_64')
+        if __cublasZcopy_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZcopy_v2_64 = dlsym(handle, 'cublasZcopy_v2_64')
+
+        global __cublasSswap_v2_64
+        __cublasSswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSswap_v2_64')
+        if __cublasSswap_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSswap_v2_64 = dlsym(handle, 'cublasSswap_v2_64')
+
+        global __cublasDswap_v2_64
+        __cublasDswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDswap_v2_64')
+        if __cublasDswap_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDswap_v2_64 = dlsym(handle, 'cublasDswap_v2_64')
+
+        global __cublasCswap_v2_64
+        __cublasCswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCswap_v2_64')
+        if __cublasCswap_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCswap_v2_64 = dlsym(handle, 'cublasCswap_v2_64')
+
+        global __cublasZswap_v2_64
+        __cublasZswap_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZswap_v2_64')
+        if __cublasZswap_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZswap_v2_64 = dlsym(handle, 'cublasZswap_v2_64')
+
+        global __cublasSwapEx_64
+        __cublasSwapEx_64 = dlsym(RTLD_DEFAULT, 'cublasSwapEx_64')
+        if __cublasSwapEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSwapEx_64 = dlsym(handle, 'cublasSwapEx_64')
+
+        global __cublasIsamax_v2_64
+        __cublasIsamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIsamax_v2_64')
+        if __cublasIsamax_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIsamax_v2_64 = dlsym(handle, 'cublasIsamax_v2_64')
+
+        global __cublasIdamax_v2_64
+        __cublasIdamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIdamax_v2_64')
+        if __cublasIdamax_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIdamax_v2_64 = dlsym(handle, 'cublasIdamax_v2_64')
+
+        global __cublasIcamax_v2_64
+        __cublasIcamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIcamax_v2_64')
+        if __cublasIcamax_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIcamax_v2_64 = dlsym(handle, 'cublasIcamax_v2_64')
+
+        global __cublasIzamax_v2_64
+        __cublasIzamax_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIzamax_v2_64')
+        if __cublasIzamax_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIzamax_v2_64 = dlsym(handle, 'cublasIzamax_v2_64')
+
+        global __cublasIamaxEx_64
+        __cublasIamaxEx_64 = dlsym(RTLD_DEFAULT, 'cublasIamaxEx_64')
+        if __cublasIamaxEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIamaxEx_64 = dlsym(handle, 'cublasIamaxEx_64')
+
+        global __cublasIsamin_v2_64
+        __cublasIsamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIsamin_v2_64')
+        if __cublasIsamin_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIsamin_v2_64 = dlsym(handle, 'cublasIsamin_v2_64')
+
+        global __cublasIdamin_v2_64
+        __cublasIdamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIdamin_v2_64')
+        if __cublasIdamin_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIdamin_v2_64 = dlsym(handle, 'cublasIdamin_v2_64')
+
+        global __cublasIcamin_v2_64
+        __cublasIcamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIcamin_v2_64')
+        if __cublasIcamin_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIcamin_v2_64 = dlsym(handle, 'cublasIcamin_v2_64')
+
+        global __cublasIzamin_v2_64
+        __cublasIzamin_v2_64 = dlsym(RTLD_DEFAULT, 'cublasIzamin_v2_64')
+        if __cublasIzamin_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIzamin_v2_64 = dlsym(handle, 'cublasIzamin_v2_64')
+
+        global __cublasIaminEx_64
+        __cublasIaminEx_64 = dlsym(RTLD_DEFAULT, 'cublasIaminEx_64')
+        if __cublasIaminEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasIaminEx_64 = dlsym(handle, 'cublasIaminEx_64')
+
+        global __cublasAsumEx_64
+        __cublasAsumEx_64 = dlsym(RTLD_DEFAULT, 'cublasAsumEx_64')
+        if __cublasAsumEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasAsumEx_64 = dlsym(handle, 'cublasAsumEx_64')
+
+        global __cublasSasum_v2_64
+        __cublasSasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSasum_v2_64')
+        if __cublasSasum_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSasum_v2_64 = dlsym(handle, 'cublasSasum_v2_64')
+
+        global __cublasDasum_v2_64
+        __cublasDasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDasum_v2_64')
+        if __cublasDasum_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDasum_v2_64 = dlsym(handle, 'cublasDasum_v2_64')
+
+        global __cublasScasum_v2_64
+        __cublasScasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasScasum_v2_64')
+        if __cublasScasum_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasScasum_v2_64 = dlsym(handle, 'cublasScasum_v2_64')
+
+        global __cublasDzasum_v2_64
+        __cublasDzasum_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDzasum_v2_64')
+        if __cublasDzasum_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDzasum_v2_64 = dlsym(handle, 'cublasDzasum_v2_64')
+
+        global __cublasSrot_v2_64
+        __cublasSrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSrot_v2_64')
+        if __cublasSrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrot_v2_64 = dlsym(handle, 'cublasSrot_v2_64')
+
+        global __cublasDrot_v2_64
+        __cublasDrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDrot_v2_64')
+        if __cublasDrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrot_v2_64 = dlsym(handle, 'cublasDrot_v2_64')
+
+        global __cublasCrot_v2_64
+        __cublasCrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCrot_v2_64')
+        if __cublasCrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCrot_v2_64 = dlsym(handle, 'cublasCrot_v2_64')
+
+        global __cublasCsrot_v2_64
+        __cublasCsrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsrot_v2_64')
+        if __cublasCsrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsrot_v2_64 = dlsym(handle, 'cublasCsrot_v2_64')
+
+        global __cublasZrot_v2_64
+        __cublasZrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZrot_v2_64')
+        if __cublasZrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZrot_v2_64 = dlsym(handle, 'cublasZrot_v2_64')
+
+        global __cublasZdrot_v2_64
+        __cublasZdrot_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZdrot_v2_64')
+        if __cublasZdrot_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdrot_v2_64 = dlsym(handle, 'cublasZdrot_v2_64')
+
+        global __cublasRotEx_64
+        __cublasRotEx_64 = dlsym(RTLD_DEFAULT, 'cublasRotEx_64')
+        if __cublasRotEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotEx_64 = dlsym(handle, 'cublasRotEx_64')
+
+        global __cublasSrotm_v2_64
+        __cublasSrotm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSrotm_v2_64')
+        if __cublasSrotm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSrotm_v2_64 = dlsym(handle, 'cublasSrotm_v2_64')
+
+        global __cublasDrotm_v2_64
+        __cublasDrotm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDrotm_v2_64')
+        if __cublasDrotm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDrotm_v2_64 = dlsym(handle, 'cublasDrotm_v2_64')
+
+        global __cublasRotmEx_64
+        __cublasRotmEx_64 = dlsym(RTLD_DEFAULT, 'cublasRotmEx_64')
+        if __cublasRotmEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasRotmEx_64 = dlsym(handle, 'cublasRotmEx_64')
+
+        global __cublasSgemv_v2_64
+        __cublasSgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgemv_v2_64')
+        if __cublasSgemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemv_v2_64 = dlsym(handle, 'cublasSgemv_v2_64')
+
+        global __cublasDgemv_v2_64
+        __cublasDgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgemv_v2_64')
+        if __cublasDgemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemv_v2_64 = dlsym(handle, 'cublasDgemv_v2_64')
+
+        global __cublasCgemv_v2_64
+        __cublasCgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgemv_v2_64')
+        if __cublasCgemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemv_v2_64 = dlsym(handle, 'cublasCgemv_v2_64')
+
+        global __cublasZgemv_v2_64
+        __cublasZgemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgemv_v2_64')
+        if __cublasZgemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemv_v2_64 = dlsym(handle, 'cublasZgemv_v2_64')
+
+        global __cublasSgbmv_v2_64
+        __cublasSgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgbmv_v2_64')
+        if __cublasSgbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgbmv_v2_64 = dlsym(handle, 'cublasSgbmv_v2_64')
+
+        global __cublasDgbmv_v2_64
+        __cublasDgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgbmv_v2_64')
+        if __cublasDgbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgbmv_v2_64 = dlsym(handle, 'cublasDgbmv_v2_64')
+
+        global __cublasCgbmv_v2_64
+        __cublasCgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgbmv_v2_64')
+        if __cublasCgbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgbmv_v2_64 = dlsym(handle, 'cublasCgbmv_v2_64')
+
+        global __cublasZgbmv_v2_64
+        __cublasZgbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgbmv_v2_64')
+        if __cublasZgbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgbmv_v2_64 = dlsym(handle, 'cublasZgbmv_v2_64')
+
+        global __cublasStrmv_v2_64
+        __cublasStrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrmv_v2_64')
+        if __cublasStrmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrmv_v2_64 = dlsym(handle, 'cublasStrmv_v2_64')
+
+        global __cublasDtrmv_v2_64
+        __cublasDtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrmv_v2_64')
+        if __cublasDtrmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrmv_v2_64 = dlsym(handle, 'cublasDtrmv_v2_64')
+
+        global __cublasCtrmv_v2_64
+        __cublasCtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrmv_v2_64')
+        if __cublasCtrmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrmv_v2_64 = dlsym(handle, 'cublasCtrmv_v2_64')
+
+        global __cublasZtrmv_v2_64
+        __cublasZtrmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrmv_v2_64')
+        if __cublasZtrmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrmv_v2_64 = dlsym(handle, 'cublasZtrmv_v2_64')
+
+        global __cublasStbmv_v2_64
+        __cublasStbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStbmv_v2_64')
+        if __cublasStbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStbmv_v2_64 = dlsym(handle, 'cublasStbmv_v2_64')
+
+        global __cublasDtbmv_v2_64
+        __cublasDtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtbmv_v2_64')
+        if __cublasDtbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtbmv_v2_64 = dlsym(handle, 'cublasDtbmv_v2_64')
+
+        global __cublasCtbmv_v2_64
+        __cublasCtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtbmv_v2_64')
+        if __cublasCtbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtbmv_v2_64 = dlsym(handle, 'cublasCtbmv_v2_64')
+
+        global __cublasZtbmv_v2_64
+        __cublasZtbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtbmv_v2_64')
+        if __cublasZtbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtbmv_v2_64 = dlsym(handle, 'cublasZtbmv_v2_64')
+
+        global __cublasStpmv_v2_64
+        __cublasStpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStpmv_v2_64')
+        if __cublasStpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStpmv_v2_64 = dlsym(handle, 'cublasStpmv_v2_64')
+
+        global __cublasDtpmv_v2_64
+        __cublasDtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtpmv_v2_64')
+        if __cublasDtpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtpmv_v2_64 = dlsym(handle, 'cublasDtpmv_v2_64')
+
+        global __cublasCtpmv_v2_64
+        __cublasCtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtpmv_v2_64')
+        if __cublasCtpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtpmv_v2_64 = dlsym(handle, 'cublasCtpmv_v2_64')
+
+        global __cublasZtpmv_v2_64
+        __cublasZtpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtpmv_v2_64')
+        if __cublasZtpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtpmv_v2_64 = dlsym(handle, 'cublasZtpmv_v2_64')
+
+        global __cublasStrsv_v2_64
+        __cublasStrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrsv_v2_64')
+        if __cublasStrsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsv_v2_64 = dlsym(handle, 'cublasStrsv_v2_64')
+
+        global __cublasDtrsv_v2_64
+        __cublasDtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsv_v2_64')
+        if __cublasDtrsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsv_v2_64 = dlsym(handle, 'cublasDtrsv_v2_64')
+
+        global __cublasCtrsv_v2_64
+        __cublasCtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsv_v2_64')
+        if __cublasCtrsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsv_v2_64 = dlsym(handle, 'cublasCtrsv_v2_64')
+
+        global __cublasZtrsv_v2_64
+        __cublasZtrsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsv_v2_64')
+        if __cublasZtrsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsv_v2_64 = dlsym(handle, 'cublasZtrsv_v2_64')
+
+        global __cublasStpsv_v2_64
+        __cublasStpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStpsv_v2_64')
+        if __cublasStpsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStpsv_v2_64 = dlsym(handle, 'cublasStpsv_v2_64')
+
+        global __cublasDtpsv_v2_64
+        __cublasDtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtpsv_v2_64')
+        if __cublasDtpsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtpsv_v2_64 = dlsym(handle, 'cublasDtpsv_v2_64')
+
+        global __cublasCtpsv_v2_64
+        __cublasCtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtpsv_v2_64')
+        if __cublasCtpsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtpsv_v2_64 = dlsym(handle, 'cublasCtpsv_v2_64')
+
+        global __cublasZtpsv_v2_64
+        __cublasZtpsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtpsv_v2_64')
+        if __cublasZtpsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtpsv_v2_64 = dlsym(handle, 'cublasZtpsv_v2_64')
+
+        global __cublasStbsv_v2_64
+        __cublasStbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStbsv_v2_64')
+        if __cublasStbsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStbsv_v2_64 = dlsym(handle, 'cublasStbsv_v2_64')
+
+        global __cublasDtbsv_v2_64
+        __cublasDtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtbsv_v2_64')
+        if __cublasDtbsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtbsv_v2_64 = dlsym(handle, 'cublasDtbsv_v2_64')
+
+        global __cublasCtbsv_v2_64
+        __cublasCtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtbsv_v2_64')
+        if __cublasCtbsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtbsv_v2_64 = dlsym(handle, 'cublasCtbsv_v2_64')
+
+        global __cublasZtbsv_v2_64
+        __cublasZtbsv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtbsv_v2_64')
+        if __cublasZtbsv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtbsv_v2_64 = dlsym(handle, 'cublasZtbsv_v2_64')
+
+        global __cublasSsymv_v2_64
+        __cublasSsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsymv_v2_64')
+        if __cublasSsymv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsymv_v2_64 = dlsym(handle, 'cublasSsymv_v2_64')
+
+        global __cublasDsymv_v2_64
+        __cublasDsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsymv_v2_64')
+        if __cublasDsymv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsymv_v2_64 = dlsym(handle, 'cublasDsymv_v2_64')
+
+        global __cublasCsymv_v2_64
+        __cublasCsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsymv_v2_64')
+        if __cublasCsymv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsymv_v2_64 = dlsym(handle, 'cublasCsymv_v2_64')
+
+        global __cublasZsymv_v2_64
+        __cublasZsymv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsymv_v2_64')
+        if __cublasZsymv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsymv_v2_64 = dlsym(handle, 'cublasZsymv_v2_64')
+
+        global __cublasChemv_v2_64
+        __cublasChemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChemv_v2_64')
+        if __cublasChemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChemv_v2_64 = dlsym(handle, 'cublasChemv_v2_64')
+
+        global __cublasZhemv_v2_64
+        __cublasZhemv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhemv_v2_64')
+        if __cublasZhemv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhemv_v2_64 = dlsym(handle, 'cublasZhemv_v2_64')
+
+        global __cublasSsbmv_v2_64
+        __cublasSsbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsbmv_v2_64')
+        if __cublasSsbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsbmv_v2_64 = dlsym(handle, 'cublasSsbmv_v2_64')
+
+        global __cublasDsbmv_v2_64
+        __cublasDsbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsbmv_v2_64')
+        if __cublasDsbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsbmv_v2_64 = dlsym(handle, 'cublasDsbmv_v2_64')
+
+        global __cublasChbmv_v2_64
+        __cublasChbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChbmv_v2_64')
+        if __cublasChbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChbmv_v2_64 = dlsym(handle, 'cublasChbmv_v2_64')
+
+        global __cublasZhbmv_v2_64
+        __cublasZhbmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhbmv_v2_64')
+        if __cublasZhbmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhbmv_v2_64 = dlsym(handle, 'cublasZhbmv_v2_64')
+
+        global __cublasSspmv_v2_64
+        __cublasSspmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspmv_v2_64')
+        if __cublasSspmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspmv_v2_64 = dlsym(handle, 'cublasSspmv_v2_64')
+
+        global __cublasDspmv_v2_64
+        __cublasDspmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspmv_v2_64')
+        if __cublasDspmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspmv_v2_64 = dlsym(handle, 'cublasDspmv_v2_64')
+
+        global __cublasChpmv_v2_64
+        __cublasChpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpmv_v2_64')
+        if __cublasChpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpmv_v2_64 = dlsym(handle, 'cublasChpmv_v2_64')
+
+        global __cublasZhpmv_v2_64
+        __cublasZhpmv_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpmv_v2_64')
+        if __cublasZhpmv_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpmv_v2_64 = dlsym(handle, 'cublasZhpmv_v2_64')
+
+        global __cublasSger_v2_64
+        __cublasSger_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSger_v2_64')
+        if __cublasSger_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSger_v2_64 = dlsym(handle, 'cublasSger_v2_64')
+
+        global __cublasDger_v2_64
+        __cublasDger_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDger_v2_64')
+        if __cublasDger_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDger_v2_64 = dlsym(handle, 'cublasDger_v2_64')
+
+        global __cublasCgeru_v2_64
+        __cublasCgeru_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgeru_v2_64')
+        if __cublasCgeru_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgeru_v2_64 = dlsym(handle, 'cublasCgeru_v2_64')
+
+        global __cublasCgerc_v2_64
+        __cublasCgerc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgerc_v2_64')
+        if __cublasCgerc_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgerc_v2_64 = dlsym(handle, 'cublasCgerc_v2_64')
+
+        global __cublasZgeru_v2_64
+        __cublasZgeru_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgeru_v2_64')
+        if __cublasZgeru_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgeru_v2_64 = dlsym(handle, 'cublasZgeru_v2_64')
+
+        global __cublasZgerc_v2_64
+        __cublasZgerc_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgerc_v2_64')
+        if __cublasZgerc_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgerc_v2_64 = dlsym(handle, 'cublasZgerc_v2_64')
+
+        global __cublasSsyr_v2_64
+        __cublasSsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr_v2_64')
+        if __cublasSsyr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr_v2_64 = dlsym(handle, 'cublasSsyr_v2_64')
+
+        global __cublasDsyr_v2_64
+        __cublasDsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr_v2_64')
+        if __cublasDsyr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr_v2_64 = dlsym(handle, 'cublasDsyr_v2_64')
+
+        global __cublasCsyr_v2_64
+        __cublasCsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr_v2_64')
+        if __cublasCsyr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr_v2_64 = dlsym(handle, 'cublasCsyr_v2_64')
+
+        global __cublasZsyr_v2_64
+        __cublasZsyr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr_v2_64')
+        if __cublasZsyr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr_v2_64 = dlsym(handle, 'cublasZsyr_v2_64')
+
+        global __cublasCher_v2_64
+        __cublasCher_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher_v2_64')
+        if __cublasCher_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher_v2_64 = dlsym(handle, 'cublasCher_v2_64')
+
+        global __cublasZher_v2_64
+        __cublasZher_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher_v2_64')
+        if __cublasZher_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher_v2_64 = dlsym(handle, 'cublasZher_v2_64')
+
+        global __cublasSspr_v2_64
+        __cublasSspr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspr_v2_64')
+        if __cublasSspr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspr_v2_64 = dlsym(handle, 'cublasSspr_v2_64')
+
+        global __cublasDspr_v2_64
+        __cublasDspr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspr_v2_64')
+        if __cublasDspr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspr_v2_64 = dlsym(handle, 'cublasDspr_v2_64')
+
+        global __cublasChpr_v2_64
+        __cublasChpr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpr_v2_64')
+        if __cublasChpr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpr_v2_64 = dlsym(handle, 'cublasChpr_v2_64')
+
+        global __cublasZhpr_v2_64
+        __cublasZhpr_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpr_v2_64')
+        if __cublasZhpr_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpr_v2_64 = dlsym(handle, 'cublasZhpr_v2_64')
+
+        global __cublasSsyr2_v2_64
+        __cublasSsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr2_v2_64')
+        if __cublasSsyr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr2_v2_64 = dlsym(handle, 'cublasSsyr2_v2_64')
+
+        global __cublasDsyr2_v2_64
+        __cublasDsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr2_v2_64')
+        if __cublasDsyr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr2_v2_64 = dlsym(handle, 'cublasDsyr2_v2_64')
+
+        global __cublasCsyr2_v2_64
+        __cublasCsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr2_v2_64')
+        if __cublasCsyr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr2_v2_64 = dlsym(handle, 'cublasCsyr2_v2_64')
+
+        global __cublasZsyr2_v2_64
+        __cublasZsyr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr2_v2_64')
+        if __cublasZsyr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr2_v2_64 = dlsym(handle, 'cublasZsyr2_v2_64')
+
+        global __cublasCher2_v2_64
+        __cublasCher2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher2_v2_64')
+        if __cublasCher2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher2_v2_64 = dlsym(handle, 'cublasCher2_v2_64')
+
+        global __cublasZher2_v2_64
+        __cublasZher2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher2_v2_64')
+        if __cublasZher2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher2_v2_64 = dlsym(handle, 'cublasZher2_v2_64')
+
+        global __cublasSspr2_v2_64
+        __cublasSspr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSspr2_v2_64')
+        if __cublasSspr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSspr2_v2_64 = dlsym(handle, 'cublasSspr2_v2_64')
+
+        global __cublasDspr2_v2_64
+        __cublasDspr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDspr2_v2_64')
+        if __cublasDspr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDspr2_v2_64 = dlsym(handle, 'cublasDspr2_v2_64')
+
+        global __cublasChpr2_v2_64
+        __cublasChpr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChpr2_v2_64')
+        if __cublasChpr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChpr2_v2_64 = dlsym(handle, 'cublasChpr2_v2_64')
+
+        global __cublasZhpr2_v2_64
+        __cublasZhpr2_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhpr2_v2_64')
+        if __cublasZhpr2_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhpr2_v2_64 = dlsym(handle, 'cublasZhpr2_v2_64')
+
+        global __cublasSgemvBatched_64
+        __cublasSgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemvBatched_64')
+        if __cublasSgemvBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemvBatched_64 = dlsym(handle, 'cublasSgemvBatched_64')
+
+        global __cublasDgemvBatched_64
+        __cublasDgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemvBatched_64')
+        if __cublasDgemvBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemvBatched_64 = dlsym(handle, 'cublasDgemvBatched_64')
+
+        global __cublasCgemvBatched_64
+        __cublasCgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemvBatched_64')
+        if __cublasCgemvBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemvBatched_64 = dlsym(handle, 'cublasCgemvBatched_64')
+
+        global __cublasZgemvBatched_64
+        __cublasZgemvBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemvBatched_64')
+        if __cublasZgemvBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemvBatched_64 = dlsym(handle, 'cublasZgemvBatched_64')
+
+        global __cublasSgemvStridedBatched_64
+        __cublasSgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemvStridedBatched_64')
+        if __cublasSgemvStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemvStridedBatched_64 = dlsym(handle, 'cublasSgemvStridedBatched_64')
+
+        global __cublasDgemvStridedBatched_64
+        __cublasDgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemvStridedBatched_64')
+        if __cublasDgemvStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemvStridedBatched_64 = dlsym(handle, 'cublasDgemvStridedBatched_64')
+
+        global __cublasCgemvStridedBatched_64
+        __cublasCgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemvStridedBatched_64')
+        if __cublasCgemvStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemvStridedBatched_64 = dlsym(handle, 'cublasCgemvStridedBatched_64')
+
+        global __cublasZgemvStridedBatched_64
+        __cublasZgemvStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemvStridedBatched_64')
+        if __cublasZgemvStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemvStridedBatched_64 = dlsym(handle, 'cublasZgemvStridedBatched_64')
+
+        global __cublasSgemm_v2_64
+        __cublasSgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSgemm_v2_64')
+        if __cublasSgemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemm_v2_64 = dlsym(handle, 'cublasSgemm_v2_64')
+
+        global __cublasDgemm_v2_64
+        __cublasDgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDgemm_v2_64')
+        if __cublasDgemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemm_v2_64 = dlsym(handle, 'cublasDgemm_v2_64')
+
+        global __cublasCgemm_v2_64
+        __cublasCgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm_v2_64')
+        if __cublasCgemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm_v2_64 = dlsym(handle, 'cublasCgemm_v2_64')
+
+        global __cublasCgemm3m_64
+        __cublasCgemm3m_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3m_64')
+        if __cublasCgemm3m_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3m_64 = dlsym(handle, 'cublasCgemm3m_64')
+
+        global __cublasCgemm3mEx_64
+        __cublasCgemm3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mEx_64')
+        if __cublasCgemm3mEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mEx_64 = dlsym(handle, 'cublasCgemm3mEx_64')
+
+        global __cublasZgemm_v2_64
+        __cublasZgemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZgemm_v2_64')
+        if __cublasZgemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemm_v2_64 = dlsym(handle, 'cublasZgemm_v2_64')
+
+        global __cublasZgemm3m_64
+        __cublasZgemm3m_64 = dlsym(RTLD_DEFAULT, 'cublasZgemm3m_64')
+        if __cublasZgemm3m_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemm3m_64 = dlsym(handle, 'cublasZgemm3m_64')
+
+        global __cublasSgemmEx_64
+        __cublasSgemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmEx_64')
+        if __cublasSgemmEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmEx_64 = dlsym(handle, 'cublasSgemmEx_64')
+
+        global __cublasGemmEx_64
+        __cublasGemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmEx_64')
+        if __cublasGemmEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmEx_64 = dlsym(handle, 'cublasGemmEx_64')
+
+        global __cublasCgemmEx_64
+        __cublasCgemmEx_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmEx_64')
+        if __cublasCgemmEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmEx_64 = dlsym(handle, 'cublasCgemmEx_64')
+
+        global __cublasSsyrk_v2_64
+        __cublasSsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyrk_v2_64')
+        if __cublasSsyrk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyrk_v2_64 = dlsym(handle, 'cublasSsyrk_v2_64')
+
+        global __cublasDsyrk_v2_64
+        __cublasDsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyrk_v2_64')
+        if __cublasDsyrk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyrk_v2_64 = dlsym(handle, 'cublasDsyrk_v2_64')
+
+        global __cublasCsyrk_v2_64
+        __cublasCsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrk_v2_64')
+        if __cublasCsyrk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrk_v2_64 = dlsym(handle, 'cublasCsyrk_v2_64')
+
+        global __cublasZsyrk_v2_64
+        __cublasZsyrk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyrk_v2_64')
+        if __cublasZsyrk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyrk_v2_64 = dlsym(handle, 'cublasZsyrk_v2_64')
+
+        global __cublasCsyrkEx_64
+        __cublasCsyrkEx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrkEx_64')
+        if __cublasCsyrkEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrkEx_64 = dlsym(handle, 'cublasCsyrkEx_64')
+
+        global __cublasCsyrk3mEx_64
+        __cublasCsyrk3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrk3mEx_64')
+        if __cublasCsyrk3mEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrk3mEx_64 = dlsym(handle, 'cublasCsyrk3mEx_64')
+
+        global __cublasCherk_v2_64
+        __cublasCherk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCherk_v2_64')
+        if __cublasCherk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherk_v2_64 = dlsym(handle, 'cublasCherk_v2_64')
+
+        global __cublasZherk_v2_64
+        __cublasZherk_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZherk_v2_64')
+        if __cublasZherk_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZherk_v2_64 = dlsym(handle, 'cublasZherk_v2_64')
+
+        global __cublasCherkEx_64
+        __cublasCherkEx_64 = dlsym(RTLD_DEFAULT, 'cublasCherkEx_64')
+        if __cublasCherkEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherkEx_64 = dlsym(handle, 'cublasCherkEx_64')
+
+        global __cublasCherk3mEx_64
+        __cublasCherk3mEx_64 = dlsym(RTLD_DEFAULT, 'cublasCherk3mEx_64')
+        if __cublasCherk3mEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherk3mEx_64 = dlsym(handle, 'cublasCherk3mEx_64')
+
+        global __cublasSsyr2k_v2_64
+        __cublasSsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsyr2k_v2_64')
+        if __cublasSsyr2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyr2k_v2_64 = dlsym(handle, 'cublasSsyr2k_v2_64')
+
+        global __cublasDsyr2k_v2_64
+        __cublasDsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsyr2k_v2_64')
+        if __cublasDsyr2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyr2k_v2_64 = dlsym(handle, 'cublasDsyr2k_v2_64')
+
+        global __cublasCsyr2k_v2_64
+        __cublasCsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsyr2k_v2_64')
+        if __cublasCsyr2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyr2k_v2_64 = dlsym(handle, 'cublasCsyr2k_v2_64')
+
+        global __cublasZsyr2k_v2_64
+        __cublasZsyr2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsyr2k_v2_64')
+        if __cublasZsyr2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyr2k_v2_64 = dlsym(handle, 'cublasZsyr2k_v2_64')
+
+        global __cublasCher2k_v2_64
+        __cublasCher2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCher2k_v2_64')
+        if __cublasCher2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCher2k_v2_64 = dlsym(handle, 'cublasCher2k_v2_64')
+
+        global __cublasZher2k_v2_64
+        __cublasZher2k_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZher2k_v2_64')
+        if __cublasZher2k_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZher2k_v2_64 = dlsym(handle, 'cublasZher2k_v2_64')
+
+        global __cublasSsyrkx_64
+        __cublasSsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasSsyrkx_64')
+        if __cublasSsyrkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsyrkx_64 = dlsym(handle, 'cublasSsyrkx_64')
+
+        global __cublasDsyrkx_64
+        __cublasDsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasDsyrkx_64')
+        if __cublasDsyrkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsyrkx_64 = dlsym(handle, 'cublasDsyrkx_64')
+
+        global __cublasCsyrkx_64
+        __cublasCsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasCsyrkx_64')
+        if __cublasCsyrkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsyrkx_64 = dlsym(handle, 'cublasCsyrkx_64')
+
+        global __cublasZsyrkx_64
+        __cublasZsyrkx_64 = dlsym(RTLD_DEFAULT, 'cublasZsyrkx_64')
+        if __cublasZsyrkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsyrkx_64 = dlsym(handle, 'cublasZsyrkx_64')
+
+        global __cublasCherkx_64
+        __cublasCherkx_64 = dlsym(RTLD_DEFAULT, 'cublasCherkx_64')
+        if __cublasCherkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCherkx_64 = dlsym(handle, 'cublasCherkx_64')
+
+        global __cublasZherkx_64
+        __cublasZherkx_64 = dlsym(RTLD_DEFAULT, 'cublasZherkx_64')
+        if __cublasZherkx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZherkx_64 = dlsym(handle, 'cublasZherkx_64')
+
+        global __cublasSsymm_v2_64
+        __cublasSsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasSsymm_v2_64')
+        if __cublasSsymm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSsymm_v2_64 = dlsym(handle, 'cublasSsymm_v2_64')
+
+        global __cublasDsymm_v2_64
+        __cublasDsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDsymm_v2_64')
+        if __cublasDsymm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDsymm_v2_64 = dlsym(handle, 'cublasDsymm_v2_64')
+
+        global __cublasCsymm_v2_64
+        __cublasCsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCsymm_v2_64')
+        if __cublasCsymm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCsymm_v2_64 = dlsym(handle, 'cublasCsymm_v2_64')
+
+        global __cublasZsymm_v2_64
+        __cublasZsymm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZsymm_v2_64')
+        if __cublasZsymm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZsymm_v2_64 = dlsym(handle, 'cublasZsymm_v2_64')
+
+        global __cublasChemm_v2_64
+        __cublasChemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasChemm_v2_64')
+        if __cublasChemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasChemm_v2_64 = dlsym(handle, 'cublasChemm_v2_64')
+
+        global __cublasZhemm_v2_64
+        __cublasZhemm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZhemm_v2_64')
+        if __cublasZhemm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZhemm_v2_64 = dlsym(handle, 'cublasZhemm_v2_64')
+
+        global __cublasStrsm_v2_64
+        __cublasStrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrsm_v2_64')
+        if __cublasStrsm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsm_v2_64 = dlsym(handle, 'cublasStrsm_v2_64')
+
+        global __cublasDtrsm_v2_64
+        __cublasDtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsm_v2_64')
+        if __cublasDtrsm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsm_v2_64 = dlsym(handle, 'cublasDtrsm_v2_64')
+
+        global __cublasCtrsm_v2_64
+        __cublasCtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsm_v2_64')
+        if __cublasCtrsm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsm_v2_64 = dlsym(handle, 'cublasCtrsm_v2_64')
+
+        global __cublasZtrsm_v2_64
+        __cublasZtrsm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsm_v2_64')
+        if __cublasZtrsm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsm_v2_64 = dlsym(handle, 'cublasZtrsm_v2_64')
+
+        global __cublasStrmm_v2_64
+        __cublasStrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasStrmm_v2_64')
+        if __cublasStrmm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrmm_v2_64 = dlsym(handle, 'cublasStrmm_v2_64')
+
+        global __cublasDtrmm_v2_64
+        __cublasDtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasDtrmm_v2_64')
+        if __cublasDtrmm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrmm_v2_64 = dlsym(handle, 'cublasDtrmm_v2_64')
+
+        global __cublasCtrmm_v2_64
+        __cublasCtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasCtrmm_v2_64')
+        if __cublasCtrmm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrmm_v2_64 = dlsym(handle, 'cublasCtrmm_v2_64')
+
+        global __cublasZtrmm_v2_64
+        __cublasZtrmm_v2_64 = dlsym(RTLD_DEFAULT, 'cublasZtrmm_v2_64')
+        if __cublasZtrmm_v2_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrmm_v2_64 = dlsym(handle, 'cublasZtrmm_v2_64')
+
+        global __cublasSgemmBatched_64
+        __cublasSgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmBatched_64')
+        if __cublasSgemmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmBatched_64 = dlsym(handle, 'cublasSgemmBatched_64')
+
+        global __cublasDgemmBatched_64
+        __cublasDgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmBatched_64')
+        if __cublasDgemmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmBatched_64 = dlsym(handle, 'cublasDgemmBatched_64')
+
+        global __cublasCgemmBatched_64
+        __cublasCgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmBatched_64')
+        if __cublasCgemmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmBatched_64 = dlsym(handle, 'cublasCgemmBatched_64')
+
+        global __cublasCgemm3mBatched_64
+        __cublasCgemm3mBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mBatched_64')
+        if __cublasCgemm3mBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mBatched_64 = dlsym(handle, 'cublasCgemm3mBatched_64')
+
+        global __cublasZgemmBatched_64
+        __cublasZgemmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemmBatched_64')
+        if __cublasZgemmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemmBatched_64 = dlsym(handle, 'cublasZgemmBatched_64')
+
+        global __cublasSgemmStridedBatched_64
+        __cublasSgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmStridedBatched_64')
+        if __cublasSgemmStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmStridedBatched_64 = dlsym(handle, 'cublasSgemmStridedBatched_64')
+
+        global __cublasDgemmStridedBatched_64
+        __cublasDgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmStridedBatched_64')
+        if __cublasDgemmStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmStridedBatched_64 = dlsym(handle, 'cublasDgemmStridedBatched_64')
+
+        global __cublasCgemmStridedBatched_64
+        __cublasCgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemmStridedBatched_64')
+        if __cublasCgemmStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemmStridedBatched_64 = dlsym(handle, 'cublasCgemmStridedBatched_64')
+
+        global __cublasCgemm3mStridedBatched_64
+        __cublasCgemm3mStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCgemm3mStridedBatched_64')
+        if __cublasCgemm3mStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgemm3mStridedBatched_64 = dlsym(handle, 'cublasCgemm3mStridedBatched_64')
+
+        global __cublasZgemmStridedBatched_64
+        __cublasZgemmStridedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZgemmStridedBatched_64')
+        if __cublasZgemmStridedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgemmStridedBatched_64 = dlsym(handle, 'cublasZgemmStridedBatched_64')
+
+        global __cublasGemmBatchedEx_64
+        __cublasGemmBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmBatchedEx_64')
+        if __cublasGemmBatchedEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmBatchedEx_64 = dlsym(handle, 'cublasGemmBatchedEx_64')
+
+        global __cublasGemmStridedBatchedEx_64
+        __cublasGemmStridedBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmStridedBatchedEx_64')
+        if __cublasGemmStridedBatchedEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmStridedBatchedEx_64 = dlsym(handle, 'cublasGemmStridedBatchedEx_64')
+
+        global __cublasSgeam_64
+        __cublasSgeam_64 = dlsym(RTLD_DEFAULT, 'cublasSgeam_64')
+        if __cublasSgeam_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgeam_64 = dlsym(handle, 'cublasSgeam_64')
+
+        global __cublasDgeam_64
+        __cublasDgeam_64 = dlsym(RTLD_DEFAULT, 'cublasDgeam_64')
+        if __cublasDgeam_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgeam_64 = dlsym(handle, 'cublasDgeam_64')
+
+        global __cublasCgeam_64
+        __cublasCgeam_64 = dlsym(RTLD_DEFAULT, 'cublasCgeam_64')
+        if __cublasCgeam_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCgeam_64 = dlsym(handle, 'cublasCgeam_64')
+
+        global __cublasZgeam_64
+        __cublasZgeam_64 = dlsym(RTLD_DEFAULT, 'cublasZgeam_64')
+        if __cublasZgeam_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZgeam_64 = dlsym(handle, 'cublasZgeam_64')
+
+        global __cublasStrsmBatched_64
+        __cublasStrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasStrsmBatched_64')
+        if __cublasStrsmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasStrsmBatched_64 = dlsym(handle, 'cublasStrsmBatched_64')
+
+        global __cublasDtrsmBatched_64
+        __cublasDtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDtrsmBatched_64')
+        if __cublasDtrsmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDtrsmBatched_64 = dlsym(handle, 'cublasDtrsmBatched_64')
+
+        global __cublasCtrsmBatched_64
+        __cublasCtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasCtrsmBatched_64')
+        if __cublasCtrsmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCtrsmBatched_64 = dlsym(handle, 'cublasCtrsmBatched_64')
+
+        global __cublasZtrsmBatched_64
+        __cublasZtrsmBatched_64 = dlsym(RTLD_DEFAULT, 'cublasZtrsmBatched_64')
+        if __cublasZtrsmBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZtrsmBatched_64 = dlsym(handle, 'cublasZtrsmBatched_64')
+
+        global __cublasSdgmm_64
+        __cublasSdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasSdgmm_64')
+        if __cublasSdgmm_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSdgmm_64 = dlsym(handle, 'cublasSdgmm_64')
+
+        global __cublasDdgmm_64
+        __cublasDdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasDdgmm_64')
+        if __cublasDdgmm_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDdgmm_64 = dlsym(handle, 'cublasDdgmm_64')
+
+        global __cublasCdgmm_64
+        __cublasCdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasCdgmm_64')
+        if __cublasCdgmm_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasCdgmm_64 = dlsym(handle, 'cublasCdgmm_64')
+
+        global __cublasZdgmm_64
+        __cublasZdgmm_64 = dlsym(RTLD_DEFAULT, 'cublasZdgmm_64')
+        if __cublasZdgmm_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasZdgmm_64 = dlsym(handle, 'cublasZdgmm_64')
+
+        global __cublasSgemmGroupedBatched
+        __cublasSgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched')
+        if __cublasSgemmGroupedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmGroupedBatched = dlsym(handle, 'cublasSgemmGroupedBatched')
+
+        global __cublasSgemmGroupedBatched_64
+        __cublasSgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched_64')
+        if __cublasSgemmGroupedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSgemmGroupedBatched_64 = dlsym(handle, 'cublasSgemmGroupedBatched_64')
+
+        global __cublasDgemmGroupedBatched
+        __cublasDgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched')
+        if __cublasDgemmGroupedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmGroupedBatched = dlsym(handle, 'cublasDgemmGroupedBatched')
+
+        global __cublasDgemmGroupedBatched_64
+        __cublasDgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched_64')
+        if __cublasDgemmGroupedBatched_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasDgemmGroupedBatched_64 = dlsym(handle, 'cublasDgemmGroupedBatched_64')
+
+        global __cublasGemmGroupedBatchedEx
+        __cublasGemmGroupedBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx')
+        if __cublasGemmGroupedBatchedEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmGroupedBatchedEx = dlsym(handle, 'cublasGemmGroupedBatchedEx')
+
+        global __cublasGemmGroupedBatchedEx_64
+        __cublasGemmGroupedBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx_64')
+        if __cublasGemmGroupedBatchedEx_64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGemmGroupedBatchedEx_64 = dlsym(handle, 'cublasGemmGroupedBatchedEx_64')
+
+        global __cublasGetEmulationStrategy
+        __cublasGetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasGetEmulationStrategy')
+        if __cublasGetEmulationStrategy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasGetEmulationStrategy = dlsym(handle, 'cublasGetEmulationStrategy')
+
+        global __cublasSetEmulationStrategy
+        __cublasSetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasSetEmulationStrategy')
+        if __cublasSetEmulationStrategy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasSetEmulationStrategy = dlsym(handle, 'cublasSetEmulationStrategy')
+
+        __py_cublas_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cublas_windows.pyx b/nvmath/bindings/_internal/cublas_windows.pyx
index 23de3ba..5154054 100644
--- a/nvmath/bindings/_internal/cublas_windows.pyx
+++ b/nvmath/bindings/_internal/cublas_windows.pyx
@@ -8,20 +8,76 @@ from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cublas_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cublasCreate_v2 = NULL
 cdef void* __cublasDestroy_v2 = NULL
@@ -544,3064 +600,1533 @@ cdef int _check_or_init_cublas() except -1 nogil:
     if __py_cublas_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = <intptr_t>load_library(driver_ver)
 
         # Load function
         global __cublasCreate_v2
-        try:
-            __cublasCreate_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCreate_v2')
-        except:
-            pass
+        __cublasCreate_v2 = GetProcAddress(handle, 'cublasCreate_v2')
 
         global __cublasDestroy_v2
-        try:
-            __cublasDestroy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDestroy_v2')
-        except:
-            pass
+        __cublasDestroy_v2 = GetProcAddress(handle, 'cublasDestroy_v2')
 
         global __cublasGetVersion_v2
-        try:
-            __cublasGetVersion_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetVersion_v2')
-        except:
-            pass
+        __cublasGetVersion_v2 = GetProcAddress(handle, 'cublasGetVersion_v2')
 
         global __cublasGetProperty
-        try:
-            __cublasGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetProperty')
-        except:
-            pass
+        __cublasGetProperty = GetProcAddress(handle, 'cublasGetProperty')
 
         global __cublasGetCudartVersion
-        try:
-            __cublasGetCudartVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetCudartVersion')
-        except:
-            pass
+        __cublasGetCudartVersion = GetProcAddress(handle, 'cublasGetCudartVersion')
 
         global __cublasSetWorkspace_v2
-        try:
-            __cublasSetWorkspace_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetWorkspace_v2')
-        except:
-            pass
+        __cublasSetWorkspace_v2 = GetProcAddress(handle, 'cublasSetWorkspace_v2')
 
         global __cublasSetStream_v2
-        try:
-            __cublasSetStream_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetStream_v2')
-        except:
-            pass
+        __cublasSetStream_v2 = GetProcAddress(handle, 'cublasSetStream_v2')
 
         global __cublasGetStream_v2
-        try:
-            __cublasGetStream_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetStream_v2')
-        except:
-            pass
+        __cublasGetStream_v2 = GetProcAddress(handle, 'cublasGetStream_v2')
 
         global __cublasGetPointerMode_v2
-        try:
-            __cublasGetPointerMode_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetPointerMode_v2')
-        except:
-            pass
+        __cublasGetPointerMode_v2 = GetProcAddress(handle, 'cublasGetPointerMode_v2')
 
         global __cublasSetPointerMode_v2
-        try:
-            __cublasSetPointerMode_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetPointerMode_v2')
-        except:
-            pass
+        __cublasSetPointerMode_v2 = GetProcAddress(handle, 'cublasSetPointerMode_v2')
 
         global __cublasGetAtomicsMode
-        try:
-            __cublasGetAtomicsMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetAtomicsMode')
-        except:
-            pass
+        __cublasGetAtomicsMode = GetProcAddress(handle, 'cublasGetAtomicsMode')
 
         global __cublasSetAtomicsMode
-        try:
-            __cublasSetAtomicsMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetAtomicsMode')
-        except:
-            pass
+        __cublasSetAtomicsMode = GetProcAddress(handle, 'cublasSetAtomicsMode')
 
         global __cublasGetMathMode
-        try:
-            __cublasGetMathMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetMathMode')
-        except:
-            pass
+        __cublasGetMathMode = GetProcAddress(handle, 'cublasGetMathMode')
 
         global __cublasSetMathMode
-        try:
-            __cublasSetMathMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetMathMode')
-        except:
-            pass
+        __cublasSetMathMode = GetProcAddress(handle, 'cublasSetMathMode')
 
         global __cublasLoggerConfigure
-        try:
-            __cublasLoggerConfigure = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasLoggerConfigure')
-        except:
-            pass
+        __cublasLoggerConfigure = GetProcAddress(handle, 'cublasLoggerConfigure')
 
         global __cublasSetLoggerCallback
-        try:
-            __cublasSetLoggerCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetLoggerCallback')
-        except:
-            pass
+        __cublasSetLoggerCallback = GetProcAddress(handle, 'cublasSetLoggerCallback')
 
         global __cublasGetLoggerCallback
-        try:
-            __cublasGetLoggerCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetLoggerCallback')
-        except:
-            pass
+        __cublasGetLoggerCallback = GetProcAddress(handle, 'cublasGetLoggerCallback')
 
         global __cublasSetVector
-        try:
-            __cublasSetVector = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetVector')
-        except:
-            pass
+        __cublasSetVector = GetProcAddress(handle, 'cublasSetVector')
 
         global __cublasGetVector
-        try:
-            __cublasGetVector = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetVector')
-        except:
-            pass
+        __cublasGetVector = GetProcAddress(handle, 'cublasGetVector')
 
         global __cublasSetMatrix
-        try:
-            __cublasSetMatrix = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetMatrix')
-        except:
-            pass
+        __cublasSetMatrix = GetProcAddress(handle, 'cublasSetMatrix')
 
         global __cublasGetMatrix
-        try:
-            __cublasGetMatrix = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetMatrix')
-        except:
-            pass
+        __cublasGetMatrix = GetProcAddress(handle, 'cublasGetMatrix')
 
         global __cublasSetVectorAsync
-        try:
-            __cublasSetVectorAsync = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetVectorAsync')
-        except:
-            pass
+        __cublasSetVectorAsync = GetProcAddress(handle, 'cublasSetVectorAsync')
 
         global __cublasGetVectorAsync
-        try:
-            __cublasGetVectorAsync = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetVectorAsync')
-        except:
-            pass
+        __cublasGetVectorAsync = GetProcAddress(handle, 'cublasGetVectorAsync')
 
         global __cublasSetMatrixAsync
-        try:
-            __cublasSetMatrixAsync = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetMatrixAsync')
-        except:
-            pass
+        __cublasSetMatrixAsync = GetProcAddress(handle, 'cublasSetMatrixAsync')
 
         global __cublasGetMatrixAsync
-        try:
-            __cublasGetMatrixAsync = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetMatrixAsync')
-        except:
-            pass
+        __cublasGetMatrixAsync = GetProcAddress(handle, 'cublasGetMatrixAsync')
 
         global __cublasNrm2Ex
-        try:
-            __cublasNrm2Ex = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasNrm2Ex')
-        except:
-            pass
+        __cublasNrm2Ex = GetProcAddress(handle, 'cublasNrm2Ex')
 
         global __cublasSnrm2_v2
-        try:
-            __cublasSnrm2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSnrm2_v2')
-        except:
-            pass
+        __cublasSnrm2_v2 = GetProcAddress(handle, 'cublasSnrm2_v2')
 
         global __cublasDnrm2_v2
-        try:
-            __cublasDnrm2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDnrm2_v2')
-        except:
-            pass
+        __cublasDnrm2_v2 = GetProcAddress(handle, 'cublasDnrm2_v2')
 
         global __cublasScnrm2_v2
-        try:
-            __cublasScnrm2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScnrm2_v2')
-        except:
-            pass
+        __cublasScnrm2_v2 = GetProcAddress(handle, 'cublasScnrm2_v2')
 
         global __cublasDznrm2_v2
-        try:
-            __cublasDznrm2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDznrm2_v2')
-        except:
-            pass
+        __cublasDznrm2_v2 = GetProcAddress(handle, 'cublasDznrm2_v2')
 
         global __cublasDotEx
-        try:
-            __cublasDotEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDotEx')
-        except:
-            pass
+        __cublasDotEx = GetProcAddress(handle, 'cublasDotEx')
 
         global __cublasDotcEx
-        try:
-            __cublasDotcEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDotcEx')
-        except:
-            pass
+        __cublasDotcEx = GetProcAddress(handle, 'cublasDotcEx')
 
         global __cublasSdot_v2
-        try:
-            __cublasSdot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSdot_v2')
-        except:
-            pass
+        __cublasSdot_v2 = GetProcAddress(handle, 'cublasSdot_v2')
 
         global __cublasDdot_v2
-        try:
-            __cublasDdot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDdot_v2')
-        except:
-            pass
+        __cublasDdot_v2 = GetProcAddress(handle, 'cublasDdot_v2')
 
         global __cublasCdotu_v2
-        try:
-            __cublasCdotu_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdotu_v2')
-        except:
-            pass
+        __cublasCdotu_v2 = GetProcAddress(handle, 'cublasCdotu_v2')
 
         global __cublasCdotc_v2
-        try:
-            __cublasCdotc_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdotc_v2')
-        except:
-            pass
+        __cublasCdotc_v2 = GetProcAddress(handle, 'cublasCdotc_v2')
 
         global __cublasZdotu_v2
-        try:
-            __cublasZdotu_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdotu_v2')
-        except:
-            pass
+        __cublasZdotu_v2 = GetProcAddress(handle, 'cublasZdotu_v2')
 
         global __cublasZdotc_v2
-        try:
-            __cublasZdotc_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdotc_v2')
-        except:
-            pass
+        __cublasZdotc_v2 = GetProcAddress(handle, 'cublasZdotc_v2')
 
         global __cublasScalEx
-        try:
-            __cublasScalEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScalEx')
-        except:
-            pass
+        __cublasScalEx = GetProcAddress(handle, 'cublasScalEx')
 
         global __cublasSscal_v2
-        try:
-            __cublasSscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSscal_v2')
-        except:
-            pass
+        __cublasSscal_v2 = GetProcAddress(handle, 'cublasSscal_v2')
 
         global __cublasDscal_v2
-        try:
-            __cublasDscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDscal_v2')
-        except:
-            pass
+        __cublasDscal_v2 = GetProcAddress(handle, 'cublasDscal_v2')
 
         global __cublasCscal_v2
-        try:
-            __cublasCscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCscal_v2')
-        except:
-            pass
+        __cublasCscal_v2 = GetProcAddress(handle, 'cublasCscal_v2')
 
         global __cublasCsscal_v2
-        try:
-            __cublasCsscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsscal_v2')
-        except:
-            pass
+        __cublasCsscal_v2 = GetProcAddress(handle, 'cublasCsscal_v2')
 
         global __cublasZscal_v2
-        try:
-            __cublasZscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZscal_v2')
-        except:
-            pass
+        __cublasZscal_v2 = GetProcAddress(handle, 'cublasZscal_v2')
 
         global __cublasZdscal_v2
-        try:
-            __cublasZdscal_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdscal_v2')
-        except:
-            pass
+        __cublasZdscal_v2 = GetProcAddress(handle, 'cublasZdscal_v2')
 
         global __cublasAxpyEx
-        try:
-            __cublasAxpyEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasAxpyEx')
-        except:
-            pass
+        __cublasAxpyEx = GetProcAddress(handle, 'cublasAxpyEx')
 
         global __cublasSaxpy_v2
-        try:
-            __cublasSaxpy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSaxpy_v2')
-        except:
-            pass
+        __cublasSaxpy_v2 = GetProcAddress(handle, 'cublasSaxpy_v2')
 
         global __cublasDaxpy_v2
-        try:
-            __cublasDaxpy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDaxpy_v2')
-        except:
-            pass
+        __cublasDaxpy_v2 = GetProcAddress(handle, 'cublasDaxpy_v2')
 
         global __cublasCaxpy_v2
-        try:
-            __cublasCaxpy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCaxpy_v2')
-        except:
-            pass
+        __cublasCaxpy_v2 = GetProcAddress(handle, 'cublasCaxpy_v2')
 
         global __cublasZaxpy_v2
-        try:
-            __cublasZaxpy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZaxpy_v2')
-        except:
-            pass
+        __cublasZaxpy_v2 = GetProcAddress(handle, 'cublasZaxpy_v2')
 
         global __cublasCopyEx
-        try:
-            __cublasCopyEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCopyEx')
-        except:
-            pass
+        __cublasCopyEx = GetProcAddress(handle, 'cublasCopyEx')
 
         global __cublasScopy_v2
-        try:
-            __cublasScopy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScopy_v2')
-        except:
-            pass
+        __cublasScopy_v2 = GetProcAddress(handle, 'cublasScopy_v2')
 
         global __cublasDcopy_v2
-        try:
-            __cublasDcopy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDcopy_v2')
-        except:
-            pass
+        __cublasDcopy_v2 = GetProcAddress(handle, 'cublasDcopy_v2')
 
         global __cublasCcopy_v2
-        try:
-            __cublasCcopy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCcopy_v2')
-        except:
-            pass
+        __cublasCcopy_v2 = GetProcAddress(handle, 'cublasCcopy_v2')
 
         global __cublasZcopy_v2
-        try:
-            __cublasZcopy_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZcopy_v2')
-        except:
-            pass
+        __cublasZcopy_v2 = GetProcAddress(handle, 'cublasZcopy_v2')
 
         global __cublasSswap_v2
-        try:
-            __cublasSswap_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSswap_v2')
-        except:
-            pass
+        __cublasSswap_v2 = GetProcAddress(handle, 'cublasSswap_v2')
 
         global __cublasDswap_v2
-        try:
-            __cublasDswap_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDswap_v2')
-        except:
-            pass
+        __cublasDswap_v2 = GetProcAddress(handle, 'cublasDswap_v2')
 
         global __cublasCswap_v2
-        try:
-            __cublasCswap_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCswap_v2')
-        except:
-            pass
+        __cublasCswap_v2 = GetProcAddress(handle, 'cublasCswap_v2')
 
         global __cublasZswap_v2
-        try:
-            __cublasZswap_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZswap_v2')
-        except:
-            pass
+        __cublasZswap_v2 = GetProcAddress(handle, 'cublasZswap_v2')
 
         global __cublasSwapEx
-        try:
-            __cublasSwapEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSwapEx')
-        except:
-            pass
+        __cublasSwapEx = GetProcAddress(handle, 'cublasSwapEx')
 
         global __cublasIsamax_v2
-        try:
-            __cublasIsamax_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIsamax_v2')
-        except:
-            pass
+        __cublasIsamax_v2 = GetProcAddress(handle, 'cublasIsamax_v2')
 
         global __cublasIdamax_v2
-        try:
-            __cublasIdamax_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIdamax_v2')
-        except:
-            pass
+        __cublasIdamax_v2 = GetProcAddress(handle, 'cublasIdamax_v2')
 
         global __cublasIcamax_v2
-        try:
-            __cublasIcamax_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIcamax_v2')
-        except:
-            pass
+        __cublasIcamax_v2 = GetProcAddress(handle, 'cublasIcamax_v2')
 
         global __cublasIzamax_v2
-        try:
-            __cublasIzamax_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIzamax_v2')
-        except:
-            pass
+        __cublasIzamax_v2 = GetProcAddress(handle, 'cublasIzamax_v2')
 
         global __cublasIamaxEx
-        try:
-            __cublasIamaxEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIamaxEx')
-        except:
-            pass
+        __cublasIamaxEx = GetProcAddress(handle, 'cublasIamaxEx')
 
         global __cublasIsamin_v2
-        try:
-            __cublasIsamin_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIsamin_v2')
-        except:
-            pass
+        __cublasIsamin_v2 = GetProcAddress(handle, 'cublasIsamin_v2')
 
         global __cublasIdamin_v2
-        try:
-            __cublasIdamin_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIdamin_v2')
-        except:
-            pass
+        __cublasIdamin_v2 = GetProcAddress(handle, 'cublasIdamin_v2')
 
         global __cublasIcamin_v2
-        try:
-            __cublasIcamin_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIcamin_v2')
-        except:
-            pass
+        __cublasIcamin_v2 = GetProcAddress(handle, 'cublasIcamin_v2')
 
         global __cublasIzamin_v2
-        try:
-            __cublasIzamin_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIzamin_v2')
-        except:
-            pass
+        __cublasIzamin_v2 = GetProcAddress(handle, 'cublasIzamin_v2')
 
         global __cublasIaminEx
-        try:
-            __cublasIaminEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIaminEx')
-        except:
-            pass
+        __cublasIaminEx = GetProcAddress(handle, 'cublasIaminEx')
 
         global __cublasAsumEx
-        try:
-            __cublasAsumEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasAsumEx')
-        except:
-            pass
+        __cublasAsumEx = GetProcAddress(handle, 'cublasAsumEx')
 
         global __cublasSasum_v2
-        try:
-            __cublasSasum_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSasum_v2')
-        except:
-            pass
+        __cublasSasum_v2 = GetProcAddress(handle, 'cublasSasum_v2')
 
         global __cublasDasum_v2
-        try:
-            __cublasDasum_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDasum_v2')
-        except:
-            pass
+        __cublasDasum_v2 = GetProcAddress(handle, 'cublasDasum_v2')
 
         global __cublasScasum_v2
-        try:
-            __cublasScasum_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScasum_v2')
-        except:
-            pass
+        __cublasScasum_v2 = GetProcAddress(handle, 'cublasScasum_v2')
 
         global __cublasDzasum_v2
-        try:
-            __cublasDzasum_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDzasum_v2')
-        except:
-            pass
+        __cublasDzasum_v2 = GetProcAddress(handle, 'cublasDzasum_v2')
 
         global __cublasSrot_v2
-        try:
-            __cublasSrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrot_v2')
-        except:
-            pass
+        __cublasSrot_v2 = GetProcAddress(handle, 'cublasSrot_v2')
 
         global __cublasDrot_v2
-        try:
-            __cublasDrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrot_v2')
-        except:
-            pass
+        __cublasDrot_v2 = GetProcAddress(handle, 'cublasDrot_v2')
 
         global __cublasCrot_v2
-        try:
-            __cublasCrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCrot_v2')
-        except:
-            pass
+        __cublasCrot_v2 = GetProcAddress(handle, 'cublasCrot_v2')
 
         global __cublasCsrot_v2
-        try:
-            __cublasCsrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsrot_v2')
-        except:
-            pass
+        __cublasCsrot_v2 = GetProcAddress(handle, 'cublasCsrot_v2')
 
         global __cublasZrot_v2
-        try:
-            __cublasZrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZrot_v2')
-        except:
-            pass
+        __cublasZrot_v2 = GetProcAddress(handle, 'cublasZrot_v2')
 
         global __cublasZdrot_v2
-        try:
-            __cublasZdrot_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdrot_v2')
-        except:
-            pass
+        __cublasZdrot_v2 = GetProcAddress(handle, 'cublasZdrot_v2')
 
         global __cublasRotEx
-        try:
-            __cublasRotEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotEx')
-        except:
-            pass
+        __cublasRotEx = GetProcAddress(handle, 'cublasRotEx')
 
         global __cublasSrotg_v2
-        try:
-            __cublasSrotg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrotg_v2')
-        except:
-            pass
+        __cublasSrotg_v2 = GetProcAddress(handle, 'cublasSrotg_v2')
 
         global __cublasDrotg_v2
-        try:
-            __cublasDrotg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrotg_v2')
-        except:
-            pass
+        __cublasDrotg_v2 = GetProcAddress(handle, 'cublasDrotg_v2')
 
         global __cublasCrotg_v2
-        try:
-            __cublasCrotg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCrotg_v2')
-        except:
-            pass
+        __cublasCrotg_v2 = GetProcAddress(handle, 'cublasCrotg_v2')
 
         global __cublasZrotg_v2
-        try:
-            __cublasZrotg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZrotg_v2')
-        except:
-            pass
+        __cublasZrotg_v2 = GetProcAddress(handle, 'cublasZrotg_v2')
 
         global __cublasRotgEx
-        try:
-            __cublasRotgEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotgEx')
-        except:
-            pass
+        __cublasRotgEx = GetProcAddress(handle, 'cublasRotgEx')
 
         global __cublasSrotm_v2
-        try:
-            __cublasSrotm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrotm_v2')
-        except:
-            pass
+        __cublasSrotm_v2 = GetProcAddress(handle, 'cublasSrotm_v2')
 
         global __cublasDrotm_v2
-        try:
-            __cublasDrotm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrotm_v2')
-        except:
-            pass
+        __cublasDrotm_v2 = GetProcAddress(handle, 'cublasDrotm_v2')
 
         global __cublasRotmEx
-        try:
-            __cublasRotmEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotmEx')
-        except:
-            pass
+        __cublasRotmEx = GetProcAddress(handle, 'cublasRotmEx')
 
         global __cublasSrotmg_v2
-        try:
-            __cublasSrotmg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrotmg_v2')
-        except:
-            pass
+        __cublasSrotmg_v2 = GetProcAddress(handle, 'cublasSrotmg_v2')
 
         global __cublasDrotmg_v2
-        try:
-            __cublasDrotmg_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrotmg_v2')
-        except:
-            pass
+        __cublasDrotmg_v2 = GetProcAddress(handle, 'cublasDrotmg_v2')
 
         global __cublasRotmgEx
-        try:
-            __cublasRotmgEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotmgEx')
-        except:
-            pass
+        __cublasRotmgEx = GetProcAddress(handle, 'cublasRotmgEx')
 
         global __cublasSgemv_v2
-        try:
-            __cublasSgemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemv_v2')
-        except:
-            pass
+        __cublasSgemv_v2 = GetProcAddress(handle, 'cublasSgemv_v2')
 
         global __cublasDgemv_v2
-        try:
-            __cublasDgemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemv_v2')
-        except:
-            pass
+        __cublasDgemv_v2 = GetProcAddress(handle, 'cublasDgemv_v2')
 
         global __cublasCgemv_v2
-        try:
-            __cublasCgemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemv_v2')
-        except:
-            pass
+        __cublasCgemv_v2 = GetProcAddress(handle, 'cublasCgemv_v2')
 
         global __cublasZgemv_v2
-        try:
-            __cublasZgemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemv_v2')
-        except:
-            pass
+        __cublasZgemv_v2 = GetProcAddress(handle, 'cublasZgemv_v2')
 
         global __cublasSgbmv_v2
-        try:
-            __cublasSgbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgbmv_v2')
-        except:
-            pass
+        __cublasSgbmv_v2 = GetProcAddress(handle, 'cublasSgbmv_v2')
 
         global __cublasDgbmv_v2
-        try:
-            __cublasDgbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgbmv_v2')
-        except:
-            pass
+        __cublasDgbmv_v2 = GetProcAddress(handle, 'cublasDgbmv_v2')
 
         global __cublasCgbmv_v2
-        try:
-            __cublasCgbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgbmv_v2')
-        except:
-            pass
+        __cublasCgbmv_v2 = GetProcAddress(handle, 'cublasCgbmv_v2')
 
         global __cublasZgbmv_v2
-        try:
-            __cublasZgbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgbmv_v2')
-        except:
-            pass
+        __cublasZgbmv_v2 = GetProcAddress(handle, 'cublasZgbmv_v2')
 
         global __cublasStrmv_v2
-        try:
-            __cublasStrmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrmv_v2')
-        except:
-            pass
+        __cublasStrmv_v2 = GetProcAddress(handle, 'cublasStrmv_v2')
 
         global __cublasDtrmv_v2
-        try:
-            __cublasDtrmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrmv_v2')
-        except:
-            pass
+        __cublasDtrmv_v2 = GetProcAddress(handle, 'cublasDtrmv_v2')
 
         global __cublasCtrmv_v2
-        try:
-            __cublasCtrmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrmv_v2')
-        except:
-            pass
+        __cublasCtrmv_v2 = GetProcAddress(handle, 'cublasCtrmv_v2')
 
         global __cublasZtrmv_v2
-        try:
-            __cublasZtrmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrmv_v2')
-        except:
-            pass
+        __cublasZtrmv_v2 = GetProcAddress(handle, 'cublasZtrmv_v2')
 
         global __cublasStbmv_v2
-        try:
-            __cublasStbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStbmv_v2')
-        except:
-            pass
+        __cublasStbmv_v2 = GetProcAddress(handle, 'cublasStbmv_v2')
 
         global __cublasDtbmv_v2
-        try:
-            __cublasDtbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtbmv_v2')
-        except:
-            pass
+        __cublasDtbmv_v2 = GetProcAddress(handle, 'cublasDtbmv_v2')
 
         global __cublasCtbmv_v2
-        try:
-            __cublasCtbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtbmv_v2')
-        except:
-            pass
+        __cublasCtbmv_v2 = GetProcAddress(handle, 'cublasCtbmv_v2')
 
         global __cublasZtbmv_v2
-        try:
-            __cublasZtbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtbmv_v2')
-        except:
-            pass
+        __cublasZtbmv_v2 = GetProcAddress(handle, 'cublasZtbmv_v2')
 
         global __cublasStpmv_v2
-        try:
-            __cublasStpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStpmv_v2')
-        except:
-            pass
+        __cublasStpmv_v2 = GetProcAddress(handle, 'cublasStpmv_v2')
 
         global __cublasDtpmv_v2
-        try:
-            __cublasDtpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtpmv_v2')
-        except:
-            pass
+        __cublasDtpmv_v2 = GetProcAddress(handle, 'cublasDtpmv_v2')
 
         global __cublasCtpmv_v2
-        try:
-            __cublasCtpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtpmv_v2')
-        except:
-            pass
+        __cublasCtpmv_v2 = GetProcAddress(handle, 'cublasCtpmv_v2')
 
         global __cublasZtpmv_v2
-        try:
-            __cublasZtpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtpmv_v2')
-        except:
-            pass
+        __cublasZtpmv_v2 = GetProcAddress(handle, 'cublasZtpmv_v2')
 
         global __cublasStrsv_v2
-        try:
-            __cublasStrsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsv_v2')
-        except:
-            pass
+        __cublasStrsv_v2 = GetProcAddress(handle, 'cublasStrsv_v2')
 
         global __cublasDtrsv_v2
-        try:
-            __cublasDtrsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsv_v2')
-        except:
-            pass
+        __cublasDtrsv_v2 = GetProcAddress(handle, 'cublasDtrsv_v2')
 
         global __cublasCtrsv_v2
-        try:
-            __cublasCtrsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsv_v2')
-        except:
-            pass
+        __cublasCtrsv_v2 = GetProcAddress(handle, 'cublasCtrsv_v2')
 
         global __cublasZtrsv_v2
-        try:
-            __cublasZtrsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsv_v2')
-        except:
-            pass
+        __cublasZtrsv_v2 = GetProcAddress(handle, 'cublasZtrsv_v2')
 
         global __cublasStpsv_v2
-        try:
-            __cublasStpsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStpsv_v2')
-        except:
-            pass
+        __cublasStpsv_v2 = GetProcAddress(handle, 'cublasStpsv_v2')
 
         global __cublasDtpsv_v2
-        try:
-            __cublasDtpsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtpsv_v2')
-        except:
-            pass
+        __cublasDtpsv_v2 = GetProcAddress(handle, 'cublasDtpsv_v2')
 
         global __cublasCtpsv_v2
-        try:
-            __cublasCtpsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtpsv_v2')
-        except:
-            pass
+        __cublasCtpsv_v2 = GetProcAddress(handle, 'cublasCtpsv_v2')
 
         global __cublasZtpsv_v2
-        try:
-            __cublasZtpsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtpsv_v2')
-        except:
-            pass
+        __cublasZtpsv_v2 = GetProcAddress(handle, 'cublasZtpsv_v2')
 
         global __cublasStbsv_v2
-        try:
-            __cublasStbsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStbsv_v2')
-        except:
-            pass
+        __cublasStbsv_v2 = GetProcAddress(handle, 'cublasStbsv_v2')
 
         global __cublasDtbsv_v2
-        try:
-            __cublasDtbsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtbsv_v2')
-        except:
-            pass
+        __cublasDtbsv_v2 = GetProcAddress(handle, 'cublasDtbsv_v2')
 
         global __cublasCtbsv_v2
-        try:
-            __cublasCtbsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtbsv_v2')
-        except:
-            pass
+        __cublasCtbsv_v2 = GetProcAddress(handle, 'cublasCtbsv_v2')
 
         global __cublasZtbsv_v2
-        try:
-            __cublasZtbsv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtbsv_v2')
-        except:
-            pass
+        __cublasZtbsv_v2 = GetProcAddress(handle, 'cublasZtbsv_v2')
 
         global __cublasSsymv_v2
-        try:
-            __cublasSsymv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsymv_v2')
-        except:
-            pass
+        __cublasSsymv_v2 = GetProcAddress(handle, 'cublasSsymv_v2')
 
         global __cublasDsymv_v2
-        try:
-            __cublasDsymv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsymv_v2')
-        except:
-            pass
+        __cublasDsymv_v2 = GetProcAddress(handle, 'cublasDsymv_v2')
 
         global __cublasCsymv_v2
-        try:
-            __cublasCsymv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsymv_v2')
-        except:
-            pass
+        __cublasCsymv_v2 = GetProcAddress(handle, 'cublasCsymv_v2')
 
         global __cublasZsymv_v2
-        try:
-            __cublasZsymv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsymv_v2')
-        except:
-            pass
+        __cublasZsymv_v2 = GetProcAddress(handle, 'cublasZsymv_v2')
 
         global __cublasChemv_v2
-        try:
-            __cublasChemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChemv_v2')
-        except:
-            pass
+        __cublasChemv_v2 = GetProcAddress(handle, 'cublasChemv_v2')
 
         global __cublasZhemv_v2
-        try:
-            __cublasZhemv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhemv_v2')
-        except:
-            pass
+        __cublasZhemv_v2 = GetProcAddress(handle, 'cublasZhemv_v2')
 
         global __cublasSsbmv_v2
-        try:
-            __cublasSsbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsbmv_v2')
-        except:
-            pass
+        __cublasSsbmv_v2 = GetProcAddress(handle, 'cublasSsbmv_v2')
 
         global __cublasDsbmv_v2
-        try:
-            __cublasDsbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsbmv_v2')
-        except:
-            pass
+        __cublasDsbmv_v2 = GetProcAddress(handle, 'cublasDsbmv_v2')
 
         global __cublasChbmv_v2
-        try:
-            __cublasChbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChbmv_v2')
-        except:
-            pass
+        __cublasChbmv_v2 = GetProcAddress(handle, 'cublasChbmv_v2')
 
         global __cublasZhbmv_v2
-        try:
-            __cublasZhbmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhbmv_v2')
-        except:
-            pass
+        __cublasZhbmv_v2 = GetProcAddress(handle, 'cublasZhbmv_v2')
 
         global __cublasSspmv_v2
-        try:
-            __cublasSspmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspmv_v2')
-        except:
-            pass
+        __cublasSspmv_v2 = GetProcAddress(handle, 'cublasSspmv_v2')
 
         global __cublasDspmv_v2
-        try:
-            __cublasDspmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspmv_v2')
-        except:
-            pass
+        __cublasDspmv_v2 = GetProcAddress(handle, 'cublasDspmv_v2')
 
         global __cublasChpmv_v2
-        try:
-            __cublasChpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpmv_v2')
-        except:
-            pass
+        __cublasChpmv_v2 = GetProcAddress(handle, 'cublasChpmv_v2')
 
         global __cublasZhpmv_v2
-        try:
-            __cublasZhpmv_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpmv_v2')
-        except:
-            pass
+        __cublasZhpmv_v2 = GetProcAddress(handle, 'cublasZhpmv_v2')
 
         global __cublasSger_v2
-        try:
-            __cublasSger_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSger_v2')
-        except:
-            pass
+        __cublasSger_v2 = GetProcAddress(handle, 'cublasSger_v2')
 
         global __cublasDger_v2
-        try:
-            __cublasDger_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDger_v2')
-        except:
-            pass
+        __cublasDger_v2 = GetProcAddress(handle, 'cublasDger_v2')
 
         global __cublasCgeru_v2
-        try:
-            __cublasCgeru_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgeru_v2')
-        except:
-            pass
+        __cublasCgeru_v2 = GetProcAddress(handle, 'cublasCgeru_v2')
 
         global __cublasCgerc_v2
-        try:
-            __cublasCgerc_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgerc_v2')
-        except:
-            pass
+        __cublasCgerc_v2 = GetProcAddress(handle, 'cublasCgerc_v2')
 
         global __cublasZgeru_v2
-        try:
-            __cublasZgeru_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgeru_v2')
-        except:
-            pass
+        __cublasZgeru_v2 = GetProcAddress(handle, 'cublasZgeru_v2')
 
         global __cublasZgerc_v2
-        try:
-            __cublasZgerc_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgerc_v2')
-        except:
-            pass
+        __cublasZgerc_v2 = GetProcAddress(handle, 'cublasZgerc_v2')
 
         global __cublasSsyr_v2
-        try:
-            __cublasSsyr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr_v2')
-        except:
-            pass
+        __cublasSsyr_v2 = GetProcAddress(handle, 'cublasSsyr_v2')
 
         global __cublasDsyr_v2
-        try:
-            __cublasDsyr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr_v2')
-        except:
-            pass
+        __cublasDsyr_v2 = GetProcAddress(handle, 'cublasDsyr_v2')
 
         global __cublasCsyr_v2
-        try:
-            __cublasCsyr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr_v2')
-        except:
-            pass
+        __cublasCsyr_v2 = GetProcAddress(handle, 'cublasCsyr_v2')
 
         global __cublasZsyr_v2
-        try:
-            __cublasZsyr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr_v2')
-        except:
-            pass
+        __cublasZsyr_v2 = GetProcAddress(handle, 'cublasZsyr_v2')
 
         global __cublasCher_v2
-        try:
-            __cublasCher_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher_v2')
-        except:
-            pass
+        __cublasCher_v2 = GetProcAddress(handle, 'cublasCher_v2')
 
         global __cublasZher_v2
-        try:
-            __cublasZher_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher_v2')
-        except:
-            pass
+        __cublasZher_v2 = GetProcAddress(handle, 'cublasZher_v2')
 
         global __cublasSspr_v2
-        try:
-            __cublasSspr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspr_v2')
-        except:
-            pass
+        __cublasSspr_v2 = GetProcAddress(handle, 'cublasSspr_v2')
 
         global __cublasDspr_v2
-        try:
-            __cublasDspr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspr_v2')
-        except:
-            pass
+        __cublasDspr_v2 = GetProcAddress(handle, 'cublasDspr_v2')
 
         global __cublasChpr_v2
-        try:
-            __cublasChpr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpr_v2')
-        except:
-            pass
+        __cublasChpr_v2 = GetProcAddress(handle, 'cublasChpr_v2')
 
         global __cublasZhpr_v2
-        try:
-            __cublasZhpr_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpr_v2')
-        except:
-            pass
+        __cublasZhpr_v2 = GetProcAddress(handle, 'cublasZhpr_v2')
 
         global __cublasSsyr2_v2
-        try:
-            __cublasSsyr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr2_v2')
-        except:
-            pass
+        __cublasSsyr2_v2 = GetProcAddress(handle, 'cublasSsyr2_v2')
 
         global __cublasDsyr2_v2
-        try:
-            __cublasDsyr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr2_v2')
-        except:
-            pass
+        __cublasDsyr2_v2 = GetProcAddress(handle, 'cublasDsyr2_v2')
 
         global __cublasCsyr2_v2
-        try:
-            __cublasCsyr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr2_v2')
-        except:
-            pass
+        __cublasCsyr2_v2 = GetProcAddress(handle, 'cublasCsyr2_v2')
 
         global __cublasZsyr2_v2
-        try:
-            __cublasZsyr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr2_v2')
-        except:
-            pass
+        __cublasZsyr2_v2 = GetProcAddress(handle, 'cublasZsyr2_v2')
 
         global __cublasCher2_v2
-        try:
-            __cublasCher2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher2_v2')
-        except:
-            pass
+        __cublasCher2_v2 = GetProcAddress(handle, 'cublasCher2_v2')
 
         global __cublasZher2_v2
-        try:
-            __cublasZher2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher2_v2')
-        except:
-            pass
+        __cublasZher2_v2 = GetProcAddress(handle, 'cublasZher2_v2')
 
         global __cublasSspr2_v2
-        try:
-            __cublasSspr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspr2_v2')
-        except:
-            pass
+        __cublasSspr2_v2 = GetProcAddress(handle, 'cublasSspr2_v2')
 
         global __cublasDspr2_v2
-        try:
-            __cublasDspr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspr2_v2')
-        except:
-            pass
+        __cublasDspr2_v2 = GetProcAddress(handle, 'cublasDspr2_v2')
 
         global __cublasChpr2_v2
-        try:
-            __cublasChpr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpr2_v2')
-        except:
-            pass
+        __cublasChpr2_v2 = GetProcAddress(handle, 'cublasChpr2_v2')
 
         global __cublasZhpr2_v2
-        try:
-            __cublasZhpr2_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpr2_v2')
-        except:
-            pass
+        __cublasZhpr2_v2 = GetProcAddress(handle, 'cublasZhpr2_v2')
 
         global __cublasSgemm_v2
-        try:
-            __cublasSgemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemm_v2')
-        except:
-            pass
+        __cublasSgemm_v2 = GetProcAddress(handle, 'cublasSgemm_v2')
 
         global __cublasDgemm_v2
-        try:
-            __cublasDgemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemm_v2')
-        except:
-            pass
+        __cublasDgemm_v2 = GetProcAddress(handle, 'cublasDgemm_v2')
 
         global __cublasCgemm_v2
-        try:
-            __cublasCgemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm_v2')
-        except:
-            pass
+        __cublasCgemm_v2 = GetProcAddress(handle, 'cublasCgemm_v2')
 
         global __cublasCgemm3m
-        try:
-            __cublasCgemm3m = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3m')
-        except:
-            pass
+        __cublasCgemm3m = GetProcAddress(handle, 'cublasCgemm3m')
 
         global __cublasCgemm3mEx
-        try:
-            __cublasCgemm3mEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mEx')
-        except:
-            pass
+        __cublasCgemm3mEx = GetProcAddress(handle, 'cublasCgemm3mEx')
 
         global __cublasZgemm_v2
-        try:
-            __cublasZgemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemm_v2')
-        except:
-            pass
+        __cublasZgemm_v2 = GetProcAddress(handle, 'cublasZgemm_v2')
 
         global __cublasZgemm3m
-        try:
-            __cublasZgemm3m = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemm3m')
-        except:
-            pass
+        __cublasZgemm3m = GetProcAddress(handle, 'cublasZgemm3m')
 
         global __cublasSgemmEx
-        try:
-            __cublasSgemmEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmEx')
-        except:
-            pass
+        __cublasSgemmEx = GetProcAddress(handle, 'cublasSgemmEx')
 
         global __cublasGemmEx
-        try:
-            __cublasGemmEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmEx')
-        except:
-            pass
+        __cublasGemmEx = GetProcAddress(handle, 'cublasGemmEx')
 
         global __cublasCgemmEx
-        try:
-            __cublasCgemmEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmEx')
-        except:
-            pass
+        __cublasCgemmEx = GetProcAddress(handle, 'cublasCgemmEx')
 
         global __cublasUint8gemmBias
-        try:
-            __cublasUint8gemmBias = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasUint8gemmBias')
-        except:
-            pass
+        __cublasUint8gemmBias = GetProcAddress(handle, 'cublasUint8gemmBias')
 
         global __cublasSsyrk_v2
-        try:
-            __cublasSsyrk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyrk_v2')
-        except:
-            pass
+        __cublasSsyrk_v2 = GetProcAddress(handle, 'cublasSsyrk_v2')
 
         global __cublasDsyrk_v2
-        try:
-            __cublasDsyrk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyrk_v2')
-        except:
-            pass
+        __cublasDsyrk_v2 = GetProcAddress(handle, 'cublasDsyrk_v2')
 
         global __cublasCsyrk_v2
-        try:
-            __cublasCsyrk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrk_v2')
-        except:
-            pass
+        __cublasCsyrk_v2 = GetProcAddress(handle, 'cublasCsyrk_v2')
 
         global __cublasZsyrk_v2
-        try:
-            __cublasZsyrk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyrk_v2')
-        except:
-            pass
+        __cublasZsyrk_v2 = GetProcAddress(handle, 'cublasZsyrk_v2')
 
         global __cublasCsyrkEx
-        try:
-            __cublasCsyrkEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrkEx')
-        except:
-            pass
+        __cublasCsyrkEx = GetProcAddress(handle, 'cublasCsyrkEx')
 
         global __cublasCsyrk3mEx
-        try:
-            __cublasCsyrk3mEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrk3mEx')
-        except:
-            pass
+        __cublasCsyrk3mEx = GetProcAddress(handle, 'cublasCsyrk3mEx')
 
         global __cublasCherk_v2
-        try:
-            __cublasCherk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherk_v2')
-        except:
-            pass
+        __cublasCherk_v2 = GetProcAddress(handle, 'cublasCherk_v2')
 
         global __cublasZherk_v2
-        try:
-            __cublasZherk_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZherk_v2')
-        except:
-            pass
+        __cublasZherk_v2 = GetProcAddress(handle, 'cublasZherk_v2')
 
         global __cublasCherkEx
-        try:
-            __cublasCherkEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherkEx')
-        except:
-            pass
+        __cublasCherkEx = GetProcAddress(handle, 'cublasCherkEx')
 
         global __cublasCherk3mEx
-        try:
-            __cublasCherk3mEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherk3mEx')
-        except:
-            pass
+        __cublasCherk3mEx = GetProcAddress(handle, 'cublasCherk3mEx')
 
         global __cublasSsyr2k_v2
-        try:
-            __cublasSsyr2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr2k_v2')
-        except:
-            pass
+        __cublasSsyr2k_v2 = GetProcAddress(handle, 'cublasSsyr2k_v2')
 
         global __cublasDsyr2k_v2
-        try:
-            __cublasDsyr2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr2k_v2')
-        except:
-            pass
+        __cublasDsyr2k_v2 = GetProcAddress(handle, 'cublasDsyr2k_v2')
 
         global __cublasCsyr2k_v2
-        try:
-            __cublasCsyr2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr2k_v2')
-        except:
-            pass
+        __cublasCsyr2k_v2 = GetProcAddress(handle, 'cublasCsyr2k_v2')
 
         global __cublasZsyr2k_v2
-        try:
-            __cublasZsyr2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr2k_v2')
-        except:
-            pass
+        __cublasZsyr2k_v2 = GetProcAddress(handle, 'cublasZsyr2k_v2')
 
         global __cublasCher2k_v2
-        try:
-            __cublasCher2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher2k_v2')
-        except:
-            pass
+        __cublasCher2k_v2 = GetProcAddress(handle, 'cublasCher2k_v2')
 
         global __cublasZher2k_v2
-        try:
-            __cublasZher2k_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher2k_v2')
-        except:
-            pass
+        __cublasZher2k_v2 = GetProcAddress(handle, 'cublasZher2k_v2')
 
         global __cublasSsyrkx
-        try:
-            __cublasSsyrkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyrkx')
-        except:
-            pass
+        __cublasSsyrkx = GetProcAddress(handle, 'cublasSsyrkx')
 
         global __cublasDsyrkx
-        try:
-            __cublasDsyrkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyrkx')
-        except:
-            pass
+        __cublasDsyrkx = GetProcAddress(handle, 'cublasDsyrkx')
 
         global __cublasCsyrkx
-        try:
-            __cublasCsyrkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrkx')
-        except:
-            pass
+        __cublasCsyrkx = GetProcAddress(handle, 'cublasCsyrkx')
 
         global __cublasZsyrkx
-        try:
-            __cublasZsyrkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyrkx')
-        except:
-            pass
+        __cublasZsyrkx = GetProcAddress(handle, 'cublasZsyrkx')
 
         global __cublasCherkx
-        try:
-            __cublasCherkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherkx')
-        except:
-            pass
+        __cublasCherkx = GetProcAddress(handle, 'cublasCherkx')
 
         global __cublasZherkx
-        try:
-            __cublasZherkx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZherkx')
-        except:
-            pass
+        __cublasZherkx = GetProcAddress(handle, 'cublasZherkx')
 
         global __cublasSsymm_v2
-        try:
-            __cublasSsymm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsymm_v2')
-        except:
-            pass
+        __cublasSsymm_v2 = GetProcAddress(handle, 'cublasSsymm_v2')
 
         global __cublasDsymm_v2
-        try:
-            __cublasDsymm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsymm_v2')
-        except:
-            pass
+        __cublasDsymm_v2 = GetProcAddress(handle, 'cublasDsymm_v2')
 
         global __cublasCsymm_v2
-        try:
-            __cublasCsymm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsymm_v2')
-        except:
-            pass
+        __cublasCsymm_v2 = GetProcAddress(handle, 'cublasCsymm_v2')
 
         global __cublasZsymm_v2
-        try:
-            __cublasZsymm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsymm_v2')
-        except:
-            pass
+        __cublasZsymm_v2 = GetProcAddress(handle, 'cublasZsymm_v2')
 
         global __cublasChemm_v2
-        try:
-            __cublasChemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChemm_v2')
-        except:
-            pass
+        __cublasChemm_v2 = GetProcAddress(handle, 'cublasChemm_v2')
 
         global __cublasZhemm_v2
-        try:
-            __cublasZhemm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhemm_v2')
-        except:
-            pass
+        __cublasZhemm_v2 = GetProcAddress(handle, 'cublasZhemm_v2')
 
         global __cublasStrsm_v2
-        try:
-            __cublasStrsm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsm_v2')
-        except:
-            pass
+        __cublasStrsm_v2 = GetProcAddress(handle, 'cublasStrsm_v2')
 
         global __cublasDtrsm_v2
-        try:
-            __cublasDtrsm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsm_v2')
-        except:
-            pass
+        __cublasDtrsm_v2 = GetProcAddress(handle, 'cublasDtrsm_v2')
 
         global __cublasCtrsm_v2
-        try:
-            __cublasCtrsm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsm_v2')
-        except:
-            pass
+        __cublasCtrsm_v2 = GetProcAddress(handle, 'cublasCtrsm_v2')
 
         global __cublasZtrsm_v2
-        try:
-            __cublasZtrsm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsm_v2')
-        except:
-            pass
+        __cublasZtrsm_v2 = GetProcAddress(handle, 'cublasZtrsm_v2')
 
         global __cublasStrmm_v2
-        try:
-            __cublasStrmm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrmm_v2')
-        except:
-            pass
+        __cublasStrmm_v2 = GetProcAddress(handle, 'cublasStrmm_v2')
 
         global __cublasDtrmm_v2
-        try:
-            __cublasDtrmm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrmm_v2')
-        except:
-            pass
+        __cublasDtrmm_v2 = GetProcAddress(handle, 'cublasDtrmm_v2')
 
         global __cublasCtrmm_v2
-        try:
-            __cublasCtrmm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrmm_v2')
-        except:
-            pass
+        __cublasCtrmm_v2 = GetProcAddress(handle, 'cublasCtrmm_v2')
 
         global __cublasZtrmm_v2
-        try:
-            __cublasZtrmm_v2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrmm_v2')
-        except:
-            pass
+        __cublasZtrmm_v2 = GetProcAddress(handle, 'cublasZtrmm_v2')
 
         global __cublasSgemmBatched
-        try:
-            __cublasSgemmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmBatched')
-        except:
-            pass
+        __cublasSgemmBatched = GetProcAddress(handle, 'cublasSgemmBatched')
 
         global __cublasDgemmBatched
-        try:
-            __cublasDgemmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmBatched')
-        except:
-            pass
+        __cublasDgemmBatched = GetProcAddress(handle, 'cublasDgemmBatched')
 
         global __cublasCgemmBatched
-        try:
-            __cublasCgemmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmBatched')
-        except:
-            pass
+        __cublasCgemmBatched = GetProcAddress(handle, 'cublasCgemmBatched')
 
         global __cublasCgemm3mBatched
-        try:
-            __cublasCgemm3mBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mBatched')
-        except:
-            pass
+        __cublasCgemm3mBatched = GetProcAddress(handle, 'cublasCgemm3mBatched')
 
         global __cublasZgemmBatched
-        try:
-            __cublasZgemmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemmBatched')
-        except:
-            pass
+        __cublasZgemmBatched = GetProcAddress(handle, 'cublasZgemmBatched')
 
         global __cublasGemmBatchedEx
-        try:
-            __cublasGemmBatchedEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmBatchedEx')
-        except:
-            pass
+        __cublasGemmBatchedEx = GetProcAddress(handle, 'cublasGemmBatchedEx')
 
         global __cublasGemmStridedBatchedEx
-        try:
-            __cublasGemmStridedBatchedEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmStridedBatchedEx')
-        except:
-            pass
+        __cublasGemmStridedBatchedEx = GetProcAddress(handle, 'cublasGemmStridedBatchedEx')
 
         global __cublasSgemmStridedBatched
-        try:
-            __cublasSgemmStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmStridedBatched')
-        except:
-            pass
+        __cublasSgemmStridedBatched = GetProcAddress(handle, 'cublasSgemmStridedBatched')
 
         global __cublasDgemmStridedBatched
-        try:
-            __cublasDgemmStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmStridedBatched')
-        except:
-            pass
+        __cublasDgemmStridedBatched = GetProcAddress(handle, 'cublasDgemmStridedBatched')
 
         global __cublasCgemmStridedBatched
-        try:
-            __cublasCgemmStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmStridedBatched')
-        except:
-            pass
+        __cublasCgemmStridedBatched = GetProcAddress(handle, 'cublasCgemmStridedBatched')
 
         global __cublasCgemm3mStridedBatched
-        try:
-            __cublasCgemm3mStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mStridedBatched')
-        except:
-            pass
+        __cublasCgemm3mStridedBatched = GetProcAddress(handle, 'cublasCgemm3mStridedBatched')
 
         global __cublasZgemmStridedBatched
-        try:
-            __cublasZgemmStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemmStridedBatched')
-        except:
-            pass
+        __cublasZgemmStridedBatched = GetProcAddress(handle, 'cublasZgemmStridedBatched')
 
         global __cublasSgeam
-        try:
-            __cublasSgeam = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgeam')
-        except:
-            pass
+        __cublasSgeam = GetProcAddress(handle, 'cublasSgeam')
 
         global __cublasDgeam
-        try:
-            __cublasDgeam = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgeam')
-        except:
-            pass
+        __cublasDgeam = GetProcAddress(handle, 'cublasDgeam')
 
         global __cublasCgeam
-        try:
-            __cublasCgeam = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgeam')
-        except:
-            pass
+        __cublasCgeam = GetProcAddress(handle, 'cublasCgeam')
 
         global __cublasZgeam
-        try:
-            __cublasZgeam = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgeam')
-        except:
-            pass
+        __cublasZgeam = GetProcAddress(handle, 'cublasZgeam')
 
         global __cublasSgetrfBatched
-        try:
-            __cublasSgetrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgetrfBatched')
-        except:
-            pass
+        __cublasSgetrfBatched = GetProcAddress(handle, 'cublasSgetrfBatched')
 
         global __cublasDgetrfBatched
-        try:
-            __cublasDgetrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgetrfBatched')
-        except:
-            pass
+        __cublasDgetrfBatched = GetProcAddress(handle, 'cublasDgetrfBatched')
 
         global __cublasCgetrfBatched
-        try:
-            __cublasCgetrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgetrfBatched')
-        except:
-            pass
+        __cublasCgetrfBatched = GetProcAddress(handle, 'cublasCgetrfBatched')
 
         global __cublasZgetrfBatched
-        try:
-            __cublasZgetrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgetrfBatched')
-        except:
-            pass
+        __cublasZgetrfBatched = GetProcAddress(handle, 'cublasZgetrfBatched')
 
         global __cublasSgetriBatched
-        try:
-            __cublasSgetriBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgetriBatched')
-        except:
-            pass
+        __cublasSgetriBatched = GetProcAddress(handle, 'cublasSgetriBatched')
 
         global __cublasDgetriBatched
-        try:
-            __cublasDgetriBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgetriBatched')
-        except:
-            pass
+        __cublasDgetriBatched = GetProcAddress(handle, 'cublasDgetriBatched')
 
         global __cublasCgetriBatched
-        try:
-            __cublasCgetriBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgetriBatched')
-        except:
-            pass
+        __cublasCgetriBatched = GetProcAddress(handle, 'cublasCgetriBatched')
 
         global __cublasZgetriBatched
-        try:
-            __cublasZgetriBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgetriBatched')
-        except:
-            pass
+        __cublasZgetriBatched = GetProcAddress(handle, 'cublasZgetriBatched')
 
         global __cublasSgetrsBatched
-        try:
-            __cublasSgetrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgetrsBatched')
-        except:
-            pass
+        __cublasSgetrsBatched = GetProcAddress(handle, 'cublasSgetrsBatched')
 
         global __cublasDgetrsBatched
-        try:
-            __cublasDgetrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgetrsBatched')
-        except:
-            pass
+        __cublasDgetrsBatched = GetProcAddress(handle, 'cublasDgetrsBatched')
 
         global __cublasCgetrsBatched
-        try:
-            __cublasCgetrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgetrsBatched')
-        except:
-            pass
+        __cublasCgetrsBatched = GetProcAddress(handle, 'cublasCgetrsBatched')
 
         global __cublasZgetrsBatched
-        try:
-            __cublasZgetrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgetrsBatched')
-        except:
-            pass
+        __cublasZgetrsBatched = GetProcAddress(handle, 'cublasZgetrsBatched')
 
         global __cublasStrsmBatched
-        try:
-            __cublasStrsmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsmBatched')
-        except:
-            pass
+        __cublasStrsmBatched = GetProcAddress(handle, 'cublasStrsmBatched')
 
         global __cublasDtrsmBatched
-        try:
-            __cublasDtrsmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsmBatched')
-        except:
-            pass
+        __cublasDtrsmBatched = GetProcAddress(handle, 'cublasDtrsmBatched')
 
         global __cublasCtrsmBatched
-        try:
-            __cublasCtrsmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsmBatched')
-        except:
-            pass
+        __cublasCtrsmBatched = GetProcAddress(handle, 'cublasCtrsmBatched')
 
         global __cublasZtrsmBatched
-        try:
-            __cublasZtrsmBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsmBatched')
-        except:
-            pass
+        __cublasZtrsmBatched = GetProcAddress(handle, 'cublasZtrsmBatched')
 
         global __cublasSmatinvBatched
-        try:
-            __cublasSmatinvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSmatinvBatched')
-        except:
-            pass
+        __cublasSmatinvBatched = GetProcAddress(handle, 'cublasSmatinvBatched')
 
         global __cublasDmatinvBatched
-        try:
-            __cublasDmatinvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDmatinvBatched')
-        except:
-            pass
+        __cublasDmatinvBatched = GetProcAddress(handle, 'cublasDmatinvBatched')
 
         global __cublasCmatinvBatched
-        try:
-            __cublasCmatinvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCmatinvBatched')
-        except:
-            pass
+        __cublasCmatinvBatched = GetProcAddress(handle, 'cublasCmatinvBatched')
 
         global __cublasZmatinvBatched
-        try:
-            __cublasZmatinvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZmatinvBatched')
-        except:
-            pass
+        __cublasZmatinvBatched = GetProcAddress(handle, 'cublasZmatinvBatched')
 
         global __cublasSgeqrfBatched
-        try:
-            __cublasSgeqrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgeqrfBatched')
-        except:
-            pass
+        __cublasSgeqrfBatched = GetProcAddress(handle, 'cublasSgeqrfBatched')
 
         global __cublasDgeqrfBatched
-        try:
-            __cublasDgeqrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgeqrfBatched')
-        except:
-            pass
+        __cublasDgeqrfBatched = GetProcAddress(handle, 'cublasDgeqrfBatched')
 
         global __cublasCgeqrfBatched
-        try:
-            __cublasCgeqrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgeqrfBatched')
-        except:
-            pass
+        __cublasCgeqrfBatched = GetProcAddress(handle, 'cublasCgeqrfBatched')
 
         global __cublasZgeqrfBatched
-        try:
-            __cublasZgeqrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgeqrfBatched')
-        except:
-            pass
+        __cublasZgeqrfBatched = GetProcAddress(handle, 'cublasZgeqrfBatched')
 
         global __cublasSgelsBatched
-        try:
-            __cublasSgelsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgelsBatched')
-        except:
-            pass
+        __cublasSgelsBatched = GetProcAddress(handle, 'cublasSgelsBatched')
 
         global __cublasDgelsBatched
-        try:
-            __cublasDgelsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgelsBatched')
-        except:
-            pass
+        __cublasDgelsBatched = GetProcAddress(handle, 'cublasDgelsBatched')
 
         global __cublasCgelsBatched
-        try:
-            __cublasCgelsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgelsBatched')
-        except:
-            pass
+        __cublasCgelsBatched = GetProcAddress(handle, 'cublasCgelsBatched')
 
         global __cublasZgelsBatched
-        try:
-            __cublasZgelsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgelsBatched')
-        except:
-            pass
+        __cublasZgelsBatched = GetProcAddress(handle, 'cublasZgelsBatched')
 
         global __cublasSdgmm
-        try:
-            __cublasSdgmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSdgmm')
-        except:
-            pass
+        __cublasSdgmm = GetProcAddress(handle, 'cublasSdgmm')
 
         global __cublasDdgmm
-        try:
-            __cublasDdgmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDdgmm')
-        except:
-            pass
+        __cublasDdgmm = GetProcAddress(handle, 'cublasDdgmm')
 
         global __cublasCdgmm
-        try:
-            __cublasCdgmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdgmm')
-        except:
-            pass
+        __cublasCdgmm = GetProcAddress(handle, 'cublasCdgmm')
 
         global __cublasZdgmm
-        try:
-            __cublasZdgmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdgmm')
-        except:
-            pass
+        __cublasZdgmm = GetProcAddress(handle, 'cublasZdgmm')
 
         global __cublasStpttr
-        try:
-            __cublasStpttr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStpttr')
-        except:
-            pass
+        __cublasStpttr = GetProcAddress(handle, 'cublasStpttr')
 
         global __cublasDtpttr
-        try:
-            __cublasDtpttr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtpttr')
-        except:
-            pass
+        __cublasDtpttr = GetProcAddress(handle, 'cublasDtpttr')
 
         global __cublasCtpttr
-        try:
-            __cublasCtpttr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtpttr')
-        except:
-            pass
+        __cublasCtpttr = GetProcAddress(handle, 'cublasCtpttr')
 
         global __cublasZtpttr
-        try:
-            __cublasZtpttr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtpttr')
-        except:
-            pass
+        __cublasZtpttr = GetProcAddress(handle, 'cublasZtpttr')
 
         global __cublasStrttp
-        try:
-            __cublasStrttp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrttp')
-        except:
-            pass
+        __cublasStrttp = GetProcAddress(handle, 'cublasStrttp')
 
         global __cublasDtrttp
-        try:
-            __cublasDtrttp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrttp')
-        except:
-            pass
+        __cublasDtrttp = GetProcAddress(handle, 'cublasDtrttp')
 
         global __cublasCtrttp
-        try:
-            __cublasCtrttp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrttp')
-        except:
-            pass
+        __cublasCtrttp = GetProcAddress(handle, 'cublasCtrttp')
 
         global __cublasZtrttp
-        try:
-            __cublasZtrttp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrttp')
-        except:
-            pass
+        __cublasZtrttp = GetProcAddress(handle, 'cublasZtrttp')
 
         global __cublasGetSmCountTarget
-        try:
-            __cublasGetSmCountTarget = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetSmCountTarget')
-        except:
-            pass
+        __cublasGetSmCountTarget = GetProcAddress(handle, 'cublasGetSmCountTarget')
 
         global __cublasSetSmCountTarget
-        try:
-            __cublasSetSmCountTarget = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetSmCountTarget')
-        except:
-            pass
+        __cublasSetSmCountTarget = GetProcAddress(handle, 'cublasSetSmCountTarget')
 
         global __cublasGetStatusName
-        try:
-            __cublasGetStatusName = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetStatusName')
-        except:
-            pass
+        __cublasGetStatusName = GetProcAddress(handle, 'cublasGetStatusName')
 
         global __cublasGetStatusString
-        try:
-            __cublasGetStatusString = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetStatusString')
-        except:
-            pass
+        __cublasGetStatusString = GetProcAddress(handle, 'cublasGetStatusString')
 
         global __cublasSgemvBatched
-        try:
-            __cublasSgemvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemvBatched')
-        except:
-            pass
+        __cublasSgemvBatched = GetProcAddress(handle, 'cublasSgemvBatched')
 
         global __cublasDgemvBatched
-        try:
-            __cublasDgemvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemvBatched')
-        except:
-            pass
+        __cublasDgemvBatched = GetProcAddress(handle, 'cublasDgemvBatched')
 
         global __cublasCgemvBatched
-        try:
-            __cublasCgemvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemvBatched')
-        except:
-            pass
+        __cublasCgemvBatched = GetProcAddress(handle, 'cublasCgemvBatched')
 
         global __cublasZgemvBatched
-        try:
-            __cublasZgemvBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemvBatched')
-        except:
-            pass
+        __cublasZgemvBatched = GetProcAddress(handle, 'cublasZgemvBatched')
 
         global __cublasSgemvStridedBatched
-        try:
-            __cublasSgemvStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemvStridedBatched')
-        except:
-            pass
+        __cublasSgemvStridedBatched = GetProcAddress(handle, 'cublasSgemvStridedBatched')
 
         global __cublasDgemvStridedBatched
-        try:
-            __cublasDgemvStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemvStridedBatched')
-        except:
-            pass
+        __cublasDgemvStridedBatched = GetProcAddress(handle, 'cublasDgemvStridedBatched')
 
         global __cublasCgemvStridedBatched
-        try:
-            __cublasCgemvStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemvStridedBatched')
-        except:
-            pass
+        __cublasCgemvStridedBatched = GetProcAddress(handle, 'cublasCgemvStridedBatched')
 
         global __cublasZgemvStridedBatched
-        try:
-            __cublasZgemvStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemvStridedBatched')
-        except:
-            pass
+        __cublasZgemvStridedBatched = GetProcAddress(handle, 'cublasZgemvStridedBatched')
 
         global __cublasSetVector_64
-        try:
-            __cublasSetVector_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetVector_64')
-        except:
-            pass
+        __cublasSetVector_64 = GetProcAddress(handle, 'cublasSetVector_64')
 
         global __cublasGetVector_64
-        try:
-            __cublasGetVector_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetVector_64')
-        except:
-            pass
+        __cublasGetVector_64 = GetProcAddress(handle, 'cublasGetVector_64')
 
         global __cublasSetMatrix_64
-        try:
-            __cublasSetMatrix_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetMatrix_64')
-        except:
-            pass
+        __cublasSetMatrix_64 = GetProcAddress(handle, 'cublasSetMatrix_64')
 
         global __cublasGetMatrix_64
-        try:
-            __cublasGetMatrix_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetMatrix_64')
-        except:
-            pass
+        __cublasGetMatrix_64 = GetProcAddress(handle, 'cublasGetMatrix_64')
 
         global __cublasSetVectorAsync_64
-        try:
-            __cublasSetVectorAsync_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetVectorAsync_64')
-        except:
-            pass
+        __cublasSetVectorAsync_64 = GetProcAddress(handle, 'cublasSetVectorAsync_64')
 
         global __cublasGetVectorAsync_64
-        try:
-            __cublasGetVectorAsync_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetVectorAsync_64')
-        except:
-            pass
+        __cublasGetVectorAsync_64 = GetProcAddress(handle, 'cublasGetVectorAsync_64')
 
         global __cublasSetMatrixAsync_64
-        try:
-            __cublasSetMatrixAsync_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetMatrixAsync_64')
-        except:
-            pass
+        __cublasSetMatrixAsync_64 = GetProcAddress(handle, 'cublasSetMatrixAsync_64')
 
         global __cublasGetMatrixAsync_64
-        try:
-            __cublasGetMatrixAsync_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetMatrixAsync_64')
-        except:
-            pass
+        __cublasGetMatrixAsync_64 = GetProcAddress(handle, 'cublasGetMatrixAsync_64')
 
         global __cublasNrm2Ex_64
-        try:
-            __cublasNrm2Ex_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasNrm2Ex_64')
-        except:
-            pass
+        __cublasNrm2Ex_64 = GetProcAddress(handle, 'cublasNrm2Ex_64')
 
         global __cublasSnrm2_v2_64
-        try:
-            __cublasSnrm2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSnrm2_v2_64')
-        except:
-            pass
+        __cublasSnrm2_v2_64 = GetProcAddress(handle, 'cublasSnrm2_v2_64')
 
         global __cublasDnrm2_v2_64
-        try:
-            __cublasDnrm2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDnrm2_v2_64')
-        except:
-            pass
+        __cublasDnrm2_v2_64 = GetProcAddress(handle, 'cublasDnrm2_v2_64')
 
         global __cublasScnrm2_v2_64
-        try:
-            __cublasScnrm2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScnrm2_v2_64')
-        except:
-            pass
+        __cublasScnrm2_v2_64 = GetProcAddress(handle, 'cublasScnrm2_v2_64')
 
         global __cublasDznrm2_v2_64
-        try:
-            __cublasDznrm2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDznrm2_v2_64')
-        except:
-            pass
+        __cublasDznrm2_v2_64 = GetProcAddress(handle, 'cublasDznrm2_v2_64')
 
         global __cublasDotEx_64
-        try:
-            __cublasDotEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDotEx_64')
-        except:
-            pass
+        __cublasDotEx_64 = GetProcAddress(handle, 'cublasDotEx_64')
 
         global __cublasDotcEx_64
-        try:
-            __cublasDotcEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDotcEx_64')
-        except:
-            pass
+        __cublasDotcEx_64 = GetProcAddress(handle, 'cublasDotcEx_64')
 
         global __cublasSdot_v2_64
-        try:
-            __cublasSdot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSdot_v2_64')
-        except:
-            pass
+        __cublasSdot_v2_64 = GetProcAddress(handle, 'cublasSdot_v2_64')
 
         global __cublasDdot_v2_64
-        try:
-            __cublasDdot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDdot_v2_64')
-        except:
-            pass
+        __cublasDdot_v2_64 = GetProcAddress(handle, 'cublasDdot_v2_64')
 
         global __cublasCdotu_v2_64
-        try:
-            __cublasCdotu_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdotu_v2_64')
-        except:
-            pass
+        __cublasCdotu_v2_64 = GetProcAddress(handle, 'cublasCdotu_v2_64')
 
         global __cublasCdotc_v2_64
-        try:
-            __cublasCdotc_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdotc_v2_64')
-        except:
-            pass
+        __cublasCdotc_v2_64 = GetProcAddress(handle, 'cublasCdotc_v2_64')
 
         global __cublasZdotu_v2_64
-        try:
-            __cublasZdotu_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdotu_v2_64')
-        except:
-            pass
+        __cublasZdotu_v2_64 = GetProcAddress(handle, 'cublasZdotu_v2_64')
 
         global __cublasZdotc_v2_64
-        try:
-            __cublasZdotc_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdotc_v2_64')
-        except:
-            pass
+        __cublasZdotc_v2_64 = GetProcAddress(handle, 'cublasZdotc_v2_64')
 
         global __cublasScalEx_64
-        try:
-            __cublasScalEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScalEx_64')
-        except:
-            pass
+        __cublasScalEx_64 = GetProcAddress(handle, 'cublasScalEx_64')
 
         global __cublasSscal_v2_64
-        try:
-            __cublasSscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSscal_v2_64')
-        except:
-            pass
+        __cublasSscal_v2_64 = GetProcAddress(handle, 'cublasSscal_v2_64')
 
         global __cublasDscal_v2_64
-        try:
-            __cublasDscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDscal_v2_64')
-        except:
-            pass
+        __cublasDscal_v2_64 = GetProcAddress(handle, 'cublasDscal_v2_64')
 
         global __cublasCscal_v2_64
-        try:
-            __cublasCscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCscal_v2_64')
-        except:
-            pass
+        __cublasCscal_v2_64 = GetProcAddress(handle, 'cublasCscal_v2_64')
 
         global __cublasCsscal_v2_64
-        try:
-            __cublasCsscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsscal_v2_64')
-        except:
-            pass
+        __cublasCsscal_v2_64 = GetProcAddress(handle, 'cublasCsscal_v2_64')
 
         global __cublasZscal_v2_64
-        try:
-            __cublasZscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZscal_v2_64')
-        except:
-            pass
+        __cublasZscal_v2_64 = GetProcAddress(handle, 'cublasZscal_v2_64')
 
         global __cublasZdscal_v2_64
-        try:
-            __cublasZdscal_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdscal_v2_64')
-        except:
-            pass
+        __cublasZdscal_v2_64 = GetProcAddress(handle, 'cublasZdscal_v2_64')
 
         global __cublasAxpyEx_64
-        try:
-            __cublasAxpyEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasAxpyEx_64')
-        except:
-            pass
+        __cublasAxpyEx_64 = GetProcAddress(handle, 'cublasAxpyEx_64')
 
         global __cublasSaxpy_v2_64
-        try:
-            __cublasSaxpy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSaxpy_v2_64')
-        except:
-            pass
+        __cublasSaxpy_v2_64 = GetProcAddress(handle, 'cublasSaxpy_v2_64')
 
         global __cublasDaxpy_v2_64
-        try:
-            __cublasDaxpy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDaxpy_v2_64')
-        except:
-            pass
+        __cublasDaxpy_v2_64 = GetProcAddress(handle, 'cublasDaxpy_v2_64')
 
         global __cublasCaxpy_v2_64
-        try:
-            __cublasCaxpy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCaxpy_v2_64')
-        except:
-            pass
+        __cublasCaxpy_v2_64 = GetProcAddress(handle, 'cublasCaxpy_v2_64')
 
         global __cublasZaxpy_v2_64
-        try:
-            __cublasZaxpy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZaxpy_v2_64')
-        except:
-            pass
+        __cublasZaxpy_v2_64 = GetProcAddress(handle, 'cublasZaxpy_v2_64')
 
         global __cublasCopyEx_64
-        try:
-            __cublasCopyEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCopyEx_64')
-        except:
-            pass
+        __cublasCopyEx_64 = GetProcAddress(handle, 'cublasCopyEx_64')
 
         global __cublasScopy_v2_64
-        try:
-            __cublasScopy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScopy_v2_64')
-        except:
-            pass
+        __cublasScopy_v2_64 = GetProcAddress(handle, 'cublasScopy_v2_64')
 
         global __cublasDcopy_v2_64
-        try:
-            __cublasDcopy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDcopy_v2_64')
-        except:
-            pass
+        __cublasDcopy_v2_64 = GetProcAddress(handle, 'cublasDcopy_v2_64')
 
         global __cublasCcopy_v2_64
-        try:
-            __cublasCcopy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCcopy_v2_64')
-        except:
-            pass
+        __cublasCcopy_v2_64 = GetProcAddress(handle, 'cublasCcopy_v2_64')
 
         global __cublasZcopy_v2_64
-        try:
-            __cublasZcopy_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZcopy_v2_64')
-        except:
-            pass
+        __cublasZcopy_v2_64 = GetProcAddress(handle, 'cublasZcopy_v2_64')
 
         global __cublasSswap_v2_64
-        try:
-            __cublasSswap_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSswap_v2_64')
-        except:
-            pass
+        __cublasSswap_v2_64 = GetProcAddress(handle, 'cublasSswap_v2_64')
 
         global __cublasDswap_v2_64
-        try:
-            __cublasDswap_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDswap_v2_64')
-        except:
-            pass
+        __cublasDswap_v2_64 = GetProcAddress(handle, 'cublasDswap_v2_64')
 
         global __cublasCswap_v2_64
-        try:
-            __cublasCswap_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCswap_v2_64')
-        except:
-            pass
+        __cublasCswap_v2_64 = GetProcAddress(handle, 'cublasCswap_v2_64')
 
         global __cublasZswap_v2_64
-        try:
-            __cublasZswap_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZswap_v2_64')
-        except:
-            pass
+        __cublasZswap_v2_64 = GetProcAddress(handle, 'cublasZswap_v2_64')
 
         global __cublasSwapEx_64
-        try:
-            __cublasSwapEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSwapEx_64')
-        except:
-            pass
+        __cublasSwapEx_64 = GetProcAddress(handle, 'cublasSwapEx_64')
 
         global __cublasIsamax_v2_64
-        try:
-            __cublasIsamax_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIsamax_v2_64')
-        except:
-            pass
+        __cublasIsamax_v2_64 = GetProcAddress(handle, 'cublasIsamax_v2_64')
 
         global __cublasIdamax_v2_64
-        try:
-            __cublasIdamax_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIdamax_v2_64')
-        except:
-            pass
+        __cublasIdamax_v2_64 = GetProcAddress(handle, 'cublasIdamax_v2_64')
 
         global __cublasIcamax_v2_64
-        try:
-            __cublasIcamax_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIcamax_v2_64')
-        except:
-            pass
+        __cublasIcamax_v2_64 = GetProcAddress(handle, 'cublasIcamax_v2_64')
 
         global __cublasIzamax_v2_64
-        try:
-            __cublasIzamax_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIzamax_v2_64')
-        except:
-            pass
+        __cublasIzamax_v2_64 = GetProcAddress(handle, 'cublasIzamax_v2_64')
 
         global __cublasIamaxEx_64
-        try:
-            __cublasIamaxEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIamaxEx_64')
-        except:
-            pass
+        __cublasIamaxEx_64 = GetProcAddress(handle, 'cublasIamaxEx_64')
 
         global __cublasIsamin_v2_64
-        try:
-            __cublasIsamin_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIsamin_v2_64')
-        except:
-            pass
+        __cublasIsamin_v2_64 = GetProcAddress(handle, 'cublasIsamin_v2_64')
 
         global __cublasIdamin_v2_64
-        try:
-            __cublasIdamin_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIdamin_v2_64')
-        except:
-            pass
+        __cublasIdamin_v2_64 = GetProcAddress(handle, 'cublasIdamin_v2_64')
 
         global __cublasIcamin_v2_64
-        try:
-            __cublasIcamin_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIcamin_v2_64')
-        except:
-            pass
+        __cublasIcamin_v2_64 = GetProcAddress(handle, 'cublasIcamin_v2_64')
 
         global __cublasIzamin_v2_64
-        try:
-            __cublasIzamin_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIzamin_v2_64')
-        except:
-            pass
+        __cublasIzamin_v2_64 = GetProcAddress(handle, 'cublasIzamin_v2_64')
 
         global __cublasIaminEx_64
-        try:
-            __cublasIaminEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasIaminEx_64')
-        except:
-            pass
+        __cublasIaminEx_64 = GetProcAddress(handle, 'cublasIaminEx_64')
 
         global __cublasAsumEx_64
-        try:
-            __cublasAsumEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasAsumEx_64')
-        except:
-            pass
+        __cublasAsumEx_64 = GetProcAddress(handle, 'cublasAsumEx_64')
 
         global __cublasSasum_v2_64
-        try:
-            __cublasSasum_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSasum_v2_64')
-        except:
-            pass
+        __cublasSasum_v2_64 = GetProcAddress(handle, 'cublasSasum_v2_64')
 
         global __cublasDasum_v2_64
-        try:
-            __cublasDasum_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDasum_v2_64')
-        except:
-            pass
+        __cublasDasum_v2_64 = GetProcAddress(handle, 'cublasDasum_v2_64')
 
         global __cublasScasum_v2_64
-        try:
-            __cublasScasum_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasScasum_v2_64')
-        except:
-            pass
+        __cublasScasum_v2_64 = GetProcAddress(handle, 'cublasScasum_v2_64')
 
         global __cublasDzasum_v2_64
-        try:
-            __cublasDzasum_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDzasum_v2_64')
-        except:
-            pass
+        __cublasDzasum_v2_64 = GetProcAddress(handle, 'cublasDzasum_v2_64')
 
         global __cublasSrot_v2_64
-        try:
-            __cublasSrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrot_v2_64')
-        except:
-            pass
+        __cublasSrot_v2_64 = GetProcAddress(handle, 'cublasSrot_v2_64')
 
         global __cublasDrot_v2_64
-        try:
-            __cublasDrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrot_v2_64')
-        except:
-            pass
+        __cublasDrot_v2_64 = GetProcAddress(handle, 'cublasDrot_v2_64')
 
         global __cublasCrot_v2_64
-        try:
-            __cublasCrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCrot_v2_64')
-        except:
-            pass
+        __cublasCrot_v2_64 = GetProcAddress(handle, 'cublasCrot_v2_64')
 
         global __cublasCsrot_v2_64
-        try:
-            __cublasCsrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsrot_v2_64')
-        except:
-            pass
+        __cublasCsrot_v2_64 = GetProcAddress(handle, 'cublasCsrot_v2_64')
 
         global __cublasZrot_v2_64
-        try:
-            __cublasZrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZrot_v2_64')
-        except:
-            pass
+        __cublasZrot_v2_64 = GetProcAddress(handle, 'cublasZrot_v2_64')
 
         global __cublasZdrot_v2_64
-        try:
-            __cublasZdrot_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdrot_v2_64')
-        except:
-            pass
+        __cublasZdrot_v2_64 = GetProcAddress(handle, 'cublasZdrot_v2_64')
 
         global __cublasRotEx_64
-        try:
-            __cublasRotEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotEx_64')
-        except:
-            pass
+        __cublasRotEx_64 = GetProcAddress(handle, 'cublasRotEx_64')
 
         global __cublasSrotm_v2_64
-        try:
-            __cublasSrotm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSrotm_v2_64')
-        except:
-            pass
+        __cublasSrotm_v2_64 = GetProcAddress(handle, 'cublasSrotm_v2_64')
 
         global __cublasDrotm_v2_64
-        try:
-            __cublasDrotm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDrotm_v2_64')
-        except:
-            pass
+        __cublasDrotm_v2_64 = GetProcAddress(handle, 'cublasDrotm_v2_64')
 
         global __cublasRotmEx_64
-        try:
-            __cublasRotmEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasRotmEx_64')
-        except:
-            pass
+        __cublasRotmEx_64 = GetProcAddress(handle, 'cublasRotmEx_64')
 
         global __cublasSgemv_v2_64
-        try:
-            __cublasSgemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemv_v2_64')
-        except:
-            pass
+        __cublasSgemv_v2_64 = GetProcAddress(handle, 'cublasSgemv_v2_64')
 
         global __cublasDgemv_v2_64
-        try:
-            __cublasDgemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemv_v2_64')
-        except:
-            pass
+        __cublasDgemv_v2_64 = GetProcAddress(handle, 'cublasDgemv_v2_64')
 
         global __cublasCgemv_v2_64
-        try:
-            __cublasCgemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemv_v2_64')
-        except:
-            pass
+        __cublasCgemv_v2_64 = GetProcAddress(handle, 'cublasCgemv_v2_64')
 
         global __cublasZgemv_v2_64
-        try:
-            __cublasZgemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemv_v2_64')
-        except:
-            pass
+        __cublasZgemv_v2_64 = GetProcAddress(handle, 'cublasZgemv_v2_64')
 
         global __cublasSgbmv_v2_64
-        try:
-            __cublasSgbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgbmv_v2_64')
-        except:
-            pass
+        __cublasSgbmv_v2_64 = GetProcAddress(handle, 'cublasSgbmv_v2_64')
 
         global __cublasDgbmv_v2_64
-        try:
-            __cublasDgbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgbmv_v2_64')
-        except:
-            pass
+        __cublasDgbmv_v2_64 = GetProcAddress(handle, 'cublasDgbmv_v2_64')
 
         global __cublasCgbmv_v2_64
-        try:
-            __cublasCgbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgbmv_v2_64')
-        except:
-            pass
+        __cublasCgbmv_v2_64 = GetProcAddress(handle, 'cublasCgbmv_v2_64')
 
         global __cublasZgbmv_v2_64
-        try:
-            __cublasZgbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgbmv_v2_64')
-        except:
-            pass
+        __cublasZgbmv_v2_64 = GetProcAddress(handle, 'cublasZgbmv_v2_64')
 
         global __cublasStrmv_v2_64
-        try:
-            __cublasStrmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrmv_v2_64')
-        except:
-            pass
+        __cublasStrmv_v2_64 = GetProcAddress(handle, 'cublasStrmv_v2_64')
 
         global __cublasDtrmv_v2_64
-        try:
-            __cublasDtrmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrmv_v2_64')
-        except:
-            pass
+        __cublasDtrmv_v2_64 = GetProcAddress(handle, 'cublasDtrmv_v2_64')
 
         global __cublasCtrmv_v2_64
-        try:
-            __cublasCtrmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrmv_v2_64')
-        except:
-            pass
+        __cublasCtrmv_v2_64 = GetProcAddress(handle, 'cublasCtrmv_v2_64')
 
         global __cublasZtrmv_v2_64
-        try:
-            __cublasZtrmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrmv_v2_64')
-        except:
-            pass
+        __cublasZtrmv_v2_64 = GetProcAddress(handle, 'cublasZtrmv_v2_64')
 
         global __cublasStbmv_v2_64
-        try:
-            __cublasStbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStbmv_v2_64')
-        except:
-            pass
+        __cublasStbmv_v2_64 = GetProcAddress(handle, 'cublasStbmv_v2_64')
 
         global __cublasDtbmv_v2_64
-        try:
-            __cublasDtbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtbmv_v2_64')
-        except:
-            pass
+        __cublasDtbmv_v2_64 = GetProcAddress(handle, 'cublasDtbmv_v2_64')
 
         global __cublasCtbmv_v2_64
-        try:
-            __cublasCtbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtbmv_v2_64')
-        except:
-            pass
+        __cublasCtbmv_v2_64 = GetProcAddress(handle, 'cublasCtbmv_v2_64')
 
         global __cublasZtbmv_v2_64
-        try:
-            __cublasZtbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtbmv_v2_64')
-        except:
-            pass
+        __cublasZtbmv_v2_64 = GetProcAddress(handle, 'cublasZtbmv_v2_64')
 
         global __cublasStpmv_v2_64
-        try:
-            __cublasStpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStpmv_v2_64')
-        except:
-            pass
+        __cublasStpmv_v2_64 = GetProcAddress(handle, 'cublasStpmv_v2_64')
 
         global __cublasDtpmv_v2_64
-        try:
-            __cublasDtpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtpmv_v2_64')
-        except:
-            pass
+        __cublasDtpmv_v2_64 = GetProcAddress(handle, 'cublasDtpmv_v2_64')
 
         global __cublasCtpmv_v2_64
-        try:
-            __cublasCtpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtpmv_v2_64')
-        except:
-            pass
+        __cublasCtpmv_v2_64 = GetProcAddress(handle, 'cublasCtpmv_v2_64')
 
         global __cublasZtpmv_v2_64
-        try:
-            __cublasZtpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtpmv_v2_64')
-        except:
-            pass
+        __cublasZtpmv_v2_64 = GetProcAddress(handle, 'cublasZtpmv_v2_64')
 
         global __cublasStrsv_v2_64
-        try:
-            __cublasStrsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsv_v2_64')
-        except:
-            pass
+        __cublasStrsv_v2_64 = GetProcAddress(handle, 'cublasStrsv_v2_64')
 
         global __cublasDtrsv_v2_64
-        try:
-            __cublasDtrsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsv_v2_64')
-        except:
-            pass
+        __cublasDtrsv_v2_64 = GetProcAddress(handle, 'cublasDtrsv_v2_64')
 
         global __cublasCtrsv_v2_64
-        try:
-            __cublasCtrsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsv_v2_64')
-        except:
-            pass
+        __cublasCtrsv_v2_64 = GetProcAddress(handle, 'cublasCtrsv_v2_64')
 
         global __cublasZtrsv_v2_64
-        try:
-            __cublasZtrsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsv_v2_64')
-        except:
-            pass
+        __cublasZtrsv_v2_64 = GetProcAddress(handle, 'cublasZtrsv_v2_64')
 
         global __cublasStpsv_v2_64
-        try:
-            __cublasStpsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStpsv_v2_64')
-        except:
-            pass
+        __cublasStpsv_v2_64 = GetProcAddress(handle, 'cublasStpsv_v2_64')
 
         global __cublasDtpsv_v2_64
-        try:
-            __cublasDtpsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtpsv_v2_64')
-        except:
-            pass
+        __cublasDtpsv_v2_64 = GetProcAddress(handle, 'cublasDtpsv_v2_64')
 
         global __cublasCtpsv_v2_64
-        try:
-            __cublasCtpsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtpsv_v2_64')
-        except:
-            pass
+        __cublasCtpsv_v2_64 = GetProcAddress(handle, 'cublasCtpsv_v2_64')
 
         global __cublasZtpsv_v2_64
-        try:
-            __cublasZtpsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtpsv_v2_64')
-        except:
-            pass
+        __cublasZtpsv_v2_64 = GetProcAddress(handle, 'cublasZtpsv_v2_64')
 
         global __cublasStbsv_v2_64
-        try:
-            __cublasStbsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStbsv_v2_64')
-        except:
-            pass
+        __cublasStbsv_v2_64 = GetProcAddress(handle, 'cublasStbsv_v2_64')
 
         global __cublasDtbsv_v2_64
-        try:
-            __cublasDtbsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtbsv_v2_64')
-        except:
-            pass
+        __cublasDtbsv_v2_64 = GetProcAddress(handle, 'cublasDtbsv_v2_64')
 
         global __cublasCtbsv_v2_64
-        try:
-            __cublasCtbsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtbsv_v2_64')
-        except:
-            pass
+        __cublasCtbsv_v2_64 = GetProcAddress(handle, 'cublasCtbsv_v2_64')
 
         global __cublasZtbsv_v2_64
-        try:
-            __cublasZtbsv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtbsv_v2_64')
-        except:
-            pass
+        __cublasZtbsv_v2_64 = GetProcAddress(handle, 'cublasZtbsv_v2_64')
 
         global __cublasSsymv_v2_64
-        try:
-            __cublasSsymv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsymv_v2_64')
-        except:
-            pass
+        __cublasSsymv_v2_64 = GetProcAddress(handle, 'cublasSsymv_v2_64')
 
         global __cublasDsymv_v2_64
-        try:
-            __cublasDsymv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsymv_v2_64')
-        except:
-            pass
+        __cublasDsymv_v2_64 = GetProcAddress(handle, 'cublasDsymv_v2_64')
 
         global __cublasCsymv_v2_64
-        try:
-            __cublasCsymv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsymv_v2_64')
-        except:
-            pass
+        __cublasCsymv_v2_64 = GetProcAddress(handle, 'cublasCsymv_v2_64')
 
         global __cublasZsymv_v2_64
-        try:
-            __cublasZsymv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsymv_v2_64')
-        except:
-            pass
+        __cublasZsymv_v2_64 = GetProcAddress(handle, 'cublasZsymv_v2_64')
 
         global __cublasChemv_v2_64
-        try:
-            __cublasChemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChemv_v2_64')
-        except:
-            pass
+        __cublasChemv_v2_64 = GetProcAddress(handle, 'cublasChemv_v2_64')
 
         global __cublasZhemv_v2_64
-        try:
-            __cublasZhemv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhemv_v2_64')
-        except:
-            pass
+        __cublasZhemv_v2_64 = GetProcAddress(handle, 'cublasZhemv_v2_64')
 
         global __cublasSsbmv_v2_64
-        try:
-            __cublasSsbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsbmv_v2_64')
-        except:
-            pass
+        __cublasSsbmv_v2_64 = GetProcAddress(handle, 'cublasSsbmv_v2_64')
 
         global __cublasDsbmv_v2_64
-        try:
-            __cublasDsbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsbmv_v2_64')
-        except:
-            pass
+        __cublasDsbmv_v2_64 = GetProcAddress(handle, 'cublasDsbmv_v2_64')
 
         global __cublasChbmv_v2_64
-        try:
-            __cublasChbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChbmv_v2_64')
-        except:
-            pass
+        __cublasChbmv_v2_64 = GetProcAddress(handle, 'cublasChbmv_v2_64')
 
         global __cublasZhbmv_v2_64
-        try:
-            __cublasZhbmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhbmv_v2_64')
-        except:
-            pass
+        __cublasZhbmv_v2_64 = GetProcAddress(handle, 'cublasZhbmv_v2_64')
 
         global __cublasSspmv_v2_64
-        try:
-            __cublasSspmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspmv_v2_64')
-        except:
-            pass
+        __cublasSspmv_v2_64 = GetProcAddress(handle, 'cublasSspmv_v2_64')
 
         global __cublasDspmv_v2_64
-        try:
-            __cublasDspmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspmv_v2_64')
-        except:
-            pass
+        __cublasDspmv_v2_64 = GetProcAddress(handle, 'cublasDspmv_v2_64')
 
         global __cublasChpmv_v2_64
-        try:
-            __cublasChpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpmv_v2_64')
-        except:
-            pass
+        __cublasChpmv_v2_64 = GetProcAddress(handle, 'cublasChpmv_v2_64')
 
         global __cublasZhpmv_v2_64
-        try:
-            __cublasZhpmv_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpmv_v2_64')
-        except:
-            pass
+        __cublasZhpmv_v2_64 = GetProcAddress(handle, 'cublasZhpmv_v2_64')
 
         global __cublasSger_v2_64
-        try:
-            __cublasSger_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSger_v2_64')
-        except:
-            pass
+        __cublasSger_v2_64 = GetProcAddress(handle, 'cublasSger_v2_64')
 
         global __cublasDger_v2_64
-        try:
-            __cublasDger_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDger_v2_64')
-        except:
-            pass
+        __cublasDger_v2_64 = GetProcAddress(handle, 'cublasDger_v2_64')
 
         global __cublasCgeru_v2_64
-        try:
-            __cublasCgeru_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgeru_v2_64')
-        except:
-            pass
+        __cublasCgeru_v2_64 = GetProcAddress(handle, 'cublasCgeru_v2_64')
 
         global __cublasCgerc_v2_64
-        try:
-            __cublasCgerc_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgerc_v2_64')
-        except:
-            pass
+        __cublasCgerc_v2_64 = GetProcAddress(handle, 'cublasCgerc_v2_64')
 
         global __cublasZgeru_v2_64
-        try:
-            __cublasZgeru_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgeru_v2_64')
-        except:
-            pass
+        __cublasZgeru_v2_64 = GetProcAddress(handle, 'cublasZgeru_v2_64')
 
         global __cublasZgerc_v2_64
-        try:
-            __cublasZgerc_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgerc_v2_64')
-        except:
-            pass
+        __cublasZgerc_v2_64 = GetProcAddress(handle, 'cublasZgerc_v2_64')
 
         global __cublasSsyr_v2_64
-        try:
-            __cublasSsyr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr_v2_64')
-        except:
-            pass
+        __cublasSsyr_v2_64 = GetProcAddress(handle, 'cublasSsyr_v2_64')
 
         global __cublasDsyr_v2_64
-        try:
-            __cublasDsyr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr_v2_64')
-        except:
-            pass
+        __cublasDsyr_v2_64 = GetProcAddress(handle, 'cublasDsyr_v2_64')
 
         global __cublasCsyr_v2_64
-        try:
-            __cublasCsyr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr_v2_64')
-        except:
-            pass
+        __cublasCsyr_v2_64 = GetProcAddress(handle, 'cublasCsyr_v2_64')
 
         global __cublasZsyr_v2_64
-        try:
-            __cublasZsyr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr_v2_64')
-        except:
-            pass
+        __cublasZsyr_v2_64 = GetProcAddress(handle, 'cublasZsyr_v2_64')
 
         global __cublasCher_v2_64
-        try:
-            __cublasCher_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher_v2_64')
-        except:
-            pass
+        __cublasCher_v2_64 = GetProcAddress(handle, 'cublasCher_v2_64')
 
         global __cublasZher_v2_64
-        try:
-            __cublasZher_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher_v2_64')
-        except:
-            pass
+        __cublasZher_v2_64 = GetProcAddress(handle, 'cublasZher_v2_64')
 
         global __cublasSspr_v2_64
-        try:
-            __cublasSspr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspr_v2_64')
-        except:
-            pass
+        __cublasSspr_v2_64 = GetProcAddress(handle, 'cublasSspr_v2_64')
 
         global __cublasDspr_v2_64
-        try:
-            __cublasDspr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspr_v2_64')
-        except:
-            pass
+        __cublasDspr_v2_64 = GetProcAddress(handle, 'cublasDspr_v2_64')
 
         global __cublasChpr_v2_64
-        try:
-            __cublasChpr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpr_v2_64')
-        except:
-            pass
+        __cublasChpr_v2_64 = GetProcAddress(handle, 'cublasChpr_v2_64')
 
         global __cublasZhpr_v2_64
-        try:
-            __cublasZhpr_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpr_v2_64')
-        except:
-            pass
+        __cublasZhpr_v2_64 = GetProcAddress(handle, 'cublasZhpr_v2_64')
 
         global __cublasSsyr2_v2_64
-        try:
-            __cublasSsyr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr2_v2_64')
-        except:
-            pass
+        __cublasSsyr2_v2_64 = GetProcAddress(handle, 'cublasSsyr2_v2_64')
 
         global __cublasDsyr2_v2_64
-        try:
-            __cublasDsyr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr2_v2_64')
-        except:
-            pass
+        __cublasDsyr2_v2_64 = GetProcAddress(handle, 'cublasDsyr2_v2_64')
 
         global __cublasCsyr2_v2_64
-        try:
-            __cublasCsyr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr2_v2_64')
-        except:
-            pass
+        __cublasCsyr2_v2_64 = GetProcAddress(handle, 'cublasCsyr2_v2_64')
 
         global __cublasZsyr2_v2_64
-        try:
-            __cublasZsyr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr2_v2_64')
-        except:
-            pass
+        __cublasZsyr2_v2_64 = GetProcAddress(handle, 'cublasZsyr2_v2_64')
 
         global __cublasCher2_v2_64
-        try:
-            __cublasCher2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher2_v2_64')
-        except:
-            pass
+        __cublasCher2_v2_64 = GetProcAddress(handle, 'cublasCher2_v2_64')
 
         global __cublasZher2_v2_64
-        try:
-            __cublasZher2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher2_v2_64')
-        except:
-            pass
+        __cublasZher2_v2_64 = GetProcAddress(handle, 'cublasZher2_v2_64')
 
         global __cublasSspr2_v2_64
-        try:
-            __cublasSspr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSspr2_v2_64')
-        except:
-            pass
+        __cublasSspr2_v2_64 = GetProcAddress(handle, 'cublasSspr2_v2_64')
 
         global __cublasDspr2_v2_64
-        try:
-            __cublasDspr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDspr2_v2_64')
-        except:
-            pass
+        __cublasDspr2_v2_64 = GetProcAddress(handle, 'cublasDspr2_v2_64')
 
         global __cublasChpr2_v2_64
-        try:
-            __cublasChpr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChpr2_v2_64')
-        except:
-            pass
+        __cublasChpr2_v2_64 = GetProcAddress(handle, 'cublasChpr2_v2_64')
 
         global __cublasZhpr2_v2_64
-        try:
-            __cublasZhpr2_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhpr2_v2_64')
-        except:
-            pass
+        __cublasZhpr2_v2_64 = GetProcAddress(handle, 'cublasZhpr2_v2_64')
 
         global __cublasSgemvBatched_64
-        try:
-            __cublasSgemvBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemvBatched_64')
-        except:
-            pass
+        __cublasSgemvBatched_64 = GetProcAddress(handle, 'cublasSgemvBatched_64')
 
         global __cublasDgemvBatched_64
-        try:
-            __cublasDgemvBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemvBatched_64')
-        except:
-            pass
+        __cublasDgemvBatched_64 = GetProcAddress(handle, 'cublasDgemvBatched_64')
 
         global __cublasCgemvBatched_64
-        try:
-            __cublasCgemvBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemvBatched_64')
-        except:
-            pass
+        __cublasCgemvBatched_64 = GetProcAddress(handle, 'cublasCgemvBatched_64')
 
         global __cublasZgemvBatched_64
-        try:
-            __cublasZgemvBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemvBatched_64')
-        except:
-            pass
+        __cublasZgemvBatched_64 = GetProcAddress(handle, 'cublasZgemvBatched_64')
 
         global __cublasSgemvStridedBatched_64
-        try:
-            __cublasSgemvStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemvStridedBatched_64')
-        except:
-            pass
+        __cublasSgemvStridedBatched_64 = GetProcAddress(handle, 'cublasSgemvStridedBatched_64')
 
         global __cublasDgemvStridedBatched_64
-        try:
-            __cublasDgemvStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemvStridedBatched_64')
-        except:
-            pass
+        __cublasDgemvStridedBatched_64 = GetProcAddress(handle, 'cublasDgemvStridedBatched_64')
 
         global __cublasCgemvStridedBatched_64
-        try:
-            __cublasCgemvStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemvStridedBatched_64')
-        except:
-            pass
+        __cublasCgemvStridedBatched_64 = GetProcAddress(handle, 'cublasCgemvStridedBatched_64')
 
         global __cublasZgemvStridedBatched_64
-        try:
-            __cublasZgemvStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemvStridedBatched_64')
-        except:
-            pass
+        __cublasZgemvStridedBatched_64 = GetProcAddress(handle, 'cublasZgemvStridedBatched_64')
 
         global __cublasSgemm_v2_64
-        try:
-            __cublasSgemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemm_v2_64')
-        except:
-            pass
+        __cublasSgemm_v2_64 = GetProcAddress(handle, 'cublasSgemm_v2_64')
 
         global __cublasDgemm_v2_64
-        try:
-            __cublasDgemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemm_v2_64')
-        except:
-            pass
+        __cublasDgemm_v2_64 = GetProcAddress(handle, 'cublasDgemm_v2_64')
 
         global __cublasCgemm_v2_64
-        try:
-            __cublasCgemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm_v2_64')
-        except:
-            pass
+        __cublasCgemm_v2_64 = GetProcAddress(handle, 'cublasCgemm_v2_64')
 
         global __cublasCgemm3m_64
-        try:
-            __cublasCgemm3m_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3m_64')
-        except:
-            pass
+        __cublasCgemm3m_64 = GetProcAddress(handle, 'cublasCgemm3m_64')
 
         global __cublasCgemm3mEx_64
-        try:
-            __cublasCgemm3mEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mEx_64')
-        except:
-            pass
+        __cublasCgemm3mEx_64 = GetProcAddress(handle, 'cublasCgemm3mEx_64')
 
         global __cublasZgemm_v2_64
-        try:
-            __cublasZgemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemm_v2_64')
-        except:
-            pass
+        __cublasZgemm_v2_64 = GetProcAddress(handle, 'cublasZgemm_v2_64')
 
         global __cublasZgemm3m_64
-        try:
-            __cublasZgemm3m_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemm3m_64')
-        except:
-            pass
+        __cublasZgemm3m_64 = GetProcAddress(handle, 'cublasZgemm3m_64')
 
         global __cublasSgemmEx_64
-        try:
-            __cublasSgemmEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmEx_64')
-        except:
-            pass
+        __cublasSgemmEx_64 = GetProcAddress(handle, 'cublasSgemmEx_64')
 
         global __cublasGemmEx_64
-        try:
-            __cublasGemmEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmEx_64')
-        except:
-            pass
+        __cublasGemmEx_64 = GetProcAddress(handle, 'cublasGemmEx_64')
 
         global __cublasCgemmEx_64
-        try:
-            __cublasCgemmEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmEx_64')
-        except:
-            pass
+        __cublasCgemmEx_64 = GetProcAddress(handle, 'cublasCgemmEx_64')
 
         global __cublasSsyrk_v2_64
-        try:
-            __cublasSsyrk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyrk_v2_64')
-        except:
-            pass
+        __cublasSsyrk_v2_64 = GetProcAddress(handle, 'cublasSsyrk_v2_64')
 
         global __cublasDsyrk_v2_64
-        try:
-            __cublasDsyrk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyrk_v2_64')
-        except:
-            pass
+        __cublasDsyrk_v2_64 = GetProcAddress(handle, 'cublasDsyrk_v2_64')
 
         global __cublasCsyrk_v2_64
-        try:
-            __cublasCsyrk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrk_v2_64')
-        except:
-            pass
+        __cublasCsyrk_v2_64 = GetProcAddress(handle, 'cublasCsyrk_v2_64')
 
         global __cublasZsyrk_v2_64
-        try:
-            __cublasZsyrk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyrk_v2_64')
-        except:
-            pass
+        __cublasZsyrk_v2_64 = GetProcAddress(handle, 'cublasZsyrk_v2_64')
 
         global __cublasCsyrkEx_64
-        try:
-            __cublasCsyrkEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrkEx_64')
-        except:
-            pass
+        __cublasCsyrkEx_64 = GetProcAddress(handle, 'cublasCsyrkEx_64')
 
         global __cublasCsyrk3mEx_64
-        try:
-            __cublasCsyrk3mEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrk3mEx_64')
-        except:
-            pass
+        __cublasCsyrk3mEx_64 = GetProcAddress(handle, 'cublasCsyrk3mEx_64')
 
         global __cublasCherk_v2_64
-        try:
-            __cublasCherk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherk_v2_64')
-        except:
-            pass
+        __cublasCherk_v2_64 = GetProcAddress(handle, 'cublasCherk_v2_64')
 
         global __cublasZherk_v2_64
-        try:
-            __cublasZherk_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZherk_v2_64')
-        except:
-            pass
+        __cublasZherk_v2_64 = GetProcAddress(handle, 'cublasZherk_v2_64')
 
         global __cublasCherkEx_64
-        try:
-            __cublasCherkEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherkEx_64')
-        except:
-            pass
+        __cublasCherkEx_64 = GetProcAddress(handle, 'cublasCherkEx_64')
 
         global __cublasCherk3mEx_64
-        try:
-            __cublasCherk3mEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherk3mEx_64')
-        except:
-            pass
+        __cublasCherk3mEx_64 = GetProcAddress(handle, 'cublasCherk3mEx_64')
 
         global __cublasSsyr2k_v2_64
-        try:
-            __cublasSsyr2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyr2k_v2_64')
-        except:
-            pass
+        __cublasSsyr2k_v2_64 = GetProcAddress(handle, 'cublasSsyr2k_v2_64')
 
         global __cublasDsyr2k_v2_64
-        try:
-            __cublasDsyr2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyr2k_v2_64')
-        except:
-            pass
+        __cublasDsyr2k_v2_64 = GetProcAddress(handle, 'cublasDsyr2k_v2_64')
 
         global __cublasCsyr2k_v2_64
-        try:
-            __cublasCsyr2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyr2k_v2_64')
-        except:
-            pass
+        __cublasCsyr2k_v2_64 = GetProcAddress(handle, 'cublasCsyr2k_v2_64')
 
         global __cublasZsyr2k_v2_64
-        try:
-            __cublasZsyr2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyr2k_v2_64')
-        except:
-            pass
+        __cublasZsyr2k_v2_64 = GetProcAddress(handle, 'cublasZsyr2k_v2_64')
 
         global __cublasCher2k_v2_64
-        try:
-            __cublasCher2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCher2k_v2_64')
-        except:
-            pass
+        __cublasCher2k_v2_64 = GetProcAddress(handle, 'cublasCher2k_v2_64')
 
         global __cublasZher2k_v2_64
-        try:
-            __cublasZher2k_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZher2k_v2_64')
-        except:
-            pass
+        __cublasZher2k_v2_64 = GetProcAddress(handle, 'cublasZher2k_v2_64')
 
         global __cublasSsyrkx_64
-        try:
-            __cublasSsyrkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsyrkx_64')
-        except:
-            pass
+        __cublasSsyrkx_64 = GetProcAddress(handle, 'cublasSsyrkx_64')
 
         global __cublasDsyrkx_64
-        try:
-            __cublasDsyrkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsyrkx_64')
-        except:
-            pass
+        __cublasDsyrkx_64 = GetProcAddress(handle, 'cublasDsyrkx_64')
 
         global __cublasCsyrkx_64
-        try:
-            __cublasCsyrkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsyrkx_64')
-        except:
-            pass
+        __cublasCsyrkx_64 = GetProcAddress(handle, 'cublasCsyrkx_64')
 
         global __cublasZsyrkx_64
-        try:
-            __cublasZsyrkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsyrkx_64')
-        except:
-            pass
+        __cublasZsyrkx_64 = GetProcAddress(handle, 'cublasZsyrkx_64')
 
         global __cublasCherkx_64
-        try:
-            __cublasCherkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCherkx_64')
-        except:
-            pass
+        __cublasCherkx_64 = GetProcAddress(handle, 'cublasCherkx_64')
 
         global __cublasZherkx_64
-        try:
-            __cublasZherkx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZherkx_64')
-        except:
-            pass
+        __cublasZherkx_64 = GetProcAddress(handle, 'cublasZherkx_64')
 
         global __cublasSsymm_v2_64
-        try:
-            __cublasSsymm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSsymm_v2_64')
-        except:
-            pass
+        __cublasSsymm_v2_64 = GetProcAddress(handle, 'cublasSsymm_v2_64')
 
         global __cublasDsymm_v2_64
-        try:
-            __cublasDsymm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDsymm_v2_64')
-        except:
-            pass
+        __cublasDsymm_v2_64 = GetProcAddress(handle, 'cublasDsymm_v2_64')
 
         global __cublasCsymm_v2_64
-        try:
-            __cublasCsymm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCsymm_v2_64')
-        except:
-            pass
+        __cublasCsymm_v2_64 = GetProcAddress(handle, 'cublasCsymm_v2_64')
 
         global __cublasZsymm_v2_64
-        try:
-            __cublasZsymm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZsymm_v2_64')
-        except:
-            pass
+        __cublasZsymm_v2_64 = GetProcAddress(handle, 'cublasZsymm_v2_64')
 
         global __cublasChemm_v2_64
-        try:
-            __cublasChemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasChemm_v2_64')
-        except:
-            pass
+        __cublasChemm_v2_64 = GetProcAddress(handle, 'cublasChemm_v2_64')
 
         global __cublasZhemm_v2_64
-        try:
-            __cublasZhemm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZhemm_v2_64')
-        except:
-            pass
+        __cublasZhemm_v2_64 = GetProcAddress(handle, 'cublasZhemm_v2_64')
 
         global __cublasStrsm_v2_64
-        try:
-            __cublasStrsm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsm_v2_64')
-        except:
-            pass
+        __cublasStrsm_v2_64 = GetProcAddress(handle, 'cublasStrsm_v2_64')
 
         global __cublasDtrsm_v2_64
-        try:
-            __cublasDtrsm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsm_v2_64')
-        except:
-            pass
+        __cublasDtrsm_v2_64 = GetProcAddress(handle, 'cublasDtrsm_v2_64')
 
         global __cublasCtrsm_v2_64
-        try:
-            __cublasCtrsm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsm_v2_64')
-        except:
-            pass
+        __cublasCtrsm_v2_64 = GetProcAddress(handle, 'cublasCtrsm_v2_64')
 
         global __cublasZtrsm_v2_64
-        try:
-            __cublasZtrsm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsm_v2_64')
-        except:
-            pass
+        __cublasZtrsm_v2_64 = GetProcAddress(handle, 'cublasZtrsm_v2_64')
 
         global __cublasStrmm_v2_64
-        try:
-            __cublasStrmm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrmm_v2_64')
-        except:
-            pass
+        __cublasStrmm_v2_64 = GetProcAddress(handle, 'cublasStrmm_v2_64')
 
         global __cublasDtrmm_v2_64
-        try:
-            __cublasDtrmm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrmm_v2_64')
-        except:
-            pass
+        __cublasDtrmm_v2_64 = GetProcAddress(handle, 'cublasDtrmm_v2_64')
 
         global __cublasCtrmm_v2_64
-        try:
-            __cublasCtrmm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrmm_v2_64')
-        except:
-            pass
+        __cublasCtrmm_v2_64 = GetProcAddress(handle, 'cublasCtrmm_v2_64')
 
         global __cublasZtrmm_v2_64
-        try:
-            __cublasZtrmm_v2_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrmm_v2_64')
-        except:
-            pass
+        __cublasZtrmm_v2_64 = GetProcAddress(handle, 'cublasZtrmm_v2_64')
 
         global __cublasSgemmBatched_64
-        try:
-            __cublasSgemmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmBatched_64')
-        except:
-            pass
+        __cublasSgemmBatched_64 = GetProcAddress(handle, 'cublasSgemmBatched_64')
 
         global __cublasDgemmBatched_64
-        try:
-            __cublasDgemmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmBatched_64')
-        except:
-            pass
+        __cublasDgemmBatched_64 = GetProcAddress(handle, 'cublasDgemmBatched_64')
 
         global __cublasCgemmBatched_64
-        try:
-            __cublasCgemmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmBatched_64')
-        except:
-            pass
+        __cublasCgemmBatched_64 = GetProcAddress(handle, 'cublasCgemmBatched_64')
 
         global __cublasCgemm3mBatched_64
-        try:
-            __cublasCgemm3mBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mBatched_64')
-        except:
-            pass
+        __cublasCgemm3mBatched_64 = GetProcAddress(handle, 'cublasCgemm3mBatched_64')
 
         global __cublasZgemmBatched_64
-        try:
-            __cublasZgemmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemmBatched_64')
-        except:
-            pass
+        __cublasZgemmBatched_64 = GetProcAddress(handle, 'cublasZgemmBatched_64')
 
         global __cublasSgemmStridedBatched_64
-        try:
-            __cublasSgemmStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmStridedBatched_64')
-        except:
-            pass
+        __cublasSgemmStridedBatched_64 = GetProcAddress(handle, 'cublasSgemmStridedBatched_64')
 
         global __cublasDgemmStridedBatched_64
-        try:
-            __cublasDgemmStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmStridedBatched_64')
-        except:
-            pass
+        __cublasDgemmStridedBatched_64 = GetProcAddress(handle, 'cublasDgemmStridedBatched_64')
 
         global __cublasCgemmStridedBatched_64
-        try:
-            __cublasCgemmStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemmStridedBatched_64')
-        except:
-            pass
+        __cublasCgemmStridedBatched_64 = GetProcAddress(handle, 'cublasCgemmStridedBatched_64')
 
         global __cublasCgemm3mStridedBatched_64
-        try:
-            __cublasCgemm3mStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgemm3mStridedBatched_64')
-        except:
-            pass
+        __cublasCgemm3mStridedBatched_64 = GetProcAddress(handle, 'cublasCgemm3mStridedBatched_64')
 
         global __cublasZgemmStridedBatched_64
-        try:
-            __cublasZgemmStridedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgemmStridedBatched_64')
-        except:
-            pass
+        __cublasZgemmStridedBatched_64 = GetProcAddress(handle, 'cublasZgemmStridedBatched_64')
 
         global __cublasGemmBatchedEx_64
-        try:
-            __cublasGemmBatchedEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmBatchedEx_64')
-        except:
-            pass
+        __cublasGemmBatchedEx_64 = GetProcAddress(handle, 'cublasGemmBatchedEx_64')
 
         global __cublasGemmStridedBatchedEx_64
-        try:
-            __cublasGemmStridedBatchedEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmStridedBatchedEx_64')
-        except:
-            pass
+        __cublasGemmStridedBatchedEx_64 = GetProcAddress(handle, 'cublasGemmStridedBatchedEx_64')
 
         global __cublasSgeam_64
-        try:
-            __cublasSgeam_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgeam_64')
-        except:
-            pass
+        __cublasSgeam_64 = GetProcAddress(handle, 'cublasSgeam_64')
 
         global __cublasDgeam_64
-        try:
-            __cublasDgeam_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgeam_64')
-        except:
-            pass
+        __cublasDgeam_64 = GetProcAddress(handle, 'cublasDgeam_64')
 
         global __cublasCgeam_64
-        try:
-            __cublasCgeam_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCgeam_64')
-        except:
-            pass
+        __cublasCgeam_64 = GetProcAddress(handle, 'cublasCgeam_64')
 
         global __cublasZgeam_64
-        try:
-            __cublasZgeam_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZgeam_64')
-        except:
-            pass
+        __cublasZgeam_64 = GetProcAddress(handle, 'cublasZgeam_64')
 
         global __cublasStrsmBatched_64
-        try:
-            __cublasStrsmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasStrsmBatched_64')
-        except:
-            pass
+        __cublasStrsmBatched_64 = GetProcAddress(handle, 'cublasStrsmBatched_64')
 
         global __cublasDtrsmBatched_64
-        try:
-            __cublasDtrsmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDtrsmBatched_64')
-        except:
-            pass
+        __cublasDtrsmBatched_64 = GetProcAddress(handle, 'cublasDtrsmBatched_64')
 
         global __cublasCtrsmBatched_64
-        try:
-            __cublasCtrsmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCtrsmBatched_64')
-        except:
-            pass
+        __cublasCtrsmBatched_64 = GetProcAddress(handle, 'cublasCtrsmBatched_64')
 
         global __cublasZtrsmBatched_64
-        try:
-            __cublasZtrsmBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZtrsmBatched_64')
-        except:
-            pass
+        __cublasZtrsmBatched_64 = GetProcAddress(handle, 'cublasZtrsmBatched_64')
 
         global __cublasSdgmm_64
-        try:
-            __cublasSdgmm_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSdgmm_64')
-        except:
-            pass
+        __cublasSdgmm_64 = GetProcAddress(handle, 'cublasSdgmm_64')
 
         global __cublasDdgmm_64
-        try:
-            __cublasDdgmm_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDdgmm_64')
-        except:
-            pass
+        __cublasDdgmm_64 = GetProcAddress(handle, 'cublasDdgmm_64')
 
         global __cublasCdgmm_64
-        try:
-            __cublasCdgmm_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasCdgmm_64')
-        except:
-            pass
+        __cublasCdgmm_64 = GetProcAddress(handle, 'cublasCdgmm_64')
 
         global __cublasZdgmm_64
-        try:
-            __cublasZdgmm_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasZdgmm_64')
-        except:
-            pass
+        __cublasZdgmm_64 = GetProcAddress(handle, 'cublasZdgmm_64')
 
         global __cublasSgemmGroupedBatched
-        try:
-            __cublasSgemmGroupedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmGroupedBatched')
-        except:
-            pass
+        __cublasSgemmGroupedBatched = GetProcAddress(handle, 'cublasSgemmGroupedBatched')
 
         global __cublasSgemmGroupedBatched_64
-        try:
-            __cublasSgemmGroupedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmGroupedBatched_64')
-        except:
-            pass
+        __cublasSgemmGroupedBatched_64 = GetProcAddress(handle, 'cublasSgemmGroupedBatched_64')
 
         global __cublasDgemmGroupedBatched
-        try:
-            __cublasDgemmGroupedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmGroupedBatched')
-        except:
-            pass
+        __cublasDgemmGroupedBatched = GetProcAddress(handle, 'cublasDgemmGroupedBatched')
 
         global __cublasDgemmGroupedBatched_64
-        try:
-            __cublasDgemmGroupedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmGroupedBatched_64')
-        except:
-            pass
+        __cublasDgemmGroupedBatched_64 = GetProcAddress(handle, 'cublasDgemmGroupedBatched_64')
 
         global __cublasGemmGroupedBatchedEx
-        try:
-            __cublasGemmGroupedBatchedEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmGroupedBatchedEx')
-        except:
-            pass
+        __cublasGemmGroupedBatchedEx = GetProcAddress(handle, 'cublasGemmGroupedBatchedEx')
 
         global __cublasGemmGroupedBatchedEx_64
-        try:
-            __cublasGemmGroupedBatchedEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmGroupedBatchedEx_64')
-        except:
-            pass
+        __cublasGemmGroupedBatchedEx_64 = GetProcAddress(handle, 'cublasGemmGroupedBatchedEx_64')
 
         global __cublasGetEmulationStrategy
-        try:
-            __cublasGetEmulationStrategy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetEmulationStrategy')
-        except:
-            pass
+        __cublasGetEmulationStrategy = GetProcAddress(handle, 'cublasGetEmulationStrategy')
 
         global __cublasSetEmulationStrategy
-        try:
-            __cublasSetEmulationStrategy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetEmulationStrategy')
-        except:
-            pass
+        __cublasSetEmulationStrategy = GetProcAddress(handle, 'cublasSetEmulationStrategy')
 
-    __py_cublas_init = True
-    return 0
+        __py_cublas_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cudss.pxd b/nvmath/bindings/_internal/cudss.pxd
index 2f366ab..9c7a410 100644
--- a/nvmath/bindings/_internal/cudss.pxd
+++ b/nvmath/bindings/_internal/cudss.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 from ..cycudss cimport *
 
@@ -15,7 +15,7 @@ cdef cudssStatus_t _cudssConfigSet(cudssConfig_t config, cudssConfigParam_t para
 cdef cudssStatus_t _cudssConfigGet(cudssConfig_t config, cudssConfigParam_t param, void* value, size_t sizeInBytes, size_t* sizeWritten) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssDataSet(cudssHandle_t handle, cudssData_t data, cudssDataParam_t param, void* value, size_t sizeInBytes) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssDataGet(cudssHandle_t handle, cudssData_t data, cudssDataParam_t param, void* value, size_t sizeInBytes, size_t* sizeWritten) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
-cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, cudssPhase_t phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, int phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssSetStream(cudssHandle_t handle, cudaStream_t stream) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssSetCommLayer(cudssHandle_t handle, const char* commLibFileName) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssSetThreadingLayer(cudssHandle_t handle, const char* thrLibFileName) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
@@ -24,6 +24,7 @@ cdef cudssStatus_t _cudssConfigDestroy(cudssConfig_t solverConfig) except?_CUDSS
 cdef cudssStatus_t _cudssDataCreate(cudssHandle_t handle, cudssData_t* solverData) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssDataDestroy(cudssHandle_t handle, cudssData_t solverData) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssCreate(cudssHandle_t* handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t _cudssCreateMg(cudssHandle_t* handle_pt, int device_count, int* device_indices) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssDestroy(cudssHandle_t handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssGetProperty(libraryPropertyType propertyType, int* value) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssMatrixCreateDn(cudssMatrix_t* matrix, int64_t nrows, int64_t ncols, int64_t ld, void* values, cudaDataType_t valueType, cudssLayout_t layout) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
@@ -40,5 +41,7 @@ cdef cudssStatus_t _cudssMatrixGetBatchCsr(cudssMatrix_t matrix, int64_t* batchC
 cdef cudssStatus_t _cudssMatrixSetBatchValues(cudssMatrix_t matrix, void** values) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssMatrixSetBatchCsrPointers(cudssMatrix_t matrix, void** rowOffsets, void** rowEnd, void** colIndices, void** values) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssMatrixGetFormat(cudssMatrix_t matrix, int* format) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t _cudssMatrixSetDistributionRow1d(cudssMatrix_t matrix, int64_t first_row, int64_t last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t _cudssMatrixGetDistributionRow1d(cudssMatrix_t matrix, int64_t* first_row, int64_t* last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssGetDeviceMemHandler(cudssHandle_t handle, cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t _cudssSetDeviceMemHandler(cudssHandle_t handle, const cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/cudss_linux.pyx b/nvmath/bindings/_internal/cudss_linux.pyx
index 94f2a41..a69c30c 100644
--- a/nvmath/bindings/_internal/cudss_linux.pyx
+++ b/nvmath/bindings/_internal/cudss_linux.pyx
@@ -2,18 +2,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
 
+# You must 'from .utils import NotSupportedError' before using this template
+
 cdef extern from "<dlfcn.h>" nogil:
     void* dlopen(const char*, int)
     char* dlerror()
@@ -28,11 +33,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cudss_init = False
 
 cdef void* __cudssConfigSet = NULL
@@ -48,6 +73,7 @@ cdef void* __cudssConfigDestroy = NULL
 cdef void* __cudssDataCreate = NULL
 cdef void* __cudssDataDestroy = NULL
 cdef void* __cudssCreate = NULL
+cdef void* __cudssCreateMg = NULL
 cdef void* __cudssDestroy = NULL
 cdef void* __cudssGetProperty = NULL
 cdef void* __cudssMatrixCreateDn = NULL
@@ -64,6 +90,8 @@ cdef void* __cudssMatrixGetBatchCsr = NULL
 cdef void* __cudssMatrixSetBatchValues = NULL
 cdef void* __cudssMatrixSetBatchCsrPointers = NULL
 cdef void* __cudssMatrixGetFormat = NULL
+cdef void* __cudssMatrixSetDistributionRow1d = NULL
+cdef void* __cudssMatrixGetDistributionRow1d = NULL
 cdef void* __cudssGetDeviceMemHandler = NULL
 cdef void* __cudssSetDeviceMemHandler = NULL
 
@@ -79,226 +107,248 @@ cdef int _check_or_init_cudss() except -1 nogil:
         return 0
 
     cdef void* handle = NULL
-    # Load function
-    global __cudssConfigSet
-    __cudssConfigSet = dlsym(RTLD_DEFAULT, 'cudssConfigSet')
-    if __cudssConfigSet == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssConfigSet = dlsym(handle, 'cudssConfigSet')
-
-    global __cudssConfigGet
-    __cudssConfigGet = dlsym(RTLD_DEFAULT, 'cudssConfigGet')
-    if __cudssConfigGet == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssConfigGet = dlsym(handle, 'cudssConfigGet')
-
-    global __cudssDataSet
-    __cudssDataSet = dlsym(RTLD_DEFAULT, 'cudssDataSet')
-    if __cudssDataSet == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssDataSet = dlsym(handle, 'cudssDataSet')
-
-    global __cudssDataGet
-    __cudssDataGet = dlsym(RTLD_DEFAULT, 'cudssDataGet')
-    if __cudssDataGet == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssDataGet = dlsym(handle, 'cudssDataGet')
-
-    global __cudssExecute
-    __cudssExecute = dlsym(RTLD_DEFAULT, 'cudssExecute')
-    if __cudssExecute == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssExecute = dlsym(handle, 'cudssExecute')
-
-    global __cudssSetStream
-    __cudssSetStream = dlsym(RTLD_DEFAULT, 'cudssSetStream')
-    if __cudssSetStream == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssSetStream = dlsym(handle, 'cudssSetStream')
-
-    global __cudssSetCommLayer
-    __cudssSetCommLayer = dlsym(RTLD_DEFAULT, 'cudssSetCommLayer')
-    if __cudssSetCommLayer == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssSetCommLayer = dlsym(handle, 'cudssSetCommLayer')
-
-    global __cudssSetThreadingLayer
-    __cudssSetThreadingLayer = dlsym(RTLD_DEFAULT, 'cudssSetThreadingLayer')
-    if __cudssSetThreadingLayer == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssSetThreadingLayer = dlsym(handle, 'cudssSetThreadingLayer')
-
-    global __cudssConfigCreate
-    __cudssConfigCreate = dlsym(RTLD_DEFAULT, 'cudssConfigCreate')
-    if __cudssConfigCreate == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssConfigCreate = dlsym(handle, 'cudssConfigCreate')
-
-    global __cudssConfigDestroy
-    __cudssConfigDestroy = dlsym(RTLD_DEFAULT, 'cudssConfigDestroy')
-    if __cudssConfigDestroy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssConfigDestroy = dlsym(handle, 'cudssConfigDestroy')
-
-    global __cudssDataCreate
-    __cudssDataCreate = dlsym(RTLD_DEFAULT, 'cudssDataCreate')
-    if __cudssDataCreate == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssDataCreate = dlsym(handle, 'cudssDataCreate')
-
-    global __cudssDataDestroy
-    __cudssDataDestroy = dlsym(RTLD_DEFAULT, 'cudssDataDestroy')
-    if __cudssDataDestroy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssDataDestroy = dlsym(handle, 'cudssDataDestroy')
-
-    global __cudssCreate
-    __cudssCreate = dlsym(RTLD_DEFAULT, 'cudssCreate')
-    if __cudssCreate == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssCreate = dlsym(handle, 'cudssCreate')
-
-    global __cudssDestroy
-    __cudssDestroy = dlsym(RTLD_DEFAULT, 'cudssDestroy')
-    if __cudssDestroy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssDestroy = dlsym(handle, 'cudssDestroy')
-
-    global __cudssGetProperty
-    __cudssGetProperty = dlsym(RTLD_DEFAULT, 'cudssGetProperty')
-    if __cudssGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssGetProperty = dlsym(handle, 'cudssGetProperty')
-
-    global __cudssMatrixCreateDn
-    __cudssMatrixCreateDn = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateDn')
-    if __cudssMatrixCreateDn == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixCreateDn = dlsym(handle, 'cudssMatrixCreateDn')
-
-    global __cudssMatrixCreateCsr
-    __cudssMatrixCreateCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateCsr')
-    if __cudssMatrixCreateCsr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixCreateCsr = dlsym(handle, 'cudssMatrixCreateCsr')
 
-    global __cudssMatrixCreateBatchDn
-    __cudssMatrixCreateBatchDn = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateBatchDn')
-    if __cudssMatrixCreateBatchDn == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixCreateBatchDn = dlsym(handle, 'cudssMatrixCreateBatchDn')
-
-    global __cudssMatrixCreateBatchCsr
-    __cudssMatrixCreateBatchCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateBatchCsr')
-    if __cudssMatrixCreateBatchCsr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixCreateBatchCsr = dlsym(handle, 'cudssMatrixCreateBatchCsr')
-
-    global __cudssMatrixDestroy
-    __cudssMatrixDestroy = dlsym(RTLD_DEFAULT, 'cudssMatrixDestroy')
-    if __cudssMatrixDestroy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixDestroy = dlsym(handle, 'cudssMatrixDestroy')
-
-    global __cudssMatrixGetDn
-    __cudssMatrixGetDn = dlsym(RTLD_DEFAULT, 'cudssMatrixGetDn')
-    if __cudssMatrixGetDn == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixGetDn = dlsym(handle, 'cudssMatrixGetDn')
-
-    global __cudssMatrixGetCsr
-    __cudssMatrixGetCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixGetCsr')
-    if __cudssMatrixGetCsr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixGetCsr = dlsym(handle, 'cudssMatrixGetCsr')
-
-    global __cudssMatrixSetValues
-    __cudssMatrixSetValues = dlsym(RTLD_DEFAULT, 'cudssMatrixSetValues')
-    if __cudssMatrixSetValues == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixSetValues = dlsym(handle, 'cudssMatrixSetValues')
-
-    global __cudssMatrixSetCsrPointers
-    __cudssMatrixSetCsrPointers = dlsym(RTLD_DEFAULT, 'cudssMatrixSetCsrPointers')
-    if __cudssMatrixSetCsrPointers == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixSetCsrPointers = dlsym(handle, 'cudssMatrixSetCsrPointers')
-
-    global __cudssMatrixGetBatchDn
-    __cudssMatrixGetBatchDn = dlsym(RTLD_DEFAULT, 'cudssMatrixGetBatchDn')
-    if __cudssMatrixGetBatchDn == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixGetBatchDn = dlsym(handle, 'cudssMatrixGetBatchDn')
-
-    global __cudssMatrixGetBatchCsr
-    __cudssMatrixGetBatchCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixGetBatchCsr')
-    if __cudssMatrixGetBatchCsr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixGetBatchCsr = dlsym(handle, 'cudssMatrixGetBatchCsr')
-
-    global __cudssMatrixSetBatchValues
-    __cudssMatrixSetBatchValues = dlsym(RTLD_DEFAULT, 'cudssMatrixSetBatchValues')
-    if __cudssMatrixSetBatchValues == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixSetBatchValues = dlsym(handle, 'cudssMatrixSetBatchValues')
-
-    global __cudssMatrixSetBatchCsrPointers
-    __cudssMatrixSetBatchCsrPointers = dlsym(RTLD_DEFAULT, 'cudssMatrixSetBatchCsrPointers')
-    if __cudssMatrixSetBatchCsrPointers == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixSetBatchCsrPointers = dlsym(handle, 'cudssMatrixSetBatchCsrPointers')
-
-    global __cudssMatrixGetFormat
-    __cudssMatrixGetFormat = dlsym(RTLD_DEFAULT, 'cudssMatrixGetFormat')
-    if __cudssMatrixGetFormat == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssMatrixGetFormat = dlsym(handle, 'cudssMatrixGetFormat')
-
-    global __cudssGetDeviceMemHandler
-    __cudssGetDeviceMemHandler = dlsym(RTLD_DEFAULT, 'cudssGetDeviceMemHandler')
-    if __cudssGetDeviceMemHandler == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssGetDeviceMemHandler = dlsym(handle, 'cudssGetDeviceMemHandler')
-
-    global __cudssSetDeviceMemHandler
-    __cudssSetDeviceMemHandler = dlsym(RTLD_DEFAULT, 'cudssSetDeviceMemHandler')
-    if __cudssSetDeviceMemHandler == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cudssSetDeviceMemHandler = dlsym(handle, 'cudssSetDeviceMemHandler')
-
-    __py_cudss_init = True
-    return 0
+    with gil, __symbol_lock:
+        # Load function
+        global __cudssConfigSet
+        __cudssConfigSet = dlsym(RTLD_DEFAULT, 'cudssConfigSet')
+        if __cudssConfigSet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssConfigSet = dlsym(handle, 'cudssConfigSet')
+
+        global __cudssConfigGet
+        __cudssConfigGet = dlsym(RTLD_DEFAULT, 'cudssConfigGet')
+        if __cudssConfigGet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssConfigGet = dlsym(handle, 'cudssConfigGet')
+
+        global __cudssDataSet
+        __cudssDataSet = dlsym(RTLD_DEFAULT, 'cudssDataSet')
+        if __cudssDataSet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssDataSet = dlsym(handle, 'cudssDataSet')
+
+        global __cudssDataGet
+        __cudssDataGet = dlsym(RTLD_DEFAULT, 'cudssDataGet')
+        if __cudssDataGet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssDataGet = dlsym(handle, 'cudssDataGet')
+
+        global __cudssExecute
+        __cudssExecute = dlsym(RTLD_DEFAULT, 'cudssExecute')
+        if __cudssExecute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssExecute = dlsym(handle, 'cudssExecute')
+
+        global __cudssSetStream
+        __cudssSetStream = dlsym(RTLD_DEFAULT, 'cudssSetStream')
+        if __cudssSetStream == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssSetStream = dlsym(handle, 'cudssSetStream')
+
+        global __cudssSetCommLayer
+        __cudssSetCommLayer = dlsym(RTLD_DEFAULT, 'cudssSetCommLayer')
+        if __cudssSetCommLayer == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssSetCommLayer = dlsym(handle, 'cudssSetCommLayer')
+
+        global __cudssSetThreadingLayer
+        __cudssSetThreadingLayer = dlsym(RTLD_DEFAULT, 'cudssSetThreadingLayer')
+        if __cudssSetThreadingLayer == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssSetThreadingLayer = dlsym(handle, 'cudssSetThreadingLayer')
+
+        global __cudssConfigCreate
+        __cudssConfigCreate = dlsym(RTLD_DEFAULT, 'cudssConfigCreate')
+        if __cudssConfigCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssConfigCreate = dlsym(handle, 'cudssConfigCreate')
+
+        global __cudssConfigDestroy
+        __cudssConfigDestroy = dlsym(RTLD_DEFAULT, 'cudssConfigDestroy')
+        if __cudssConfigDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssConfigDestroy = dlsym(handle, 'cudssConfigDestroy')
+
+        global __cudssDataCreate
+        __cudssDataCreate = dlsym(RTLD_DEFAULT, 'cudssDataCreate')
+        if __cudssDataCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssDataCreate = dlsym(handle, 'cudssDataCreate')
+
+        global __cudssDataDestroy
+        __cudssDataDestroy = dlsym(RTLD_DEFAULT, 'cudssDataDestroy')
+        if __cudssDataDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssDataDestroy = dlsym(handle, 'cudssDataDestroy')
+
+        global __cudssCreate
+        __cudssCreate = dlsym(RTLD_DEFAULT, 'cudssCreate')
+        if __cudssCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssCreate = dlsym(handle, 'cudssCreate')
+
+        global __cudssCreateMg
+        __cudssCreateMg = dlsym(RTLD_DEFAULT, 'cudssCreateMg')
+        if __cudssCreateMg == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssCreateMg = dlsym(handle, 'cudssCreateMg')
+
+        global __cudssDestroy
+        __cudssDestroy = dlsym(RTLD_DEFAULT, 'cudssDestroy')
+        if __cudssDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssDestroy = dlsym(handle, 'cudssDestroy')
+
+        global __cudssGetProperty
+        __cudssGetProperty = dlsym(RTLD_DEFAULT, 'cudssGetProperty')
+        if __cudssGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssGetProperty = dlsym(handle, 'cudssGetProperty')
+
+        global __cudssMatrixCreateDn
+        __cudssMatrixCreateDn = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateDn')
+        if __cudssMatrixCreateDn == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixCreateDn = dlsym(handle, 'cudssMatrixCreateDn')
+
+        global __cudssMatrixCreateCsr
+        __cudssMatrixCreateCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateCsr')
+        if __cudssMatrixCreateCsr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixCreateCsr = dlsym(handle, 'cudssMatrixCreateCsr')
+
+        global __cudssMatrixCreateBatchDn
+        __cudssMatrixCreateBatchDn = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateBatchDn')
+        if __cudssMatrixCreateBatchDn == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixCreateBatchDn = dlsym(handle, 'cudssMatrixCreateBatchDn')
+
+        global __cudssMatrixCreateBatchCsr
+        __cudssMatrixCreateBatchCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixCreateBatchCsr')
+        if __cudssMatrixCreateBatchCsr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixCreateBatchCsr = dlsym(handle, 'cudssMatrixCreateBatchCsr')
+
+        global __cudssMatrixDestroy
+        __cudssMatrixDestroy = dlsym(RTLD_DEFAULT, 'cudssMatrixDestroy')
+        if __cudssMatrixDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixDestroy = dlsym(handle, 'cudssMatrixDestroy')
+
+        global __cudssMatrixGetDn
+        __cudssMatrixGetDn = dlsym(RTLD_DEFAULT, 'cudssMatrixGetDn')
+        if __cudssMatrixGetDn == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetDn = dlsym(handle, 'cudssMatrixGetDn')
+
+        global __cudssMatrixGetCsr
+        __cudssMatrixGetCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixGetCsr')
+        if __cudssMatrixGetCsr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetCsr = dlsym(handle, 'cudssMatrixGetCsr')
+
+        global __cudssMatrixSetValues
+        __cudssMatrixSetValues = dlsym(RTLD_DEFAULT, 'cudssMatrixSetValues')
+        if __cudssMatrixSetValues == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixSetValues = dlsym(handle, 'cudssMatrixSetValues')
+
+        global __cudssMatrixSetCsrPointers
+        __cudssMatrixSetCsrPointers = dlsym(RTLD_DEFAULT, 'cudssMatrixSetCsrPointers')
+        if __cudssMatrixSetCsrPointers == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixSetCsrPointers = dlsym(handle, 'cudssMatrixSetCsrPointers')
+
+        global __cudssMatrixGetBatchDn
+        __cudssMatrixGetBatchDn = dlsym(RTLD_DEFAULT, 'cudssMatrixGetBatchDn')
+        if __cudssMatrixGetBatchDn == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetBatchDn = dlsym(handle, 'cudssMatrixGetBatchDn')
+
+        global __cudssMatrixGetBatchCsr
+        __cudssMatrixGetBatchCsr = dlsym(RTLD_DEFAULT, 'cudssMatrixGetBatchCsr')
+        if __cudssMatrixGetBatchCsr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetBatchCsr = dlsym(handle, 'cudssMatrixGetBatchCsr')
+
+        global __cudssMatrixSetBatchValues
+        __cudssMatrixSetBatchValues = dlsym(RTLD_DEFAULT, 'cudssMatrixSetBatchValues')
+        if __cudssMatrixSetBatchValues == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixSetBatchValues = dlsym(handle, 'cudssMatrixSetBatchValues')
+
+        global __cudssMatrixSetBatchCsrPointers
+        __cudssMatrixSetBatchCsrPointers = dlsym(RTLD_DEFAULT, 'cudssMatrixSetBatchCsrPointers')
+        if __cudssMatrixSetBatchCsrPointers == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixSetBatchCsrPointers = dlsym(handle, 'cudssMatrixSetBatchCsrPointers')
+
+        global __cudssMatrixGetFormat
+        __cudssMatrixGetFormat = dlsym(RTLD_DEFAULT, 'cudssMatrixGetFormat')
+        if __cudssMatrixGetFormat == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetFormat = dlsym(handle, 'cudssMatrixGetFormat')
+
+        global __cudssMatrixSetDistributionRow1d
+        __cudssMatrixSetDistributionRow1d = dlsym(RTLD_DEFAULT, 'cudssMatrixSetDistributionRow1d')
+        if __cudssMatrixSetDistributionRow1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixSetDistributionRow1d = dlsym(handle, 'cudssMatrixSetDistributionRow1d')
+
+        global __cudssMatrixGetDistributionRow1d
+        __cudssMatrixGetDistributionRow1d = dlsym(RTLD_DEFAULT, 'cudssMatrixGetDistributionRow1d')
+        if __cudssMatrixGetDistributionRow1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssMatrixGetDistributionRow1d = dlsym(handle, 'cudssMatrixGetDistributionRow1d')
+
+        global __cudssGetDeviceMemHandler
+        __cudssGetDeviceMemHandler = dlsym(RTLD_DEFAULT, 'cudssGetDeviceMemHandler')
+        if __cudssGetDeviceMemHandler == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssGetDeviceMemHandler = dlsym(handle, 'cudssGetDeviceMemHandler')
+
+        global __cudssSetDeviceMemHandler
+        __cudssSetDeviceMemHandler = dlsym(RTLD_DEFAULT, 'cudssSetDeviceMemHandler')
+        if __cudssSetDeviceMemHandler == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudssSetDeviceMemHandler = dlsym(handle, 'cudssSetDeviceMemHandler')
+        __py_cudss_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -351,6 +401,9 @@ cpdef dict _inspect_function_pointers():
     global __cudssCreate
     data["__cudssCreate"] = <intptr_t>__cudssCreate
 
+    global __cudssCreateMg
+    data["__cudssCreateMg"] = <intptr_t>__cudssCreateMg
+
     global __cudssDestroy
     data["__cudssDestroy"] = <intptr_t>__cudssDestroy
 
@@ -399,6 +452,12 @@ cpdef dict _inspect_function_pointers():
     global __cudssMatrixGetFormat
     data["__cudssMatrixGetFormat"] = <intptr_t>__cudssMatrixGetFormat
 
+    global __cudssMatrixSetDistributionRow1d
+    data["__cudssMatrixSetDistributionRow1d"] = <intptr_t>__cudssMatrixSetDistributionRow1d
+
+    global __cudssMatrixGetDistributionRow1d
+    data["__cudssMatrixGetDistributionRow1d"] = <intptr_t>__cudssMatrixGetDistributionRow1d
+
     global __cudssGetDeviceMemHandler
     data["__cudssGetDeviceMemHandler"] = <intptr_t>__cudssGetDeviceMemHandler
 
@@ -460,13 +519,13 @@ cdef cudssStatus_t _cudssDataGet(cudssHandle_t handle, cudssData_t data, cudssDa
         handle, data, param, value, sizeInBytes, sizeWritten)
 
 
-cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, cudssPhase_t phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, int phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssExecute
     _check_or_init_cudss()
     if __cudssExecute == NULL:
         with gil:
             raise FunctionNotFoundError("function cudssExecute is not found")
-    return (<cudssStatus_t (*)(cudssHandle_t, cudssPhase_t, cudssConfig_t, cudssData_t, cudssMatrix_t, cudssMatrix_t, cudssMatrix_t) noexcept nogil>__cudssExecute)(
+    return (<cudssStatus_t (*)(cudssHandle_t, int, cudssConfig_t, cudssData_t, cudssMatrix_t, cudssMatrix_t, cudssMatrix_t) noexcept nogil>__cudssExecute)(
         handle, phase, solverConfig, solverData, inputMatrix, solution, rhs)
 
 
@@ -550,6 +609,16 @@ cdef cudssStatus_t _cudssCreate(cudssHandle_t* handle) except?_CUDSSSTATUS_T_INT
         handle)
 
 
+cdef cudssStatus_t _cudssCreateMg(cudssHandle_t* handle_pt, int device_count, int* device_indices) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssCreateMg
+    _check_or_init_cudss()
+    if __cudssCreateMg == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssCreateMg is not found")
+    return (<cudssStatus_t (*)(cudssHandle_t*, int, int*) noexcept nogil>__cudssCreateMg)(
+        handle_pt, device_count, device_indices)
+
+
 cdef cudssStatus_t _cudssDestroy(cudssHandle_t handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssDestroy
     _check_or_init_cudss()
@@ -710,6 +779,26 @@ cdef cudssStatus_t _cudssMatrixGetFormat(cudssMatrix_t matrix, int* format) exce
         matrix, format)
 
 
+cdef cudssStatus_t _cudssMatrixSetDistributionRow1d(cudssMatrix_t matrix, int64_t first_row, int64_t last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssMatrixSetDistributionRow1d
+    _check_or_init_cudss()
+    if __cudssMatrixSetDistributionRow1d == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssMatrixSetDistributionRow1d is not found")
+    return (<cudssStatus_t (*)(cudssMatrix_t, int64_t, int64_t) noexcept nogil>__cudssMatrixSetDistributionRow1d)(
+        matrix, first_row, last_row)
+
+
+cdef cudssStatus_t _cudssMatrixGetDistributionRow1d(cudssMatrix_t matrix, int64_t* first_row, int64_t* last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssMatrixGetDistributionRow1d
+    _check_or_init_cudss()
+    if __cudssMatrixGetDistributionRow1d == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssMatrixGetDistributionRow1d is not found")
+    return (<cudssStatus_t (*)(cudssMatrix_t, int64_t*, int64_t*) noexcept nogil>__cudssMatrixGetDistributionRow1d)(
+        matrix, first_row, last_row)
+
+
 cdef cudssStatus_t _cudssGetDeviceMemHandler(cudssHandle_t handle, cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssGetDeviceMemHandler
     _check_or_init_cudss()
diff --git a/nvmath/bindings/_internal/cudss_windows.pyx b/nvmath/bindings/_internal/cudss_windows.pyx
index d64ce3f..07daec7 100644
--- a/nvmath/bindings/_internal/cudss_windows.pyx
+++ b/nvmath/bindings/_internal/cudss_windows.pyx
@@ -2,27 +2,83 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in nvcuda.dll')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cudss_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cudssConfigSet = NULL
 cdef void* __cudssConfigGet = NULL
@@ -37,6 +93,7 @@ cdef void* __cudssConfigDestroy = NULL
 cdef void* __cudssDataCreate = NULL
 cdef void* __cudssDataDestroy = NULL
 cdef void* __cudssCreate = NULL
+cdef void* __cudssCreateMg = NULL
 cdef void* __cudssDestroy = NULL
 cdef void* __cudssGetProperty = NULL
 cdef void* __cudssMatrixCreateDn = NULL
@@ -53,6 +110,8 @@ cdef void* __cudssMatrixGetBatchCsr = NULL
 cdef void* __cudssMatrixSetBatchValues = NULL
 cdef void* __cudssMatrixSetBatchCsrPointers = NULL
 cdef void* __cudssMatrixGetFormat = NULL
+cdef void* __cudssMatrixSetDistributionRow1d = NULL
+cdef void* __cudssMatrixGetDistributionRow1d = NULL
 cdef void* __cudssGetDeviceMemHandler = NULL
 cdef void* __cudssSetDeviceMemHandler = NULL
 
@@ -71,199 +130,115 @@ cdef int _check_or_init_cudss() except -1 nogil:
     if __py_cudss_init:
         return 0
 
-    with gil:
+    with gil, __symbol_lock:
         # Load library
         handle = <intptr_t>load_library()
 
         # Load function
         global __cudssConfigSet
-        try:
-            __cudssConfigSet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssConfigSet')
-        except:
-            pass
+        __cudssConfigSet = GetProcAddress(handle, 'cudssConfigSet')
 
         global __cudssConfigGet
-        try:
-            __cudssConfigGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssConfigGet')
-        except:
-            pass
+        __cudssConfigGet = GetProcAddress(handle, 'cudssConfigGet')
 
         global __cudssDataSet
-        try:
-            __cudssDataSet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssDataSet')
-        except:
-            pass
+        __cudssDataSet = GetProcAddress(handle, 'cudssDataSet')
 
         global __cudssDataGet
-        try:
-            __cudssDataGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssDataGet')
-        except:
-            pass
+        __cudssDataGet = GetProcAddress(handle, 'cudssDataGet')
 
         global __cudssExecute
-        try:
-            __cudssExecute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssExecute')
-        except:
-            pass
+        __cudssExecute = GetProcAddress(handle, 'cudssExecute')
 
         global __cudssSetStream
-        try:
-            __cudssSetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssSetStream')
-        except:
-            pass
+        __cudssSetStream = GetProcAddress(handle, 'cudssSetStream')
 
         global __cudssSetCommLayer
-        try:
-            __cudssSetCommLayer = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssSetCommLayer')
-        except:
-            pass
+        __cudssSetCommLayer = GetProcAddress(handle, 'cudssSetCommLayer')
 
         global __cudssSetThreadingLayer
-        try:
-            __cudssSetThreadingLayer = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssSetThreadingLayer')
-        except:
-            pass
+        __cudssSetThreadingLayer = GetProcAddress(handle, 'cudssSetThreadingLayer')
 
         global __cudssConfigCreate
-        try:
-            __cudssConfigCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssConfigCreate')
-        except:
-            pass
+        __cudssConfigCreate = GetProcAddress(handle, 'cudssConfigCreate')
 
         global __cudssConfigDestroy
-        try:
-            __cudssConfigDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssConfigDestroy')
-        except:
-            pass
+        __cudssConfigDestroy = GetProcAddress(handle, 'cudssConfigDestroy')
 
         global __cudssDataCreate
-        try:
-            __cudssDataCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssDataCreate')
-        except:
-            pass
+        __cudssDataCreate = GetProcAddress(handle, 'cudssDataCreate')
 
         global __cudssDataDestroy
-        try:
-            __cudssDataDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssDataDestroy')
-        except:
-            pass
+        __cudssDataDestroy = GetProcAddress(handle, 'cudssDataDestroy')
 
         global __cudssCreate
-        try:
-            __cudssCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssCreate')
-        except:
-            pass
+        __cudssCreate = GetProcAddress(handle, 'cudssCreate')
+
+        global __cudssCreateMg
+        __cudssCreateMg = GetProcAddress(handle, 'cudssCreateMg')
 
         global __cudssDestroy
-        try:
-            __cudssDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssDestroy')
-        except:
-            pass
+        __cudssDestroy = GetProcAddress(handle, 'cudssDestroy')
 
         global __cudssGetProperty
-        try:
-            __cudssGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssGetProperty')
-        except:
-            pass
+        __cudssGetProperty = GetProcAddress(handle, 'cudssGetProperty')
 
         global __cudssMatrixCreateDn
-        try:
-            __cudssMatrixCreateDn = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixCreateDn')
-        except:
-            pass
+        __cudssMatrixCreateDn = GetProcAddress(handle, 'cudssMatrixCreateDn')
 
         global __cudssMatrixCreateCsr
-        try:
-            __cudssMatrixCreateCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixCreateCsr')
-        except:
-            pass
+        __cudssMatrixCreateCsr = GetProcAddress(handle, 'cudssMatrixCreateCsr')
 
         global __cudssMatrixCreateBatchDn
-        try:
-            __cudssMatrixCreateBatchDn = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixCreateBatchDn')
-        except:
-            pass
+        __cudssMatrixCreateBatchDn = GetProcAddress(handle, 'cudssMatrixCreateBatchDn')
 
         global __cudssMatrixCreateBatchCsr
-        try:
-            __cudssMatrixCreateBatchCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixCreateBatchCsr')
-        except:
-            pass
+        __cudssMatrixCreateBatchCsr = GetProcAddress(handle, 'cudssMatrixCreateBatchCsr')
 
         global __cudssMatrixDestroy
-        try:
-            __cudssMatrixDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixDestroy')
-        except:
-            pass
+        __cudssMatrixDestroy = GetProcAddress(handle, 'cudssMatrixDestroy')
 
         global __cudssMatrixGetDn
-        try:
-            __cudssMatrixGetDn = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixGetDn')
-        except:
-            pass
+        __cudssMatrixGetDn = GetProcAddress(handle, 'cudssMatrixGetDn')
 
         global __cudssMatrixGetCsr
-        try:
-            __cudssMatrixGetCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixGetCsr')
-        except:
-            pass
+        __cudssMatrixGetCsr = GetProcAddress(handle, 'cudssMatrixGetCsr')
 
         global __cudssMatrixSetValues
-        try:
-            __cudssMatrixSetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixSetValues')
-        except:
-            pass
+        __cudssMatrixSetValues = GetProcAddress(handle, 'cudssMatrixSetValues')
 
         global __cudssMatrixSetCsrPointers
-        try:
-            __cudssMatrixSetCsrPointers = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixSetCsrPointers')
-        except:
-            pass
+        __cudssMatrixSetCsrPointers = GetProcAddress(handle, 'cudssMatrixSetCsrPointers')
 
         global __cudssMatrixGetBatchDn
-        try:
-            __cudssMatrixGetBatchDn = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixGetBatchDn')
-        except:
-            pass
+        __cudssMatrixGetBatchDn = GetProcAddress(handle, 'cudssMatrixGetBatchDn')
 
         global __cudssMatrixGetBatchCsr
-        try:
-            __cudssMatrixGetBatchCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixGetBatchCsr')
-        except:
-            pass
+        __cudssMatrixGetBatchCsr = GetProcAddress(handle, 'cudssMatrixGetBatchCsr')
 
         global __cudssMatrixSetBatchValues
-        try:
-            __cudssMatrixSetBatchValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixSetBatchValues')
-        except:
-            pass
+        __cudssMatrixSetBatchValues = GetProcAddress(handle, 'cudssMatrixSetBatchValues')
 
         global __cudssMatrixSetBatchCsrPointers
-        try:
-            __cudssMatrixSetBatchCsrPointers = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixSetBatchCsrPointers')
-        except:
-            pass
+        __cudssMatrixSetBatchCsrPointers = GetProcAddress(handle, 'cudssMatrixSetBatchCsrPointers')
 
         global __cudssMatrixGetFormat
-        try:
-            __cudssMatrixGetFormat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssMatrixGetFormat')
-        except:
-            pass
+        __cudssMatrixGetFormat = GetProcAddress(handle, 'cudssMatrixGetFormat')
+
+        global __cudssMatrixSetDistributionRow1d
+        __cudssMatrixSetDistributionRow1d = GetProcAddress(handle, 'cudssMatrixSetDistributionRow1d')
+
+        global __cudssMatrixGetDistributionRow1d
+        __cudssMatrixGetDistributionRow1d = GetProcAddress(handle, 'cudssMatrixGetDistributionRow1d')
 
         global __cudssGetDeviceMemHandler
-        try:
-            __cudssGetDeviceMemHandler = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssGetDeviceMemHandler')
-        except:
-            pass
+        __cudssGetDeviceMemHandler = GetProcAddress(handle, 'cudssGetDeviceMemHandler')
 
         global __cudssSetDeviceMemHandler
-        try:
-            __cudssSetDeviceMemHandler = <void*><intptr_t>win32api.GetProcAddress(handle, 'cudssSetDeviceMemHandler')
-        except:
-            pass
+        __cudssSetDeviceMemHandler = GetProcAddress(handle, 'cudssSetDeviceMemHandler')
 
-    __py_cudss_init = True
-    return 0
+        __py_cudss_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -316,6 +291,9 @@ cpdef dict _inspect_function_pointers():
     global __cudssCreate
     data["__cudssCreate"] = <intptr_t>__cudssCreate
 
+    global __cudssCreateMg
+    data["__cudssCreateMg"] = <intptr_t>__cudssCreateMg
+
     global __cudssDestroy
     data["__cudssDestroy"] = <intptr_t>__cudssDestroy
 
@@ -364,6 +342,12 @@ cpdef dict _inspect_function_pointers():
     global __cudssMatrixGetFormat
     data["__cudssMatrixGetFormat"] = <intptr_t>__cudssMatrixGetFormat
 
+    global __cudssMatrixSetDistributionRow1d
+    data["__cudssMatrixSetDistributionRow1d"] = <intptr_t>__cudssMatrixSetDistributionRow1d
+
+    global __cudssMatrixGetDistributionRow1d
+    data["__cudssMatrixGetDistributionRow1d"] = <intptr_t>__cudssMatrixGetDistributionRow1d
+
     global __cudssGetDeviceMemHandler
     data["__cudssGetDeviceMemHandler"] = <intptr_t>__cudssGetDeviceMemHandler
 
@@ -425,13 +409,13 @@ cdef cudssStatus_t _cudssDataGet(cudssHandle_t handle, cudssData_t data, cudssDa
         handle, data, param, value, sizeInBytes, sizeWritten)
 
 
-cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, cudssPhase_t phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+cdef cudssStatus_t _cudssExecute(cudssHandle_t handle, int phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssExecute
     _check_or_init_cudss()
     if __cudssExecute == NULL:
         with gil:
             raise FunctionNotFoundError("function cudssExecute is not found")
-    return (<cudssStatus_t (*)(cudssHandle_t, cudssPhase_t, cudssConfig_t, cudssData_t, cudssMatrix_t, cudssMatrix_t, cudssMatrix_t) noexcept nogil>__cudssExecute)(
+    return (<cudssStatus_t (*)(cudssHandle_t, int, cudssConfig_t, cudssData_t, cudssMatrix_t, cudssMatrix_t, cudssMatrix_t) noexcept nogil>__cudssExecute)(
         handle, phase, solverConfig, solverData, inputMatrix, solution, rhs)
 
 
@@ -515,6 +499,16 @@ cdef cudssStatus_t _cudssCreate(cudssHandle_t* handle) except?_CUDSSSTATUS_T_INT
         handle)
 
 
+cdef cudssStatus_t _cudssCreateMg(cudssHandle_t* handle_pt, int device_count, int* device_indices) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssCreateMg
+    _check_or_init_cudss()
+    if __cudssCreateMg == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssCreateMg is not found")
+    return (<cudssStatus_t (*)(cudssHandle_t*, int, int*) noexcept nogil>__cudssCreateMg)(
+        handle_pt, device_count, device_indices)
+
+
 cdef cudssStatus_t _cudssDestroy(cudssHandle_t handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssDestroy
     _check_or_init_cudss()
@@ -675,6 +669,26 @@ cdef cudssStatus_t _cudssMatrixGetFormat(cudssMatrix_t matrix, int* format) exce
         matrix, format)
 
 
+cdef cudssStatus_t _cudssMatrixSetDistributionRow1d(cudssMatrix_t matrix, int64_t first_row, int64_t last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssMatrixSetDistributionRow1d
+    _check_or_init_cudss()
+    if __cudssMatrixSetDistributionRow1d == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssMatrixSetDistributionRow1d is not found")
+    return (<cudssStatus_t (*)(cudssMatrix_t, int64_t, int64_t) noexcept nogil>__cudssMatrixSetDistributionRow1d)(
+        matrix, first_row, last_row)
+
+
+cdef cudssStatus_t _cudssMatrixGetDistributionRow1d(cudssMatrix_t matrix, int64_t* first_row, int64_t* last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cudssMatrixGetDistributionRow1d
+    _check_or_init_cudss()
+    if __cudssMatrixGetDistributionRow1d == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudssMatrixGetDistributionRow1d is not found")
+    return (<cudssStatus_t (*)(cudssMatrix_t, int64_t*, int64_t*) noexcept nogil>__cudssMatrixGetDistributionRow1d)(
+        matrix, first_row, last_row)
+
+
 cdef cudssStatus_t _cudssGetDeviceMemHandler(cudssHandle_t handle, cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     global __cudssGetDeviceMemHandler
     _check_or_init_cudss()
diff --git a/nvmath/bindings/_internal/cufft.pxd b/nvmath/bindings/_internal/cufft.pxd
index 28f8cd4..0c1136a 100644
--- a/nvmath/bindings/_internal/cufft.pxd
+++ b/nvmath/bindings/_internal/cufft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 from ..cycufft cimport *
 
@@ -62,8 +62,9 @@ cdef cufftResult _cufftXtGetSizeMany(cufftHandle plan, int rank, long long int*
 cdef cufftResult _cufftXtExec(cufftHandle plan, void* input, void* output, int direction) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftXtExecDescriptor(cufftHandle plan, cudaLibXtDesc* input, cudaLibXtDesc* output, int direction) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t* workSize) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
-cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
+cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
+cdef cufftResult ___cufftXtSetJITCallback_12_7(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/cufftMp_linux.pyx b/nvmath/bindings/_internal/cufftMp_linux.pyx
index d9829c4..902bd43 100644
--- a/nvmath/bindings/_internal/cufftMp_linux.pyx
+++ b/nvmath/bindings/_internal/cufftMp_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -25,15 +28,33 @@ cdef extern from "<dlfcn.h>" nogil:
         RTLD_NOW
         RTLD_GLOBAL
         RTLD_LOCAL
-        RTLD_DEEPBIND
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cufftMp_init = False
 
 cdef void* __cufftPlan1d = NULL
@@ -115,394 +136,395 @@ cdef int _check_or_init_cufftMp() except -1 nogil:
     if __py_cufftMp_init:
         return 0
 
-    # Load function
     cdef void* handle = NULL
-    global __cufftPlan1d
-    if __cufftPlan1d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftPlan1d = dlsym(handle, 'cufftPlan1d')
-
-    global __cufftPlan2d
-    if __cufftPlan2d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftPlan2d = dlsym(handle, 'cufftPlan2d')
-
-    global __cufftPlan3d
-    if __cufftPlan3d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftPlan3d = dlsym(handle, 'cufftPlan3d')
-
-    global __cufftPlanMany
-    if __cufftPlanMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftPlanMany = dlsym(handle, 'cufftPlanMany')
-
-    global __cufftMakePlan1d
-    if __cufftMakePlan1d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMakePlan1d = dlsym(handle, 'cufftMakePlan1d')
-
-    global __cufftMakePlan2d
-    if __cufftMakePlan2d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMakePlan2d = dlsym(handle, 'cufftMakePlan2d')
-
-    global __cufftMakePlan3d
-    if __cufftMakePlan3d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMakePlan3d = dlsym(handle, 'cufftMakePlan3d')
-
-    global __cufftMakePlanMany
-    if __cufftMakePlanMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMakePlanMany = dlsym(handle, 'cufftMakePlanMany')
-
-    global __cufftMakePlanMany64
-    if __cufftMakePlanMany64 == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMakePlanMany64 = dlsym(handle, 'cufftMakePlanMany64')
-
-    global __cufftGetSizeMany64
-    if __cufftGetSizeMany64 == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSizeMany64 = dlsym(handle, 'cufftGetSizeMany64')
-
-    global __cufftEstimate1d
-    if __cufftEstimate1d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftEstimate1d = dlsym(handle, 'cufftEstimate1d')
-
-    global __cufftEstimate2d
-    if __cufftEstimate2d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftEstimate2d = dlsym(handle, 'cufftEstimate2d')
-
-    global __cufftEstimate3d
-    if __cufftEstimate3d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftEstimate3d = dlsym(handle, 'cufftEstimate3d')
-
-    global __cufftEstimateMany
-    if __cufftEstimateMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftEstimateMany = dlsym(handle, 'cufftEstimateMany')
-
-    global __cufftCreate
-    if __cufftCreate == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftCreate = dlsym(handle, 'cufftCreate')
-
-    global __cufftGetSize1d
-    if __cufftGetSize1d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSize1d = dlsym(handle, 'cufftGetSize1d')
-
-    global __cufftGetSize2d
-    if __cufftGetSize2d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSize2d = dlsym(handle, 'cufftGetSize2d')
-
-    global __cufftGetSize3d
-    if __cufftGetSize3d == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSize3d = dlsym(handle, 'cufftGetSize3d')
-
-    global __cufftGetSizeMany
-    if __cufftGetSizeMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSizeMany = dlsym(handle, 'cufftGetSizeMany')
-
-    global __cufftGetSize
-    if __cufftGetSize == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetSize = dlsym(handle, 'cufftGetSize')
-
-    global __cufftSetWorkArea
-    if __cufftSetWorkArea == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftSetWorkArea = dlsym(handle, 'cufftSetWorkArea')
-
-    global __cufftSetAutoAllocation
-    if __cufftSetAutoAllocation == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftSetAutoAllocation = dlsym(handle, 'cufftSetAutoAllocation')
-
-    global __cufftExecC2C
-    if __cufftExecC2C == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecC2C = dlsym(handle, 'cufftExecC2C')
-
-    global __cufftExecR2C
-    if __cufftExecR2C == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecR2C = dlsym(handle, 'cufftExecR2C')
-
-    global __cufftExecC2R
-    if __cufftExecC2R == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecC2R = dlsym(handle, 'cufftExecC2R')
-
-    global __cufftExecZ2Z
-    if __cufftExecZ2Z == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecZ2Z = dlsym(handle, 'cufftExecZ2Z')
-
-    global __cufftExecD2Z
-    if __cufftExecD2Z == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecD2Z = dlsym(handle, 'cufftExecD2Z')
-
-    global __cufftExecZ2D
-    if __cufftExecZ2D == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftExecZ2D = dlsym(handle, 'cufftExecZ2D')
-
-    global __cufftSetStream
-    if __cufftSetStream == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftSetStream = dlsym(handle, 'cufftSetStream')
-
-    global __cufftDestroy
-    if __cufftDestroy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftDestroy = dlsym(handle, 'cufftDestroy')
-
-    global __cufftGetVersion
-    if __cufftGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetVersion = dlsym(handle, 'cufftGetVersion')
-
-    global __cufftGetProperty
-    if __cufftGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetProperty = dlsym(handle, 'cufftGetProperty')
-
-    global __cufftSetPlanPropertyInt64
-    if __cufftSetPlanPropertyInt64 == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftSetPlanPropertyInt64 = dlsym(handle, 'cufftSetPlanPropertyInt64')
-
-    global __cufftGetPlanPropertyInt64
-    if __cufftGetPlanPropertyInt64 == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftGetPlanPropertyInt64 = dlsym(handle, 'cufftGetPlanPropertyInt64')
-
-    global __cufftResetPlanProperty
-    if __cufftResetPlanProperty == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftResetPlanProperty = dlsym(handle, 'cufftResetPlanProperty')
-
-    global __cufftXtSetGPUs
-    if __cufftXtSetGPUs == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetGPUs = dlsym(handle, 'cufftXtSetGPUs')
-
-    global __cufftXtMalloc
-    if __cufftXtMalloc == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtMalloc = dlsym(handle, 'cufftXtMalloc')
-
-    global __cufftXtMemcpy
-    if __cufftXtMemcpy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtMemcpy = dlsym(handle, 'cufftXtMemcpy')
-
-    global __cufftXtFree
-    if __cufftXtFree == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtFree = dlsym(handle, 'cufftXtFree')
-
-    global __cufftXtSetWorkArea
-    if __cufftXtSetWorkArea == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetWorkArea = dlsym(handle, 'cufftXtSetWorkArea')
-
-    global __cufftXtExecDescriptorC2C
-    if __cufftXtExecDescriptorC2C == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorC2C = dlsym(handle, 'cufftXtExecDescriptorC2C')
-
-    global __cufftXtExecDescriptorR2C
-    if __cufftXtExecDescriptorR2C == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorR2C = dlsym(handle, 'cufftXtExecDescriptorR2C')
-
-    global __cufftXtExecDescriptorC2R
-    if __cufftXtExecDescriptorC2R == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorC2R = dlsym(handle, 'cufftXtExecDescriptorC2R')
-
-    global __cufftXtExecDescriptorZ2Z
-    if __cufftXtExecDescriptorZ2Z == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorZ2Z = dlsym(handle, 'cufftXtExecDescriptorZ2Z')
-
-    global __cufftXtExecDescriptorD2Z
-    if __cufftXtExecDescriptorD2Z == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorD2Z = dlsym(handle, 'cufftXtExecDescriptorD2Z')
-
-    global __cufftXtExecDescriptorZ2D
-    if __cufftXtExecDescriptorZ2D == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptorZ2D = dlsym(handle, 'cufftXtExecDescriptorZ2D')
-
-    global __cufftXtQueryPlan
-    if __cufftXtQueryPlan == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtQueryPlan = dlsym(handle, 'cufftXtQueryPlan')
-
-    global __cufftXtClearCallback
-    if __cufftXtClearCallback == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtClearCallback = dlsym(handle, 'cufftXtClearCallback')
 
-    global __cufftXtSetCallbackSharedSize
-    if __cufftXtSetCallbackSharedSize == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetCallbackSharedSize = dlsym(handle, 'cufftXtSetCallbackSharedSize')
-
-    global __cufftXtMakePlanMany
-    if __cufftXtMakePlanMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtMakePlanMany = dlsym(handle, 'cufftXtMakePlanMany')
-
-    global __cufftXtGetSizeMany
-    if __cufftXtGetSizeMany == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtGetSizeMany = dlsym(handle, 'cufftXtGetSizeMany')
-
-    global __cufftXtExec
-    if __cufftXtExec == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExec = dlsym(handle, 'cufftXtExec')
-
-    global __cufftXtExecDescriptor
-    if __cufftXtExecDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtExecDescriptor = dlsym(handle, 'cufftXtExecDescriptor')
-
-    global __cufftXtSetWorkAreaPolicy
-    if __cufftXtSetWorkAreaPolicy == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetWorkAreaPolicy = dlsym(handle, 'cufftXtSetWorkAreaPolicy')
-
-    global __cufftMpAttachComm
-    if __cufftMpAttachComm == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpAttachComm = dlsym(handle, 'cufftMpAttachComm')
-
-    global __cufftXtSetDistribution
-    if __cufftXtSetDistribution == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetDistribution = dlsym(handle, 'cufftXtSetDistribution')
-
-    global __cufftXtSetSubformatDefault
-    if __cufftXtSetSubformatDefault == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftXtSetSubformatDefault = dlsym(handle, 'cufftXtSetSubformatDefault')
-
-    global __cufftMpCreateReshape
-    if __cufftMpCreateReshape == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpCreateReshape = dlsym(handle, 'cufftMpCreateReshape')
-
-    global __cufftMpAttachReshapeComm
-    if __cufftMpAttachReshapeComm == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpAttachReshapeComm = dlsym(handle, 'cufftMpAttachReshapeComm')
-
-    global __cufftMpGetReshapeSize
-    if __cufftMpGetReshapeSize == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpGetReshapeSize = dlsym(handle, 'cufftMpGetReshapeSize')
-
-    global __cufftMpMakeReshape
-    if __cufftMpMakeReshape == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpMakeReshape = dlsym(handle, 'cufftMpMakeReshape')
-
-    global __cufftMpExecReshapeAsync
-    if __cufftMpExecReshapeAsync == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpExecReshapeAsync = dlsym(handle, 'cufftMpExecReshapeAsync')
-
-    global __cufftMpDestroyReshape
-    if __cufftMpDestroyReshape == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __cufftMpDestroyReshape = dlsym(handle, 'cufftMpDestroyReshape')
-
-    global ____cufftMpMakeReshape_11_4
-    if ____cufftMpMakeReshape_11_4 == NULL:
-        if handle == NULL:
-            handle = load_library()
-        ____cufftMpMakeReshape_11_4 = dlsym(handle, '__cufftMpMakeReshape_11_4')
-
-    __py_cufftMp_init = True
-    return 0
+    with gil, __symbol_lock:
+        # Load function
+        global __cufftPlan1d
+        if __cufftPlan1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftPlan1d = dlsym(handle, 'cufftPlan1d')
+
+        global __cufftPlan2d
+        if __cufftPlan2d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftPlan2d = dlsym(handle, 'cufftPlan2d')
+
+        global __cufftPlan3d
+        if __cufftPlan3d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftPlan3d = dlsym(handle, 'cufftPlan3d')
+
+        global __cufftPlanMany
+        if __cufftPlanMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftPlanMany = dlsym(handle, 'cufftPlanMany')
+
+        global __cufftMakePlan1d
+        if __cufftMakePlan1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMakePlan1d = dlsym(handle, 'cufftMakePlan1d')
+
+        global __cufftMakePlan2d
+        if __cufftMakePlan2d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMakePlan2d = dlsym(handle, 'cufftMakePlan2d')
+
+        global __cufftMakePlan3d
+        if __cufftMakePlan3d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMakePlan3d = dlsym(handle, 'cufftMakePlan3d')
+
+        global __cufftMakePlanMany
+        if __cufftMakePlanMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMakePlanMany = dlsym(handle, 'cufftMakePlanMany')
+
+        global __cufftMakePlanMany64
+        if __cufftMakePlanMany64 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMakePlanMany64 = dlsym(handle, 'cufftMakePlanMany64')
+
+        global __cufftGetSizeMany64
+        if __cufftGetSizeMany64 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSizeMany64 = dlsym(handle, 'cufftGetSizeMany64')
+
+        global __cufftEstimate1d
+        if __cufftEstimate1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftEstimate1d = dlsym(handle, 'cufftEstimate1d')
+
+        global __cufftEstimate2d
+        if __cufftEstimate2d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftEstimate2d = dlsym(handle, 'cufftEstimate2d')
+
+        global __cufftEstimate3d
+        if __cufftEstimate3d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftEstimate3d = dlsym(handle, 'cufftEstimate3d')
+
+        global __cufftEstimateMany
+        if __cufftEstimateMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftEstimateMany = dlsym(handle, 'cufftEstimateMany')
+
+        global __cufftCreate
+        if __cufftCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftCreate = dlsym(handle, 'cufftCreate')
+
+        global __cufftGetSize1d
+        if __cufftGetSize1d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSize1d = dlsym(handle, 'cufftGetSize1d')
+
+        global __cufftGetSize2d
+        if __cufftGetSize2d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSize2d = dlsym(handle, 'cufftGetSize2d')
+
+        global __cufftGetSize3d
+        if __cufftGetSize3d == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSize3d = dlsym(handle, 'cufftGetSize3d')
+
+        global __cufftGetSizeMany
+        if __cufftGetSizeMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSizeMany = dlsym(handle, 'cufftGetSizeMany')
+
+        global __cufftGetSize
+        if __cufftGetSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetSize = dlsym(handle, 'cufftGetSize')
+
+        global __cufftSetWorkArea
+        if __cufftSetWorkArea == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftSetWorkArea = dlsym(handle, 'cufftSetWorkArea')
+
+        global __cufftSetAutoAllocation
+        if __cufftSetAutoAllocation == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftSetAutoAllocation = dlsym(handle, 'cufftSetAutoAllocation')
+
+        global __cufftExecC2C
+        if __cufftExecC2C == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecC2C = dlsym(handle, 'cufftExecC2C')
+
+        global __cufftExecR2C
+        if __cufftExecR2C == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecR2C = dlsym(handle, 'cufftExecR2C')
+
+        global __cufftExecC2R
+        if __cufftExecC2R == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecC2R = dlsym(handle, 'cufftExecC2R')
+
+        global __cufftExecZ2Z
+        if __cufftExecZ2Z == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecZ2Z = dlsym(handle, 'cufftExecZ2Z')
+
+        global __cufftExecD2Z
+        if __cufftExecD2Z == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecD2Z = dlsym(handle, 'cufftExecD2Z')
+
+        global __cufftExecZ2D
+        if __cufftExecZ2D == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftExecZ2D = dlsym(handle, 'cufftExecZ2D')
+
+        global __cufftSetStream
+        if __cufftSetStream == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftSetStream = dlsym(handle, 'cufftSetStream')
+
+        global __cufftDestroy
+        if __cufftDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftDestroy = dlsym(handle, 'cufftDestroy')
+
+        global __cufftGetVersion
+        if __cufftGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetVersion = dlsym(handle, 'cufftGetVersion')
+
+        global __cufftGetProperty
+        if __cufftGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetProperty = dlsym(handle, 'cufftGetProperty')
+
+        global __cufftSetPlanPropertyInt64
+        if __cufftSetPlanPropertyInt64 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftSetPlanPropertyInt64 = dlsym(handle, 'cufftSetPlanPropertyInt64')
+
+        global __cufftGetPlanPropertyInt64
+        if __cufftGetPlanPropertyInt64 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftGetPlanPropertyInt64 = dlsym(handle, 'cufftGetPlanPropertyInt64')
+
+        global __cufftResetPlanProperty
+        if __cufftResetPlanProperty == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftResetPlanProperty = dlsym(handle, 'cufftResetPlanProperty')
+
+        global __cufftXtSetGPUs
+        if __cufftXtSetGPUs == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetGPUs = dlsym(handle, 'cufftXtSetGPUs')
+
+        global __cufftXtMalloc
+        if __cufftXtMalloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtMalloc = dlsym(handle, 'cufftXtMalloc')
+
+        global __cufftXtMemcpy
+        if __cufftXtMemcpy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtMemcpy = dlsym(handle, 'cufftXtMemcpy')
+
+        global __cufftXtFree
+        if __cufftXtFree == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtFree = dlsym(handle, 'cufftXtFree')
+
+        global __cufftXtSetWorkArea
+        if __cufftXtSetWorkArea == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetWorkArea = dlsym(handle, 'cufftXtSetWorkArea')
+
+        global __cufftXtExecDescriptorC2C
+        if __cufftXtExecDescriptorC2C == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorC2C = dlsym(handle, 'cufftXtExecDescriptorC2C')
+
+        global __cufftXtExecDescriptorR2C
+        if __cufftXtExecDescriptorR2C == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorR2C = dlsym(handle, 'cufftXtExecDescriptorR2C')
+
+        global __cufftXtExecDescriptorC2R
+        if __cufftXtExecDescriptorC2R == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorC2R = dlsym(handle, 'cufftXtExecDescriptorC2R')
+
+        global __cufftXtExecDescriptorZ2Z
+        if __cufftXtExecDescriptorZ2Z == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorZ2Z = dlsym(handle, 'cufftXtExecDescriptorZ2Z')
+
+        global __cufftXtExecDescriptorD2Z
+        if __cufftXtExecDescriptorD2Z == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorD2Z = dlsym(handle, 'cufftXtExecDescriptorD2Z')
+
+        global __cufftXtExecDescriptorZ2D
+        if __cufftXtExecDescriptorZ2D == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptorZ2D = dlsym(handle, 'cufftXtExecDescriptorZ2D')
+
+        global __cufftXtQueryPlan
+        if __cufftXtQueryPlan == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtQueryPlan = dlsym(handle, 'cufftXtQueryPlan')
+
+        global __cufftXtClearCallback
+        if __cufftXtClearCallback == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtClearCallback = dlsym(handle, 'cufftXtClearCallback')
+
+        global __cufftXtSetCallbackSharedSize
+        if __cufftXtSetCallbackSharedSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetCallbackSharedSize = dlsym(handle, 'cufftXtSetCallbackSharedSize')
+
+        global __cufftXtMakePlanMany
+        if __cufftXtMakePlanMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtMakePlanMany = dlsym(handle, 'cufftXtMakePlanMany')
+
+        global __cufftXtGetSizeMany
+        if __cufftXtGetSizeMany == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtGetSizeMany = dlsym(handle, 'cufftXtGetSizeMany')
+
+        global __cufftXtExec
+        if __cufftXtExec == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExec = dlsym(handle, 'cufftXtExec')
+
+        global __cufftXtExecDescriptor
+        if __cufftXtExecDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtExecDescriptor = dlsym(handle, 'cufftXtExecDescriptor')
+
+        global __cufftXtSetWorkAreaPolicy
+        if __cufftXtSetWorkAreaPolicy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetWorkAreaPolicy = dlsym(handle, 'cufftXtSetWorkAreaPolicy')
+
+        global __cufftMpAttachComm
+        if __cufftMpAttachComm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpAttachComm = dlsym(handle, 'cufftMpAttachComm')
+
+        global __cufftXtSetDistribution
+        if __cufftXtSetDistribution == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetDistribution = dlsym(handle, 'cufftXtSetDistribution')
+
+        global __cufftXtSetSubformatDefault
+        if __cufftXtSetSubformatDefault == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftXtSetSubformatDefault = dlsym(handle, 'cufftXtSetSubformatDefault')
+
+        global __cufftMpCreateReshape
+        if __cufftMpCreateReshape == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpCreateReshape = dlsym(handle, 'cufftMpCreateReshape')
+
+        global __cufftMpAttachReshapeComm
+        if __cufftMpAttachReshapeComm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpAttachReshapeComm = dlsym(handle, 'cufftMpAttachReshapeComm')
+
+        global __cufftMpGetReshapeSize
+        if __cufftMpGetReshapeSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpGetReshapeSize = dlsym(handle, 'cufftMpGetReshapeSize')
+
+        global __cufftMpMakeReshape
+        if __cufftMpMakeReshape == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpMakeReshape = dlsym(handle, 'cufftMpMakeReshape')
+
+        global __cufftMpExecReshapeAsync
+        if __cufftMpExecReshapeAsync == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpExecReshapeAsync = dlsym(handle, 'cufftMpExecReshapeAsync')
+
+        global __cufftMpDestroyReshape
+        if __cufftMpDestroyReshape == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cufftMpDestroyReshape = dlsym(handle, 'cufftMpDestroyReshape')
+
+        global ____cufftMpMakeReshape_11_4
+        if ____cufftMpMakeReshape_11_4 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            ____cufftMpMakeReshape_11_4 = dlsym(handle, '__cufftMpMakeReshape_11_4')
+        __py_cufftMp_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cufft_linux.pyx b/nvmath/bindings/_internal/cufft_linux.pyx
index 475ce71..1a37b3e 100644
--- a/nvmath/bindings/_internal/cufft_linux.pyx
+++ b/nvmath/bindings/_internal/cufft_linux.pyx
@@ -2,14 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cufft_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cufftPlan1d = NULL
 cdef void* __cufftPlan2d = NULL
@@ -92,6 +113,7 @@ cdef void* __cufftXtSetSubformatDefault = NULL
 cdef void* __cufftSetPlanPropertyInt64 = NULL
 cdef void* __cufftGetPlanPropertyInt64 = NULL
 cdef void* __cufftResetPlanProperty = NULL
+cdef void* ____cufftXtSetJITCallback_12_7 = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -104,422 +126,413 @@ cdef int _check_or_init_cufft() except -1 nogil:
     if __py_cufft_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cufftPlan1d
-    __cufftPlan1d = dlsym(RTLD_DEFAULT, 'cufftPlan1d')
-    if __cufftPlan1d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftPlan1d = dlsym(handle, 'cufftPlan1d')
-
-    global __cufftPlan2d
-    __cufftPlan2d = dlsym(RTLD_DEFAULT, 'cufftPlan2d')
-    if __cufftPlan2d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftPlan2d = dlsym(handle, 'cufftPlan2d')
-
-    global __cufftPlan3d
-    __cufftPlan3d = dlsym(RTLD_DEFAULT, 'cufftPlan3d')
-    if __cufftPlan3d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftPlan3d = dlsym(handle, 'cufftPlan3d')
-
-    global __cufftPlanMany
-    __cufftPlanMany = dlsym(RTLD_DEFAULT, 'cufftPlanMany')
-    if __cufftPlanMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftPlanMany = dlsym(handle, 'cufftPlanMany')
-
-    global __cufftMakePlan1d
-    __cufftMakePlan1d = dlsym(RTLD_DEFAULT, 'cufftMakePlan1d')
-    if __cufftMakePlan1d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftMakePlan1d = dlsym(handle, 'cufftMakePlan1d')
-
-    global __cufftMakePlan2d
-    __cufftMakePlan2d = dlsym(RTLD_DEFAULT, 'cufftMakePlan2d')
-    if __cufftMakePlan2d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftMakePlan2d = dlsym(handle, 'cufftMakePlan2d')
-
-    global __cufftMakePlan3d
-    __cufftMakePlan3d = dlsym(RTLD_DEFAULT, 'cufftMakePlan3d')
-    if __cufftMakePlan3d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftMakePlan3d = dlsym(handle, 'cufftMakePlan3d')
-
-    global __cufftMakePlanMany
-    __cufftMakePlanMany = dlsym(RTLD_DEFAULT, 'cufftMakePlanMany')
-    if __cufftMakePlanMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftMakePlanMany = dlsym(handle, 'cufftMakePlanMany')
-
-    global __cufftMakePlanMany64
-    __cufftMakePlanMany64 = dlsym(RTLD_DEFAULT, 'cufftMakePlanMany64')
-    if __cufftMakePlanMany64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftMakePlanMany64 = dlsym(handle, 'cufftMakePlanMany64')
-
-    global __cufftGetSizeMany64
-    __cufftGetSizeMany64 = dlsym(RTLD_DEFAULT, 'cufftGetSizeMany64')
-    if __cufftGetSizeMany64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSizeMany64 = dlsym(handle, 'cufftGetSizeMany64')
-
-    global __cufftEstimate1d
-    __cufftEstimate1d = dlsym(RTLD_DEFAULT, 'cufftEstimate1d')
-    if __cufftEstimate1d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftEstimate1d = dlsym(handle, 'cufftEstimate1d')
-
-    global __cufftEstimate2d
-    __cufftEstimate2d = dlsym(RTLD_DEFAULT, 'cufftEstimate2d')
-    if __cufftEstimate2d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftEstimate2d = dlsym(handle, 'cufftEstimate2d')
-
-    global __cufftEstimate3d
-    __cufftEstimate3d = dlsym(RTLD_DEFAULT, 'cufftEstimate3d')
-    if __cufftEstimate3d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftEstimate3d = dlsym(handle, 'cufftEstimate3d')
-
-    global __cufftEstimateMany
-    __cufftEstimateMany = dlsym(RTLD_DEFAULT, 'cufftEstimateMany')
-    if __cufftEstimateMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftEstimateMany = dlsym(handle, 'cufftEstimateMany')
-
-    global __cufftCreate
-    __cufftCreate = dlsym(RTLD_DEFAULT, 'cufftCreate')
-    if __cufftCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftCreate = dlsym(handle, 'cufftCreate')
-
-    global __cufftGetSize1d
-    __cufftGetSize1d = dlsym(RTLD_DEFAULT, 'cufftGetSize1d')
-    if __cufftGetSize1d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSize1d = dlsym(handle, 'cufftGetSize1d')
-
-    global __cufftGetSize2d
-    __cufftGetSize2d = dlsym(RTLD_DEFAULT, 'cufftGetSize2d')
-    if __cufftGetSize2d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSize2d = dlsym(handle, 'cufftGetSize2d')
-
-    global __cufftGetSize3d
-    __cufftGetSize3d = dlsym(RTLD_DEFAULT, 'cufftGetSize3d')
-    if __cufftGetSize3d == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSize3d = dlsym(handle, 'cufftGetSize3d')
-
-    global __cufftGetSizeMany
-    __cufftGetSizeMany = dlsym(RTLD_DEFAULT, 'cufftGetSizeMany')
-    if __cufftGetSizeMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSizeMany = dlsym(handle, 'cufftGetSizeMany')
-
-    global __cufftGetSize
-    __cufftGetSize = dlsym(RTLD_DEFAULT, 'cufftGetSize')
-    if __cufftGetSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetSize = dlsym(handle, 'cufftGetSize')
-
-    global __cufftSetWorkArea
-    __cufftSetWorkArea = dlsym(RTLD_DEFAULT, 'cufftSetWorkArea')
-    if __cufftSetWorkArea == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftSetWorkArea = dlsym(handle, 'cufftSetWorkArea')
-
-    global __cufftSetAutoAllocation
-    __cufftSetAutoAllocation = dlsym(RTLD_DEFAULT, 'cufftSetAutoAllocation')
-    if __cufftSetAutoAllocation == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftSetAutoAllocation = dlsym(handle, 'cufftSetAutoAllocation')
-
-    global __cufftExecC2C
-    __cufftExecC2C = dlsym(RTLD_DEFAULT, 'cufftExecC2C')
-    if __cufftExecC2C == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecC2C = dlsym(handle, 'cufftExecC2C')
-
-    global __cufftExecR2C
-    __cufftExecR2C = dlsym(RTLD_DEFAULT, 'cufftExecR2C')
-    if __cufftExecR2C == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecR2C = dlsym(handle, 'cufftExecR2C')
-
-    global __cufftExecC2R
-    __cufftExecC2R = dlsym(RTLD_DEFAULT, 'cufftExecC2R')
-    if __cufftExecC2R == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecC2R = dlsym(handle, 'cufftExecC2R')
-
-    global __cufftExecZ2Z
-    __cufftExecZ2Z = dlsym(RTLD_DEFAULT, 'cufftExecZ2Z')
-    if __cufftExecZ2Z == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecZ2Z = dlsym(handle, 'cufftExecZ2Z')
 
-    global __cufftExecD2Z
-    __cufftExecD2Z = dlsym(RTLD_DEFAULT, 'cufftExecD2Z')
-    if __cufftExecD2Z == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecD2Z = dlsym(handle, 'cufftExecD2Z')
-
-    global __cufftExecZ2D
-    __cufftExecZ2D = dlsym(RTLD_DEFAULT, 'cufftExecZ2D')
-    if __cufftExecZ2D == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftExecZ2D = dlsym(handle, 'cufftExecZ2D')
-
-    global __cufftSetStream
-    __cufftSetStream = dlsym(RTLD_DEFAULT, 'cufftSetStream')
-    if __cufftSetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftSetStream = dlsym(handle, 'cufftSetStream')
-
-    global __cufftDestroy
-    __cufftDestroy = dlsym(RTLD_DEFAULT, 'cufftDestroy')
-    if __cufftDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftDestroy = dlsym(handle, 'cufftDestroy')
-
-    global __cufftGetVersion
-    __cufftGetVersion = dlsym(RTLD_DEFAULT, 'cufftGetVersion')
-    if __cufftGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetVersion = dlsym(handle, 'cufftGetVersion')
-
-    global __cufftGetProperty
-    __cufftGetProperty = dlsym(RTLD_DEFAULT, 'cufftGetProperty')
-    if __cufftGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetProperty = dlsym(handle, 'cufftGetProperty')
-
-    global __cufftXtSetGPUs
-    __cufftXtSetGPUs = dlsym(RTLD_DEFAULT, 'cufftXtSetGPUs')
-    if __cufftXtSetGPUs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetGPUs = dlsym(handle, 'cufftXtSetGPUs')
-
-    global __cufftXtMalloc
-    __cufftXtMalloc = dlsym(RTLD_DEFAULT, 'cufftXtMalloc')
-    if __cufftXtMalloc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtMalloc = dlsym(handle, 'cufftXtMalloc')
-
-    global __cufftXtMemcpy
-    __cufftXtMemcpy = dlsym(RTLD_DEFAULT, 'cufftXtMemcpy')
-    if __cufftXtMemcpy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtMemcpy = dlsym(handle, 'cufftXtMemcpy')
-
-    global __cufftXtFree
-    __cufftXtFree = dlsym(RTLD_DEFAULT, 'cufftXtFree')
-    if __cufftXtFree == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtFree = dlsym(handle, 'cufftXtFree')
-
-    global __cufftXtSetWorkArea
-    __cufftXtSetWorkArea = dlsym(RTLD_DEFAULT, 'cufftXtSetWorkArea')
-    if __cufftXtSetWorkArea == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetWorkArea = dlsym(handle, 'cufftXtSetWorkArea')
-
-    global __cufftXtExecDescriptorC2C
-    __cufftXtExecDescriptorC2C = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorC2C')
-    if __cufftXtExecDescriptorC2C == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorC2C = dlsym(handle, 'cufftXtExecDescriptorC2C')
-
-    global __cufftXtExecDescriptorR2C
-    __cufftXtExecDescriptorR2C = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorR2C')
-    if __cufftXtExecDescriptorR2C == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorR2C = dlsym(handle, 'cufftXtExecDescriptorR2C')
-
-    global __cufftXtExecDescriptorC2R
-    __cufftXtExecDescriptorC2R = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorC2R')
-    if __cufftXtExecDescriptorC2R == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorC2R = dlsym(handle, 'cufftXtExecDescriptorC2R')
-
-    global __cufftXtExecDescriptorZ2Z
-    __cufftXtExecDescriptorZ2Z = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorZ2Z')
-    if __cufftXtExecDescriptorZ2Z == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorZ2Z = dlsym(handle, 'cufftXtExecDescriptorZ2Z')
-
-    global __cufftXtExecDescriptorD2Z
-    __cufftXtExecDescriptorD2Z = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorD2Z')
-    if __cufftXtExecDescriptorD2Z == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorD2Z = dlsym(handle, 'cufftXtExecDescriptorD2Z')
-
-    global __cufftXtExecDescriptorZ2D
-    __cufftXtExecDescriptorZ2D = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorZ2D')
-    if __cufftXtExecDescriptorZ2D == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptorZ2D = dlsym(handle, 'cufftXtExecDescriptorZ2D')
-
-    global __cufftXtQueryPlan
-    __cufftXtQueryPlan = dlsym(RTLD_DEFAULT, 'cufftXtQueryPlan')
-    if __cufftXtQueryPlan == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtQueryPlan = dlsym(handle, 'cufftXtQueryPlan')
-
-    global __cufftXtClearCallback
-    __cufftXtClearCallback = dlsym(RTLD_DEFAULT, 'cufftXtClearCallback')
-    if __cufftXtClearCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtClearCallback = dlsym(handle, 'cufftXtClearCallback')
-
-    global __cufftXtSetCallbackSharedSize
-    __cufftXtSetCallbackSharedSize = dlsym(RTLD_DEFAULT, 'cufftXtSetCallbackSharedSize')
-    if __cufftXtSetCallbackSharedSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetCallbackSharedSize = dlsym(handle, 'cufftXtSetCallbackSharedSize')
-
-    global __cufftXtMakePlanMany
-    __cufftXtMakePlanMany = dlsym(RTLD_DEFAULT, 'cufftXtMakePlanMany')
-    if __cufftXtMakePlanMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtMakePlanMany = dlsym(handle, 'cufftXtMakePlanMany')
-
-    global __cufftXtGetSizeMany
-    __cufftXtGetSizeMany = dlsym(RTLD_DEFAULT, 'cufftXtGetSizeMany')
-    if __cufftXtGetSizeMany == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtGetSizeMany = dlsym(handle, 'cufftXtGetSizeMany')
-
-    global __cufftXtExec
-    __cufftXtExec = dlsym(RTLD_DEFAULT, 'cufftXtExec')
-    if __cufftXtExec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExec = dlsym(handle, 'cufftXtExec')
-
-    global __cufftXtExecDescriptor
-    __cufftXtExecDescriptor = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptor')
-    if __cufftXtExecDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtExecDescriptor = dlsym(handle, 'cufftXtExecDescriptor')
-
-    global __cufftXtSetWorkAreaPolicy
-    __cufftXtSetWorkAreaPolicy = dlsym(RTLD_DEFAULT, 'cufftXtSetWorkAreaPolicy')
-    if __cufftXtSetWorkAreaPolicy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetWorkAreaPolicy = dlsym(handle, 'cufftXtSetWorkAreaPolicy')
-
-    global __cufftXtSetJITCallback
-    __cufftXtSetJITCallback = dlsym(RTLD_DEFAULT, 'cufftXtSetJITCallback')
-    if __cufftXtSetJITCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetJITCallback = dlsym(handle, 'cufftXtSetJITCallback')
-
-    global __cufftXtSetSubformatDefault
-    __cufftXtSetSubformatDefault = dlsym(RTLD_DEFAULT, 'cufftXtSetSubformatDefault')
-    if __cufftXtSetSubformatDefault == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftXtSetSubformatDefault = dlsym(handle, 'cufftXtSetSubformatDefault')
-
-    global __cufftSetPlanPropertyInt64
-    __cufftSetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftSetPlanPropertyInt64')
-    if __cufftSetPlanPropertyInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftSetPlanPropertyInt64 = dlsym(handle, 'cufftSetPlanPropertyInt64')
-
-    global __cufftGetPlanPropertyInt64
-    __cufftGetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftGetPlanPropertyInt64')
-    if __cufftGetPlanPropertyInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftGetPlanPropertyInt64 = dlsym(handle, 'cufftGetPlanPropertyInt64')
-
-    global __cufftResetPlanProperty
-    __cufftResetPlanProperty = dlsym(RTLD_DEFAULT, 'cufftResetPlanProperty')
-    if __cufftResetPlanProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftResetPlanProperty = dlsym(handle, 'cufftResetPlanProperty')
-
-    __py_cufft_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cufftPlan1d
+        __cufftPlan1d = dlsym(RTLD_DEFAULT, 'cufftPlan1d')
+        if __cufftPlan1d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftPlan1d = dlsym(handle, 'cufftPlan1d')
+
+        global __cufftPlan2d
+        __cufftPlan2d = dlsym(RTLD_DEFAULT, 'cufftPlan2d')
+        if __cufftPlan2d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftPlan2d = dlsym(handle, 'cufftPlan2d')
+
+        global __cufftPlan3d
+        __cufftPlan3d = dlsym(RTLD_DEFAULT, 'cufftPlan3d')
+        if __cufftPlan3d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftPlan3d = dlsym(handle, 'cufftPlan3d')
+
+        global __cufftPlanMany
+        __cufftPlanMany = dlsym(RTLD_DEFAULT, 'cufftPlanMany')
+        if __cufftPlanMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftPlanMany = dlsym(handle, 'cufftPlanMany')
+
+        global __cufftMakePlan1d
+        __cufftMakePlan1d = dlsym(RTLD_DEFAULT, 'cufftMakePlan1d')
+        if __cufftMakePlan1d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftMakePlan1d = dlsym(handle, 'cufftMakePlan1d')
+
+        global __cufftMakePlan2d
+        __cufftMakePlan2d = dlsym(RTLD_DEFAULT, 'cufftMakePlan2d')
+        if __cufftMakePlan2d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftMakePlan2d = dlsym(handle, 'cufftMakePlan2d')
+
+        global __cufftMakePlan3d
+        __cufftMakePlan3d = dlsym(RTLD_DEFAULT, 'cufftMakePlan3d')
+        if __cufftMakePlan3d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftMakePlan3d = dlsym(handle, 'cufftMakePlan3d')
+
+        global __cufftMakePlanMany
+        __cufftMakePlanMany = dlsym(RTLD_DEFAULT, 'cufftMakePlanMany')
+        if __cufftMakePlanMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftMakePlanMany = dlsym(handle, 'cufftMakePlanMany')
+
+        global __cufftMakePlanMany64
+        __cufftMakePlanMany64 = dlsym(RTLD_DEFAULT, 'cufftMakePlanMany64')
+        if __cufftMakePlanMany64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftMakePlanMany64 = dlsym(handle, 'cufftMakePlanMany64')
+
+        global __cufftGetSizeMany64
+        __cufftGetSizeMany64 = dlsym(RTLD_DEFAULT, 'cufftGetSizeMany64')
+        if __cufftGetSizeMany64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSizeMany64 = dlsym(handle, 'cufftGetSizeMany64')
+
+        global __cufftEstimate1d
+        __cufftEstimate1d = dlsym(RTLD_DEFAULT, 'cufftEstimate1d')
+        if __cufftEstimate1d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftEstimate1d = dlsym(handle, 'cufftEstimate1d')
+
+        global __cufftEstimate2d
+        __cufftEstimate2d = dlsym(RTLD_DEFAULT, 'cufftEstimate2d')
+        if __cufftEstimate2d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftEstimate2d = dlsym(handle, 'cufftEstimate2d')
+
+        global __cufftEstimate3d
+        __cufftEstimate3d = dlsym(RTLD_DEFAULT, 'cufftEstimate3d')
+        if __cufftEstimate3d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftEstimate3d = dlsym(handle, 'cufftEstimate3d')
+
+        global __cufftEstimateMany
+        __cufftEstimateMany = dlsym(RTLD_DEFAULT, 'cufftEstimateMany')
+        if __cufftEstimateMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftEstimateMany = dlsym(handle, 'cufftEstimateMany')
+
+        global __cufftCreate
+        __cufftCreate = dlsym(RTLD_DEFAULT, 'cufftCreate')
+        if __cufftCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftCreate = dlsym(handle, 'cufftCreate')
+
+        global __cufftGetSize1d
+        __cufftGetSize1d = dlsym(RTLD_DEFAULT, 'cufftGetSize1d')
+        if __cufftGetSize1d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSize1d = dlsym(handle, 'cufftGetSize1d')
+
+        global __cufftGetSize2d
+        __cufftGetSize2d = dlsym(RTLD_DEFAULT, 'cufftGetSize2d')
+        if __cufftGetSize2d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSize2d = dlsym(handle, 'cufftGetSize2d')
+
+        global __cufftGetSize3d
+        __cufftGetSize3d = dlsym(RTLD_DEFAULT, 'cufftGetSize3d')
+        if __cufftGetSize3d == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSize3d = dlsym(handle, 'cufftGetSize3d')
+
+        global __cufftGetSizeMany
+        __cufftGetSizeMany = dlsym(RTLD_DEFAULT, 'cufftGetSizeMany')
+        if __cufftGetSizeMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSizeMany = dlsym(handle, 'cufftGetSizeMany')
+
+        global __cufftGetSize
+        __cufftGetSize = dlsym(RTLD_DEFAULT, 'cufftGetSize')
+        if __cufftGetSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetSize = dlsym(handle, 'cufftGetSize')
+
+        global __cufftSetWorkArea
+        __cufftSetWorkArea = dlsym(RTLD_DEFAULT, 'cufftSetWorkArea')
+        if __cufftSetWorkArea == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftSetWorkArea = dlsym(handle, 'cufftSetWorkArea')
+
+        global __cufftSetAutoAllocation
+        __cufftSetAutoAllocation = dlsym(RTLD_DEFAULT, 'cufftSetAutoAllocation')
+        if __cufftSetAutoAllocation == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftSetAutoAllocation = dlsym(handle, 'cufftSetAutoAllocation')
+
+        global __cufftExecC2C
+        __cufftExecC2C = dlsym(RTLD_DEFAULT, 'cufftExecC2C')
+        if __cufftExecC2C == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecC2C = dlsym(handle, 'cufftExecC2C')
+
+        global __cufftExecR2C
+        __cufftExecR2C = dlsym(RTLD_DEFAULT, 'cufftExecR2C')
+        if __cufftExecR2C == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecR2C = dlsym(handle, 'cufftExecR2C')
+
+        global __cufftExecC2R
+        __cufftExecC2R = dlsym(RTLD_DEFAULT, 'cufftExecC2R')
+        if __cufftExecC2R == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecC2R = dlsym(handle, 'cufftExecC2R')
+
+        global __cufftExecZ2Z
+        __cufftExecZ2Z = dlsym(RTLD_DEFAULT, 'cufftExecZ2Z')
+        if __cufftExecZ2Z == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecZ2Z = dlsym(handle, 'cufftExecZ2Z')
+
+        global __cufftExecD2Z
+        __cufftExecD2Z = dlsym(RTLD_DEFAULT, 'cufftExecD2Z')
+        if __cufftExecD2Z == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecD2Z = dlsym(handle, 'cufftExecD2Z')
+
+        global __cufftExecZ2D
+        __cufftExecZ2D = dlsym(RTLD_DEFAULT, 'cufftExecZ2D')
+        if __cufftExecZ2D == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftExecZ2D = dlsym(handle, 'cufftExecZ2D')
+
+        global __cufftSetStream
+        __cufftSetStream = dlsym(RTLD_DEFAULT, 'cufftSetStream')
+        if __cufftSetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftSetStream = dlsym(handle, 'cufftSetStream')
+
+        global __cufftDestroy
+        __cufftDestroy = dlsym(RTLD_DEFAULT, 'cufftDestroy')
+        if __cufftDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftDestroy = dlsym(handle, 'cufftDestroy')
+
+        global __cufftGetVersion
+        __cufftGetVersion = dlsym(RTLD_DEFAULT, 'cufftGetVersion')
+        if __cufftGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetVersion = dlsym(handle, 'cufftGetVersion')
+
+        global __cufftGetProperty
+        __cufftGetProperty = dlsym(RTLD_DEFAULT, 'cufftGetProperty')
+        if __cufftGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetProperty = dlsym(handle, 'cufftGetProperty')
+
+        global __cufftXtSetGPUs
+        __cufftXtSetGPUs = dlsym(RTLD_DEFAULT, 'cufftXtSetGPUs')
+        if __cufftXtSetGPUs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetGPUs = dlsym(handle, 'cufftXtSetGPUs')
+
+        global __cufftXtMalloc
+        __cufftXtMalloc = dlsym(RTLD_DEFAULT, 'cufftXtMalloc')
+        if __cufftXtMalloc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtMalloc = dlsym(handle, 'cufftXtMalloc')
+
+        global __cufftXtMemcpy
+        __cufftXtMemcpy = dlsym(RTLD_DEFAULT, 'cufftXtMemcpy')
+        if __cufftXtMemcpy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtMemcpy = dlsym(handle, 'cufftXtMemcpy')
+
+        global __cufftXtFree
+        __cufftXtFree = dlsym(RTLD_DEFAULT, 'cufftXtFree')
+        if __cufftXtFree == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtFree = dlsym(handle, 'cufftXtFree')
+
+        global __cufftXtSetWorkArea
+        __cufftXtSetWorkArea = dlsym(RTLD_DEFAULT, 'cufftXtSetWorkArea')
+        if __cufftXtSetWorkArea == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetWorkArea = dlsym(handle, 'cufftXtSetWorkArea')
+
+        global __cufftXtExecDescriptorC2C
+        __cufftXtExecDescriptorC2C = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorC2C')
+        if __cufftXtExecDescriptorC2C == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorC2C = dlsym(handle, 'cufftXtExecDescriptorC2C')
+
+        global __cufftXtExecDescriptorR2C
+        __cufftXtExecDescriptorR2C = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorR2C')
+        if __cufftXtExecDescriptorR2C == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorR2C = dlsym(handle, 'cufftXtExecDescriptorR2C')
+
+        global __cufftXtExecDescriptorC2R
+        __cufftXtExecDescriptorC2R = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorC2R')
+        if __cufftXtExecDescriptorC2R == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorC2R = dlsym(handle, 'cufftXtExecDescriptorC2R')
+
+        global __cufftXtExecDescriptorZ2Z
+        __cufftXtExecDescriptorZ2Z = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorZ2Z')
+        if __cufftXtExecDescriptorZ2Z == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorZ2Z = dlsym(handle, 'cufftXtExecDescriptorZ2Z')
+
+        global __cufftXtExecDescriptorD2Z
+        __cufftXtExecDescriptorD2Z = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorD2Z')
+        if __cufftXtExecDescriptorD2Z == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorD2Z = dlsym(handle, 'cufftXtExecDescriptorD2Z')
+
+        global __cufftXtExecDescriptorZ2D
+        __cufftXtExecDescriptorZ2D = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptorZ2D')
+        if __cufftXtExecDescriptorZ2D == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptorZ2D = dlsym(handle, 'cufftXtExecDescriptorZ2D')
+
+        global __cufftXtQueryPlan
+        __cufftXtQueryPlan = dlsym(RTLD_DEFAULT, 'cufftXtQueryPlan')
+        if __cufftXtQueryPlan == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtQueryPlan = dlsym(handle, 'cufftXtQueryPlan')
+
+        global __cufftXtClearCallback
+        __cufftXtClearCallback = dlsym(RTLD_DEFAULT, 'cufftXtClearCallback')
+        if __cufftXtClearCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtClearCallback = dlsym(handle, 'cufftXtClearCallback')
+
+        global __cufftXtSetCallbackSharedSize
+        __cufftXtSetCallbackSharedSize = dlsym(RTLD_DEFAULT, 'cufftXtSetCallbackSharedSize')
+        if __cufftXtSetCallbackSharedSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetCallbackSharedSize = dlsym(handle, 'cufftXtSetCallbackSharedSize')
+
+        global __cufftXtMakePlanMany
+        __cufftXtMakePlanMany = dlsym(RTLD_DEFAULT, 'cufftXtMakePlanMany')
+        if __cufftXtMakePlanMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtMakePlanMany = dlsym(handle, 'cufftXtMakePlanMany')
+
+        global __cufftXtGetSizeMany
+        __cufftXtGetSizeMany = dlsym(RTLD_DEFAULT, 'cufftXtGetSizeMany')
+        if __cufftXtGetSizeMany == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtGetSizeMany = dlsym(handle, 'cufftXtGetSizeMany')
+
+        global __cufftXtExec
+        __cufftXtExec = dlsym(RTLD_DEFAULT, 'cufftXtExec')
+        if __cufftXtExec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExec = dlsym(handle, 'cufftXtExec')
+
+        global __cufftXtExecDescriptor
+        __cufftXtExecDescriptor = dlsym(RTLD_DEFAULT, 'cufftXtExecDescriptor')
+        if __cufftXtExecDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtExecDescriptor = dlsym(handle, 'cufftXtExecDescriptor')
+
+        global __cufftXtSetWorkAreaPolicy
+        __cufftXtSetWorkAreaPolicy = dlsym(RTLD_DEFAULT, 'cufftXtSetWorkAreaPolicy')
+        if __cufftXtSetWorkAreaPolicy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetWorkAreaPolicy = dlsym(handle, 'cufftXtSetWorkAreaPolicy')
+
+        global __cufftXtSetJITCallback
+        __cufftXtSetJITCallback = dlsym(RTLD_DEFAULT, 'cufftXtSetJITCallback')
+        if __cufftXtSetJITCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetJITCallback = dlsym(handle, 'cufftXtSetJITCallback')
+
+        global __cufftXtSetSubformatDefault
+        __cufftXtSetSubformatDefault = dlsym(RTLD_DEFAULT, 'cufftXtSetSubformatDefault')
+        if __cufftXtSetSubformatDefault == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftXtSetSubformatDefault = dlsym(handle, 'cufftXtSetSubformatDefault')
+
+        global __cufftSetPlanPropertyInt64
+        __cufftSetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftSetPlanPropertyInt64')
+        if __cufftSetPlanPropertyInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftSetPlanPropertyInt64 = dlsym(handle, 'cufftSetPlanPropertyInt64')
+
+        global __cufftGetPlanPropertyInt64
+        __cufftGetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftGetPlanPropertyInt64')
+        if __cufftGetPlanPropertyInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftGetPlanPropertyInt64 = dlsym(handle, 'cufftGetPlanPropertyInt64')
+
+        global __cufftResetPlanProperty
+        __cufftResetPlanProperty = dlsym(RTLD_DEFAULT, 'cufftResetPlanProperty')
+        if __cufftResetPlanProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftResetPlanProperty = dlsym(handle, 'cufftResetPlanProperty')
+
+        global ____cufftXtSetJITCallback_12_7
+        ____cufftXtSetJITCallback_12_7 = dlsym(RTLD_DEFAULT, '__cufftXtSetJITCallback_12_7')
+        if ____cufftXtSetJITCallback_12_7 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            ____cufftXtSetJITCallback_12_7 = dlsym(handle, '__cufftXtSetJITCallback_12_7')
+
+        __py_cufft_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -701,6 +714,9 @@ cpdef dict _inspect_function_pointers():
     global __cufftResetPlanProperty
     data["__cufftResetPlanProperty"] = <intptr_t>__cufftResetPlanProperty
 
+    global ____cufftXtSetJITCallback_12_7
+    data["____cufftXtSetJITCallback_12_7"] = <intptr_t>____cufftXtSetJITCallback_12_7
+
     func_ptrs = data
     return data
 
@@ -1226,14 +1242,14 @@ cdef cufftResult _cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPoli
         plan, policy, workSize)
 
 
-cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
     global __cufftXtSetJITCallback
     _check_or_init_cufft()
     if __cufftXtSetJITCallback == NULL:
         with gil:
             raise FunctionNotFoundError("function cufftXtSetJITCallback is not found")
-    return (<cufftResult (*)(cufftHandle, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>__cufftXtSetJITCallback)(
-        plan, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
+    return (<cufftResult (*)(cufftHandle, const char*, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>__cufftXtSetJITCallback)(
+        plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
 
 
 cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
@@ -1274,3 +1290,13 @@ cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty propert
             raise FunctionNotFoundError("function cufftResetPlanProperty is not found")
     return (<cufftResult (*)(cufftHandle, cufftProperty) noexcept nogil>__cufftResetPlanProperty)(
         plan, property)
+
+
+cdef cufftResult ___cufftXtSetJITCallback_12_7(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+    global ____cufftXtSetJITCallback_12_7
+    _check_or_init_cufft()
+    if ____cufftXtSetJITCallback_12_7 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function __cufftXtSetJITCallback_12_7 is not found")
+    return (<cufftResult (*)(cufftHandle, const char*, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>____cufftXtSetJITCallback_12_7)(
+        plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
diff --git a/nvmath/bindings/_internal/cufft_windows.pyx b/nvmath/bindings/_internal/cufft_windows.pyx
index 7e930be..126935f 100644
--- a/nvmath/bindings/_internal/cufft_windows.pyx
+++ b/nvmath/bindings/_internal/cufft_windows.pyx
@@ -2,26 +2,82 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cufft_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cufftPlan1d = NULL
 cdef void* __cufftPlan2d = NULL
@@ -79,6 +135,7 @@ cdef void* __cufftXtSetSubformatDefault = NULL
 cdef void* __cufftSetPlanPropertyInt64 = NULL
 cdef void* __cufftGetPlanPropertyInt64 = NULL
 cdef void* __cufftResetPlanProperty = NULL
+cdef void* ____cufftXtSetJITCallback_12_7 = NULL
 
 
 cdef inline list get_site_packages():
@@ -93,364 +150,186 @@ cdef int _check_or_init_cufft() except -1 nogil:
     if __py_cufft_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __cufftPlan1d
-        try:
-            __cufftPlan1d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftPlan1d')
-        except:
-            pass
+        __cufftPlan1d = GetProcAddress(handle, 'cufftPlan1d')
 
         global __cufftPlan2d
-        try:
-            __cufftPlan2d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftPlan2d')
-        except:
-            pass
+        __cufftPlan2d = GetProcAddress(handle, 'cufftPlan2d')
 
         global __cufftPlan3d
-        try:
-            __cufftPlan3d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftPlan3d')
-        except:
-            pass
+        __cufftPlan3d = GetProcAddress(handle, 'cufftPlan3d')
 
         global __cufftPlanMany
-        try:
-            __cufftPlanMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftPlanMany')
-        except:
-            pass
+        __cufftPlanMany = GetProcAddress(handle, 'cufftPlanMany')
 
         global __cufftMakePlan1d
-        try:
-            __cufftMakePlan1d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftMakePlan1d')
-        except:
-            pass
+        __cufftMakePlan1d = GetProcAddress(handle, 'cufftMakePlan1d')
 
         global __cufftMakePlan2d
-        try:
-            __cufftMakePlan2d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftMakePlan2d')
-        except:
-            pass
+        __cufftMakePlan2d = GetProcAddress(handle, 'cufftMakePlan2d')
 
         global __cufftMakePlan3d
-        try:
-            __cufftMakePlan3d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftMakePlan3d')
-        except:
-            pass
+        __cufftMakePlan3d = GetProcAddress(handle, 'cufftMakePlan3d')
 
         global __cufftMakePlanMany
-        try:
-            __cufftMakePlanMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftMakePlanMany')
-        except:
-            pass
+        __cufftMakePlanMany = GetProcAddress(handle, 'cufftMakePlanMany')
 
         global __cufftMakePlanMany64
-        try:
-            __cufftMakePlanMany64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftMakePlanMany64')
-        except:
-            pass
+        __cufftMakePlanMany64 = GetProcAddress(handle, 'cufftMakePlanMany64')
 
         global __cufftGetSizeMany64
-        try:
-            __cufftGetSizeMany64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSizeMany64')
-        except:
-            pass
+        __cufftGetSizeMany64 = GetProcAddress(handle, 'cufftGetSizeMany64')
 
         global __cufftEstimate1d
-        try:
-            __cufftEstimate1d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftEstimate1d')
-        except:
-            pass
+        __cufftEstimate1d = GetProcAddress(handle, 'cufftEstimate1d')
 
         global __cufftEstimate2d
-        try:
-            __cufftEstimate2d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftEstimate2d')
-        except:
-            pass
+        __cufftEstimate2d = GetProcAddress(handle, 'cufftEstimate2d')
 
         global __cufftEstimate3d
-        try:
-            __cufftEstimate3d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftEstimate3d')
-        except:
-            pass
+        __cufftEstimate3d = GetProcAddress(handle, 'cufftEstimate3d')
 
         global __cufftEstimateMany
-        try:
-            __cufftEstimateMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftEstimateMany')
-        except:
-            pass
+        __cufftEstimateMany = GetProcAddress(handle, 'cufftEstimateMany')
 
         global __cufftCreate
-        try:
-            __cufftCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftCreate')
-        except:
-            pass
+        __cufftCreate = GetProcAddress(handle, 'cufftCreate')
 
         global __cufftGetSize1d
-        try:
-            __cufftGetSize1d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSize1d')
-        except:
-            pass
+        __cufftGetSize1d = GetProcAddress(handle, 'cufftGetSize1d')
 
         global __cufftGetSize2d
-        try:
-            __cufftGetSize2d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSize2d')
-        except:
-            pass
+        __cufftGetSize2d = GetProcAddress(handle, 'cufftGetSize2d')
 
         global __cufftGetSize3d
-        try:
-            __cufftGetSize3d = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSize3d')
-        except:
-            pass
+        __cufftGetSize3d = GetProcAddress(handle, 'cufftGetSize3d')
 
         global __cufftGetSizeMany
-        try:
-            __cufftGetSizeMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSizeMany')
-        except:
-            pass
+        __cufftGetSizeMany = GetProcAddress(handle, 'cufftGetSizeMany')
 
         global __cufftGetSize
-        try:
-            __cufftGetSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetSize')
-        except:
-            pass
+        __cufftGetSize = GetProcAddress(handle, 'cufftGetSize')
 
         global __cufftSetWorkArea
-        try:
-            __cufftSetWorkArea = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftSetWorkArea')
-        except:
-            pass
+        __cufftSetWorkArea = GetProcAddress(handle, 'cufftSetWorkArea')
 
         global __cufftSetAutoAllocation
-        try:
-            __cufftSetAutoAllocation = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftSetAutoAllocation')
-        except:
-            pass
+        __cufftSetAutoAllocation = GetProcAddress(handle, 'cufftSetAutoAllocation')
 
         global __cufftExecC2C
-        try:
-            __cufftExecC2C = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecC2C')
-        except:
-            pass
+        __cufftExecC2C = GetProcAddress(handle, 'cufftExecC2C')
 
         global __cufftExecR2C
-        try:
-            __cufftExecR2C = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecR2C')
-        except:
-            pass
+        __cufftExecR2C = GetProcAddress(handle, 'cufftExecR2C')
 
         global __cufftExecC2R
-        try:
-            __cufftExecC2R = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecC2R')
-        except:
-            pass
+        __cufftExecC2R = GetProcAddress(handle, 'cufftExecC2R')
 
         global __cufftExecZ2Z
-        try:
-            __cufftExecZ2Z = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecZ2Z')
-        except:
-            pass
+        __cufftExecZ2Z = GetProcAddress(handle, 'cufftExecZ2Z')
 
         global __cufftExecD2Z
-        try:
-            __cufftExecD2Z = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecD2Z')
-        except:
-            pass
+        __cufftExecD2Z = GetProcAddress(handle, 'cufftExecD2Z')
 
         global __cufftExecZ2D
-        try:
-            __cufftExecZ2D = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftExecZ2D')
-        except:
-            pass
+        __cufftExecZ2D = GetProcAddress(handle, 'cufftExecZ2D')
 
         global __cufftSetStream
-        try:
-            __cufftSetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftSetStream')
-        except:
-            pass
+        __cufftSetStream = GetProcAddress(handle, 'cufftSetStream')
 
         global __cufftDestroy
-        try:
-            __cufftDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftDestroy')
-        except:
-            pass
+        __cufftDestroy = GetProcAddress(handle, 'cufftDestroy')
 
         global __cufftGetVersion
-        try:
-            __cufftGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetVersion')
-        except:
-            pass
+        __cufftGetVersion = GetProcAddress(handle, 'cufftGetVersion')
 
         global __cufftGetProperty
-        try:
-            __cufftGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetProperty')
-        except:
-            pass
+        __cufftGetProperty = GetProcAddress(handle, 'cufftGetProperty')
 
         global __cufftXtSetGPUs
-        try:
-            __cufftXtSetGPUs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetGPUs')
-        except:
-            pass
+        __cufftXtSetGPUs = GetProcAddress(handle, 'cufftXtSetGPUs')
 
         global __cufftXtMalloc
-        try:
-            __cufftXtMalloc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtMalloc')
-        except:
-            pass
+        __cufftXtMalloc = GetProcAddress(handle, 'cufftXtMalloc')
 
         global __cufftXtMemcpy
-        try:
-            __cufftXtMemcpy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtMemcpy')
-        except:
-            pass
+        __cufftXtMemcpy = GetProcAddress(handle, 'cufftXtMemcpy')
 
         global __cufftXtFree
-        try:
-            __cufftXtFree = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtFree')
-        except:
-            pass
+        __cufftXtFree = GetProcAddress(handle, 'cufftXtFree')
 
         global __cufftXtSetWorkArea
-        try:
-            __cufftXtSetWorkArea = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetWorkArea')
-        except:
-            pass
+        __cufftXtSetWorkArea = GetProcAddress(handle, 'cufftXtSetWorkArea')
 
         global __cufftXtExecDescriptorC2C
-        try:
-            __cufftXtExecDescriptorC2C = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorC2C')
-        except:
-            pass
+        __cufftXtExecDescriptorC2C = GetProcAddress(handle, 'cufftXtExecDescriptorC2C')
 
         global __cufftXtExecDescriptorR2C
-        try:
-            __cufftXtExecDescriptorR2C = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorR2C')
-        except:
-            pass
+        __cufftXtExecDescriptorR2C = GetProcAddress(handle, 'cufftXtExecDescriptorR2C')
 
         global __cufftXtExecDescriptorC2R
-        try:
-            __cufftXtExecDescriptorC2R = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorC2R')
-        except:
-            pass
+        __cufftXtExecDescriptorC2R = GetProcAddress(handle, 'cufftXtExecDescriptorC2R')
 
         global __cufftXtExecDescriptorZ2Z
-        try:
-            __cufftXtExecDescriptorZ2Z = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorZ2Z')
-        except:
-            pass
+        __cufftXtExecDescriptorZ2Z = GetProcAddress(handle, 'cufftXtExecDescriptorZ2Z')
 
         global __cufftXtExecDescriptorD2Z
-        try:
-            __cufftXtExecDescriptorD2Z = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorD2Z')
-        except:
-            pass
+        __cufftXtExecDescriptorD2Z = GetProcAddress(handle, 'cufftXtExecDescriptorD2Z')
 
         global __cufftXtExecDescriptorZ2D
-        try:
-            __cufftXtExecDescriptorZ2D = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptorZ2D')
-        except:
-            pass
+        __cufftXtExecDescriptorZ2D = GetProcAddress(handle, 'cufftXtExecDescriptorZ2D')
 
         global __cufftXtQueryPlan
-        try:
-            __cufftXtQueryPlan = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtQueryPlan')
-        except:
-            pass
+        __cufftXtQueryPlan = GetProcAddress(handle, 'cufftXtQueryPlan')
 
         global __cufftXtClearCallback
-        try:
-            __cufftXtClearCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtClearCallback')
-        except:
-            pass
+        __cufftXtClearCallback = GetProcAddress(handle, 'cufftXtClearCallback')
 
         global __cufftXtSetCallbackSharedSize
-        try:
-            __cufftXtSetCallbackSharedSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetCallbackSharedSize')
-        except:
-            pass
+        __cufftXtSetCallbackSharedSize = GetProcAddress(handle, 'cufftXtSetCallbackSharedSize')
 
         global __cufftXtMakePlanMany
-        try:
-            __cufftXtMakePlanMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtMakePlanMany')
-        except:
-            pass
+        __cufftXtMakePlanMany = GetProcAddress(handle, 'cufftXtMakePlanMany')
 
         global __cufftXtGetSizeMany
-        try:
-            __cufftXtGetSizeMany = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtGetSizeMany')
-        except:
-            pass
+        __cufftXtGetSizeMany = GetProcAddress(handle, 'cufftXtGetSizeMany')
 
         global __cufftXtExec
-        try:
-            __cufftXtExec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExec')
-        except:
-            pass
+        __cufftXtExec = GetProcAddress(handle, 'cufftXtExec')
 
         global __cufftXtExecDescriptor
-        try:
-            __cufftXtExecDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtExecDescriptor')
-        except:
-            pass
+        __cufftXtExecDescriptor = GetProcAddress(handle, 'cufftXtExecDescriptor')
 
         global __cufftXtSetWorkAreaPolicy
-        try:
-            __cufftXtSetWorkAreaPolicy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetWorkAreaPolicy')
-        except:
-            pass
+        __cufftXtSetWorkAreaPolicy = GetProcAddress(handle, 'cufftXtSetWorkAreaPolicy')
 
         global __cufftXtSetJITCallback
-        try:
-            __cufftXtSetJITCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetJITCallback')
-        except:
-            pass
+        __cufftXtSetJITCallback = GetProcAddress(handle, 'cufftXtSetJITCallback')
 
         global __cufftXtSetSubformatDefault
-        try:
-            __cufftXtSetSubformatDefault = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftXtSetSubformatDefault')
-        except:
-            pass
+        __cufftXtSetSubformatDefault = GetProcAddress(handle, 'cufftXtSetSubformatDefault')
 
         global __cufftSetPlanPropertyInt64
-        try:
-            __cufftSetPlanPropertyInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftSetPlanPropertyInt64')
-        except:
-            pass
+        __cufftSetPlanPropertyInt64 = GetProcAddress(handle, 'cufftSetPlanPropertyInt64')
 
         global __cufftGetPlanPropertyInt64
-        try:
-            __cufftGetPlanPropertyInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetPlanPropertyInt64')
-        except:
-            pass
+        __cufftGetPlanPropertyInt64 = GetProcAddress(handle, 'cufftGetPlanPropertyInt64')
 
         global __cufftResetPlanProperty
-        try:
-            __cufftResetPlanProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftResetPlanProperty')
-        except:
-            pass
+        __cufftResetPlanProperty = GetProcAddress(handle, 'cufftResetPlanProperty')
 
-    __py_cufft_init = True
-    return 0
+        global ____cufftXtSetJITCallback_12_7
+        ____cufftXtSetJITCallback_12_7 = GetProcAddress(handle, '__cufftXtSetJITCallback_12_7')
+
+        __py_cufft_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -632,6 +511,9 @@ cpdef dict _inspect_function_pointers():
     global __cufftResetPlanProperty
     data["__cufftResetPlanProperty"] = <intptr_t>__cufftResetPlanProperty
 
+    global ____cufftXtSetJITCallback_12_7
+    data["____cufftXtSetJITCallback_12_7"] = <intptr_t>____cufftXtSetJITCallback_12_7
+
     func_ptrs = data
     return data
 
@@ -1157,14 +1039,14 @@ cdef cufftResult _cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPoli
         plan, policy, workSize)
 
 
-cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
     global __cufftXtSetJITCallback
     _check_or_init_cufft()
     if __cufftXtSetJITCallback == NULL:
         with gil:
             raise FunctionNotFoundError("function cufftXtSetJITCallback is not found")
-    return (<cufftResult (*)(cufftHandle, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>__cufftXtSetJITCallback)(
-        plan, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
+    return (<cufftResult (*)(cufftHandle, const char*, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>__cufftXtSetJITCallback)(
+        plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
 
 
 cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
@@ -1205,3 +1087,13 @@ cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty propert
             raise FunctionNotFoundError("function cufftResetPlanProperty is not found")
     return (<cufftResult (*)(cufftHandle, cufftProperty) noexcept nogil>__cufftResetPlanProperty)(
         plan, property)
+
+
+cdef cufftResult ___cufftXtSetJITCallback_12_7(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+    global ____cufftXtSetJITCallback_12_7
+    _check_or_init_cufft()
+    if ____cufftXtSetJITCallback_12_7 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function __cufftXtSetJITCallback_12_7 is not found")
+    return (<cufftResult (*)(cufftHandle, const char*, const void*, size_t, cufftXtCallbackType, void**) noexcept nogil>____cufftXtSetJITCallback_12_7)(
+        plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
diff --git a/nvmath/bindings/_internal/curand_linux.pyx b/nvmath/bindings/_internal/curand_linux.pyx
index 489db0f..fb18e36 100644
--- a/nvmath/bindings/_internal/curand_linux.pyx
+++ b/nvmath/bindings/_internal/curand_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_curand_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __curandCreateGenerator = NULL
 cdef void* __curandCreateGeneratorHost = NULL
@@ -77,233 +98,217 @@ cdef int _check_or_init_curand() except -1 nogil:
     if __py_curand_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __curandCreateGenerator
-    __curandCreateGenerator = dlsym(RTLD_DEFAULT, 'curandCreateGenerator')
-    if __curandCreateGenerator == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandCreateGenerator = dlsym(handle, 'curandCreateGenerator')
-
-    global __curandCreateGeneratorHost
-    __curandCreateGeneratorHost = dlsym(RTLD_DEFAULT, 'curandCreateGeneratorHost')
-    if __curandCreateGeneratorHost == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandCreateGeneratorHost = dlsym(handle, 'curandCreateGeneratorHost')
-
-    global __curandDestroyGenerator
-    __curandDestroyGenerator = dlsym(RTLD_DEFAULT, 'curandDestroyGenerator')
-    if __curandDestroyGenerator == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandDestroyGenerator = dlsym(handle, 'curandDestroyGenerator')
-
-    global __curandGetVersion
-    __curandGetVersion = dlsym(RTLD_DEFAULT, 'curandGetVersion')
-    if __curandGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetVersion = dlsym(handle, 'curandGetVersion')
-
-    global __curandGetProperty
-    __curandGetProperty = dlsym(RTLD_DEFAULT, 'curandGetProperty')
-    if __curandGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetProperty = dlsym(handle, 'curandGetProperty')
-
-    global __curandSetStream
-    __curandSetStream = dlsym(RTLD_DEFAULT, 'curandSetStream')
-    if __curandSetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandSetStream = dlsym(handle, 'curandSetStream')
-
-    global __curandSetPseudoRandomGeneratorSeed
-    __curandSetPseudoRandomGeneratorSeed = dlsym(RTLD_DEFAULT, 'curandSetPseudoRandomGeneratorSeed')
-    if __curandSetPseudoRandomGeneratorSeed == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandSetPseudoRandomGeneratorSeed = dlsym(handle, 'curandSetPseudoRandomGeneratorSeed')
-
-    global __curandSetGeneratorOffset
-    __curandSetGeneratorOffset = dlsym(RTLD_DEFAULT, 'curandSetGeneratorOffset')
-    if __curandSetGeneratorOffset == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandSetGeneratorOffset = dlsym(handle, 'curandSetGeneratorOffset')
-
-    global __curandSetGeneratorOrdering
-    __curandSetGeneratorOrdering = dlsym(RTLD_DEFAULT, 'curandSetGeneratorOrdering')
-    if __curandSetGeneratorOrdering == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandSetGeneratorOrdering = dlsym(handle, 'curandSetGeneratorOrdering')
-
-    global __curandSetQuasiRandomGeneratorDimensions
-    __curandSetQuasiRandomGeneratorDimensions = dlsym(RTLD_DEFAULT, 'curandSetQuasiRandomGeneratorDimensions')
-    if __curandSetQuasiRandomGeneratorDimensions == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandSetQuasiRandomGeneratorDimensions = dlsym(handle, 'curandSetQuasiRandomGeneratorDimensions')
-
-    global __curandGenerate
-    __curandGenerate = dlsym(RTLD_DEFAULT, 'curandGenerate')
-    if __curandGenerate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerate = dlsym(handle, 'curandGenerate')
-
-    global __curandGenerateLongLong
-    __curandGenerateLongLong = dlsym(RTLD_DEFAULT, 'curandGenerateLongLong')
-    if __curandGenerateLongLong == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateLongLong = dlsym(handle, 'curandGenerateLongLong')
 
-    global __curandGenerateUniform
-    __curandGenerateUniform = dlsym(RTLD_DEFAULT, 'curandGenerateUniform')
-    if __curandGenerateUniform == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateUniform = dlsym(handle, 'curandGenerateUniform')
-
-    global __curandGenerateUniformDouble
-    __curandGenerateUniformDouble = dlsym(RTLD_DEFAULT, 'curandGenerateUniformDouble')
-    if __curandGenerateUniformDouble == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateUniformDouble = dlsym(handle, 'curandGenerateUniformDouble')
-
-    global __curandGenerateNormal
-    __curandGenerateNormal = dlsym(RTLD_DEFAULT, 'curandGenerateNormal')
-    if __curandGenerateNormal == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateNormal = dlsym(handle, 'curandGenerateNormal')
-
-    global __curandGenerateNormalDouble
-    __curandGenerateNormalDouble = dlsym(RTLD_DEFAULT, 'curandGenerateNormalDouble')
-    if __curandGenerateNormalDouble == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateNormalDouble = dlsym(handle, 'curandGenerateNormalDouble')
-
-    global __curandGenerateLogNormal
-    __curandGenerateLogNormal = dlsym(RTLD_DEFAULT, 'curandGenerateLogNormal')
-    if __curandGenerateLogNormal == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateLogNormal = dlsym(handle, 'curandGenerateLogNormal')
-
-    global __curandGenerateLogNormalDouble
-    __curandGenerateLogNormalDouble = dlsym(RTLD_DEFAULT, 'curandGenerateLogNormalDouble')
-    if __curandGenerateLogNormalDouble == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateLogNormalDouble = dlsym(handle, 'curandGenerateLogNormalDouble')
-
-    global __curandCreatePoissonDistribution
-    __curandCreatePoissonDistribution = dlsym(RTLD_DEFAULT, 'curandCreatePoissonDistribution')
-    if __curandCreatePoissonDistribution == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandCreatePoissonDistribution = dlsym(handle, 'curandCreatePoissonDistribution')
-
-    global __curandDestroyDistribution
-    __curandDestroyDistribution = dlsym(RTLD_DEFAULT, 'curandDestroyDistribution')
-    if __curandDestroyDistribution == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandDestroyDistribution = dlsym(handle, 'curandDestroyDistribution')
-
-    global __curandGeneratePoisson
-    __curandGeneratePoisson = dlsym(RTLD_DEFAULT, 'curandGeneratePoisson')
-    if __curandGeneratePoisson == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGeneratePoisson = dlsym(handle, 'curandGeneratePoisson')
-
-    global __curandGeneratePoissonMethod
-    __curandGeneratePoissonMethod = dlsym(RTLD_DEFAULT, 'curandGeneratePoissonMethod')
-    if __curandGeneratePoissonMethod == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGeneratePoissonMethod = dlsym(handle, 'curandGeneratePoissonMethod')
-
-    global __curandGenerateBinomial
-    __curandGenerateBinomial = dlsym(RTLD_DEFAULT, 'curandGenerateBinomial')
-    if __curandGenerateBinomial == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateBinomial = dlsym(handle, 'curandGenerateBinomial')
-
-    global __curandGenerateBinomialMethod
-    __curandGenerateBinomialMethod = dlsym(RTLD_DEFAULT, 'curandGenerateBinomialMethod')
-    if __curandGenerateBinomialMethod == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateBinomialMethod = dlsym(handle, 'curandGenerateBinomialMethod')
-
-    global __curandGenerateSeeds
-    __curandGenerateSeeds = dlsym(RTLD_DEFAULT, 'curandGenerateSeeds')
-    if __curandGenerateSeeds == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGenerateSeeds = dlsym(handle, 'curandGenerateSeeds')
-
-    global __curandGetDirectionVectors32
-    __curandGetDirectionVectors32 = dlsym(RTLD_DEFAULT, 'curandGetDirectionVectors32')
-    if __curandGetDirectionVectors32 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetDirectionVectors32 = dlsym(handle, 'curandGetDirectionVectors32')
-
-    global __curandGetScrambleConstants32
-    __curandGetScrambleConstants32 = dlsym(RTLD_DEFAULT, 'curandGetScrambleConstants32')
-    if __curandGetScrambleConstants32 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetScrambleConstants32 = dlsym(handle, 'curandGetScrambleConstants32')
-
-    global __curandGetDirectionVectors64
-    __curandGetDirectionVectors64 = dlsym(RTLD_DEFAULT, 'curandGetDirectionVectors64')
-    if __curandGetDirectionVectors64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetDirectionVectors64 = dlsym(handle, 'curandGetDirectionVectors64')
-
-    global __curandGetScrambleConstants64
-    __curandGetScrambleConstants64 = dlsym(RTLD_DEFAULT, 'curandGetScrambleConstants64')
-    if __curandGetScrambleConstants64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __curandGetScrambleConstants64 = dlsym(handle, 'curandGetScrambleConstants64')
-
-    __py_curand_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __curandCreateGenerator
+        __curandCreateGenerator = dlsym(RTLD_DEFAULT, 'curandCreateGenerator')
+        if __curandCreateGenerator == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandCreateGenerator = dlsym(handle, 'curandCreateGenerator')
+
+        global __curandCreateGeneratorHost
+        __curandCreateGeneratorHost = dlsym(RTLD_DEFAULT, 'curandCreateGeneratorHost')
+        if __curandCreateGeneratorHost == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandCreateGeneratorHost = dlsym(handle, 'curandCreateGeneratorHost')
+
+        global __curandDestroyGenerator
+        __curandDestroyGenerator = dlsym(RTLD_DEFAULT, 'curandDestroyGenerator')
+        if __curandDestroyGenerator == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandDestroyGenerator = dlsym(handle, 'curandDestroyGenerator')
+
+        global __curandGetVersion
+        __curandGetVersion = dlsym(RTLD_DEFAULT, 'curandGetVersion')
+        if __curandGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetVersion = dlsym(handle, 'curandGetVersion')
+
+        global __curandGetProperty
+        __curandGetProperty = dlsym(RTLD_DEFAULT, 'curandGetProperty')
+        if __curandGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetProperty = dlsym(handle, 'curandGetProperty')
+
+        global __curandSetStream
+        __curandSetStream = dlsym(RTLD_DEFAULT, 'curandSetStream')
+        if __curandSetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandSetStream = dlsym(handle, 'curandSetStream')
+
+        global __curandSetPseudoRandomGeneratorSeed
+        __curandSetPseudoRandomGeneratorSeed = dlsym(RTLD_DEFAULT, 'curandSetPseudoRandomGeneratorSeed')
+        if __curandSetPseudoRandomGeneratorSeed == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandSetPseudoRandomGeneratorSeed = dlsym(handle, 'curandSetPseudoRandomGeneratorSeed')
+
+        global __curandSetGeneratorOffset
+        __curandSetGeneratorOffset = dlsym(RTLD_DEFAULT, 'curandSetGeneratorOffset')
+        if __curandSetGeneratorOffset == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandSetGeneratorOffset = dlsym(handle, 'curandSetGeneratorOffset')
+
+        global __curandSetGeneratorOrdering
+        __curandSetGeneratorOrdering = dlsym(RTLD_DEFAULT, 'curandSetGeneratorOrdering')
+        if __curandSetGeneratorOrdering == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandSetGeneratorOrdering = dlsym(handle, 'curandSetGeneratorOrdering')
+
+        global __curandSetQuasiRandomGeneratorDimensions
+        __curandSetQuasiRandomGeneratorDimensions = dlsym(RTLD_DEFAULT, 'curandSetQuasiRandomGeneratorDimensions')
+        if __curandSetQuasiRandomGeneratorDimensions == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandSetQuasiRandomGeneratorDimensions = dlsym(handle, 'curandSetQuasiRandomGeneratorDimensions')
+
+        global __curandGenerate
+        __curandGenerate = dlsym(RTLD_DEFAULT, 'curandGenerate')
+        if __curandGenerate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerate = dlsym(handle, 'curandGenerate')
+
+        global __curandGenerateLongLong
+        __curandGenerateLongLong = dlsym(RTLD_DEFAULT, 'curandGenerateLongLong')
+        if __curandGenerateLongLong == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateLongLong = dlsym(handle, 'curandGenerateLongLong')
+
+        global __curandGenerateUniform
+        __curandGenerateUniform = dlsym(RTLD_DEFAULT, 'curandGenerateUniform')
+        if __curandGenerateUniform == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateUniform = dlsym(handle, 'curandGenerateUniform')
+
+        global __curandGenerateUniformDouble
+        __curandGenerateUniformDouble = dlsym(RTLD_DEFAULT, 'curandGenerateUniformDouble')
+        if __curandGenerateUniformDouble == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateUniformDouble = dlsym(handle, 'curandGenerateUniformDouble')
+
+        global __curandGenerateNormal
+        __curandGenerateNormal = dlsym(RTLD_DEFAULT, 'curandGenerateNormal')
+        if __curandGenerateNormal == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateNormal = dlsym(handle, 'curandGenerateNormal')
+
+        global __curandGenerateNormalDouble
+        __curandGenerateNormalDouble = dlsym(RTLD_DEFAULT, 'curandGenerateNormalDouble')
+        if __curandGenerateNormalDouble == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateNormalDouble = dlsym(handle, 'curandGenerateNormalDouble')
+
+        global __curandGenerateLogNormal
+        __curandGenerateLogNormal = dlsym(RTLD_DEFAULT, 'curandGenerateLogNormal')
+        if __curandGenerateLogNormal == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateLogNormal = dlsym(handle, 'curandGenerateLogNormal')
+
+        global __curandGenerateLogNormalDouble
+        __curandGenerateLogNormalDouble = dlsym(RTLD_DEFAULT, 'curandGenerateLogNormalDouble')
+        if __curandGenerateLogNormalDouble == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateLogNormalDouble = dlsym(handle, 'curandGenerateLogNormalDouble')
+
+        global __curandCreatePoissonDistribution
+        __curandCreatePoissonDistribution = dlsym(RTLD_DEFAULT, 'curandCreatePoissonDistribution')
+        if __curandCreatePoissonDistribution == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandCreatePoissonDistribution = dlsym(handle, 'curandCreatePoissonDistribution')
+
+        global __curandDestroyDistribution
+        __curandDestroyDistribution = dlsym(RTLD_DEFAULT, 'curandDestroyDistribution')
+        if __curandDestroyDistribution == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandDestroyDistribution = dlsym(handle, 'curandDestroyDistribution')
+
+        global __curandGeneratePoisson
+        __curandGeneratePoisson = dlsym(RTLD_DEFAULT, 'curandGeneratePoisson')
+        if __curandGeneratePoisson == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGeneratePoisson = dlsym(handle, 'curandGeneratePoisson')
+
+        global __curandGeneratePoissonMethod
+        __curandGeneratePoissonMethod = dlsym(RTLD_DEFAULT, 'curandGeneratePoissonMethod')
+        if __curandGeneratePoissonMethod == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGeneratePoissonMethod = dlsym(handle, 'curandGeneratePoissonMethod')
+
+        global __curandGenerateBinomial
+        __curandGenerateBinomial = dlsym(RTLD_DEFAULT, 'curandGenerateBinomial')
+        if __curandGenerateBinomial == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateBinomial = dlsym(handle, 'curandGenerateBinomial')
+
+        global __curandGenerateBinomialMethod
+        __curandGenerateBinomialMethod = dlsym(RTLD_DEFAULT, 'curandGenerateBinomialMethod')
+        if __curandGenerateBinomialMethod == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateBinomialMethod = dlsym(handle, 'curandGenerateBinomialMethod')
+
+        global __curandGenerateSeeds
+        __curandGenerateSeeds = dlsym(RTLD_DEFAULT, 'curandGenerateSeeds')
+        if __curandGenerateSeeds == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGenerateSeeds = dlsym(handle, 'curandGenerateSeeds')
+
+        global __curandGetDirectionVectors32
+        __curandGetDirectionVectors32 = dlsym(RTLD_DEFAULT, 'curandGetDirectionVectors32')
+        if __curandGetDirectionVectors32 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetDirectionVectors32 = dlsym(handle, 'curandGetDirectionVectors32')
+
+        global __curandGetScrambleConstants32
+        __curandGetScrambleConstants32 = dlsym(RTLD_DEFAULT, 'curandGetScrambleConstants32')
+        if __curandGetScrambleConstants32 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetScrambleConstants32 = dlsym(handle, 'curandGetScrambleConstants32')
+
+        global __curandGetDirectionVectors64
+        __curandGetDirectionVectors64 = dlsym(RTLD_DEFAULT, 'curandGetDirectionVectors64')
+        if __curandGetDirectionVectors64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetDirectionVectors64 = dlsym(handle, 'curandGetDirectionVectors64')
+
+        global __curandGetScrambleConstants64
+        __curandGetScrambleConstants64 = dlsym(RTLD_DEFAULT, 'curandGetScrambleConstants64')
+        if __curandGetScrambleConstants64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __curandGetScrambleConstants64 = dlsym(handle, 'curandGetScrambleConstants64')
+
+        __py_curand_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/curand_windows.pyx b/nvmath/bindings/_internal/curand_windows.pyx
index 2fb9595..30d1f8e 100644
--- a/nvmath/bindings/_internal/curand_windows.pyx
+++ b/nvmath/bindings/_internal/curand_windows.pyx
@@ -8,20 +8,77 @@ from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_curand_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __curandCreateGenerator = NULL
 cdef void* __curandCreateGeneratorHost = NULL
@@ -65,202 +122,102 @@ cdef int _check_or_init_curand() except -1 nogil:
     if __py_curand_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __curandCreateGenerator
-        try:
-            __curandCreateGenerator = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandCreateGenerator')
-        except:
-            pass
+        __curandCreateGenerator = GetProcAddress(handle, 'curandCreateGenerator')
 
         global __curandCreateGeneratorHost
-        try:
-            __curandCreateGeneratorHost = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandCreateGeneratorHost')
-        except:
-            pass
+        __curandCreateGeneratorHost = GetProcAddress(handle, 'curandCreateGeneratorHost')
 
         global __curandDestroyGenerator
-        try:
-            __curandDestroyGenerator = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandDestroyGenerator')
-        except:
-            pass
+        __curandDestroyGenerator = GetProcAddress(handle, 'curandDestroyGenerator')
 
         global __curandGetVersion
-        try:
-            __curandGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetVersion')
-        except:
-            pass
+        __curandGetVersion = GetProcAddress(handle, 'curandGetVersion')
 
         global __curandGetProperty
-        try:
-            __curandGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetProperty')
-        except:
-            pass
+        __curandGetProperty = GetProcAddress(handle, 'curandGetProperty')
 
         global __curandSetStream
-        try:
-            __curandSetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandSetStream')
-        except:
-            pass
+        __curandSetStream = GetProcAddress(handle, 'curandSetStream')
 
         global __curandSetPseudoRandomGeneratorSeed
-        try:
-            __curandSetPseudoRandomGeneratorSeed = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandSetPseudoRandomGeneratorSeed')
-        except:
-            pass
+        __curandSetPseudoRandomGeneratorSeed = GetProcAddress(handle, 'curandSetPseudoRandomGeneratorSeed')
 
         global __curandSetGeneratorOffset
-        try:
-            __curandSetGeneratorOffset = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandSetGeneratorOffset')
-        except:
-            pass
+        __curandSetGeneratorOffset = GetProcAddress(handle, 'curandSetGeneratorOffset')
 
         global __curandSetGeneratorOrdering
-        try:
-            __curandSetGeneratorOrdering = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandSetGeneratorOrdering')
-        except:
-            pass
+        __curandSetGeneratorOrdering = GetProcAddress(handle, 'curandSetGeneratorOrdering')
 
         global __curandSetQuasiRandomGeneratorDimensions
-        try:
-            __curandSetQuasiRandomGeneratorDimensions = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandSetQuasiRandomGeneratorDimensions')
-        except:
-            pass
+        __curandSetQuasiRandomGeneratorDimensions = GetProcAddress(handle, 'curandSetQuasiRandomGeneratorDimensions')
 
         global __curandGenerate
-        try:
-            __curandGenerate = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerate')
-        except:
-            pass
+        __curandGenerate = GetProcAddress(handle, 'curandGenerate')
 
         global __curandGenerateLongLong
-        try:
-            __curandGenerateLongLong = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateLongLong')
-        except:
-            pass
+        __curandGenerateLongLong = GetProcAddress(handle, 'curandGenerateLongLong')
 
         global __curandGenerateUniform
-        try:
-            __curandGenerateUniform = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateUniform')
-        except:
-            pass
+        __curandGenerateUniform = GetProcAddress(handle, 'curandGenerateUniform')
 
         global __curandGenerateUniformDouble
-        try:
-            __curandGenerateUniformDouble = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateUniformDouble')
-        except:
-            pass
+        __curandGenerateUniformDouble = GetProcAddress(handle, 'curandGenerateUniformDouble')
 
         global __curandGenerateNormal
-        try:
-            __curandGenerateNormal = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateNormal')
-        except:
-            pass
+        __curandGenerateNormal = GetProcAddress(handle, 'curandGenerateNormal')
 
         global __curandGenerateNormalDouble
-        try:
-            __curandGenerateNormalDouble = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateNormalDouble')
-        except:
-            pass
+        __curandGenerateNormalDouble = GetProcAddress(handle, 'curandGenerateNormalDouble')
 
         global __curandGenerateLogNormal
-        try:
-            __curandGenerateLogNormal = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateLogNormal')
-        except:
-            pass
+        __curandGenerateLogNormal = GetProcAddress(handle, 'curandGenerateLogNormal')
 
         global __curandGenerateLogNormalDouble
-        try:
-            __curandGenerateLogNormalDouble = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateLogNormalDouble')
-        except:
-            pass
+        __curandGenerateLogNormalDouble = GetProcAddress(handle, 'curandGenerateLogNormalDouble')
 
         global __curandCreatePoissonDistribution
-        try:
-            __curandCreatePoissonDistribution = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandCreatePoissonDistribution')
-        except:
-            pass
+        __curandCreatePoissonDistribution = GetProcAddress(handle, 'curandCreatePoissonDistribution')
 
         global __curandDestroyDistribution
-        try:
-            __curandDestroyDistribution = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandDestroyDistribution')
-        except:
-            pass
+        __curandDestroyDistribution = GetProcAddress(handle, 'curandDestroyDistribution')
 
         global __curandGeneratePoisson
-        try:
-            __curandGeneratePoisson = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGeneratePoisson')
-        except:
-            pass
+        __curandGeneratePoisson = GetProcAddress(handle, 'curandGeneratePoisson')
 
         global __curandGeneratePoissonMethod
-        try:
-            __curandGeneratePoissonMethod = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGeneratePoissonMethod')
-        except:
-            pass
+        __curandGeneratePoissonMethod = GetProcAddress(handle, 'curandGeneratePoissonMethod')
 
         global __curandGenerateBinomial
-        try:
-            __curandGenerateBinomial = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateBinomial')
-        except:
-            pass
+        __curandGenerateBinomial = GetProcAddress(handle, 'curandGenerateBinomial')
 
         global __curandGenerateBinomialMethod
-        try:
-            __curandGenerateBinomialMethod = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateBinomialMethod')
-        except:
-            pass
+        __curandGenerateBinomialMethod = GetProcAddress(handle, 'curandGenerateBinomialMethod')
 
         global __curandGenerateSeeds
-        try:
-            __curandGenerateSeeds = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGenerateSeeds')
-        except:
-            pass
+        __curandGenerateSeeds = GetProcAddress(handle, 'curandGenerateSeeds')
 
         global __curandGetDirectionVectors32
-        try:
-            __curandGetDirectionVectors32 = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetDirectionVectors32')
-        except:
-            pass
+        __curandGetDirectionVectors32 = GetProcAddress(handle, 'curandGetDirectionVectors32')
 
         global __curandGetScrambleConstants32
-        try:
-            __curandGetScrambleConstants32 = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetScrambleConstants32')
-        except:
-            pass
+        __curandGetScrambleConstants32 = GetProcAddress(handle, 'curandGetScrambleConstants32')
 
         global __curandGetDirectionVectors64
-        try:
-            __curandGetDirectionVectors64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetDirectionVectors64')
-        except:
-            pass
+        __curandGetDirectionVectors64 = GetProcAddress(handle, 'curandGetDirectionVectors64')
 
         global __curandGetScrambleConstants64
-        try:
-            __curandGetScrambleConstants64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'curandGetScrambleConstants64')
-        except:
-            pass
+        __curandGetScrambleConstants64 = GetProcAddress(handle, 'curandGetScrambleConstants64')
 
-    __py_curand_init = True
-    return 0
+        __py_curand_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusolverDn_linux.pyx b/nvmath/bindings/_internal/cusolverDn_linux.pyx
index 5c5bcba..1ead531 100644
--- a/nvmath/bindings/_internal/cusolverDn_linux.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusolverDn_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusolverDnCreate = NULL
 cdef void* __cusolverDnDestroy = NULL
@@ -421,2641 +442,2625 @@ cdef int _check_or_init_cusolverDn() except -1 nogil:
     if __py_cusolverDn_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cusolverDnCreate
-    __cusolverDnCreate = dlsym(RTLD_DEFAULT, 'cusolverDnCreate')
-    if __cusolverDnCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCreate = dlsym(handle, 'cusolverDnCreate')
-
-    global __cusolverDnDestroy
-    __cusolverDnDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnDestroy')
-    if __cusolverDnDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDestroy = dlsym(handle, 'cusolverDnDestroy')
-
-    global __cusolverDnSetStream
-    __cusolverDnSetStream = dlsym(RTLD_DEFAULT, 'cusolverDnSetStream')
-    if __cusolverDnSetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSetStream = dlsym(handle, 'cusolverDnSetStream')
-
-    global __cusolverDnGetStream
-    __cusolverDnGetStream = dlsym(RTLD_DEFAULT, 'cusolverDnGetStream')
-    if __cusolverDnGetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnGetStream = dlsym(handle, 'cusolverDnGetStream')
-
-    global __cusolverDnIRSParamsCreate
-    __cusolverDnIRSParamsCreate = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsCreate')
-    if __cusolverDnIRSParamsCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsCreate = dlsym(handle, 'cusolverDnIRSParamsCreate')
-
-    global __cusolverDnIRSParamsDestroy
-    __cusolverDnIRSParamsDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsDestroy')
-    if __cusolverDnIRSParamsDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsDestroy = dlsym(handle, 'cusolverDnIRSParamsDestroy')
-
-    global __cusolverDnIRSParamsSetRefinementSolver
-    __cusolverDnIRSParamsSetRefinementSolver = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetRefinementSolver')
-    if __cusolverDnIRSParamsSetRefinementSolver == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetRefinementSolver = dlsym(handle, 'cusolverDnIRSParamsSetRefinementSolver')
-
-    global __cusolverDnIRSParamsSetSolverMainPrecision
-    __cusolverDnIRSParamsSetSolverMainPrecision = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverMainPrecision')
-    if __cusolverDnIRSParamsSetSolverMainPrecision == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetSolverMainPrecision = dlsym(handle, 'cusolverDnIRSParamsSetSolverMainPrecision')
-
-    global __cusolverDnIRSParamsSetSolverLowestPrecision
-    __cusolverDnIRSParamsSetSolverLowestPrecision = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverLowestPrecision')
-    if __cusolverDnIRSParamsSetSolverLowestPrecision == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetSolverLowestPrecision = dlsym(handle, 'cusolverDnIRSParamsSetSolverLowestPrecision')
-
-    global __cusolverDnIRSParamsSetSolverPrecisions
-    __cusolverDnIRSParamsSetSolverPrecisions = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverPrecisions')
-    if __cusolverDnIRSParamsSetSolverPrecisions == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetSolverPrecisions = dlsym(handle, 'cusolverDnIRSParamsSetSolverPrecisions')
-
-    global __cusolverDnIRSParamsSetTol
-    __cusolverDnIRSParamsSetTol = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetTol')
-    if __cusolverDnIRSParamsSetTol == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetTol = dlsym(handle, 'cusolverDnIRSParamsSetTol')
-
-    global __cusolverDnIRSParamsSetTolInner
-    __cusolverDnIRSParamsSetTolInner = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetTolInner')
-    if __cusolverDnIRSParamsSetTolInner == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetTolInner = dlsym(handle, 'cusolverDnIRSParamsSetTolInner')
-
-    global __cusolverDnIRSParamsSetMaxIters
-    __cusolverDnIRSParamsSetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetMaxIters')
-    if __cusolverDnIRSParamsSetMaxIters == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetMaxIters = dlsym(handle, 'cusolverDnIRSParamsSetMaxIters')
-
-    global __cusolverDnIRSParamsSetMaxItersInner
-    __cusolverDnIRSParamsSetMaxItersInner = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetMaxItersInner')
-    if __cusolverDnIRSParamsSetMaxItersInner == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsSetMaxItersInner = dlsym(handle, 'cusolverDnIRSParamsSetMaxItersInner')
-
-    global __cusolverDnIRSParamsGetMaxIters
-    __cusolverDnIRSParamsGetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsGetMaxIters')
-    if __cusolverDnIRSParamsGetMaxIters == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsGetMaxIters = dlsym(handle, 'cusolverDnIRSParamsGetMaxIters')
-
-    global __cusolverDnIRSParamsEnableFallback
-    __cusolverDnIRSParamsEnableFallback = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsEnableFallback')
-    if __cusolverDnIRSParamsEnableFallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsEnableFallback = dlsym(handle, 'cusolverDnIRSParamsEnableFallback')
-
-    global __cusolverDnIRSParamsDisableFallback
-    __cusolverDnIRSParamsDisableFallback = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsDisableFallback')
-    if __cusolverDnIRSParamsDisableFallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSParamsDisableFallback = dlsym(handle, 'cusolverDnIRSParamsDisableFallback')
-
-    global __cusolverDnIRSInfosDestroy
-    __cusolverDnIRSInfosDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosDestroy')
-    if __cusolverDnIRSInfosDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosDestroy = dlsym(handle, 'cusolverDnIRSInfosDestroy')
-
-    global __cusolverDnIRSInfosCreate
-    __cusolverDnIRSInfosCreate = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosCreate')
-    if __cusolverDnIRSInfosCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosCreate = dlsym(handle, 'cusolverDnIRSInfosCreate')
-
-    global __cusolverDnIRSInfosGetNiters
-    __cusolverDnIRSInfosGetNiters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetNiters')
-    if __cusolverDnIRSInfosGetNiters == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosGetNiters = dlsym(handle, 'cusolverDnIRSInfosGetNiters')
-
-    global __cusolverDnIRSInfosGetOuterNiters
-    __cusolverDnIRSInfosGetOuterNiters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetOuterNiters')
-    if __cusolverDnIRSInfosGetOuterNiters == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosGetOuterNiters = dlsym(handle, 'cusolverDnIRSInfosGetOuterNiters')
-
-    global __cusolverDnIRSInfosRequestResidual
-    __cusolverDnIRSInfosRequestResidual = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosRequestResidual')
-    if __cusolverDnIRSInfosRequestResidual == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosRequestResidual = dlsym(handle, 'cusolverDnIRSInfosRequestResidual')
-
-    global __cusolverDnIRSInfosGetResidualHistory
-    __cusolverDnIRSInfosGetResidualHistory = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetResidualHistory')
-    if __cusolverDnIRSInfosGetResidualHistory == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosGetResidualHistory = dlsym(handle, 'cusolverDnIRSInfosGetResidualHistory')
-
-    global __cusolverDnIRSInfosGetMaxIters
-    __cusolverDnIRSInfosGetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetMaxIters')
-    if __cusolverDnIRSInfosGetMaxIters == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSInfosGetMaxIters = dlsym(handle, 'cusolverDnIRSInfosGetMaxIters')
-
-    global __cusolverDnZZgesv
-    __cusolverDnZZgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZZgesv')
-    if __cusolverDnZZgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZZgesv = dlsym(handle, 'cusolverDnZZgesv')
-
-    global __cusolverDnZCgesv
-    __cusolverDnZCgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZCgesv')
-    if __cusolverDnZCgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZCgesv = dlsym(handle, 'cusolverDnZCgesv')
-
-    global __cusolverDnZKgesv
-    __cusolverDnZKgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZKgesv')
-    if __cusolverDnZKgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZKgesv = dlsym(handle, 'cusolverDnZKgesv')
-
-    global __cusolverDnZEgesv
-    __cusolverDnZEgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZEgesv')
-    if __cusolverDnZEgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZEgesv = dlsym(handle, 'cusolverDnZEgesv')
-
-    global __cusolverDnZYgesv
-    __cusolverDnZYgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZYgesv')
-    if __cusolverDnZYgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZYgesv = dlsym(handle, 'cusolverDnZYgesv')
-
-    global __cusolverDnCCgesv
-    __cusolverDnCCgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCCgesv')
-    if __cusolverDnCCgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCCgesv = dlsym(handle, 'cusolverDnCCgesv')
-
-    global __cusolverDnCEgesv
-    __cusolverDnCEgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCEgesv')
-    if __cusolverDnCEgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCEgesv = dlsym(handle, 'cusolverDnCEgesv')
-
-    global __cusolverDnCKgesv
-    __cusolverDnCKgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCKgesv')
-    if __cusolverDnCKgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCKgesv = dlsym(handle, 'cusolverDnCKgesv')
-
-    global __cusolverDnCYgesv
-    __cusolverDnCYgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCYgesv')
-    if __cusolverDnCYgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCYgesv = dlsym(handle, 'cusolverDnCYgesv')
-
-    global __cusolverDnDDgesv
-    __cusolverDnDDgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDDgesv')
-    if __cusolverDnDDgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDDgesv = dlsym(handle, 'cusolverDnDDgesv')
-
-    global __cusolverDnDSgesv
-    __cusolverDnDSgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDSgesv')
-    if __cusolverDnDSgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDSgesv = dlsym(handle, 'cusolverDnDSgesv')
-
-    global __cusolverDnDHgesv
-    __cusolverDnDHgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDHgesv')
-    if __cusolverDnDHgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDHgesv = dlsym(handle, 'cusolverDnDHgesv')
-
-    global __cusolverDnDBgesv
-    __cusolverDnDBgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDBgesv')
-    if __cusolverDnDBgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDBgesv = dlsym(handle, 'cusolverDnDBgesv')
 
-    global __cusolverDnDXgesv
-    __cusolverDnDXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDXgesv')
-    if __cusolverDnDXgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDXgesv = dlsym(handle, 'cusolverDnDXgesv')
-
-    global __cusolverDnSSgesv
-    __cusolverDnSSgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSSgesv')
-    if __cusolverDnSSgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSSgesv = dlsym(handle, 'cusolverDnSSgesv')
-
-    global __cusolverDnSHgesv
-    __cusolverDnSHgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSHgesv')
-    if __cusolverDnSHgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSHgesv = dlsym(handle, 'cusolverDnSHgesv')
-
-    global __cusolverDnSBgesv
-    __cusolverDnSBgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSBgesv')
-    if __cusolverDnSBgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSBgesv = dlsym(handle, 'cusolverDnSBgesv')
-
-    global __cusolverDnSXgesv
-    __cusolverDnSXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSXgesv')
-    if __cusolverDnSXgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSXgesv = dlsym(handle, 'cusolverDnSXgesv')
-
-    global __cusolverDnZZgesv_bufferSize
-    __cusolverDnZZgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZZgesv_bufferSize')
-    if __cusolverDnZZgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZZgesv_bufferSize = dlsym(handle, 'cusolverDnZZgesv_bufferSize')
-
-    global __cusolverDnZCgesv_bufferSize
-    __cusolverDnZCgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZCgesv_bufferSize')
-    if __cusolverDnZCgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZCgesv_bufferSize = dlsym(handle, 'cusolverDnZCgesv_bufferSize')
-
-    global __cusolverDnZKgesv_bufferSize
-    __cusolverDnZKgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZKgesv_bufferSize')
-    if __cusolverDnZKgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZKgesv_bufferSize = dlsym(handle, 'cusolverDnZKgesv_bufferSize')
-
-    global __cusolverDnZEgesv_bufferSize
-    __cusolverDnZEgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZEgesv_bufferSize')
-    if __cusolverDnZEgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZEgesv_bufferSize = dlsym(handle, 'cusolverDnZEgesv_bufferSize')
-
-    global __cusolverDnZYgesv_bufferSize
-    __cusolverDnZYgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZYgesv_bufferSize')
-    if __cusolverDnZYgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZYgesv_bufferSize = dlsym(handle, 'cusolverDnZYgesv_bufferSize')
-
-    global __cusolverDnCCgesv_bufferSize
-    __cusolverDnCCgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCCgesv_bufferSize')
-    if __cusolverDnCCgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCCgesv_bufferSize = dlsym(handle, 'cusolverDnCCgesv_bufferSize')
-
-    global __cusolverDnCKgesv_bufferSize
-    __cusolverDnCKgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCKgesv_bufferSize')
-    if __cusolverDnCKgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCKgesv_bufferSize = dlsym(handle, 'cusolverDnCKgesv_bufferSize')
-
-    global __cusolverDnCEgesv_bufferSize
-    __cusolverDnCEgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCEgesv_bufferSize')
-    if __cusolverDnCEgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCEgesv_bufferSize = dlsym(handle, 'cusolverDnCEgesv_bufferSize')
-
-    global __cusolverDnCYgesv_bufferSize
-    __cusolverDnCYgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCYgesv_bufferSize')
-    if __cusolverDnCYgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCYgesv_bufferSize = dlsym(handle, 'cusolverDnCYgesv_bufferSize')
-
-    global __cusolverDnDDgesv_bufferSize
-    __cusolverDnDDgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDDgesv_bufferSize')
-    if __cusolverDnDDgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDDgesv_bufferSize = dlsym(handle, 'cusolverDnDDgesv_bufferSize')
-
-    global __cusolverDnDSgesv_bufferSize
-    __cusolverDnDSgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDSgesv_bufferSize')
-    if __cusolverDnDSgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDSgesv_bufferSize = dlsym(handle, 'cusolverDnDSgesv_bufferSize')
-
-    global __cusolverDnDHgesv_bufferSize
-    __cusolverDnDHgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDHgesv_bufferSize')
-    if __cusolverDnDHgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDHgesv_bufferSize = dlsym(handle, 'cusolverDnDHgesv_bufferSize')
-
-    global __cusolverDnDBgesv_bufferSize
-    __cusolverDnDBgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDBgesv_bufferSize')
-    if __cusolverDnDBgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDBgesv_bufferSize = dlsym(handle, 'cusolverDnDBgesv_bufferSize')
-
-    global __cusolverDnDXgesv_bufferSize
-    __cusolverDnDXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDXgesv_bufferSize')
-    if __cusolverDnDXgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDXgesv_bufferSize = dlsym(handle, 'cusolverDnDXgesv_bufferSize')
-
-    global __cusolverDnSSgesv_bufferSize
-    __cusolverDnSSgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSSgesv_bufferSize')
-    if __cusolverDnSSgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSSgesv_bufferSize = dlsym(handle, 'cusolverDnSSgesv_bufferSize')
-
-    global __cusolverDnSHgesv_bufferSize
-    __cusolverDnSHgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSHgesv_bufferSize')
-    if __cusolverDnSHgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSHgesv_bufferSize = dlsym(handle, 'cusolverDnSHgesv_bufferSize')
-
-    global __cusolverDnSBgesv_bufferSize
-    __cusolverDnSBgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSBgesv_bufferSize')
-    if __cusolverDnSBgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSBgesv_bufferSize = dlsym(handle, 'cusolverDnSBgesv_bufferSize')
-
-    global __cusolverDnSXgesv_bufferSize
-    __cusolverDnSXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSXgesv_bufferSize')
-    if __cusolverDnSXgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSXgesv_bufferSize = dlsym(handle, 'cusolverDnSXgesv_bufferSize')
-
-    global __cusolverDnZZgels
-    __cusolverDnZZgels = dlsym(RTLD_DEFAULT, 'cusolverDnZZgels')
-    if __cusolverDnZZgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZZgels = dlsym(handle, 'cusolverDnZZgels')
-
-    global __cusolverDnZCgels
-    __cusolverDnZCgels = dlsym(RTLD_DEFAULT, 'cusolverDnZCgels')
-    if __cusolverDnZCgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZCgels = dlsym(handle, 'cusolverDnZCgels')
-
-    global __cusolverDnZKgels
-    __cusolverDnZKgels = dlsym(RTLD_DEFAULT, 'cusolverDnZKgels')
-    if __cusolverDnZKgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZKgels = dlsym(handle, 'cusolverDnZKgels')
-
-    global __cusolverDnZEgels
-    __cusolverDnZEgels = dlsym(RTLD_DEFAULT, 'cusolverDnZEgels')
-    if __cusolverDnZEgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZEgels = dlsym(handle, 'cusolverDnZEgels')
-
-    global __cusolverDnZYgels
-    __cusolverDnZYgels = dlsym(RTLD_DEFAULT, 'cusolverDnZYgels')
-    if __cusolverDnZYgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZYgels = dlsym(handle, 'cusolverDnZYgels')
-
-    global __cusolverDnCCgels
-    __cusolverDnCCgels = dlsym(RTLD_DEFAULT, 'cusolverDnCCgels')
-    if __cusolverDnCCgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCCgels = dlsym(handle, 'cusolverDnCCgels')
-
-    global __cusolverDnCKgels
-    __cusolverDnCKgels = dlsym(RTLD_DEFAULT, 'cusolverDnCKgels')
-    if __cusolverDnCKgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCKgels = dlsym(handle, 'cusolverDnCKgels')
-
-    global __cusolverDnCEgels
-    __cusolverDnCEgels = dlsym(RTLD_DEFAULT, 'cusolverDnCEgels')
-    if __cusolverDnCEgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCEgels = dlsym(handle, 'cusolverDnCEgels')
-
-    global __cusolverDnCYgels
-    __cusolverDnCYgels = dlsym(RTLD_DEFAULT, 'cusolverDnCYgels')
-    if __cusolverDnCYgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCYgels = dlsym(handle, 'cusolverDnCYgels')
-
-    global __cusolverDnDDgels
-    __cusolverDnDDgels = dlsym(RTLD_DEFAULT, 'cusolverDnDDgels')
-    if __cusolverDnDDgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDDgels = dlsym(handle, 'cusolverDnDDgels')
-
-    global __cusolverDnDSgels
-    __cusolverDnDSgels = dlsym(RTLD_DEFAULT, 'cusolverDnDSgels')
-    if __cusolverDnDSgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDSgels = dlsym(handle, 'cusolverDnDSgels')
-
-    global __cusolverDnDHgels
-    __cusolverDnDHgels = dlsym(RTLD_DEFAULT, 'cusolverDnDHgels')
-    if __cusolverDnDHgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDHgels = dlsym(handle, 'cusolverDnDHgels')
-
-    global __cusolverDnDBgels
-    __cusolverDnDBgels = dlsym(RTLD_DEFAULT, 'cusolverDnDBgels')
-    if __cusolverDnDBgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDBgels = dlsym(handle, 'cusolverDnDBgels')
-
-    global __cusolverDnDXgels
-    __cusolverDnDXgels = dlsym(RTLD_DEFAULT, 'cusolverDnDXgels')
-    if __cusolverDnDXgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDXgels = dlsym(handle, 'cusolverDnDXgels')
-
-    global __cusolverDnSSgels
-    __cusolverDnSSgels = dlsym(RTLD_DEFAULT, 'cusolverDnSSgels')
-    if __cusolverDnSSgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSSgels = dlsym(handle, 'cusolverDnSSgels')
-
-    global __cusolverDnSHgels
-    __cusolverDnSHgels = dlsym(RTLD_DEFAULT, 'cusolverDnSHgels')
-    if __cusolverDnSHgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSHgels = dlsym(handle, 'cusolverDnSHgels')
-
-    global __cusolverDnSBgels
-    __cusolverDnSBgels = dlsym(RTLD_DEFAULT, 'cusolverDnSBgels')
-    if __cusolverDnSBgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSBgels = dlsym(handle, 'cusolverDnSBgels')
-
-    global __cusolverDnSXgels
-    __cusolverDnSXgels = dlsym(RTLD_DEFAULT, 'cusolverDnSXgels')
-    if __cusolverDnSXgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSXgels = dlsym(handle, 'cusolverDnSXgels')
-
-    global __cusolverDnZZgels_bufferSize
-    __cusolverDnZZgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZZgels_bufferSize')
-    if __cusolverDnZZgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZZgels_bufferSize = dlsym(handle, 'cusolverDnZZgels_bufferSize')
-
-    global __cusolverDnZCgels_bufferSize
-    __cusolverDnZCgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZCgels_bufferSize')
-    if __cusolverDnZCgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZCgels_bufferSize = dlsym(handle, 'cusolverDnZCgels_bufferSize')
-
-    global __cusolverDnZKgels_bufferSize
-    __cusolverDnZKgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZKgels_bufferSize')
-    if __cusolverDnZKgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZKgels_bufferSize = dlsym(handle, 'cusolverDnZKgels_bufferSize')
-
-    global __cusolverDnZEgels_bufferSize
-    __cusolverDnZEgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZEgels_bufferSize')
-    if __cusolverDnZEgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZEgels_bufferSize = dlsym(handle, 'cusolverDnZEgels_bufferSize')
-
-    global __cusolverDnZYgels_bufferSize
-    __cusolverDnZYgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZYgels_bufferSize')
-    if __cusolverDnZYgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZYgels_bufferSize = dlsym(handle, 'cusolverDnZYgels_bufferSize')
-
-    global __cusolverDnCCgels_bufferSize
-    __cusolverDnCCgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCCgels_bufferSize')
-    if __cusolverDnCCgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCCgels_bufferSize = dlsym(handle, 'cusolverDnCCgels_bufferSize')
-
-    global __cusolverDnCKgels_bufferSize
-    __cusolverDnCKgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCKgels_bufferSize')
-    if __cusolverDnCKgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCKgels_bufferSize = dlsym(handle, 'cusolverDnCKgels_bufferSize')
-
-    global __cusolverDnCEgels_bufferSize
-    __cusolverDnCEgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCEgels_bufferSize')
-    if __cusolverDnCEgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCEgels_bufferSize = dlsym(handle, 'cusolverDnCEgels_bufferSize')
-
-    global __cusolverDnCYgels_bufferSize
-    __cusolverDnCYgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCYgels_bufferSize')
-    if __cusolverDnCYgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCYgels_bufferSize = dlsym(handle, 'cusolverDnCYgels_bufferSize')
-
-    global __cusolverDnDDgels_bufferSize
-    __cusolverDnDDgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDDgels_bufferSize')
-    if __cusolverDnDDgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDDgels_bufferSize = dlsym(handle, 'cusolverDnDDgels_bufferSize')
-
-    global __cusolverDnDSgels_bufferSize
-    __cusolverDnDSgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDSgels_bufferSize')
-    if __cusolverDnDSgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDSgels_bufferSize = dlsym(handle, 'cusolverDnDSgels_bufferSize')
-
-    global __cusolverDnDHgels_bufferSize
-    __cusolverDnDHgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDHgels_bufferSize')
-    if __cusolverDnDHgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDHgels_bufferSize = dlsym(handle, 'cusolverDnDHgels_bufferSize')
-
-    global __cusolverDnDBgels_bufferSize
-    __cusolverDnDBgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDBgels_bufferSize')
-    if __cusolverDnDBgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDBgels_bufferSize = dlsym(handle, 'cusolverDnDBgels_bufferSize')
-
-    global __cusolverDnDXgels_bufferSize
-    __cusolverDnDXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDXgels_bufferSize')
-    if __cusolverDnDXgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDXgels_bufferSize = dlsym(handle, 'cusolverDnDXgels_bufferSize')
-
-    global __cusolverDnSSgels_bufferSize
-    __cusolverDnSSgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSSgels_bufferSize')
-    if __cusolverDnSSgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSSgels_bufferSize = dlsym(handle, 'cusolverDnSSgels_bufferSize')
-
-    global __cusolverDnSHgels_bufferSize
-    __cusolverDnSHgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSHgels_bufferSize')
-    if __cusolverDnSHgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSHgels_bufferSize = dlsym(handle, 'cusolverDnSHgels_bufferSize')
-
-    global __cusolverDnSBgels_bufferSize
-    __cusolverDnSBgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSBgels_bufferSize')
-    if __cusolverDnSBgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSBgels_bufferSize = dlsym(handle, 'cusolverDnSBgels_bufferSize')
-
-    global __cusolverDnSXgels_bufferSize
-    __cusolverDnSXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSXgels_bufferSize')
-    if __cusolverDnSXgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSXgels_bufferSize = dlsym(handle, 'cusolverDnSXgels_bufferSize')
-
-    global __cusolverDnIRSXgesv
-    __cusolverDnIRSXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgesv')
-    if __cusolverDnIRSXgesv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSXgesv = dlsym(handle, 'cusolverDnIRSXgesv')
-
-    global __cusolverDnIRSXgesv_bufferSize
-    __cusolverDnIRSXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgesv_bufferSize')
-    if __cusolverDnIRSXgesv_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSXgesv_bufferSize = dlsym(handle, 'cusolverDnIRSXgesv_bufferSize')
-
-    global __cusolverDnIRSXgels
-    __cusolverDnIRSXgels = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgels')
-    if __cusolverDnIRSXgels == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSXgels = dlsym(handle, 'cusolverDnIRSXgels')
-
-    global __cusolverDnIRSXgels_bufferSize
-    __cusolverDnIRSXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgels_bufferSize')
-    if __cusolverDnIRSXgels_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnIRSXgels_bufferSize = dlsym(handle, 'cusolverDnIRSXgels_bufferSize')
-
-    global __cusolverDnSpotrf_bufferSize
-    __cusolverDnSpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrf_bufferSize')
-    if __cusolverDnSpotrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotrf_bufferSize = dlsym(handle, 'cusolverDnSpotrf_bufferSize')
-
-    global __cusolverDnDpotrf_bufferSize
-    __cusolverDnDpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrf_bufferSize')
-    if __cusolverDnDpotrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotrf_bufferSize = dlsym(handle, 'cusolverDnDpotrf_bufferSize')
-
-    global __cusolverDnCpotrf_bufferSize
-    __cusolverDnCpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrf_bufferSize')
-    if __cusolverDnCpotrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotrf_bufferSize = dlsym(handle, 'cusolverDnCpotrf_bufferSize')
-
-    global __cusolverDnZpotrf_bufferSize
-    __cusolverDnZpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrf_bufferSize')
-    if __cusolverDnZpotrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotrf_bufferSize = dlsym(handle, 'cusolverDnZpotrf_bufferSize')
-
-    global __cusolverDnSpotrf
-    __cusolverDnSpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrf')
-    if __cusolverDnSpotrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotrf = dlsym(handle, 'cusolverDnSpotrf')
-
-    global __cusolverDnDpotrf
-    __cusolverDnDpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrf')
-    if __cusolverDnDpotrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotrf = dlsym(handle, 'cusolverDnDpotrf')
-
-    global __cusolverDnCpotrf
-    __cusolverDnCpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrf')
-    if __cusolverDnCpotrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotrf = dlsym(handle, 'cusolverDnCpotrf')
-
-    global __cusolverDnZpotrf
-    __cusolverDnZpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrf')
-    if __cusolverDnZpotrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotrf = dlsym(handle, 'cusolverDnZpotrf')
-
-    global __cusolverDnSpotrs
-    __cusolverDnSpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrs')
-    if __cusolverDnSpotrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotrs = dlsym(handle, 'cusolverDnSpotrs')
-
-    global __cusolverDnDpotrs
-    __cusolverDnDpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrs')
-    if __cusolverDnDpotrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotrs = dlsym(handle, 'cusolverDnDpotrs')
-
-    global __cusolverDnCpotrs
-    __cusolverDnCpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrs')
-    if __cusolverDnCpotrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotrs = dlsym(handle, 'cusolverDnCpotrs')
-
-    global __cusolverDnZpotrs
-    __cusolverDnZpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrs')
-    if __cusolverDnZpotrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotrs = dlsym(handle, 'cusolverDnZpotrs')
-
-    global __cusolverDnSpotrfBatched
-    __cusolverDnSpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrfBatched')
-    if __cusolverDnSpotrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotrfBatched = dlsym(handle, 'cusolverDnSpotrfBatched')
-
-    global __cusolverDnDpotrfBatched
-    __cusolverDnDpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrfBatched')
-    if __cusolverDnDpotrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotrfBatched = dlsym(handle, 'cusolverDnDpotrfBatched')
-
-    global __cusolverDnCpotrfBatched
-    __cusolverDnCpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrfBatched')
-    if __cusolverDnCpotrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotrfBatched = dlsym(handle, 'cusolverDnCpotrfBatched')
-
-    global __cusolverDnZpotrfBatched
-    __cusolverDnZpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrfBatched')
-    if __cusolverDnZpotrfBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotrfBatched = dlsym(handle, 'cusolverDnZpotrfBatched')
-
-    global __cusolverDnSpotrsBatched
-    __cusolverDnSpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrsBatched')
-    if __cusolverDnSpotrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotrsBatched = dlsym(handle, 'cusolverDnSpotrsBatched')
-
-    global __cusolverDnDpotrsBatched
-    __cusolverDnDpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrsBatched')
-    if __cusolverDnDpotrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotrsBatched = dlsym(handle, 'cusolverDnDpotrsBatched')
-
-    global __cusolverDnCpotrsBatched
-    __cusolverDnCpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrsBatched')
-    if __cusolverDnCpotrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotrsBatched = dlsym(handle, 'cusolverDnCpotrsBatched')
-
-    global __cusolverDnZpotrsBatched
-    __cusolverDnZpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrsBatched')
-    if __cusolverDnZpotrsBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotrsBatched = dlsym(handle, 'cusolverDnZpotrsBatched')
-
-    global __cusolverDnSpotri_bufferSize
-    __cusolverDnSpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSpotri_bufferSize')
-    if __cusolverDnSpotri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotri_bufferSize = dlsym(handle, 'cusolverDnSpotri_bufferSize')
-
-    global __cusolverDnDpotri_bufferSize
-    __cusolverDnDpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDpotri_bufferSize')
-    if __cusolverDnDpotri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotri_bufferSize = dlsym(handle, 'cusolverDnDpotri_bufferSize')
-
-    global __cusolverDnCpotri_bufferSize
-    __cusolverDnCpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCpotri_bufferSize')
-    if __cusolverDnCpotri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotri_bufferSize = dlsym(handle, 'cusolverDnCpotri_bufferSize')
-
-    global __cusolverDnZpotri_bufferSize
-    __cusolverDnZpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZpotri_bufferSize')
-    if __cusolverDnZpotri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotri_bufferSize = dlsym(handle, 'cusolverDnZpotri_bufferSize')
-
-    global __cusolverDnSpotri
-    __cusolverDnSpotri = dlsym(RTLD_DEFAULT, 'cusolverDnSpotri')
-    if __cusolverDnSpotri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSpotri = dlsym(handle, 'cusolverDnSpotri')
-
-    global __cusolverDnDpotri
-    __cusolverDnDpotri = dlsym(RTLD_DEFAULT, 'cusolverDnDpotri')
-    if __cusolverDnDpotri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDpotri = dlsym(handle, 'cusolverDnDpotri')
-
-    global __cusolverDnCpotri
-    __cusolverDnCpotri = dlsym(RTLD_DEFAULT, 'cusolverDnCpotri')
-    if __cusolverDnCpotri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCpotri = dlsym(handle, 'cusolverDnCpotri')
-
-    global __cusolverDnZpotri
-    __cusolverDnZpotri = dlsym(RTLD_DEFAULT, 'cusolverDnZpotri')
-    if __cusolverDnZpotri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZpotri = dlsym(handle, 'cusolverDnZpotri')
-
-    global __cusolverDnSlauum_bufferSize
-    __cusolverDnSlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSlauum_bufferSize')
-    if __cusolverDnSlauum_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSlauum_bufferSize = dlsym(handle, 'cusolverDnSlauum_bufferSize')
-
-    global __cusolverDnDlauum_bufferSize
-    __cusolverDnDlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDlauum_bufferSize')
-    if __cusolverDnDlauum_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDlauum_bufferSize = dlsym(handle, 'cusolverDnDlauum_bufferSize')
-
-    global __cusolverDnClauum_bufferSize
-    __cusolverDnClauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnClauum_bufferSize')
-    if __cusolverDnClauum_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnClauum_bufferSize = dlsym(handle, 'cusolverDnClauum_bufferSize')
-
-    global __cusolverDnZlauum_bufferSize
-    __cusolverDnZlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZlauum_bufferSize')
-    if __cusolverDnZlauum_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZlauum_bufferSize = dlsym(handle, 'cusolverDnZlauum_bufferSize')
-
-    global __cusolverDnSlauum
-    __cusolverDnSlauum = dlsym(RTLD_DEFAULT, 'cusolverDnSlauum')
-    if __cusolverDnSlauum == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSlauum = dlsym(handle, 'cusolverDnSlauum')
-
-    global __cusolverDnDlauum
-    __cusolverDnDlauum = dlsym(RTLD_DEFAULT, 'cusolverDnDlauum')
-    if __cusolverDnDlauum == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDlauum = dlsym(handle, 'cusolverDnDlauum')
-
-    global __cusolverDnClauum
-    __cusolverDnClauum = dlsym(RTLD_DEFAULT, 'cusolverDnClauum')
-    if __cusolverDnClauum == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnClauum = dlsym(handle, 'cusolverDnClauum')
-
-    global __cusolverDnZlauum
-    __cusolverDnZlauum = dlsym(RTLD_DEFAULT, 'cusolverDnZlauum')
-    if __cusolverDnZlauum == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZlauum = dlsym(handle, 'cusolverDnZlauum')
-
-    global __cusolverDnSgetrf_bufferSize
-    __cusolverDnSgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrf_bufferSize')
-    if __cusolverDnSgetrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgetrf_bufferSize = dlsym(handle, 'cusolverDnSgetrf_bufferSize')
-
-    global __cusolverDnDgetrf_bufferSize
-    __cusolverDnDgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrf_bufferSize')
-    if __cusolverDnDgetrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgetrf_bufferSize = dlsym(handle, 'cusolverDnDgetrf_bufferSize')
-
-    global __cusolverDnCgetrf_bufferSize
-    __cusolverDnCgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrf_bufferSize')
-    if __cusolverDnCgetrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgetrf_bufferSize = dlsym(handle, 'cusolverDnCgetrf_bufferSize')
-
-    global __cusolverDnZgetrf_bufferSize
-    __cusolverDnZgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrf_bufferSize')
-    if __cusolverDnZgetrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgetrf_bufferSize = dlsym(handle, 'cusolverDnZgetrf_bufferSize')
-
-    global __cusolverDnSgetrf
-    __cusolverDnSgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrf')
-    if __cusolverDnSgetrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgetrf = dlsym(handle, 'cusolverDnSgetrf')
-
-    global __cusolverDnDgetrf
-    __cusolverDnDgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrf')
-    if __cusolverDnDgetrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgetrf = dlsym(handle, 'cusolverDnDgetrf')
-
-    global __cusolverDnCgetrf
-    __cusolverDnCgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrf')
-    if __cusolverDnCgetrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgetrf = dlsym(handle, 'cusolverDnCgetrf')
-
-    global __cusolverDnZgetrf
-    __cusolverDnZgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrf')
-    if __cusolverDnZgetrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgetrf = dlsym(handle, 'cusolverDnZgetrf')
-
-    global __cusolverDnSlaswp
-    __cusolverDnSlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnSlaswp')
-    if __cusolverDnSlaswp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSlaswp = dlsym(handle, 'cusolverDnSlaswp')
-
-    global __cusolverDnDlaswp
-    __cusolverDnDlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnDlaswp')
-    if __cusolverDnDlaswp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDlaswp = dlsym(handle, 'cusolverDnDlaswp')
-
-    global __cusolverDnClaswp
-    __cusolverDnClaswp = dlsym(RTLD_DEFAULT, 'cusolverDnClaswp')
-    if __cusolverDnClaswp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnClaswp = dlsym(handle, 'cusolverDnClaswp')
-
-    global __cusolverDnZlaswp
-    __cusolverDnZlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnZlaswp')
-    if __cusolverDnZlaswp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZlaswp = dlsym(handle, 'cusolverDnZlaswp')
-
-    global __cusolverDnSgetrs
-    __cusolverDnSgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrs')
-    if __cusolverDnSgetrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgetrs = dlsym(handle, 'cusolverDnSgetrs')
-
-    global __cusolverDnDgetrs
-    __cusolverDnDgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrs')
-    if __cusolverDnDgetrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgetrs = dlsym(handle, 'cusolverDnDgetrs')
-
-    global __cusolverDnCgetrs
-    __cusolverDnCgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrs')
-    if __cusolverDnCgetrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgetrs = dlsym(handle, 'cusolverDnCgetrs')
-
-    global __cusolverDnZgetrs
-    __cusolverDnZgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrs')
-    if __cusolverDnZgetrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgetrs = dlsym(handle, 'cusolverDnZgetrs')
-
-    global __cusolverDnSgeqrf_bufferSize
-    __cusolverDnSgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgeqrf_bufferSize')
-    if __cusolverDnSgeqrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgeqrf_bufferSize = dlsym(handle, 'cusolverDnSgeqrf_bufferSize')
-
-    global __cusolverDnDgeqrf_bufferSize
-    __cusolverDnDgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgeqrf_bufferSize')
-    if __cusolverDnDgeqrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgeqrf_bufferSize = dlsym(handle, 'cusolverDnDgeqrf_bufferSize')
-
-    global __cusolverDnCgeqrf_bufferSize
-    __cusolverDnCgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgeqrf_bufferSize')
-    if __cusolverDnCgeqrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgeqrf_bufferSize = dlsym(handle, 'cusolverDnCgeqrf_bufferSize')
-
-    global __cusolverDnZgeqrf_bufferSize
-    __cusolverDnZgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgeqrf_bufferSize')
-    if __cusolverDnZgeqrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgeqrf_bufferSize = dlsym(handle, 'cusolverDnZgeqrf_bufferSize')
-
-    global __cusolverDnSgeqrf
-    __cusolverDnSgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnSgeqrf')
-    if __cusolverDnSgeqrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgeqrf = dlsym(handle, 'cusolverDnSgeqrf')
-
-    global __cusolverDnDgeqrf
-    __cusolverDnDgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnDgeqrf')
-    if __cusolverDnDgeqrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgeqrf = dlsym(handle, 'cusolverDnDgeqrf')
-
-    global __cusolverDnCgeqrf
-    __cusolverDnCgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnCgeqrf')
-    if __cusolverDnCgeqrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgeqrf = dlsym(handle, 'cusolverDnCgeqrf')
-
-    global __cusolverDnZgeqrf
-    __cusolverDnZgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnZgeqrf')
-    if __cusolverDnZgeqrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgeqrf = dlsym(handle, 'cusolverDnZgeqrf')
-
-    global __cusolverDnSorgqr_bufferSize
-    __cusolverDnSorgqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgqr_bufferSize')
-    if __cusolverDnSorgqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgqr_bufferSize = dlsym(handle, 'cusolverDnSorgqr_bufferSize')
-
-    global __cusolverDnDorgqr_bufferSize
-    __cusolverDnDorgqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgqr_bufferSize')
-    if __cusolverDnDorgqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgqr_bufferSize = dlsym(handle, 'cusolverDnDorgqr_bufferSize')
-
-    global __cusolverDnCungqr_bufferSize
-    __cusolverDnCungqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungqr_bufferSize')
-    if __cusolverDnCungqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungqr_bufferSize = dlsym(handle, 'cusolverDnCungqr_bufferSize')
-
-    global __cusolverDnZungqr_bufferSize
-    __cusolverDnZungqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungqr_bufferSize')
-    if __cusolverDnZungqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungqr_bufferSize = dlsym(handle, 'cusolverDnZungqr_bufferSize')
-
-    global __cusolverDnSorgqr
-    __cusolverDnSorgqr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgqr')
-    if __cusolverDnSorgqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgqr = dlsym(handle, 'cusolverDnSorgqr')
-
-    global __cusolverDnDorgqr
-    __cusolverDnDorgqr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgqr')
-    if __cusolverDnDorgqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgqr = dlsym(handle, 'cusolverDnDorgqr')
-
-    global __cusolverDnCungqr
-    __cusolverDnCungqr = dlsym(RTLD_DEFAULT, 'cusolverDnCungqr')
-    if __cusolverDnCungqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungqr = dlsym(handle, 'cusolverDnCungqr')
-
-    global __cusolverDnZungqr
-    __cusolverDnZungqr = dlsym(RTLD_DEFAULT, 'cusolverDnZungqr')
-    if __cusolverDnZungqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungqr = dlsym(handle, 'cusolverDnZungqr')
-
-    global __cusolverDnSormqr_bufferSize
-    __cusolverDnSormqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSormqr_bufferSize')
-    if __cusolverDnSormqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSormqr_bufferSize = dlsym(handle, 'cusolverDnSormqr_bufferSize')
-
-    global __cusolverDnDormqr_bufferSize
-    __cusolverDnDormqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDormqr_bufferSize')
-    if __cusolverDnDormqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDormqr_bufferSize = dlsym(handle, 'cusolverDnDormqr_bufferSize')
-
-    global __cusolverDnCunmqr_bufferSize
-    __cusolverDnCunmqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCunmqr_bufferSize')
-    if __cusolverDnCunmqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCunmqr_bufferSize = dlsym(handle, 'cusolverDnCunmqr_bufferSize')
-
-    global __cusolverDnZunmqr_bufferSize
-    __cusolverDnZunmqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZunmqr_bufferSize')
-    if __cusolverDnZunmqr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZunmqr_bufferSize = dlsym(handle, 'cusolverDnZunmqr_bufferSize')
-
-    global __cusolverDnSormqr
-    __cusolverDnSormqr = dlsym(RTLD_DEFAULT, 'cusolverDnSormqr')
-    if __cusolverDnSormqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSormqr = dlsym(handle, 'cusolverDnSormqr')
-
-    global __cusolverDnDormqr
-    __cusolverDnDormqr = dlsym(RTLD_DEFAULT, 'cusolverDnDormqr')
-    if __cusolverDnDormqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDormqr = dlsym(handle, 'cusolverDnDormqr')
-
-    global __cusolverDnCunmqr
-    __cusolverDnCunmqr = dlsym(RTLD_DEFAULT, 'cusolverDnCunmqr')
-    if __cusolverDnCunmqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCunmqr = dlsym(handle, 'cusolverDnCunmqr')
-
-    global __cusolverDnZunmqr
-    __cusolverDnZunmqr = dlsym(RTLD_DEFAULT, 'cusolverDnZunmqr')
-    if __cusolverDnZunmqr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZunmqr = dlsym(handle, 'cusolverDnZunmqr')
-
-    global __cusolverDnSsytrf_bufferSize
-    __cusolverDnSsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrf_bufferSize')
-    if __cusolverDnSsytrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytrf_bufferSize = dlsym(handle, 'cusolverDnSsytrf_bufferSize')
-
-    global __cusolverDnDsytrf_bufferSize
-    __cusolverDnDsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrf_bufferSize')
-    if __cusolverDnDsytrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytrf_bufferSize = dlsym(handle, 'cusolverDnDsytrf_bufferSize')
-
-    global __cusolverDnCsytrf_bufferSize
-    __cusolverDnCsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCsytrf_bufferSize')
-    if __cusolverDnCsytrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCsytrf_bufferSize = dlsym(handle, 'cusolverDnCsytrf_bufferSize')
-
-    global __cusolverDnZsytrf_bufferSize
-    __cusolverDnZsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZsytrf_bufferSize')
-    if __cusolverDnZsytrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZsytrf_bufferSize = dlsym(handle, 'cusolverDnZsytrf_bufferSize')
-
-    global __cusolverDnSsytrf
-    __cusolverDnSsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrf')
-    if __cusolverDnSsytrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytrf = dlsym(handle, 'cusolverDnSsytrf')
-
-    global __cusolverDnDsytrf
-    __cusolverDnDsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrf')
-    if __cusolverDnDsytrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytrf = dlsym(handle, 'cusolverDnDsytrf')
-
-    global __cusolverDnCsytrf
-    __cusolverDnCsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnCsytrf')
-    if __cusolverDnCsytrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCsytrf = dlsym(handle, 'cusolverDnCsytrf')
-
-    global __cusolverDnZsytrf
-    __cusolverDnZsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnZsytrf')
-    if __cusolverDnZsytrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZsytrf = dlsym(handle, 'cusolverDnZsytrf')
-
-    global __cusolverDnSsytri_bufferSize
-    __cusolverDnSsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytri_bufferSize')
-    if __cusolverDnSsytri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytri_bufferSize = dlsym(handle, 'cusolverDnSsytri_bufferSize')
-
-    global __cusolverDnDsytri_bufferSize
-    __cusolverDnDsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytri_bufferSize')
-    if __cusolverDnDsytri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytri_bufferSize = dlsym(handle, 'cusolverDnDsytri_bufferSize')
-
-    global __cusolverDnCsytri_bufferSize
-    __cusolverDnCsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCsytri_bufferSize')
-    if __cusolverDnCsytri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCsytri_bufferSize = dlsym(handle, 'cusolverDnCsytri_bufferSize')
-
-    global __cusolverDnZsytri_bufferSize
-    __cusolverDnZsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZsytri_bufferSize')
-    if __cusolverDnZsytri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZsytri_bufferSize = dlsym(handle, 'cusolverDnZsytri_bufferSize')
-
-    global __cusolverDnSsytri
-    __cusolverDnSsytri = dlsym(RTLD_DEFAULT, 'cusolverDnSsytri')
-    if __cusolverDnSsytri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytri = dlsym(handle, 'cusolverDnSsytri')
-
-    global __cusolverDnDsytri
-    __cusolverDnDsytri = dlsym(RTLD_DEFAULT, 'cusolverDnDsytri')
-    if __cusolverDnDsytri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytri = dlsym(handle, 'cusolverDnDsytri')
-
-    global __cusolverDnCsytri
-    __cusolverDnCsytri = dlsym(RTLD_DEFAULT, 'cusolverDnCsytri')
-    if __cusolverDnCsytri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCsytri = dlsym(handle, 'cusolverDnCsytri')
-
-    global __cusolverDnZsytri
-    __cusolverDnZsytri = dlsym(RTLD_DEFAULT, 'cusolverDnZsytri')
-    if __cusolverDnZsytri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZsytri = dlsym(handle, 'cusolverDnZsytri')
-
-    global __cusolverDnSgebrd_bufferSize
-    __cusolverDnSgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgebrd_bufferSize')
-    if __cusolverDnSgebrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgebrd_bufferSize = dlsym(handle, 'cusolverDnSgebrd_bufferSize')
-
-    global __cusolverDnDgebrd_bufferSize
-    __cusolverDnDgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgebrd_bufferSize')
-    if __cusolverDnDgebrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgebrd_bufferSize = dlsym(handle, 'cusolverDnDgebrd_bufferSize')
-
-    global __cusolverDnCgebrd_bufferSize
-    __cusolverDnCgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgebrd_bufferSize')
-    if __cusolverDnCgebrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgebrd_bufferSize = dlsym(handle, 'cusolverDnCgebrd_bufferSize')
-
-    global __cusolverDnZgebrd_bufferSize
-    __cusolverDnZgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgebrd_bufferSize')
-    if __cusolverDnZgebrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgebrd_bufferSize = dlsym(handle, 'cusolverDnZgebrd_bufferSize')
-
-    global __cusolverDnSgebrd
-    __cusolverDnSgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnSgebrd')
-    if __cusolverDnSgebrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgebrd = dlsym(handle, 'cusolverDnSgebrd')
-
-    global __cusolverDnDgebrd
-    __cusolverDnDgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnDgebrd')
-    if __cusolverDnDgebrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgebrd = dlsym(handle, 'cusolverDnDgebrd')
-
-    global __cusolverDnCgebrd
-    __cusolverDnCgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnCgebrd')
-    if __cusolverDnCgebrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgebrd = dlsym(handle, 'cusolverDnCgebrd')
-
-    global __cusolverDnZgebrd
-    __cusolverDnZgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnZgebrd')
-    if __cusolverDnZgebrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgebrd = dlsym(handle, 'cusolverDnZgebrd')
-
-    global __cusolverDnSorgbr_bufferSize
-    __cusolverDnSorgbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgbr_bufferSize')
-    if __cusolverDnSorgbr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgbr_bufferSize = dlsym(handle, 'cusolverDnSorgbr_bufferSize')
-
-    global __cusolverDnDorgbr_bufferSize
-    __cusolverDnDorgbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgbr_bufferSize')
-    if __cusolverDnDorgbr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgbr_bufferSize = dlsym(handle, 'cusolverDnDorgbr_bufferSize')
-
-    global __cusolverDnCungbr_bufferSize
-    __cusolverDnCungbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungbr_bufferSize')
-    if __cusolverDnCungbr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungbr_bufferSize = dlsym(handle, 'cusolverDnCungbr_bufferSize')
-
-    global __cusolverDnZungbr_bufferSize
-    __cusolverDnZungbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungbr_bufferSize')
-    if __cusolverDnZungbr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungbr_bufferSize = dlsym(handle, 'cusolverDnZungbr_bufferSize')
-
-    global __cusolverDnSorgbr
-    __cusolverDnSorgbr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgbr')
-    if __cusolverDnSorgbr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgbr = dlsym(handle, 'cusolverDnSorgbr')
-
-    global __cusolverDnDorgbr
-    __cusolverDnDorgbr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgbr')
-    if __cusolverDnDorgbr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgbr = dlsym(handle, 'cusolverDnDorgbr')
-
-    global __cusolverDnCungbr
-    __cusolverDnCungbr = dlsym(RTLD_DEFAULT, 'cusolverDnCungbr')
-    if __cusolverDnCungbr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungbr = dlsym(handle, 'cusolverDnCungbr')
-
-    global __cusolverDnZungbr
-    __cusolverDnZungbr = dlsym(RTLD_DEFAULT, 'cusolverDnZungbr')
-    if __cusolverDnZungbr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungbr = dlsym(handle, 'cusolverDnZungbr')
-
-    global __cusolverDnSsytrd_bufferSize
-    __cusolverDnSsytrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrd_bufferSize')
-    if __cusolverDnSsytrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytrd_bufferSize = dlsym(handle, 'cusolverDnSsytrd_bufferSize')
-
-    global __cusolverDnDsytrd_bufferSize
-    __cusolverDnDsytrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrd_bufferSize')
-    if __cusolverDnDsytrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytrd_bufferSize = dlsym(handle, 'cusolverDnDsytrd_bufferSize')
-
-    global __cusolverDnChetrd_bufferSize
-    __cusolverDnChetrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChetrd_bufferSize')
-    if __cusolverDnChetrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChetrd_bufferSize = dlsym(handle, 'cusolverDnChetrd_bufferSize')
-
-    global __cusolverDnZhetrd_bufferSize
-    __cusolverDnZhetrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhetrd_bufferSize')
-    if __cusolverDnZhetrd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhetrd_bufferSize = dlsym(handle, 'cusolverDnZhetrd_bufferSize')
-
-    global __cusolverDnSsytrd
-    __cusolverDnSsytrd = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrd')
-    if __cusolverDnSsytrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsytrd = dlsym(handle, 'cusolverDnSsytrd')
-
-    global __cusolverDnDsytrd
-    __cusolverDnDsytrd = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrd')
-    if __cusolverDnDsytrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsytrd = dlsym(handle, 'cusolverDnDsytrd')
-
-    global __cusolverDnChetrd
-    __cusolverDnChetrd = dlsym(RTLD_DEFAULT, 'cusolverDnChetrd')
-    if __cusolverDnChetrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChetrd = dlsym(handle, 'cusolverDnChetrd')
-
-    global __cusolverDnZhetrd
-    __cusolverDnZhetrd = dlsym(RTLD_DEFAULT, 'cusolverDnZhetrd')
-    if __cusolverDnZhetrd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhetrd = dlsym(handle, 'cusolverDnZhetrd')
-
-    global __cusolverDnSorgtr_bufferSize
-    __cusolverDnSorgtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgtr_bufferSize')
-    if __cusolverDnSorgtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgtr_bufferSize = dlsym(handle, 'cusolverDnSorgtr_bufferSize')
-
-    global __cusolverDnDorgtr_bufferSize
-    __cusolverDnDorgtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgtr_bufferSize')
-    if __cusolverDnDorgtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgtr_bufferSize = dlsym(handle, 'cusolverDnDorgtr_bufferSize')
-
-    global __cusolverDnCungtr_bufferSize
-    __cusolverDnCungtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungtr_bufferSize')
-    if __cusolverDnCungtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungtr_bufferSize = dlsym(handle, 'cusolverDnCungtr_bufferSize')
-
-    global __cusolverDnZungtr_bufferSize
-    __cusolverDnZungtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungtr_bufferSize')
-    if __cusolverDnZungtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungtr_bufferSize = dlsym(handle, 'cusolverDnZungtr_bufferSize')
-
-    global __cusolverDnSorgtr
-    __cusolverDnSorgtr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgtr')
-    if __cusolverDnSorgtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSorgtr = dlsym(handle, 'cusolverDnSorgtr')
-
-    global __cusolverDnDorgtr
-    __cusolverDnDorgtr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgtr')
-    if __cusolverDnDorgtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDorgtr = dlsym(handle, 'cusolverDnDorgtr')
-
-    global __cusolverDnCungtr
-    __cusolverDnCungtr = dlsym(RTLD_DEFAULT, 'cusolverDnCungtr')
-    if __cusolverDnCungtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCungtr = dlsym(handle, 'cusolverDnCungtr')
-
-    global __cusolverDnZungtr
-    __cusolverDnZungtr = dlsym(RTLD_DEFAULT, 'cusolverDnZungtr')
-    if __cusolverDnZungtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZungtr = dlsym(handle, 'cusolverDnZungtr')
-
-    global __cusolverDnSormtr_bufferSize
-    __cusolverDnSormtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSormtr_bufferSize')
-    if __cusolverDnSormtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSormtr_bufferSize = dlsym(handle, 'cusolverDnSormtr_bufferSize')
-
-    global __cusolverDnDormtr_bufferSize
-    __cusolverDnDormtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDormtr_bufferSize')
-    if __cusolverDnDormtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDormtr_bufferSize = dlsym(handle, 'cusolverDnDormtr_bufferSize')
-
-    global __cusolverDnCunmtr_bufferSize
-    __cusolverDnCunmtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCunmtr_bufferSize')
-    if __cusolverDnCunmtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCunmtr_bufferSize = dlsym(handle, 'cusolverDnCunmtr_bufferSize')
-
-    global __cusolverDnZunmtr_bufferSize
-    __cusolverDnZunmtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZunmtr_bufferSize')
-    if __cusolverDnZunmtr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZunmtr_bufferSize = dlsym(handle, 'cusolverDnZunmtr_bufferSize')
-
-    global __cusolverDnSormtr
-    __cusolverDnSormtr = dlsym(RTLD_DEFAULT, 'cusolverDnSormtr')
-    if __cusolverDnSormtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSormtr = dlsym(handle, 'cusolverDnSormtr')
-
-    global __cusolverDnDormtr
-    __cusolverDnDormtr = dlsym(RTLD_DEFAULT, 'cusolverDnDormtr')
-    if __cusolverDnDormtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDormtr = dlsym(handle, 'cusolverDnDormtr')
-
-    global __cusolverDnCunmtr
-    __cusolverDnCunmtr = dlsym(RTLD_DEFAULT, 'cusolverDnCunmtr')
-    if __cusolverDnCunmtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCunmtr = dlsym(handle, 'cusolverDnCunmtr')
-
-    global __cusolverDnZunmtr
-    __cusolverDnZunmtr = dlsym(RTLD_DEFAULT, 'cusolverDnZunmtr')
-    if __cusolverDnZunmtr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZunmtr = dlsym(handle, 'cusolverDnZunmtr')
-
-    global __cusolverDnSgesvd_bufferSize
-    __cusolverDnSgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvd_bufferSize')
-    if __cusolverDnSgesvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvd_bufferSize = dlsym(handle, 'cusolverDnSgesvd_bufferSize')
-
-    global __cusolverDnDgesvd_bufferSize
-    __cusolverDnDgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvd_bufferSize')
-    if __cusolverDnDgesvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvd_bufferSize = dlsym(handle, 'cusolverDnDgesvd_bufferSize')
-
-    global __cusolverDnCgesvd_bufferSize
-    __cusolverDnCgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvd_bufferSize')
-    if __cusolverDnCgesvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvd_bufferSize = dlsym(handle, 'cusolverDnCgesvd_bufferSize')
-
-    global __cusolverDnZgesvd_bufferSize
-    __cusolverDnZgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvd_bufferSize')
-    if __cusolverDnZgesvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvd_bufferSize = dlsym(handle, 'cusolverDnZgesvd_bufferSize')
-
-    global __cusolverDnSgesvd
-    __cusolverDnSgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvd')
-    if __cusolverDnSgesvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvd = dlsym(handle, 'cusolverDnSgesvd')
-
-    global __cusolverDnDgesvd
-    __cusolverDnDgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvd')
-    if __cusolverDnDgesvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvd = dlsym(handle, 'cusolverDnDgesvd')
-
-    global __cusolverDnCgesvd
-    __cusolverDnCgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvd')
-    if __cusolverDnCgesvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvd = dlsym(handle, 'cusolverDnCgesvd')
-
-    global __cusolverDnZgesvd
-    __cusolverDnZgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvd')
-    if __cusolverDnZgesvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvd = dlsym(handle, 'cusolverDnZgesvd')
-
-    global __cusolverDnSsyevd_bufferSize
-    __cusolverDnSsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevd_bufferSize')
-    if __cusolverDnSsyevd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevd_bufferSize = dlsym(handle, 'cusolverDnSsyevd_bufferSize')
-
-    global __cusolverDnDsyevd_bufferSize
-    __cusolverDnDsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevd_bufferSize')
-    if __cusolverDnDsyevd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevd_bufferSize = dlsym(handle, 'cusolverDnDsyevd_bufferSize')
-
-    global __cusolverDnCheevd_bufferSize
-    __cusolverDnCheevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevd_bufferSize')
-    if __cusolverDnCheevd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevd_bufferSize = dlsym(handle, 'cusolverDnCheevd_bufferSize')
-
-    global __cusolverDnZheevd_bufferSize
-    __cusolverDnZheevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevd_bufferSize')
-    if __cusolverDnZheevd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevd_bufferSize = dlsym(handle, 'cusolverDnZheevd_bufferSize')
-
-    global __cusolverDnSsyevd
-    __cusolverDnSsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevd')
-    if __cusolverDnSsyevd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevd = dlsym(handle, 'cusolverDnSsyevd')
-
-    global __cusolverDnDsyevd
-    __cusolverDnDsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevd')
-    if __cusolverDnDsyevd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevd = dlsym(handle, 'cusolverDnDsyevd')
-
-    global __cusolverDnCheevd
-    __cusolverDnCheevd = dlsym(RTLD_DEFAULT, 'cusolverDnCheevd')
-    if __cusolverDnCheevd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevd = dlsym(handle, 'cusolverDnCheevd')
-
-    global __cusolverDnZheevd
-    __cusolverDnZheevd = dlsym(RTLD_DEFAULT, 'cusolverDnZheevd')
-    if __cusolverDnZheevd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevd = dlsym(handle, 'cusolverDnZheevd')
-
-    global __cusolverDnSsyevdx_bufferSize
-    __cusolverDnSsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevdx_bufferSize')
-    if __cusolverDnSsyevdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevdx_bufferSize = dlsym(handle, 'cusolverDnSsyevdx_bufferSize')
-
-    global __cusolverDnDsyevdx_bufferSize
-    __cusolverDnDsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevdx_bufferSize')
-    if __cusolverDnDsyevdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevdx_bufferSize = dlsym(handle, 'cusolverDnDsyevdx_bufferSize')
-
-    global __cusolverDnCheevdx_bufferSize
-    __cusolverDnCheevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevdx_bufferSize')
-    if __cusolverDnCheevdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevdx_bufferSize = dlsym(handle, 'cusolverDnCheevdx_bufferSize')
-
-    global __cusolverDnZheevdx_bufferSize
-    __cusolverDnZheevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevdx_bufferSize')
-    if __cusolverDnZheevdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevdx_bufferSize = dlsym(handle, 'cusolverDnZheevdx_bufferSize')
-
-    global __cusolverDnSsyevdx
-    __cusolverDnSsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevdx')
-    if __cusolverDnSsyevdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevdx = dlsym(handle, 'cusolverDnSsyevdx')
-
-    global __cusolverDnDsyevdx
-    __cusolverDnDsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevdx')
-    if __cusolverDnDsyevdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevdx = dlsym(handle, 'cusolverDnDsyevdx')
-
-    global __cusolverDnCheevdx
-    __cusolverDnCheevdx = dlsym(RTLD_DEFAULT, 'cusolverDnCheevdx')
-    if __cusolverDnCheevdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevdx = dlsym(handle, 'cusolverDnCheevdx')
-
-    global __cusolverDnZheevdx
-    __cusolverDnZheevdx = dlsym(RTLD_DEFAULT, 'cusolverDnZheevdx')
-    if __cusolverDnZheevdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevdx = dlsym(handle, 'cusolverDnZheevdx')
-
-    global __cusolverDnSsygvdx_bufferSize
-    __cusolverDnSsygvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvdx_bufferSize')
-    if __cusolverDnSsygvdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvdx_bufferSize = dlsym(handle, 'cusolverDnSsygvdx_bufferSize')
-
-    global __cusolverDnDsygvdx_bufferSize
-    __cusolverDnDsygvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvdx_bufferSize')
-    if __cusolverDnDsygvdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvdx_bufferSize = dlsym(handle, 'cusolverDnDsygvdx_bufferSize')
-
-    global __cusolverDnChegvdx_bufferSize
-    __cusolverDnChegvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvdx_bufferSize')
-    if __cusolverDnChegvdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvdx_bufferSize = dlsym(handle, 'cusolverDnChegvdx_bufferSize')
-
-    global __cusolverDnZhegvdx_bufferSize
-    __cusolverDnZhegvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvdx_bufferSize')
-    if __cusolverDnZhegvdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvdx_bufferSize = dlsym(handle, 'cusolverDnZhegvdx_bufferSize')
-
-    global __cusolverDnSsygvdx
-    __cusolverDnSsygvdx = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvdx')
-    if __cusolverDnSsygvdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvdx = dlsym(handle, 'cusolverDnSsygvdx')
-
-    global __cusolverDnDsygvdx
-    __cusolverDnDsygvdx = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvdx')
-    if __cusolverDnDsygvdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvdx = dlsym(handle, 'cusolverDnDsygvdx')
-
-    global __cusolverDnChegvdx
-    __cusolverDnChegvdx = dlsym(RTLD_DEFAULT, 'cusolverDnChegvdx')
-    if __cusolverDnChegvdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvdx = dlsym(handle, 'cusolverDnChegvdx')
-
-    global __cusolverDnZhegvdx
-    __cusolverDnZhegvdx = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvdx')
-    if __cusolverDnZhegvdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvdx = dlsym(handle, 'cusolverDnZhegvdx')
-
-    global __cusolverDnSsygvd_bufferSize
-    __cusolverDnSsygvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvd_bufferSize')
-    if __cusolverDnSsygvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvd_bufferSize = dlsym(handle, 'cusolverDnSsygvd_bufferSize')
-
-    global __cusolverDnDsygvd_bufferSize
-    __cusolverDnDsygvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvd_bufferSize')
-    if __cusolverDnDsygvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvd_bufferSize = dlsym(handle, 'cusolverDnDsygvd_bufferSize')
-
-    global __cusolverDnChegvd_bufferSize
-    __cusolverDnChegvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvd_bufferSize')
-    if __cusolverDnChegvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvd_bufferSize = dlsym(handle, 'cusolverDnChegvd_bufferSize')
-
-    global __cusolverDnZhegvd_bufferSize
-    __cusolverDnZhegvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvd_bufferSize')
-    if __cusolverDnZhegvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvd_bufferSize = dlsym(handle, 'cusolverDnZhegvd_bufferSize')
-
-    global __cusolverDnSsygvd
-    __cusolverDnSsygvd = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvd')
-    if __cusolverDnSsygvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvd = dlsym(handle, 'cusolverDnSsygvd')
-
-    global __cusolverDnDsygvd
-    __cusolverDnDsygvd = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvd')
-    if __cusolverDnDsygvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvd = dlsym(handle, 'cusolverDnDsygvd')
-
-    global __cusolverDnChegvd
-    __cusolverDnChegvd = dlsym(RTLD_DEFAULT, 'cusolverDnChegvd')
-    if __cusolverDnChegvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvd = dlsym(handle, 'cusolverDnChegvd')
-
-    global __cusolverDnZhegvd
-    __cusolverDnZhegvd = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvd')
-    if __cusolverDnZhegvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvd = dlsym(handle, 'cusolverDnZhegvd')
-
-    global __cusolverDnCreateSyevjInfo
-    __cusolverDnCreateSyevjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnCreateSyevjInfo')
-    if __cusolverDnCreateSyevjInfo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCreateSyevjInfo = dlsym(handle, 'cusolverDnCreateSyevjInfo')
-
-    global __cusolverDnDestroySyevjInfo
-    __cusolverDnDestroySyevjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnDestroySyevjInfo')
-    if __cusolverDnDestroySyevjInfo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDestroySyevjInfo = dlsym(handle, 'cusolverDnDestroySyevjInfo')
-
-    global __cusolverDnXsyevjSetTolerance
-    __cusolverDnXsyevjSetTolerance = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetTolerance')
-    if __cusolverDnXsyevjSetTolerance == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevjSetTolerance = dlsym(handle, 'cusolverDnXsyevjSetTolerance')
-
-    global __cusolverDnXsyevjSetMaxSweeps
-    __cusolverDnXsyevjSetMaxSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetMaxSweeps')
-    if __cusolverDnXsyevjSetMaxSweeps == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevjSetMaxSweeps = dlsym(handle, 'cusolverDnXsyevjSetMaxSweeps')
-
-    global __cusolverDnXsyevjSetSortEig
-    __cusolverDnXsyevjSetSortEig = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetSortEig')
-    if __cusolverDnXsyevjSetSortEig == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevjSetSortEig = dlsym(handle, 'cusolverDnXsyevjSetSortEig')
-
-    global __cusolverDnXsyevjGetResidual
-    __cusolverDnXsyevjGetResidual = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjGetResidual')
-    if __cusolverDnXsyevjGetResidual == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevjGetResidual = dlsym(handle, 'cusolverDnXsyevjGetResidual')
-
-    global __cusolverDnXsyevjGetSweeps
-    __cusolverDnXsyevjGetSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjGetSweeps')
-    if __cusolverDnXsyevjGetSweeps == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevjGetSweeps = dlsym(handle, 'cusolverDnXsyevjGetSweeps')
-
-    global __cusolverDnSsyevjBatched_bufferSize
-    __cusolverDnSsyevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevjBatched_bufferSize')
-    if __cusolverDnSsyevjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevjBatched_bufferSize = dlsym(handle, 'cusolverDnSsyevjBatched_bufferSize')
-
-    global __cusolverDnDsyevjBatched_bufferSize
-    __cusolverDnDsyevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevjBatched_bufferSize')
-    if __cusolverDnDsyevjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevjBatched_bufferSize = dlsym(handle, 'cusolverDnDsyevjBatched_bufferSize')
-
-    global __cusolverDnCheevjBatched_bufferSize
-    __cusolverDnCheevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevjBatched_bufferSize')
-    if __cusolverDnCheevjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevjBatched_bufferSize = dlsym(handle, 'cusolverDnCheevjBatched_bufferSize')
-
-    global __cusolverDnZheevjBatched_bufferSize
-    __cusolverDnZheevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevjBatched_bufferSize')
-    if __cusolverDnZheevjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevjBatched_bufferSize = dlsym(handle, 'cusolverDnZheevjBatched_bufferSize')
-
-    global __cusolverDnSsyevjBatched
-    __cusolverDnSsyevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevjBatched')
-    if __cusolverDnSsyevjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevjBatched = dlsym(handle, 'cusolverDnSsyevjBatched')
-
-    global __cusolverDnDsyevjBatched
-    __cusolverDnDsyevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevjBatched')
-    if __cusolverDnDsyevjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevjBatched = dlsym(handle, 'cusolverDnDsyevjBatched')
-
-    global __cusolverDnCheevjBatched
-    __cusolverDnCheevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCheevjBatched')
-    if __cusolverDnCheevjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevjBatched = dlsym(handle, 'cusolverDnCheevjBatched')
-
-    global __cusolverDnZheevjBatched
-    __cusolverDnZheevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZheevjBatched')
-    if __cusolverDnZheevjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevjBatched = dlsym(handle, 'cusolverDnZheevjBatched')
-
-    global __cusolverDnSsyevj_bufferSize
-    __cusolverDnSsyevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevj_bufferSize')
-    if __cusolverDnSsyevj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevj_bufferSize = dlsym(handle, 'cusolverDnSsyevj_bufferSize')
-
-    global __cusolverDnDsyevj_bufferSize
-    __cusolverDnDsyevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevj_bufferSize')
-    if __cusolverDnDsyevj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevj_bufferSize = dlsym(handle, 'cusolverDnDsyevj_bufferSize')
-
-    global __cusolverDnCheevj_bufferSize
-    __cusolverDnCheevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevj_bufferSize')
-    if __cusolverDnCheevj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevj_bufferSize = dlsym(handle, 'cusolverDnCheevj_bufferSize')
-
-    global __cusolverDnZheevj_bufferSize
-    __cusolverDnZheevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevj_bufferSize')
-    if __cusolverDnZheevj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevj_bufferSize = dlsym(handle, 'cusolverDnZheevj_bufferSize')
-
-    global __cusolverDnSsyevj
-    __cusolverDnSsyevj = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevj')
-    if __cusolverDnSsyevj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsyevj = dlsym(handle, 'cusolverDnSsyevj')
-
-    global __cusolverDnDsyevj
-    __cusolverDnDsyevj = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevj')
-    if __cusolverDnDsyevj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsyevj = dlsym(handle, 'cusolverDnDsyevj')
-
-    global __cusolverDnCheevj
-    __cusolverDnCheevj = dlsym(RTLD_DEFAULT, 'cusolverDnCheevj')
-    if __cusolverDnCheevj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCheevj = dlsym(handle, 'cusolverDnCheevj')
-
-    global __cusolverDnZheevj
-    __cusolverDnZheevj = dlsym(RTLD_DEFAULT, 'cusolverDnZheevj')
-    if __cusolverDnZheevj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZheevj = dlsym(handle, 'cusolverDnZheevj')
-
-    global __cusolverDnSsygvj_bufferSize
-    __cusolverDnSsygvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvj_bufferSize')
-    if __cusolverDnSsygvj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvj_bufferSize = dlsym(handle, 'cusolverDnSsygvj_bufferSize')
-
-    global __cusolverDnDsygvj_bufferSize
-    __cusolverDnDsygvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvj_bufferSize')
-    if __cusolverDnDsygvj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvj_bufferSize = dlsym(handle, 'cusolverDnDsygvj_bufferSize')
-
-    global __cusolverDnChegvj_bufferSize
-    __cusolverDnChegvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvj_bufferSize')
-    if __cusolverDnChegvj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvj_bufferSize = dlsym(handle, 'cusolverDnChegvj_bufferSize')
-
-    global __cusolverDnZhegvj_bufferSize
-    __cusolverDnZhegvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvj_bufferSize')
-    if __cusolverDnZhegvj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvj_bufferSize = dlsym(handle, 'cusolverDnZhegvj_bufferSize')
-
-    global __cusolverDnSsygvj
-    __cusolverDnSsygvj = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvj')
-    if __cusolverDnSsygvj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSsygvj = dlsym(handle, 'cusolverDnSsygvj')
-
-    global __cusolverDnDsygvj
-    __cusolverDnDsygvj = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvj')
-    if __cusolverDnDsygvj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDsygvj = dlsym(handle, 'cusolverDnDsygvj')
-
-    global __cusolverDnChegvj
-    __cusolverDnChegvj = dlsym(RTLD_DEFAULT, 'cusolverDnChegvj')
-    if __cusolverDnChegvj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnChegvj = dlsym(handle, 'cusolverDnChegvj')
-
-    global __cusolverDnZhegvj
-    __cusolverDnZhegvj = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvj')
-    if __cusolverDnZhegvj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZhegvj = dlsym(handle, 'cusolverDnZhegvj')
-
-    global __cusolverDnCreateGesvdjInfo
-    __cusolverDnCreateGesvdjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnCreateGesvdjInfo')
-    if __cusolverDnCreateGesvdjInfo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCreateGesvdjInfo = dlsym(handle, 'cusolverDnCreateGesvdjInfo')
-
-    global __cusolverDnDestroyGesvdjInfo
-    __cusolverDnDestroyGesvdjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnDestroyGesvdjInfo')
-    if __cusolverDnDestroyGesvdjInfo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDestroyGesvdjInfo = dlsym(handle, 'cusolverDnDestroyGesvdjInfo')
-
-    global __cusolverDnXgesvdjSetTolerance
-    __cusolverDnXgesvdjSetTolerance = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetTolerance')
-    if __cusolverDnXgesvdjSetTolerance == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdjSetTolerance = dlsym(handle, 'cusolverDnXgesvdjSetTolerance')
-
-    global __cusolverDnXgesvdjSetMaxSweeps
-    __cusolverDnXgesvdjSetMaxSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetMaxSweeps')
-    if __cusolverDnXgesvdjSetMaxSweeps == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdjSetMaxSweeps = dlsym(handle, 'cusolverDnXgesvdjSetMaxSweeps')
-
-    global __cusolverDnXgesvdjSetSortEig
-    __cusolverDnXgesvdjSetSortEig = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetSortEig')
-    if __cusolverDnXgesvdjSetSortEig == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdjSetSortEig = dlsym(handle, 'cusolverDnXgesvdjSetSortEig')
-
-    global __cusolverDnXgesvdjGetResidual
-    __cusolverDnXgesvdjGetResidual = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjGetResidual')
-    if __cusolverDnXgesvdjGetResidual == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdjGetResidual = dlsym(handle, 'cusolverDnXgesvdjGetResidual')
-
-    global __cusolverDnXgesvdjGetSweeps
-    __cusolverDnXgesvdjGetSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjGetSweeps')
-    if __cusolverDnXgesvdjGetSweeps == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdjGetSweeps = dlsym(handle, 'cusolverDnXgesvdjGetSweeps')
-
-    global __cusolverDnSgesvdjBatched_bufferSize
-    __cusolverDnSgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdjBatched_bufferSize')
-    if __cusolverDnSgesvdjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnSgesvdjBatched_bufferSize')
-
-    global __cusolverDnDgesvdjBatched_bufferSize
-    __cusolverDnDgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdjBatched_bufferSize')
-    if __cusolverDnDgesvdjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnDgesvdjBatched_bufferSize')
-
-    global __cusolverDnCgesvdjBatched_bufferSize
-    __cusolverDnCgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdjBatched_bufferSize')
-    if __cusolverDnCgesvdjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnCgesvdjBatched_bufferSize')
-
-    global __cusolverDnZgesvdjBatched_bufferSize
-    __cusolverDnZgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdjBatched_bufferSize')
-    if __cusolverDnZgesvdjBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnZgesvdjBatched_bufferSize')
-
-    global __cusolverDnSgesvdjBatched
-    __cusolverDnSgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdjBatched')
-    if __cusolverDnSgesvdjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdjBatched = dlsym(handle, 'cusolverDnSgesvdjBatched')
-
-    global __cusolverDnDgesvdjBatched
-    __cusolverDnDgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdjBatched')
-    if __cusolverDnDgesvdjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdjBatched = dlsym(handle, 'cusolverDnDgesvdjBatched')
-
-    global __cusolverDnCgesvdjBatched
-    __cusolverDnCgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdjBatched')
-    if __cusolverDnCgesvdjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdjBatched = dlsym(handle, 'cusolverDnCgesvdjBatched')
-
-    global __cusolverDnZgesvdjBatched
-    __cusolverDnZgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdjBatched')
-    if __cusolverDnZgesvdjBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdjBatched = dlsym(handle, 'cusolverDnZgesvdjBatched')
-
-    global __cusolverDnSgesvdj_bufferSize
-    __cusolverDnSgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdj_bufferSize')
-    if __cusolverDnSgesvdj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdj_bufferSize = dlsym(handle, 'cusolverDnSgesvdj_bufferSize')
-
-    global __cusolverDnDgesvdj_bufferSize
-    __cusolverDnDgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdj_bufferSize')
-    if __cusolverDnDgesvdj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdj_bufferSize = dlsym(handle, 'cusolverDnDgesvdj_bufferSize')
-
-    global __cusolverDnCgesvdj_bufferSize
-    __cusolverDnCgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdj_bufferSize')
-    if __cusolverDnCgesvdj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdj_bufferSize = dlsym(handle, 'cusolverDnCgesvdj_bufferSize')
-
-    global __cusolverDnZgesvdj_bufferSize
-    __cusolverDnZgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdj_bufferSize')
-    if __cusolverDnZgesvdj_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdj_bufferSize = dlsym(handle, 'cusolverDnZgesvdj_bufferSize')
-
-    global __cusolverDnSgesvdj
-    __cusolverDnSgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdj')
-    if __cusolverDnSgesvdj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdj = dlsym(handle, 'cusolverDnSgesvdj')
-
-    global __cusolverDnDgesvdj
-    __cusolverDnDgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdj')
-    if __cusolverDnDgesvdj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdj = dlsym(handle, 'cusolverDnDgesvdj')
-
-    global __cusolverDnCgesvdj
-    __cusolverDnCgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdj')
-    if __cusolverDnCgesvdj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdj = dlsym(handle, 'cusolverDnCgesvdj')
-
-    global __cusolverDnZgesvdj
-    __cusolverDnZgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdj')
-    if __cusolverDnZgesvdj == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdj = dlsym(handle, 'cusolverDnZgesvdj')
-
-    global __cusolverDnSgesvdaStridedBatched_bufferSize
-    __cusolverDnSgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdaStridedBatched_bufferSize')
-    if __cusolverDnSgesvdaStridedBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnSgesvdaStridedBatched_bufferSize')
-
-    global __cusolverDnDgesvdaStridedBatched_bufferSize
-    __cusolverDnDgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdaStridedBatched_bufferSize')
-    if __cusolverDnDgesvdaStridedBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnDgesvdaStridedBatched_bufferSize')
-
-    global __cusolverDnCgesvdaStridedBatched_bufferSize
-    __cusolverDnCgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdaStridedBatched_bufferSize')
-    if __cusolverDnCgesvdaStridedBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnCgesvdaStridedBatched_bufferSize')
-
-    global __cusolverDnZgesvdaStridedBatched_bufferSize
-    __cusolverDnZgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdaStridedBatched_bufferSize')
-    if __cusolverDnZgesvdaStridedBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnZgesvdaStridedBatched_bufferSize')
-
-    global __cusolverDnSgesvdaStridedBatched
-    __cusolverDnSgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdaStridedBatched')
-    if __cusolverDnSgesvdaStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSgesvdaStridedBatched = dlsym(handle, 'cusolverDnSgesvdaStridedBatched')
-
-    global __cusolverDnDgesvdaStridedBatched
-    __cusolverDnDgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdaStridedBatched')
-    if __cusolverDnDgesvdaStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDgesvdaStridedBatched = dlsym(handle, 'cusolverDnDgesvdaStridedBatched')
-
-    global __cusolverDnCgesvdaStridedBatched
-    __cusolverDnCgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdaStridedBatched')
-    if __cusolverDnCgesvdaStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCgesvdaStridedBatched = dlsym(handle, 'cusolverDnCgesvdaStridedBatched')
-
-    global __cusolverDnZgesvdaStridedBatched
-    __cusolverDnZgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdaStridedBatched')
-    if __cusolverDnZgesvdaStridedBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnZgesvdaStridedBatched = dlsym(handle, 'cusolverDnZgesvdaStridedBatched')
-
-    global __cusolverDnCreateParams
-    __cusolverDnCreateParams = dlsym(RTLD_DEFAULT, 'cusolverDnCreateParams')
-    if __cusolverDnCreateParams == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnCreateParams = dlsym(handle, 'cusolverDnCreateParams')
-
-    global __cusolverDnDestroyParams
-    __cusolverDnDestroyParams = dlsym(RTLD_DEFAULT, 'cusolverDnDestroyParams')
-    if __cusolverDnDestroyParams == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnDestroyParams = dlsym(handle, 'cusolverDnDestroyParams')
-
-    global __cusolverDnSetAdvOptions
-    __cusolverDnSetAdvOptions = dlsym(RTLD_DEFAULT, 'cusolverDnSetAdvOptions')
-    if __cusolverDnSetAdvOptions == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSetAdvOptions = dlsym(handle, 'cusolverDnSetAdvOptions')
-
-    global __cusolverDnXpotrf_bufferSize
-    __cusolverDnXpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrf_bufferSize')
-    if __cusolverDnXpotrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXpotrf_bufferSize = dlsym(handle, 'cusolverDnXpotrf_bufferSize')
-
-    global __cusolverDnXpotrf
-    __cusolverDnXpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrf')
-    if __cusolverDnXpotrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXpotrf = dlsym(handle, 'cusolverDnXpotrf')
-
-    global __cusolverDnXpotrs
-    __cusolverDnXpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrs')
-    if __cusolverDnXpotrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXpotrs = dlsym(handle, 'cusolverDnXpotrs')
-
-    global __cusolverDnXgeqrf_bufferSize
-    __cusolverDnXgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgeqrf_bufferSize')
-    if __cusolverDnXgeqrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgeqrf_bufferSize = dlsym(handle, 'cusolverDnXgeqrf_bufferSize')
-
-    global __cusolverDnXgeqrf
-    __cusolverDnXgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnXgeqrf')
-    if __cusolverDnXgeqrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgeqrf = dlsym(handle, 'cusolverDnXgeqrf')
-
-    global __cusolverDnXgetrf_bufferSize
-    __cusolverDnXgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrf_bufferSize')
-    if __cusolverDnXgetrf_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgetrf_bufferSize = dlsym(handle, 'cusolverDnXgetrf_bufferSize')
-
-    global __cusolverDnXgetrf
-    __cusolverDnXgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrf')
-    if __cusolverDnXgetrf == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgetrf = dlsym(handle, 'cusolverDnXgetrf')
-
-    global __cusolverDnXgetrs
-    __cusolverDnXgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrs')
-    if __cusolverDnXgetrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgetrs = dlsym(handle, 'cusolverDnXgetrs')
-
-    global __cusolverDnXsyevd_bufferSize
-    __cusolverDnXsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevd_bufferSize')
-    if __cusolverDnXsyevd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevd_bufferSize = dlsym(handle, 'cusolverDnXsyevd_bufferSize')
-
-    global __cusolverDnXsyevd
-    __cusolverDnXsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevd')
-    if __cusolverDnXsyevd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevd = dlsym(handle, 'cusolverDnXsyevd')
-
-    global __cusolverDnXsyevdx_bufferSize
-    __cusolverDnXsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevdx_bufferSize')
-    if __cusolverDnXsyevdx_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevdx_bufferSize = dlsym(handle, 'cusolverDnXsyevdx_bufferSize')
-
-    global __cusolverDnXsyevdx
-    __cusolverDnXsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevdx')
-    if __cusolverDnXsyevdx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevdx = dlsym(handle, 'cusolverDnXsyevdx')
-
-    global __cusolverDnXgesvd_bufferSize
-    __cusolverDnXgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvd_bufferSize')
-    if __cusolverDnXgesvd_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvd_bufferSize = dlsym(handle, 'cusolverDnXgesvd_bufferSize')
-
-    global __cusolverDnXgesvd
-    __cusolverDnXgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvd')
-    if __cusolverDnXgesvd == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvd = dlsym(handle, 'cusolverDnXgesvd')
-
-    global __cusolverDnXgesvdp_bufferSize
-    __cusolverDnXgesvdp_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdp_bufferSize')
-    if __cusolverDnXgesvdp_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdp_bufferSize = dlsym(handle, 'cusolverDnXgesvdp_bufferSize')
-
-    global __cusolverDnXgesvdp
-    __cusolverDnXgesvdp = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdp')
-    if __cusolverDnXgesvdp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdp = dlsym(handle, 'cusolverDnXgesvdp')
-
-    global __cusolverDnXgesvdr_bufferSize
-    __cusolverDnXgesvdr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdr_bufferSize')
-    if __cusolverDnXgesvdr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdr_bufferSize = dlsym(handle, 'cusolverDnXgesvdr_bufferSize')
-
-    global __cusolverDnXgesvdr
-    __cusolverDnXgesvdr = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdr')
-    if __cusolverDnXgesvdr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgesvdr = dlsym(handle, 'cusolverDnXgesvdr')
-
-    global __cusolverDnXsytrs_bufferSize
-    __cusolverDnXsytrs_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsytrs_bufferSize')
-    if __cusolverDnXsytrs_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsytrs_bufferSize = dlsym(handle, 'cusolverDnXsytrs_bufferSize')
-
-    global __cusolverDnXsytrs
-    __cusolverDnXsytrs = dlsym(RTLD_DEFAULT, 'cusolverDnXsytrs')
-    if __cusolverDnXsytrs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsytrs = dlsym(handle, 'cusolverDnXsytrs')
-
-    global __cusolverDnXtrtri_bufferSize
-    __cusolverDnXtrtri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXtrtri_bufferSize')
-    if __cusolverDnXtrtri_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXtrtri_bufferSize = dlsym(handle, 'cusolverDnXtrtri_bufferSize')
-
-    global __cusolverDnXtrtri
-    __cusolverDnXtrtri = dlsym(RTLD_DEFAULT, 'cusolverDnXtrtri')
-    if __cusolverDnXtrtri == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXtrtri = dlsym(handle, 'cusolverDnXtrtri')
-
-    global __cusolverDnLoggerSetCallback
-    __cusolverDnLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetCallback')
-    if __cusolverDnLoggerSetCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerSetCallback = dlsym(handle, 'cusolverDnLoggerSetCallback')
-
-    global __cusolverDnLoggerSetFile
-    __cusolverDnLoggerSetFile = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetFile')
-    if __cusolverDnLoggerSetFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerSetFile = dlsym(handle, 'cusolverDnLoggerSetFile')
-
-    global __cusolverDnLoggerOpenFile
-    __cusolverDnLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerOpenFile')
-    if __cusolverDnLoggerOpenFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerOpenFile = dlsym(handle, 'cusolverDnLoggerOpenFile')
-
-    global __cusolverDnLoggerSetLevel
-    __cusolverDnLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetLevel')
-    if __cusolverDnLoggerSetLevel == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerSetLevel = dlsym(handle, 'cusolverDnLoggerSetLevel')
-
-    global __cusolverDnLoggerSetMask
-    __cusolverDnLoggerSetMask = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetMask')
-    if __cusolverDnLoggerSetMask == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerSetMask = dlsym(handle, 'cusolverDnLoggerSetMask')
-
-    global __cusolverDnLoggerForceDisable
-    __cusolverDnLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerForceDisable')
-    if __cusolverDnLoggerForceDisable == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnLoggerForceDisable = dlsym(handle, 'cusolverDnLoggerForceDisable')
-
-    global __cusolverDnSetDeterministicMode
-    __cusolverDnSetDeterministicMode = dlsym(RTLD_DEFAULT, 'cusolverDnSetDeterministicMode')
-    if __cusolverDnSetDeterministicMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnSetDeterministicMode = dlsym(handle, 'cusolverDnSetDeterministicMode')
-
-    global __cusolverDnGetDeterministicMode
-    __cusolverDnGetDeterministicMode = dlsym(RTLD_DEFAULT, 'cusolverDnGetDeterministicMode')
-    if __cusolverDnGetDeterministicMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnGetDeterministicMode = dlsym(handle, 'cusolverDnGetDeterministicMode')
-
-    global __cusolverDnXlarft_bufferSize
-    __cusolverDnXlarft_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft_bufferSize')
-    if __cusolverDnXlarft_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXlarft_bufferSize = dlsym(handle, 'cusolverDnXlarft_bufferSize')
-
-    global __cusolverDnXlarft
-    __cusolverDnXlarft = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft')
-    if __cusolverDnXlarft == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXlarft = dlsym(handle, 'cusolverDnXlarft')
-
-    global __cusolverDnXsyevBatched_bufferSize
-    __cusolverDnXsyevBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched_bufferSize')
-    if __cusolverDnXsyevBatched_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevBatched_bufferSize = dlsym(handle, 'cusolverDnXsyevBatched_bufferSize')
-
-    global __cusolverDnXsyevBatched
-    __cusolverDnXsyevBatched = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched')
-    if __cusolverDnXsyevBatched == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXsyevBatched = dlsym(handle, 'cusolverDnXsyevBatched')
-
-    global __cusolverDnXgeev_bufferSize
-    __cusolverDnXgeev_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev_bufferSize')
-    if __cusolverDnXgeev_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgeev_bufferSize = dlsym(handle, 'cusolverDnXgeev_bufferSize')
-
-    global __cusolverDnXgeev
-    __cusolverDnXgeev = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev')
-    if __cusolverDnXgeev == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverDnXgeev = dlsym(handle, 'cusolverDnXgeev')
-
-    __py_cusolverDn_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cusolverDnCreate
+        __cusolverDnCreate = dlsym(RTLD_DEFAULT, 'cusolverDnCreate')
+        if __cusolverDnCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCreate = dlsym(handle, 'cusolverDnCreate')
+
+        global __cusolverDnDestroy
+        __cusolverDnDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnDestroy')
+        if __cusolverDnDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDestroy = dlsym(handle, 'cusolverDnDestroy')
+
+        global __cusolverDnSetStream
+        __cusolverDnSetStream = dlsym(RTLD_DEFAULT, 'cusolverDnSetStream')
+        if __cusolverDnSetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSetStream = dlsym(handle, 'cusolverDnSetStream')
+
+        global __cusolverDnGetStream
+        __cusolverDnGetStream = dlsym(RTLD_DEFAULT, 'cusolverDnGetStream')
+        if __cusolverDnGetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnGetStream = dlsym(handle, 'cusolverDnGetStream')
+
+        global __cusolverDnIRSParamsCreate
+        __cusolverDnIRSParamsCreate = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsCreate')
+        if __cusolverDnIRSParamsCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsCreate = dlsym(handle, 'cusolverDnIRSParamsCreate')
+
+        global __cusolverDnIRSParamsDestroy
+        __cusolverDnIRSParamsDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsDestroy')
+        if __cusolverDnIRSParamsDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsDestroy = dlsym(handle, 'cusolverDnIRSParamsDestroy')
+
+        global __cusolverDnIRSParamsSetRefinementSolver
+        __cusolverDnIRSParamsSetRefinementSolver = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetRefinementSolver')
+        if __cusolverDnIRSParamsSetRefinementSolver == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetRefinementSolver = dlsym(handle, 'cusolverDnIRSParamsSetRefinementSolver')
+
+        global __cusolverDnIRSParamsSetSolverMainPrecision
+        __cusolverDnIRSParamsSetSolverMainPrecision = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverMainPrecision')
+        if __cusolverDnIRSParamsSetSolverMainPrecision == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetSolverMainPrecision = dlsym(handle, 'cusolverDnIRSParamsSetSolverMainPrecision')
+
+        global __cusolverDnIRSParamsSetSolverLowestPrecision
+        __cusolverDnIRSParamsSetSolverLowestPrecision = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverLowestPrecision')
+        if __cusolverDnIRSParamsSetSolverLowestPrecision == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetSolverLowestPrecision = dlsym(handle, 'cusolverDnIRSParamsSetSolverLowestPrecision')
+
+        global __cusolverDnIRSParamsSetSolverPrecisions
+        __cusolverDnIRSParamsSetSolverPrecisions = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetSolverPrecisions')
+        if __cusolverDnIRSParamsSetSolverPrecisions == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetSolverPrecisions = dlsym(handle, 'cusolverDnIRSParamsSetSolverPrecisions')
+
+        global __cusolverDnIRSParamsSetTol
+        __cusolverDnIRSParamsSetTol = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetTol')
+        if __cusolverDnIRSParamsSetTol == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetTol = dlsym(handle, 'cusolverDnIRSParamsSetTol')
+
+        global __cusolverDnIRSParamsSetTolInner
+        __cusolverDnIRSParamsSetTolInner = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetTolInner')
+        if __cusolverDnIRSParamsSetTolInner == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetTolInner = dlsym(handle, 'cusolverDnIRSParamsSetTolInner')
+
+        global __cusolverDnIRSParamsSetMaxIters
+        __cusolverDnIRSParamsSetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetMaxIters')
+        if __cusolverDnIRSParamsSetMaxIters == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetMaxIters = dlsym(handle, 'cusolverDnIRSParamsSetMaxIters')
+
+        global __cusolverDnIRSParamsSetMaxItersInner
+        __cusolverDnIRSParamsSetMaxItersInner = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsSetMaxItersInner')
+        if __cusolverDnIRSParamsSetMaxItersInner == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsSetMaxItersInner = dlsym(handle, 'cusolverDnIRSParamsSetMaxItersInner')
+
+        global __cusolverDnIRSParamsGetMaxIters
+        __cusolverDnIRSParamsGetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsGetMaxIters')
+        if __cusolverDnIRSParamsGetMaxIters == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsGetMaxIters = dlsym(handle, 'cusolverDnIRSParamsGetMaxIters')
+
+        global __cusolverDnIRSParamsEnableFallback
+        __cusolverDnIRSParamsEnableFallback = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsEnableFallback')
+        if __cusolverDnIRSParamsEnableFallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsEnableFallback = dlsym(handle, 'cusolverDnIRSParamsEnableFallback')
+
+        global __cusolverDnIRSParamsDisableFallback
+        __cusolverDnIRSParamsDisableFallback = dlsym(RTLD_DEFAULT, 'cusolverDnIRSParamsDisableFallback')
+        if __cusolverDnIRSParamsDisableFallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSParamsDisableFallback = dlsym(handle, 'cusolverDnIRSParamsDisableFallback')
+
+        global __cusolverDnIRSInfosDestroy
+        __cusolverDnIRSInfosDestroy = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosDestroy')
+        if __cusolverDnIRSInfosDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosDestroy = dlsym(handle, 'cusolverDnIRSInfosDestroy')
+
+        global __cusolverDnIRSInfosCreate
+        __cusolverDnIRSInfosCreate = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosCreate')
+        if __cusolverDnIRSInfosCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosCreate = dlsym(handle, 'cusolverDnIRSInfosCreate')
+
+        global __cusolverDnIRSInfosGetNiters
+        __cusolverDnIRSInfosGetNiters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetNiters')
+        if __cusolverDnIRSInfosGetNiters == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosGetNiters = dlsym(handle, 'cusolverDnIRSInfosGetNiters')
+
+        global __cusolverDnIRSInfosGetOuterNiters
+        __cusolverDnIRSInfosGetOuterNiters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetOuterNiters')
+        if __cusolverDnIRSInfosGetOuterNiters == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosGetOuterNiters = dlsym(handle, 'cusolverDnIRSInfosGetOuterNiters')
+
+        global __cusolverDnIRSInfosRequestResidual
+        __cusolverDnIRSInfosRequestResidual = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosRequestResidual')
+        if __cusolverDnIRSInfosRequestResidual == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosRequestResidual = dlsym(handle, 'cusolverDnIRSInfosRequestResidual')
+
+        global __cusolverDnIRSInfosGetResidualHistory
+        __cusolverDnIRSInfosGetResidualHistory = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetResidualHistory')
+        if __cusolverDnIRSInfosGetResidualHistory == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosGetResidualHistory = dlsym(handle, 'cusolverDnIRSInfosGetResidualHistory')
+
+        global __cusolverDnIRSInfosGetMaxIters
+        __cusolverDnIRSInfosGetMaxIters = dlsym(RTLD_DEFAULT, 'cusolverDnIRSInfosGetMaxIters')
+        if __cusolverDnIRSInfosGetMaxIters == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSInfosGetMaxIters = dlsym(handle, 'cusolverDnIRSInfosGetMaxIters')
+
+        global __cusolverDnZZgesv
+        __cusolverDnZZgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZZgesv')
+        if __cusolverDnZZgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZZgesv = dlsym(handle, 'cusolverDnZZgesv')
+
+        global __cusolverDnZCgesv
+        __cusolverDnZCgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZCgesv')
+        if __cusolverDnZCgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZCgesv = dlsym(handle, 'cusolverDnZCgesv')
+
+        global __cusolverDnZKgesv
+        __cusolverDnZKgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZKgesv')
+        if __cusolverDnZKgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZKgesv = dlsym(handle, 'cusolverDnZKgesv')
+
+        global __cusolverDnZEgesv
+        __cusolverDnZEgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZEgesv')
+        if __cusolverDnZEgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZEgesv = dlsym(handle, 'cusolverDnZEgesv')
+
+        global __cusolverDnZYgesv
+        __cusolverDnZYgesv = dlsym(RTLD_DEFAULT, 'cusolverDnZYgesv')
+        if __cusolverDnZYgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZYgesv = dlsym(handle, 'cusolverDnZYgesv')
+
+        global __cusolverDnCCgesv
+        __cusolverDnCCgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCCgesv')
+        if __cusolverDnCCgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCCgesv = dlsym(handle, 'cusolverDnCCgesv')
+
+        global __cusolverDnCEgesv
+        __cusolverDnCEgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCEgesv')
+        if __cusolverDnCEgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCEgesv = dlsym(handle, 'cusolverDnCEgesv')
+
+        global __cusolverDnCKgesv
+        __cusolverDnCKgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCKgesv')
+        if __cusolverDnCKgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCKgesv = dlsym(handle, 'cusolverDnCKgesv')
+
+        global __cusolverDnCYgesv
+        __cusolverDnCYgesv = dlsym(RTLD_DEFAULT, 'cusolverDnCYgesv')
+        if __cusolverDnCYgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCYgesv = dlsym(handle, 'cusolverDnCYgesv')
+
+        global __cusolverDnDDgesv
+        __cusolverDnDDgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDDgesv')
+        if __cusolverDnDDgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDDgesv = dlsym(handle, 'cusolverDnDDgesv')
+
+        global __cusolverDnDSgesv
+        __cusolverDnDSgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDSgesv')
+        if __cusolverDnDSgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDSgesv = dlsym(handle, 'cusolverDnDSgesv')
+
+        global __cusolverDnDHgesv
+        __cusolverDnDHgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDHgesv')
+        if __cusolverDnDHgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDHgesv = dlsym(handle, 'cusolverDnDHgesv')
+
+        global __cusolverDnDBgesv
+        __cusolverDnDBgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDBgesv')
+        if __cusolverDnDBgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDBgesv = dlsym(handle, 'cusolverDnDBgesv')
+
+        global __cusolverDnDXgesv
+        __cusolverDnDXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnDXgesv')
+        if __cusolverDnDXgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDXgesv = dlsym(handle, 'cusolverDnDXgesv')
+
+        global __cusolverDnSSgesv
+        __cusolverDnSSgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSSgesv')
+        if __cusolverDnSSgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSSgesv = dlsym(handle, 'cusolverDnSSgesv')
+
+        global __cusolverDnSHgesv
+        __cusolverDnSHgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSHgesv')
+        if __cusolverDnSHgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSHgesv = dlsym(handle, 'cusolverDnSHgesv')
+
+        global __cusolverDnSBgesv
+        __cusolverDnSBgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSBgesv')
+        if __cusolverDnSBgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSBgesv = dlsym(handle, 'cusolverDnSBgesv')
+
+        global __cusolverDnSXgesv
+        __cusolverDnSXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnSXgesv')
+        if __cusolverDnSXgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSXgesv = dlsym(handle, 'cusolverDnSXgesv')
+
+        global __cusolverDnZZgesv_bufferSize
+        __cusolverDnZZgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZZgesv_bufferSize')
+        if __cusolverDnZZgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZZgesv_bufferSize = dlsym(handle, 'cusolverDnZZgesv_bufferSize')
+
+        global __cusolverDnZCgesv_bufferSize
+        __cusolverDnZCgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZCgesv_bufferSize')
+        if __cusolverDnZCgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZCgesv_bufferSize = dlsym(handle, 'cusolverDnZCgesv_bufferSize')
+
+        global __cusolverDnZKgesv_bufferSize
+        __cusolverDnZKgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZKgesv_bufferSize')
+        if __cusolverDnZKgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZKgesv_bufferSize = dlsym(handle, 'cusolverDnZKgesv_bufferSize')
+
+        global __cusolverDnZEgesv_bufferSize
+        __cusolverDnZEgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZEgesv_bufferSize')
+        if __cusolverDnZEgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZEgesv_bufferSize = dlsym(handle, 'cusolverDnZEgesv_bufferSize')
+
+        global __cusolverDnZYgesv_bufferSize
+        __cusolverDnZYgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZYgesv_bufferSize')
+        if __cusolverDnZYgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZYgesv_bufferSize = dlsym(handle, 'cusolverDnZYgesv_bufferSize')
+
+        global __cusolverDnCCgesv_bufferSize
+        __cusolverDnCCgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCCgesv_bufferSize')
+        if __cusolverDnCCgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCCgesv_bufferSize = dlsym(handle, 'cusolverDnCCgesv_bufferSize')
+
+        global __cusolverDnCKgesv_bufferSize
+        __cusolverDnCKgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCKgesv_bufferSize')
+        if __cusolverDnCKgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCKgesv_bufferSize = dlsym(handle, 'cusolverDnCKgesv_bufferSize')
+
+        global __cusolverDnCEgesv_bufferSize
+        __cusolverDnCEgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCEgesv_bufferSize')
+        if __cusolverDnCEgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCEgesv_bufferSize = dlsym(handle, 'cusolverDnCEgesv_bufferSize')
+
+        global __cusolverDnCYgesv_bufferSize
+        __cusolverDnCYgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCYgesv_bufferSize')
+        if __cusolverDnCYgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCYgesv_bufferSize = dlsym(handle, 'cusolverDnCYgesv_bufferSize')
+
+        global __cusolverDnDDgesv_bufferSize
+        __cusolverDnDDgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDDgesv_bufferSize')
+        if __cusolverDnDDgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDDgesv_bufferSize = dlsym(handle, 'cusolverDnDDgesv_bufferSize')
+
+        global __cusolverDnDSgesv_bufferSize
+        __cusolverDnDSgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDSgesv_bufferSize')
+        if __cusolverDnDSgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDSgesv_bufferSize = dlsym(handle, 'cusolverDnDSgesv_bufferSize')
+
+        global __cusolverDnDHgesv_bufferSize
+        __cusolverDnDHgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDHgesv_bufferSize')
+        if __cusolverDnDHgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDHgesv_bufferSize = dlsym(handle, 'cusolverDnDHgesv_bufferSize')
+
+        global __cusolverDnDBgesv_bufferSize
+        __cusolverDnDBgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDBgesv_bufferSize')
+        if __cusolverDnDBgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDBgesv_bufferSize = dlsym(handle, 'cusolverDnDBgesv_bufferSize')
+
+        global __cusolverDnDXgesv_bufferSize
+        __cusolverDnDXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDXgesv_bufferSize')
+        if __cusolverDnDXgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDXgesv_bufferSize = dlsym(handle, 'cusolverDnDXgesv_bufferSize')
+
+        global __cusolverDnSSgesv_bufferSize
+        __cusolverDnSSgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSSgesv_bufferSize')
+        if __cusolverDnSSgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSSgesv_bufferSize = dlsym(handle, 'cusolverDnSSgesv_bufferSize')
+
+        global __cusolverDnSHgesv_bufferSize
+        __cusolverDnSHgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSHgesv_bufferSize')
+        if __cusolverDnSHgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSHgesv_bufferSize = dlsym(handle, 'cusolverDnSHgesv_bufferSize')
+
+        global __cusolverDnSBgesv_bufferSize
+        __cusolverDnSBgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSBgesv_bufferSize')
+        if __cusolverDnSBgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSBgesv_bufferSize = dlsym(handle, 'cusolverDnSBgesv_bufferSize')
+
+        global __cusolverDnSXgesv_bufferSize
+        __cusolverDnSXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSXgesv_bufferSize')
+        if __cusolverDnSXgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSXgesv_bufferSize = dlsym(handle, 'cusolverDnSXgesv_bufferSize')
+
+        global __cusolverDnZZgels
+        __cusolverDnZZgels = dlsym(RTLD_DEFAULT, 'cusolverDnZZgels')
+        if __cusolverDnZZgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZZgels = dlsym(handle, 'cusolverDnZZgels')
+
+        global __cusolverDnZCgels
+        __cusolverDnZCgels = dlsym(RTLD_DEFAULT, 'cusolverDnZCgels')
+        if __cusolverDnZCgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZCgels = dlsym(handle, 'cusolverDnZCgels')
+
+        global __cusolverDnZKgels
+        __cusolverDnZKgels = dlsym(RTLD_DEFAULT, 'cusolverDnZKgels')
+        if __cusolverDnZKgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZKgels = dlsym(handle, 'cusolverDnZKgels')
+
+        global __cusolverDnZEgels
+        __cusolverDnZEgels = dlsym(RTLD_DEFAULT, 'cusolverDnZEgels')
+        if __cusolverDnZEgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZEgels = dlsym(handle, 'cusolverDnZEgels')
+
+        global __cusolverDnZYgels
+        __cusolverDnZYgels = dlsym(RTLD_DEFAULT, 'cusolverDnZYgels')
+        if __cusolverDnZYgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZYgels = dlsym(handle, 'cusolverDnZYgels')
+
+        global __cusolverDnCCgels
+        __cusolverDnCCgels = dlsym(RTLD_DEFAULT, 'cusolverDnCCgels')
+        if __cusolverDnCCgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCCgels = dlsym(handle, 'cusolverDnCCgels')
+
+        global __cusolverDnCKgels
+        __cusolverDnCKgels = dlsym(RTLD_DEFAULT, 'cusolverDnCKgels')
+        if __cusolverDnCKgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCKgels = dlsym(handle, 'cusolverDnCKgels')
+
+        global __cusolverDnCEgels
+        __cusolverDnCEgels = dlsym(RTLD_DEFAULT, 'cusolverDnCEgels')
+        if __cusolverDnCEgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCEgels = dlsym(handle, 'cusolverDnCEgels')
+
+        global __cusolverDnCYgels
+        __cusolverDnCYgels = dlsym(RTLD_DEFAULT, 'cusolverDnCYgels')
+        if __cusolverDnCYgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCYgels = dlsym(handle, 'cusolverDnCYgels')
+
+        global __cusolverDnDDgels
+        __cusolverDnDDgels = dlsym(RTLD_DEFAULT, 'cusolverDnDDgels')
+        if __cusolverDnDDgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDDgels = dlsym(handle, 'cusolverDnDDgels')
+
+        global __cusolverDnDSgels
+        __cusolverDnDSgels = dlsym(RTLD_DEFAULT, 'cusolverDnDSgels')
+        if __cusolverDnDSgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDSgels = dlsym(handle, 'cusolverDnDSgels')
+
+        global __cusolverDnDHgels
+        __cusolverDnDHgels = dlsym(RTLD_DEFAULT, 'cusolverDnDHgels')
+        if __cusolverDnDHgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDHgels = dlsym(handle, 'cusolverDnDHgels')
+
+        global __cusolverDnDBgels
+        __cusolverDnDBgels = dlsym(RTLD_DEFAULT, 'cusolverDnDBgels')
+        if __cusolverDnDBgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDBgels = dlsym(handle, 'cusolverDnDBgels')
+
+        global __cusolverDnDXgels
+        __cusolverDnDXgels = dlsym(RTLD_DEFAULT, 'cusolverDnDXgels')
+        if __cusolverDnDXgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDXgels = dlsym(handle, 'cusolverDnDXgels')
+
+        global __cusolverDnSSgels
+        __cusolverDnSSgels = dlsym(RTLD_DEFAULT, 'cusolverDnSSgels')
+        if __cusolverDnSSgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSSgels = dlsym(handle, 'cusolverDnSSgels')
+
+        global __cusolverDnSHgels
+        __cusolverDnSHgels = dlsym(RTLD_DEFAULT, 'cusolverDnSHgels')
+        if __cusolverDnSHgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSHgels = dlsym(handle, 'cusolverDnSHgels')
+
+        global __cusolverDnSBgels
+        __cusolverDnSBgels = dlsym(RTLD_DEFAULT, 'cusolverDnSBgels')
+        if __cusolverDnSBgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSBgels = dlsym(handle, 'cusolverDnSBgels')
+
+        global __cusolverDnSXgels
+        __cusolverDnSXgels = dlsym(RTLD_DEFAULT, 'cusolverDnSXgels')
+        if __cusolverDnSXgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSXgels = dlsym(handle, 'cusolverDnSXgels')
+
+        global __cusolverDnZZgels_bufferSize
+        __cusolverDnZZgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZZgels_bufferSize')
+        if __cusolverDnZZgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZZgels_bufferSize = dlsym(handle, 'cusolverDnZZgels_bufferSize')
+
+        global __cusolverDnZCgels_bufferSize
+        __cusolverDnZCgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZCgels_bufferSize')
+        if __cusolverDnZCgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZCgels_bufferSize = dlsym(handle, 'cusolverDnZCgels_bufferSize')
+
+        global __cusolverDnZKgels_bufferSize
+        __cusolverDnZKgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZKgels_bufferSize')
+        if __cusolverDnZKgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZKgels_bufferSize = dlsym(handle, 'cusolverDnZKgels_bufferSize')
+
+        global __cusolverDnZEgels_bufferSize
+        __cusolverDnZEgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZEgels_bufferSize')
+        if __cusolverDnZEgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZEgels_bufferSize = dlsym(handle, 'cusolverDnZEgels_bufferSize')
+
+        global __cusolverDnZYgels_bufferSize
+        __cusolverDnZYgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZYgels_bufferSize')
+        if __cusolverDnZYgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZYgels_bufferSize = dlsym(handle, 'cusolverDnZYgels_bufferSize')
+
+        global __cusolverDnCCgels_bufferSize
+        __cusolverDnCCgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCCgels_bufferSize')
+        if __cusolverDnCCgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCCgels_bufferSize = dlsym(handle, 'cusolverDnCCgels_bufferSize')
+
+        global __cusolverDnCKgels_bufferSize
+        __cusolverDnCKgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCKgels_bufferSize')
+        if __cusolverDnCKgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCKgels_bufferSize = dlsym(handle, 'cusolverDnCKgels_bufferSize')
+
+        global __cusolverDnCEgels_bufferSize
+        __cusolverDnCEgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCEgels_bufferSize')
+        if __cusolverDnCEgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCEgels_bufferSize = dlsym(handle, 'cusolverDnCEgels_bufferSize')
+
+        global __cusolverDnCYgels_bufferSize
+        __cusolverDnCYgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCYgels_bufferSize')
+        if __cusolverDnCYgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCYgels_bufferSize = dlsym(handle, 'cusolverDnCYgels_bufferSize')
+
+        global __cusolverDnDDgels_bufferSize
+        __cusolverDnDDgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDDgels_bufferSize')
+        if __cusolverDnDDgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDDgels_bufferSize = dlsym(handle, 'cusolverDnDDgels_bufferSize')
+
+        global __cusolverDnDSgels_bufferSize
+        __cusolverDnDSgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDSgels_bufferSize')
+        if __cusolverDnDSgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDSgels_bufferSize = dlsym(handle, 'cusolverDnDSgels_bufferSize')
+
+        global __cusolverDnDHgels_bufferSize
+        __cusolverDnDHgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDHgels_bufferSize')
+        if __cusolverDnDHgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDHgels_bufferSize = dlsym(handle, 'cusolverDnDHgels_bufferSize')
+
+        global __cusolverDnDBgels_bufferSize
+        __cusolverDnDBgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDBgels_bufferSize')
+        if __cusolverDnDBgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDBgels_bufferSize = dlsym(handle, 'cusolverDnDBgels_bufferSize')
+
+        global __cusolverDnDXgels_bufferSize
+        __cusolverDnDXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDXgels_bufferSize')
+        if __cusolverDnDXgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDXgels_bufferSize = dlsym(handle, 'cusolverDnDXgels_bufferSize')
+
+        global __cusolverDnSSgels_bufferSize
+        __cusolverDnSSgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSSgels_bufferSize')
+        if __cusolverDnSSgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSSgels_bufferSize = dlsym(handle, 'cusolverDnSSgels_bufferSize')
+
+        global __cusolverDnSHgels_bufferSize
+        __cusolverDnSHgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSHgels_bufferSize')
+        if __cusolverDnSHgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSHgels_bufferSize = dlsym(handle, 'cusolverDnSHgels_bufferSize')
+
+        global __cusolverDnSBgels_bufferSize
+        __cusolverDnSBgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSBgels_bufferSize')
+        if __cusolverDnSBgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSBgels_bufferSize = dlsym(handle, 'cusolverDnSBgels_bufferSize')
+
+        global __cusolverDnSXgels_bufferSize
+        __cusolverDnSXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSXgels_bufferSize')
+        if __cusolverDnSXgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSXgels_bufferSize = dlsym(handle, 'cusolverDnSXgels_bufferSize')
+
+        global __cusolverDnIRSXgesv
+        __cusolverDnIRSXgesv = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgesv')
+        if __cusolverDnIRSXgesv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSXgesv = dlsym(handle, 'cusolverDnIRSXgesv')
+
+        global __cusolverDnIRSXgesv_bufferSize
+        __cusolverDnIRSXgesv_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgesv_bufferSize')
+        if __cusolverDnIRSXgesv_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSXgesv_bufferSize = dlsym(handle, 'cusolverDnIRSXgesv_bufferSize')
+
+        global __cusolverDnIRSXgels
+        __cusolverDnIRSXgels = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgels')
+        if __cusolverDnIRSXgels == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSXgels = dlsym(handle, 'cusolverDnIRSXgels')
+
+        global __cusolverDnIRSXgels_bufferSize
+        __cusolverDnIRSXgels_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnIRSXgels_bufferSize')
+        if __cusolverDnIRSXgels_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnIRSXgels_bufferSize = dlsym(handle, 'cusolverDnIRSXgels_bufferSize')
+
+        global __cusolverDnSpotrf_bufferSize
+        __cusolverDnSpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrf_bufferSize')
+        if __cusolverDnSpotrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotrf_bufferSize = dlsym(handle, 'cusolverDnSpotrf_bufferSize')
+
+        global __cusolverDnDpotrf_bufferSize
+        __cusolverDnDpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrf_bufferSize')
+        if __cusolverDnDpotrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotrf_bufferSize = dlsym(handle, 'cusolverDnDpotrf_bufferSize')
+
+        global __cusolverDnCpotrf_bufferSize
+        __cusolverDnCpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrf_bufferSize')
+        if __cusolverDnCpotrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotrf_bufferSize = dlsym(handle, 'cusolverDnCpotrf_bufferSize')
+
+        global __cusolverDnZpotrf_bufferSize
+        __cusolverDnZpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrf_bufferSize')
+        if __cusolverDnZpotrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotrf_bufferSize = dlsym(handle, 'cusolverDnZpotrf_bufferSize')
+
+        global __cusolverDnSpotrf
+        __cusolverDnSpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrf')
+        if __cusolverDnSpotrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotrf = dlsym(handle, 'cusolverDnSpotrf')
+
+        global __cusolverDnDpotrf
+        __cusolverDnDpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrf')
+        if __cusolverDnDpotrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotrf = dlsym(handle, 'cusolverDnDpotrf')
+
+        global __cusolverDnCpotrf
+        __cusolverDnCpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrf')
+        if __cusolverDnCpotrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotrf = dlsym(handle, 'cusolverDnCpotrf')
+
+        global __cusolverDnZpotrf
+        __cusolverDnZpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrf')
+        if __cusolverDnZpotrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotrf = dlsym(handle, 'cusolverDnZpotrf')
+
+        global __cusolverDnSpotrs
+        __cusolverDnSpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrs')
+        if __cusolverDnSpotrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotrs = dlsym(handle, 'cusolverDnSpotrs')
+
+        global __cusolverDnDpotrs
+        __cusolverDnDpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrs')
+        if __cusolverDnDpotrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotrs = dlsym(handle, 'cusolverDnDpotrs')
+
+        global __cusolverDnCpotrs
+        __cusolverDnCpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrs')
+        if __cusolverDnCpotrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotrs = dlsym(handle, 'cusolverDnCpotrs')
+
+        global __cusolverDnZpotrs
+        __cusolverDnZpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrs')
+        if __cusolverDnZpotrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotrs = dlsym(handle, 'cusolverDnZpotrs')
+
+        global __cusolverDnSpotrfBatched
+        __cusolverDnSpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrfBatched')
+        if __cusolverDnSpotrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotrfBatched = dlsym(handle, 'cusolverDnSpotrfBatched')
+
+        global __cusolverDnDpotrfBatched
+        __cusolverDnDpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrfBatched')
+        if __cusolverDnDpotrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotrfBatched = dlsym(handle, 'cusolverDnDpotrfBatched')
+
+        global __cusolverDnCpotrfBatched
+        __cusolverDnCpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrfBatched')
+        if __cusolverDnCpotrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotrfBatched = dlsym(handle, 'cusolverDnCpotrfBatched')
+
+        global __cusolverDnZpotrfBatched
+        __cusolverDnZpotrfBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrfBatched')
+        if __cusolverDnZpotrfBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotrfBatched = dlsym(handle, 'cusolverDnZpotrfBatched')
+
+        global __cusolverDnSpotrsBatched
+        __cusolverDnSpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSpotrsBatched')
+        if __cusolverDnSpotrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotrsBatched = dlsym(handle, 'cusolverDnSpotrsBatched')
+
+        global __cusolverDnDpotrsBatched
+        __cusolverDnDpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDpotrsBatched')
+        if __cusolverDnDpotrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotrsBatched = dlsym(handle, 'cusolverDnDpotrsBatched')
+
+        global __cusolverDnCpotrsBatched
+        __cusolverDnCpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCpotrsBatched')
+        if __cusolverDnCpotrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotrsBatched = dlsym(handle, 'cusolverDnCpotrsBatched')
+
+        global __cusolverDnZpotrsBatched
+        __cusolverDnZpotrsBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZpotrsBatched')
+        if __cusolverDnZpotrsBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotrsBatched = dlsym(handle, 'cusolverDnZpotrsBatched')
+
+        global __cusolverDnSpotri_bufferSize
+        __cusolverDnSpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSpotri_bufferSize')
+        if __cusolverDnSpotri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotri_bufferSize = dlsym(handle, 'cusolverDnSpotri_bufferSize')
+
+        global __cusolverDnDpotri_bufferSize
+        __cusolverDnDpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDpotri_bufferSize')
+        if __cusolverDnDpotri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotri_bufferSize = dlsym(handle, 'cusolverDnDpotri_bufferSize')
+
+        global __cusolverDnCpotri_bufferSize
+        __cusolverDnCpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCpotri_bufferSize')
+        if __cusolverDnCpotri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotri_bufferSize = dlsym(handle, 'cusolverDnCpotri_bufferSize')
+
+        global __cusolverDnZpotri_bufferSize
+        __cusolverDnZpotri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZpotri_bufferSize')
+        if __cusolverDnZpotri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotri_bufferSize = dlsym(handle, 'cusolverDnZpotri_bufferSize')
+
+        global __cusolverDnSpotri
+        __cusolverDnSpotri = dlsym(RTLD_DEFAULT, 'cusolverDnSpotri')
+        if __cusolverDnSpotri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSpotri = dlsym(handle, 'cusolverDnSpotri')
+
+        global __cusolverDnDpotri
+        __cusolverDnDpotri = dlsym(RTLD_DEFAULT, 'cusolverDnDpotri')
+        if __cusolverDnDpotri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDpotri = dlsym(handle, 'cusolverDnDpotri')
+
+        global __cusolverDnCpotri
+        __cusolverDnCpotri = dlsym(RTLD_DEFAULT, 'cusolverDnCpotri')
+        if __cusolverDnCpotri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCpotri = dlsym(handle, 'cusolverDnCpotri')
+
+        global __cusolverDnZpotri
+        __cusolverDnZpotri = dlsym(RTLD_DEFAULT, 'cusolverDnZpotri')
+        if __cusolverDnZpotri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZpotri = dlsym(handle, 'cusolverDnZpotri')
+
+        global __cusolverDnSlauum_bufferSize
+        __cusolverDnSlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSlauum_bufferSize')
+        if __cusolverDnSlauum_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSlauum_bufferSize = dlsym(handle, 'cusolverDnSlauum_bufferSize')
+
+        global __cusolverDnDlauum_bufferSize
+        __cusolverDnDlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDlauum_bufferSize')
+        if __cusolverDnDlauum_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDlauum_bufferSize = dlsym(handle, 'cusolverDnDlauum_bufferSize')
+
+        global __cusolverDnClauum_bufferSize
+        __cusolverDnClauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnClauum_bufferSize')
+        if __cusolverDnClauum_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnClauum_bufferSize = dlsym(handle, 'cusolverDnClauum_bufferSize')
+
+        global __cusolverDnZlauum_bufferSize
+        __cusolverDnZlauum_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZlauum_bufferSize')
+        if __cusolverDnZlauum_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZlauum_bufferSize = dlsym(handle, 'cusolverDnZlauum_bufferSize')
+
+        global __cusolverDnSlauum
+        __cusolverDnSlauum = dlsym(RTLD_DEFAULT, 'cusolverDnSlauum')
+        if __cusolverDnSlauum == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSlauum = dlsym(handle, 'cusolverDnSlauum')
+
+        global __cusolverDnDlauum
+        __cusolverDnDlauum = dlsym(RTLD_DEFAULT, 'cusolverDnDlauum')
+        if __cusolverDnDlauum == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDlauum = dlsym(handle, 'cusolverDnDlauum')
+
+        global __cusolverDnClauum
+        __cusolverDnClauum = dlsym(RTLD_DEFAULT, 'cusolverDnClauum')
+        if __cusolverDnClauum == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnClauum = dlsym(handle, 'cusolverDnClauum')
+
+        global __cusolverDnZlauum
+        __cusolverDnZlauum = dlsym(RTLD_DEFAULT, 'cusolverDnZlauum')
+        if __cusolverDnZlauum == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZlauum = dlsym(handle, 'cusolverDnZlauum')
+
+        global __cusolverDnSgetrf_bufferSize
+        __cusolverDnSgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrf_bufferSize')
+        if __cusolverDnSgetrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgetrf_bufferSize = dlsym(handle, 'cusolverDnSgetrf_bufferSize')
+
+        global __cusolverDnDgetrf_bufferSize
+        __cusolverDnDgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrf_bufferSize')
+        if __cusolverDnDgetrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgetrf_bufferSize = dlsym(handle, 'cusolverDnDgetrf_bufferSize')
+
+        global __cusolverDnCgetrf_bufferSize
+        __cusolverDnCgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrf_bufferSize')
+        if __cusolverDnCgetrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgetrf_bufferSize = dlsym(handle, 'cusolverDnCgetrf_bufferSize')
+
+        global __cusolverDnZgetrf_bufferSize
+        __cusolverDnZgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrf_bufferSize')
+        if __cusolverDnZgetrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgetrf_bufferSize = dlsym(handle, 'cusolverDnZgetrf_bufferSize')
+
+        global __cusolverDnSgetrf
+        __cusolverDnSgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrf')
+        if __cusolverDnSgetrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgetrf = dlsym(handle, 'cusolverDnSgetrf')
+
+        global __cusolverDnDgetrf
+        __cusolverDnDgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrf')
+        if __cusolverDnDgetrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgetrf = dlsym(handle, 'cusolverDnDgetrf')
+
+        global __cusolverDnCgetrf
+        __cusolverDnCgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrf')
+        if __cusolverDnCgetrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgetrf = dlsym(handle, 'cusolverDnCgetrf')
+
+        global __cusolverDnZgetrf
+        __cusolverDnZgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrf')
+        if __cusolverDnZgetrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgetrf = dlsym(handle, 'cusolverDnZgetrf')
+
+        global __cusolverDnSlaswp
+        __cusolverDnSlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnSlaswp')
+        if __cusolverDnSlaswp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSlaswp = dlsym(handle, 'cusolverDnSlaswp')
+
+        global __cusolverDnDlaswp
+        __cusolverDnDlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnDlaswp')
+        if __cusolverDnDlaswp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDlaswp = dlsym(handle, 'cusolverDnDlaswp')
+
+        global __cusolverDnClaswp
+        __cusolverDnClaswp = dlsym(RTLD_DEFAULT, 'cusolverDnClaswp')
+        if __cusolverDnClaswp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnClaswp = dlsym(handle, 'cusolverDnClaswp')
+
+        global __cusolverDnZlaswp
+        __cusolverDnZlaswp = dlsym(RTLD_DEFAULT, 'cusolverDnZlaswp')
+        if __cusolverDnZlaswp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZlaswp = dlsym(handle, 'cusolverDnZlaswp')
+
+        global __cusolverDnSgetrs
+        __cusolverDnSgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnSgetrs')
+        if __cusolverDnSgetrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgetrs = dlsym(handle, 'cusolverDnSgetrs')
+
+        global __cusolverDnDgetrs
+        __cusolverDnDgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnDgetrs')
+        if __cusolverDnDgetrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgetrs = dlsym(handle, 'cusolverDnDgetrs')
+
+        global __cusolverDnCgetrs
+        __cusolverDnCgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnCgetrs')
+        if __cusolverDnCgetrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgetrs = dlsym(handle, 'cusolverDnCgetrs')
+
+        global __cusolverDnZgetrs
+        __cusolverDnZgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnZgetrs')
+        if __cusolverDnZgetrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgetrs = dlsym(handle, 'cusolverDnZgetrs')
+
+        global __cusolverDnSgeqrf_bufferSize
+        __cusolverDnSgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgeqrf_bufferSize')
+        if __cusolverDnSgeqrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgeqrf_bufferSize = dlsym(handle, 'cusolverDnSgeqrf_bufferSize')
+
+        global __cusolverDnDgeqrf_bufferSize
+        __cusolverDnDgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgeqrf_bufferSize')
+        if __cusolverDnDgeqrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgeqrf_bufferSize = dlsym(handle, 'cusolverDnDgeqrf_bufferSize')
+
+        global __cusolverDnCgeqrf_bufferSize
+        __cusolverDnCgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgeqrf_bufferSize')
+        if __cusolverDnCgeqrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgeqrf_bufferSize = dlsym(handle, 'cusolverDnCgeqrf_bufferSize')
+
+        global __cusolverDnZgeqrf_bufferSize
+        __cusolverDnZgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgeqrf_bufferSize')
+        if __cusolverDnZgeqrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgeqrf_bufferSize = dlsym(handle, 'cusolverDnZgeqrf_bufferSize')
+
+        global __cusolverDnSgeqrf
+        __cusolverDnSgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnSgeqrf')
+        if __cusolverDnSgeqrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgeqrf = dlsym(handle, 'cusolverDnSgeqrf')
+
+        global __cusolverDnDgeqrf
+        __cusolverDnDgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnDgeqrf')
+        if __cusolverDnDgeqrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgeqrf = dlsym(handle, 'cusolverDnDgeqrf')
+
+        global __cusolverDnCgeqrf
+        __cusolverDnCgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnCgeqrf')
+        if __cusolverDnCgeqrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgeqrf = dlsym(handle, 'cusolverDnCgeqrf')
+
+        global __cusolverDnZgeqrf
+        __cusolverDnZgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnZgeqrf')
+        if __cusolverDnZgeqrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgeqrf = dlsym(handle, 'cusolverDnZgeqrf')
+
+        global __cusolverDnSorgqr_bufferSize
+        __cusolverDnSorgqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgqr_bufferSize')
+        if __cusolverDnSorgqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgqr_bufferSize = dlsym(handle, 'cusolverDnSorgqr_bufferSize')
+
+        global __cusolverDnDorgqr_bufferSize
+        __cusolverDnDorgqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgqr_bufferSize')
+        if __cusolverDnDorgqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgqr_bufferSize = dlsym(handle, 'cusolverDnDorgqr_bufferSize')
+
+        global __cusolverDnCungqr_bufferSize
+        __cusolverDnCungqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungqr_bufferSize')
+        if __cusolverDnCungqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungqr_bufferSize = dlsym(handle, 'cusolverDnCungqr_bufferSize')
+
+        global __cusolverDnZungqr_bufferSize
+        __cusolverDnZungqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungqr_bufferSize')
+        if __cusolverDnZungqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungqr_bufferSize = dlsym(handle, 'cusolverDnZungqr_bufferSize')
+
+        global __cusolverDnSorgqr
+        __cusolverDnSorgqr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgqr')
+        if __cusolverDnSorgqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgqr = dlsym(handle, 'cusolverDnSorgqr')
+
+        global __cusolverDnDorgqr
+        __cusolverDnDorgqr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgqr')
+        if __cusolverDnDorgqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgqr = dlsym(handle, 'cusolverDnDorgqr')
+
+        global __cusolverDnCungqr
+        __cusolverDnCungqr = dlsym(RTLD_DEFAULT, 'cusolverDnCungqr')
+        if __cusolverDnCungqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungqr = dlsym(handle, 'cusolverDnCungqr')
+
+        global __cusolverDnZungqr
+        __cusolverDnZungqr = dlsym(RTLD_DEFAULT, 'cusolverDnZungqr')
+        if __cusolverDnZungqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungqr = dlsym(handle, 'cusolverDnZungqr')
+
+        global __cusolverDnSormqr_bufferSize
+        __cusolverDnSormqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSormqr_bufferSize')
+        if __cusolverDnSormqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSormqr_bufferSize = dlsym(handle, 'cusolverDnSormqr_bufferSize')
+
+        global __cusolverDnDormqr_bufferSize
+        __cusolverDnDormqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDormqr_bufferSize')
+        if __cusolverDnDormqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDormqr_bufferSize = dlsym(handle, 'cusolverDnDormqr_bufferSize')
+
+        global __cusolverDnCunmqr_bufferSize
+        __cusolverDnCunmqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCunmqr_bufferSize')
+        if __cusolverDnCunmqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCunmqr_bufferSize = dlsym(handle, 'cusolverDnCunmqr_bufferSize')
+
+        global __cusolverDnZunmqr_bufferSize
+        __cusolverDnZunmqr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZunmqr_bufferSize')
+        if __cusolverDnZunmqr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZunmqr_bufferSize = dlsym(handle, 'cusolverDnZunmqr_bufferSize')
+
+        global __cusolverDnSormqr
+        __cusolverDnSormqr = dlsym(RTLD_DEFAULT, 'cusolverDnSormqr')
+        if __cusolverDnSormqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSormqr = dlsym(handle, 'cusolverDnSormqr')
+
+        global __cusolverDnDormqr
+        __cusolverDnDormqr = dlsym(RTLD_DEFAULT, 'cusolverDnDormqr')
+        if __cusolverDnDormqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDormqr = dlsym(handle, 'cusolverDnDormqr')
+
+        global __cusolverDnCunmqr
+        __cusolverDnCunmqr = dlsym(RTLD_DEFAULT, 'cusolverDnCunmqr')
+        if __cusolverDnCunmqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCunmqr = dlsym(handle, 'cusolverDnCunmqr')
+
+        global __cusolverDnZunmqr
+        __cusolverDnZunmqr = dlsym(RTLD_DEFAULT, 'cusolverDnZunmqr')
+        if __cusolverDnZunmqr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZunmqr = dlsym(handle, 'cusolverDnZunmqr')
+
+        global __cusolverDnSsytrf_bufferSize
+        __cusolverDnSsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrf_bufferSize')
+        if __cusolverDnSsytrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytrf_bufferSize = dlsym(handle, 'cusolverDnSsytrf_bufferSize')
+
+        global __cusolverDnDsytrf_bufferSize
+        __cusolverDnDsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrf_bufferSize')
+        if __cusolverDnDsytrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytrf_bufferSize = dlsym(handle, 'cusolverDnDsytrf_bufferSize')
+
+        global __cusolverDnCsytrf_bufferSize
+        __cusolverDnCsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCsytrf_bufferSize')
+        if __cusolverDnCsytrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCsytrf_bufferSize = dlsym(handle, 'cusolverDnCsytrf_bufferSize')
+
+        global __cusolverDnZsytrf_bufferSize
+        __cusolverDnZsytrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZsytrf_bufferSize')
+        if __cusolverDnZsytrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZsytrf_bufferSize = dlsym(handle, 'cusolverDnZsytrf_bufferSize')
+
+        global __cusolverDnSsytrf
+        __cusolverDnSsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrf')
+        if __cusolverDnSsytrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytrf = dlsym(handle, 'cusolverDnSsytrf')
+
+        global __cusolverDnDsytrf
+        __cusolverDnDsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrf')
+        if __cusolverDnDsytrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytrf = dlsym(handle, 'cusolverDnDsytrf')
+
+        global __cusolverDnCsytrf
+        __cusolverDnCsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnCsytrf')
+        if __cusolverDnCsytrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCsytrf = dlsym(handle, 'cusolverDnCsytrf')
+
+        global __cusolverDnZsytrf
+        __cusolverDnZsytrf = dlsym(RTLD_DEFAULT, 'cusolverDnZsytrf')
+        if __cusolverDnZsytrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZsytrf = dlsym(handle, 'cusolverDnZsytrf')
+
+        global __cusolverDnSsytri_bufferSize
+        __cusolverDnSsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytri_bufferSize')
+        if __cusolverDnSsytri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytri_bufferSize = dlsym(handle, 'cusolverDnSsytri_bufferSize')
+
+        global __cusolverDnDsytri_bufferSize
+        __cusolverDnDsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytri_bufferSize')
+        if __cusolverDnDsytri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytri_bufferSize = dlsym(handle, 'cusolverDnDsytri_bufferSize')
+
+        global __cusolverDnCsytri_bufferSize
+        __cusolverDnCsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCsytri_bufferSize')
+        if __cusolverDnCsytri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCsytri_bufferSize = dlsym(handle, 'cusolverDnCsytri_bufferSize')
+
+        global __cusolverDnZsytri_bufferSize
+        __cusolverDnZsytri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZsytri_bufferSize')
+        if __cusolverDnZsytri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZsytri_bufferSize = dlsym(handle, 'cusolverDnZsytri_bufferSize')
+
+        global __cusolverDnSsytri
+        __cusolverDnSsytri = dlsym(RTLD_DEFAULT, 'cusolverDnSsytri')
+        if __cusolverDnSsytri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytri = dlsym(handle, 'cusolverDnSsytri')
+
+        global __cusolverDnDsytri
+        __cusolverDnDsytri = dlsym(RTLD_DEFAULT, 'cusolverDnDsytri')
+        if __cusolverDnDsytri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytri = dlsym(handle, 'cusolverDnDsytri')
+
+        global __cusolverDnCsytri
+        __cusolverDnCsytri = dlsym(RTLD_DEFAULT, 'cusolverDnCsytri')
+        if __cusolverDnCsytri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCsytri = dlsym(handle, 'cusolverDnCsytri')
+
+        global __cusolverDnZsytri
+        __cusolverDnZsytri = dlsym(RTLD_DEFAULT, 'cusolverDnZsytri')
+        if __cusolverDnZsytri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZsytri = dlsym(handle, 'cusolverDnZsytri')
+
+        global __cusolverDnSgebrd_bufferSize
+        __cusolverDnSgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgebrd_bufferSize')
+        if __cusolverDnSgebrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgebrd_bufferSize = dlsym(handle, 'cusolverDnSgebrd_bufferSize')
+
+        global __cusolverDnDgebrd_bufferSize
+        __cusolverDnDgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgebrd_bufferSize')
+        if __cusolverDnDgebrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgebrd_bufferSize = dlsym(handle, 'cusolverDnDgebrd_bufferSize')
+
+        global __cusolverDnCgebrd_bufferSize
+        __cusolverDnCgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgebrd_bufferSize')
+        if __cusolverDnCgebrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgebrd_bufferSize = dlsym(handle, 'cusolverDnCgebrd_bufferSize')
+
+        global __cusolverDnZgebrd_bufferSize
+        __cusolverDnZgebrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgebrd_bufferSize')
+        if __cusolverDnZgebrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgebrd_bufferSize = dlsym(handle, 'cusolverDnZgebrd_bufferSize')
+
+        global __cusolverDnSgebrd
+        __cusolverDnSgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnSgebrd')
+        if __cusolverDnSgebrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgebrd = dlsym(handle, 'cusolverDnSgebrd')
+
+        global __cusolverDnDgebrd
+        __cusolverDnDgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnDgebrd')
+        if __cusolverDnDgebrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgebrd = dlsym(handle, 'cusolverDnDgebrd')
+
+        global __cusolverDnCgebrd
+        __cusolverDnCgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnCgebrd')
+        if __cusolverDnCgebrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgebrd = dlsym(handle, 'cusolverDnCgebrd')
+
+        global __cusolverDnZgebrd
+        __cusolverDnZgebrd = dlsym(RTLD_DEFAULT, 'cusolverDnZgebrd')
+        if __cusolverDnZgebrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgebrd = dlsym(handle, 'cusolverDnZgebrd')
+
+        global __cusolverDnSorgbr_bufferSize
+        __cusolverDnSorgbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgbr_bufferSize')
+        if __cusolverDnSorgbr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgbr_bufferSize = dlsym(handle, 'cusolverDnSorgbr_bufferSize')
+
+        global __cusolverDnDorgbr_bufferSize
+        __cusolverDnDorgbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgbr_bufferSize')
+        if __cusolverDnDorgbr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgbr_bufferSize = dlsym(handle, 'cusolverDnDorgbr_bufferSize')
+
+        global __cusolverDnCungbr_bufferSize
+        __cusolverDnCungbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungbr_bufferSize')
+        if __cusolverDnCungbr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungbr_bufferSize = dlsym(handle, 'cusolverDnCungbr_bufferSize')
+
+        global __cusolverDnZungbr_bufferSize
+        __cusolverDnZungbr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungbr_bufferSize')
+        if __cusolverDnZungbr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungbr_bufferSize = dlsym(handle, 'cusolverDnZungbr_bufferSize')
+
+        global __cusolverDnSorgbr
+        __cusolverDnSorgbr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgbr')
+        if __cusolverDnSorgbr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgbr = dlsym(handle, 'cusolverDnSorgbr')
+
+        global __cusolverDnDorgbr
+        __cusolverDnDorgbr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgbr')
+        if __cusolverDnDorgbr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgbr = dlsym(handle, 'cusolverDnDorgbr')
+
+        global __cusolverDnCungbr
+        __cusolverDnCungbr = dlsym(RTLD_DEFAULT, 'cusolverDnCungbr')
+        if __cusolverDnCungbr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungbr = dlsym(handle, 'cusolverDnCungbr')
+
+        global __cusolverDnZungbr
+        __cusolverDnZungbr = dlsym(RTLD_DEFAULT, 'cusolverDnZungbr')
+        if __cusolverDnZungbr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungbr = dlsym(handle, 'cusolverDnZungbr')
+
+        global __cusolverDnSsytrd_bufferSize
+        __cusolverDnSsytrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrd_bufferSize')
+        if __cusolverDnSsytrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytrd_bufferSize = dlsym(handle, 'cusolverDnSsytrd_bufferSize')
+
+        global __cusolverDnDsytrd_bufferSize
+        __cusolverDnDsytrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrd_bufferSize')
+        if __cusolverDnDsytrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytrd_bufferSize = dlsym(handle, 'cusolverDnDsytrd_bufferSize')
+
+        global __cusolverDnChetrd_bufferSize
+        __cusolverDnChetrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChetrd_bufferSize')
+        if __cusolverDnChetrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChetrd_bufferSize = dlsym(handle, 'cusolverDnChetrd_bufferSize')
+
+        global __cusolverDnZhetrd_bufferSize
+        __cusolverDnZhetrd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhetrd_bufferSize')
+        if __cusolverDnZhetrd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhetrd_bufferSize = dlsym(handle, 'cusolverDnZhetrd_bufferSize')
+
+        global __cusolverDnSsytrd
+        __cusolverDnSsytrd = dlsym(RTLD_DEFAULT, 'cusolverDnSsytrd')
+        if __cusolverDnSsytrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsytrd = dlsym(handle, 'cusolverDnSsytrd')
+
+        global __cusolverDnDsytrd
+        __cusolverDnDsytrd = dlsym(RTLD_DEFAULT, 'cusolverDnDsytrd')
+        if __cusolverDnDsytrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsytrd = dlsym(handle, 'cusolverDnDsytrd')
+
+        global __cusolverDnChetrd
+        __cusolverDnChetrd = dlsym(RTLD_DEFAULT, 'cusolverDnChetrd')
+        if __cusolverDnChetrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChetrd = dlsym(handle, 'cusolverDnChetrd')
+
+        global __cusolverDnZhetrd
+        __cusolverDnZhetrd = dlsym(RTLD_DEFAULT, 'cusolverDnZhetrd')
+        if __cusolverDnZhetrd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhetrd = dlsym(handle, 'cusolverDnZhetrd')
+
+        global __cusolverDnSorgtr_bufferSize
+        __cusolverDnSorgtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSorgtr_bufferSize')
+        if __cusolverDnSorgtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgtr_bufferSize = dlsym(handle, 'cusolverDnSorgtr_bufferSize')
+
+        global __cusolverDnDorgtr_bufferSize
+        __cusolverDnDorgtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDorgtr_bufferSize')
+        if __cusolverDnDorgtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgtr_bufferSize = dlsym(handle, 'cusolverDnDorgtr_bufferSize')
+
+        global __cusolverDnCungtr_bufferSize
+        __cusolverDnCungtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCungtr_bufferSize')
+        if __cusolverDnCungtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungtr_bufferSize = dlsym(handle, 'cusolverDnCungtr_bufferSize')
+
+        global __cusolverDnZungtr_bufferSize
+        __cusolverDnZungtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZungtr_bufferSize')
+        if __cusolverDnZungtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungtr_bufferSize = dlsym(handle, 'cusolverDnZungtr_bufferSize')
+
+        global __cusolverDnSorgtr
+        __cusolverDnSorgtr = dlsym(RTLD_DEFAULT, 'cusolverDnSorgtr')
+        if __cusolverDnSorgtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSorgtr = dlsym(handle, 'cusolverDnSorgtr')
+
+        global __cusolverDnDorgtr
+        __cusolverDnDorgtr = dlsym(RTLD_DEFAULT, 'cusolverDnDorgtr')
+        if __cusolverDnDorgtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDorgtr = dlsym(handle, 'cusolverDnDorgtr')
+
+        global __cusolverDnCungtr
+        __cusolverDnCungtr = dlsym(RTLD_DEFAULT, 'cusolverDnCungtr')
+        if __cusolverDnCungtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCungtr = dlsym(handle, 'cusolverDnCungtr')
+
+        global __cusolverDnZungtr
+        __cusolverDnZungtr = dlsym(RTLD_DEFAULT, 'cusolverDnZungtr')
+        if __cusolverDnZungtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZungtr = dlsym(handle, 'cusolverDnZungtr')
+
+        global __cusolverDnSormtr_bufferSize
+        __cusolverDnSormtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSormtr_bufferSize')
+        if __cusolverDnSormtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSormtr_bufferSize = dlsym(handle, 'cusolverDnSormtr_bufferSize')
+
+        global __cusolverDnDormtr_bufferSize
+        __cusolverDnDormtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDormtr_bufferSize')
+        if __cusolverDnDormtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDormtr_bufferSize = dlsym(handle, 'cusolverDnDormtr_bufferSize')
+
+        global __cusolverDnCunmtr_bufferSize
+        __cusolverDnCunmtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCunmtr_bufferSize')
+        if __cusolverDnCunmtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCunmtr_bufferSize = dlsym(handle, 'cusolverDnCunmtr_bufferSize')
+
+        global __cusolverDnZunmtr_bufferSize
+        __cusolverDnZunmtr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZunmtr_bufferSize')
+        if __cusolverDnZunmtr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZunmtr_bufferSize = dlsym(handle, 'cusolverDnZunmtr_bufferSize')
+
+        global __cusolverDnSormtr
+        __cusolverDnSormtr = dlsym(RTLD_DEFAULT, 'cusolverDnSormtr')
+        if __cusolverDnSormtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSormtr = dlsym(handle, 'cusolverDnSormtr')
+
+        global __cusolverDnDormtr
+        __cusolverDnDormtr = dlsym(RTLD_DEFAULT, 'cusolverDnDormtr')
+        if __cusolverDnDormtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDormtr = dlsym(handle, 'cusolverDnDormtr')
+
+        global __cusolverDnCunmtr
+        __cusolverDnCunmtr = dlsym(RTLD_DEFAULT, 'cusolverDnCunmtr')
+        if __cusolverDnCunmtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCunmtr = dlsym(handle, 'cusolverDnCunmtr')
+
+        global __cusolverDnZunmtr
+        __cusolverDnZunmtr = dlsym(RTLD_DEFAULT, 'cusolverDnZunmtr')
+        if __cusolverDnZunmtr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZunmtr = dlsym(handle, 'cusolverDnZunmtr')
+
+        global __cusolverDnSgesvd_bufferSize
+        __cusolverDnSgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvd_bufferSize')
+        if __cusolverDnSgesvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvd_bufferSize = dlsym(handle, 'cusolverDnSgesvd_bufferSize')
+
+        global __cusolverDnDgesvd_bufferSize
+        __cusolverDnDgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvd_bufferSize')
+        if __cusolverDnDgesvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvd_bufferSize = dlsym(handle, 'cusolverDnDgesvd_bufferSize')
+
+        global __cusolverDnCgesvd_bufferSize
+        __cusolverDnCgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvd_bufferSize')
+        if __cusolverDnCgesvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvd_bufferSize = dlsym(handle, 'cusolverDnCgesvd_bufferSize')
+
+        global __cusolverDnZgesvd_bufferSize
+        __cusolverDnZgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvd_bufferSize')
+        if __cusolverDnZgesvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvd_bufferSize = dlsym(handle, 'cusolverDnZgesvd_bufferSize')
+
+        global __cusolverDnSgesvd
+        __cusolverDnSgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvd')
+        if __cusolverDnSgesvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvd = dlsym(handle, 'cusolverDnSgesvd')
+
+        global __cusolverDnDgesvd
+        __cusolverDnDgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvd')
+        if __cusolverDnDgesvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvd = dlsym(handle, 'cusolverDnDgesvd')
+
+        global __cusolverDnCgesvd
+        __cusolverDnCgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvd')
+        if __cusolverDnCgesvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvd = dlsym(handle, 'cusolverDnCgesvd')
+
+        global __cusolverDnZgesvd
+        __cusolverDnZgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvd')
+        if __cusolverDnZgesvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvd = dlsym(handle, 'cusolverDnZgesvd')
+
+        global __cusolverDnSsyevd_bufferSize
+        __cusolverDnSsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevd_bufferSize')
+        if __cusolverDnSsyevd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevd_bufferSize = dlsym(handle, 'cusolverDnSsyevd_bufferSize')
+
+        global __cusolverDnDsyevd_bufferSize
+        __cusolverDnDsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevd_bufferSize')
+        if __cusolverDnDsyevd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevd_bufferSize = dlsym(handle, 'cusolverDnDsyevd_bufferSize')
+
+        global __cusolverDnCheevd_bufferSize
+        __cusolverDnCheevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevd_bufferSize')
+        if __cusolverDnCheevd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevd_bufferSize = dlsym(handle, 'cusolverDnCheevd_bufferSize')
+
+        global __cusolverDnZheevd_bufferSize
+        __cusolverDnZheevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevd_bufferSize')
+        if __cusolverDnZheevd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevd_bufferSize = dlsym(handle, 'cusolverDnZheevd_bufferSize')
+
+        global __cusolverDnSsyevd
+        __cusolverDnSsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevd')
+        if __cusolverDnSsyevd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevd = dlsym(handle, 'cusolverDnSsyevd')
+
+        global __cusolverDnDsyevd
+        __cusolverDnDsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevd')
+        if __cusolverDnDsyevd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevd = dlsym(handle, 'cusolverDnDsyevd')
+
+        global __cusolverDnCheevd
+        __cusolverDnCheevd = dlsym(RTLD_DEFAULT, 'cusolverDnCheevd')
+        if __cusolverDnCheevd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevd = dlsym(handle, 'cusolverDnCheevd')
+
+        global __cusolverDnZheevd
+        __cusolverDnZheevd = dlsym(RTLD_DEFAULT, 'cusolverDnZheevd')
+        if __cusolverDnZheevd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevd = dlsym(handle, 'cusolverDnZheevd')
+
+        global __cusolverDnSsyevdx_bufferSize
+        __cusolverDnSsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevdx_bufferSize')
+        if __cusolverDnSsyevdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevdx_bufferSize = dlsym(handle, 'cusolverDnSsyevdx_bufferSize')
+
+        global __cusolverDnDsyevdx_bufferSize
+        __cusolverDnDsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevdx_bufferSize')
+        if __cusolverDnDsyevdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevdx_bufferSize = dlsym(handle, 'cusolverDnDsyevdx_bufferSize')
+
+        global __cusolverDnCheevdx_bufferSize
+        __cusolverDnCheevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevdx_bufferSize')
+        if __cusolverDnCheevdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevdx_bufferSize = dlsym(handle, 'cusolverDnCheevdx_bufferSize')
+
+        global __cusolverDnZheevdx_bufferSize
+        __cusolverDnZheevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevdx_bufferSize')
+        if __cusolverDnZheevdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevdx_bufferSize = dlsym(handle, 'cusolverDnZheevdx_bufferSize')
+
+        global __cusolverDnSsyevdx
+        __cusolverDnSsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevdx')
+        if __cusolverDnSsyevdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevdx = dlsym(handle, 'cusolverDnSsyevdx')
+
+        global __cusolverDnDsyevdx
+        __cusolverDnDsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevdx')
+        if __cusolverDnDsyevdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevdx = dlsym(handle, 'cusolverDnDsyevdx')
+
+        global __cusolverDnCheevdx
+        __cusolverDnCheevdx = dlsym(RTLD_DEFAULT, 'cusolverDnCheevdx')
+        if __cusolverDnCheevdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevdx = dlsym(handle, 'cusolverDnCheevdx')
+
+        global __cusolverDnZheevdx
+        __cusolverDnZheevdx = dlsym(RTLD_DEFAULT, 'cusolverDnZheevdx')
+        if __cusolverDnZheevdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevdx = dlsym(handle, 'cusolverDnZheevdx')
+
+        global __cusolverDnSsygvdx_bufferSize
+        __cusolverDnSsygvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvdx_bufferSize')
+        if __cusolverDnSsygvdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvdx_bufferSize = dlsym(handle, 'cusolverDnSsygvdx_bufferSize')
+
+        global __cusolverDnDsygvdx_bufferSize
+        __cusolverDnDsygvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvdx_bufferSize')
+        if __cusolverDnDsygvdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvdx_bufferSize = dlsym(handle, 'cusolverDnDsygvdx_bufferSize')
+
+        global __cusolverDnChegvdx_bufferSize
+        __cusolverDnChegvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvdx_bufferSize')
+        if __cusolverDnChegvdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvdx_bufferSize = dlsym(handle, 'cusolverDnChegvdx_bufferSize')
+
+        global __cusolverDnZhegvdx_bufferSize
+        __cusolverDnZhegvdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvdx_bufferSize')
+        if __cusolverDnZhegvdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvdx_bufferSize = dlsym(handle, 'cusolverDnZhegvdx_bufferSize')
+
+        global __cusolverDnSsygvdx
+        __cusolverDnSsygvdx = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvdx')
+        if __cusolverDnSsygvdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvdx = dlsym(handle, 'cusolverDnSsygvdx')
+
+        global __cusolverDnDsygvdx
+        __cusolverDnDsygvdx = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvdx')
+        if __cusolverDnDsygvdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvdx = dlsym(handle, 'cusolverDnDsygvdx')
+
+        global __cusolverDnChegvdx
+        __cusolverDnChegvdx = dlsym(RTLD_DEFAULT, 'cusolverDnChegvdx')
+        if __cusolverDnChegvdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvdx = dlsym(handle, 'cusolverDnChegvdx')
+
+        global __cusolverDnZhegvdx
+        __cusolverDnZhegvdx = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvdx')
+        if __cusolverDnZhegvdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvdx = dlsym(handle, 'cusolverDnZhegvdx')
+
+        global __cusolverDnSsygvd_bufferSize
+        __cusolverDnSsygvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvd_bufferSize')
+        if __cusolverDnSsygvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvd_bufferSize = dlsym(handle, 'cusolverDnSsygvd_bufferSize')
+
+        global __cusolverDnDsygvd_bufferSize
+        __cusolverDnDsygvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvd_bufferSize')
+        if __cusolverDnDsygvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvd_bufferSize = dlsym(handle, 'cusolverDnDsygvd_bufferSize')
+
+        global __cusolverDnChegvd_bufferSize
+        __cusolverDnChegvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvd_bufferSize')
+        if __cusolverDnChegvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvd_bufferSize = dlsym(handle, 'cusolverDnChegvd_bufferSize')
+
+        global __cusolverDnZhegvd_bufferSize
+        __cusolverDnZhegvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvd_bufferSize')
+        if __cusolverDnZhegvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvd_bufferSize = dlsym(handle, 'cusolverDnZhegvd_bufferSize')
+
+        global __cusolverDnSsygvd
+        __cusolverDnSsygvd = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvd')
+        if __cusolverDnSsygvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvd = dlsym(handle, 'cusolverDnSsygvd')
+
+        global __cusolverDnDsygvd
+        __cusolverDnDsygvd = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvd')
+        if __cusolverDnDsygvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvd = dlsym(handle, 'cusolverDnDsygvd')
+
+        global __cusolverDnChegvd
+        __cusolverDnChegvd = dlsym(RTLD_DEFAULT, 'cusolverDnChegvd')
+        if __cusolverDnChegvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvd = dlsym(handle, 'cusolverDnChegvd')
+
+        global __cusolverDnZhegvd
+        __cusolverDnZhegvd = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvd')
+        if __cusolverDnZhegvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvd = dlsym(handle, 'cusolverDnZhegvd')
+
+        global __cusolverDnCreateSyevjInfo
+        __cusolverDnCreateSyevjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnCreateSyevjInfo')
+        if __cusolverDnCreateSyevjInfo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCreateSyevjInfo = dlsym(handle, 'cusolverDnCreateSyevjInfo')
+
+        global __cusolverDnDestroySyevjInfo
+        __cusolverDnDestroySyevjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnDestroySyevjInfo')
+        if __cusolverDnDestroySyevjInfo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDestroySyevjInfo = dlsym(handle, 'cusolverDnDestroySyevjInfo')
+
+        global __cusolverDnXsyevjSetTolerance
+        __cusolverDnXsyevjSetTolerance = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetTolerance')
+        if __cusolverDnXsyevjSetTolerance == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevjSetTolerance = dlsym(handle, 'cusolverDnXsyevjSetTolerance')
+
+        global __cusolverDnXsyevjSetMaxSweeps
+        __cusolverDnXsyevjSetMaxSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetMaxSweeps')
+        if __cusolverDnXsyevjSetMaxSweeps == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevjSetMaxSweeps = dlsym(handle, 'cusolverDnXsyevjSetMaxSweeps')
+
+        global __cusolverDnXsyevjSetSortEig
+        __cusolverDnXsyevjSetSortEig = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjSetSortEig')
+        if __cusolverDnXsyevjSetSortEig == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevjSetSortEig = dlsym(handle, 'cusolverDnXsyevjSetSortEig')
+
+        global __cusolverDnXsyevjGetResidual
+        __cusolverDnXsyevjGetResidual = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjGetResidual')
+        if __cusolverDnXsyevjGetResidual == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevjGetResidual = dlsym(handle, 'cusolverDnXsyevjGetResidual')
+
+        global __cusolverDnXsyevjGetSweeps
+        __cusolverDnXsyevjGetSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevjGetSweeps')
+        if __cusolverDnXsyevjGetSweeps == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevjGetSweeps = dlsym(handle, 'cusolverDnXsyevjGetSweeps')
+
+        global __cusolverDnSsyevjBatched_bufferSize
+        __cusolverDnSsyevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevjBatched_bufferSize')
+        if __cusolverDnSsyevjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevjBatched_bufferSize = dlsym(handle, 'cusolverDnSsyevjBatched_bufferSize')
+
+        global __cusolverDnDsyevjBatched_bufferSize
+        __cusolverDnDsyevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevjBatched_bufferSize')
+        if __cusolverDnDsyevjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevjBatched_bufferSize = dlsym(handle, 'cusolverDnDsyevjBatched_bufferSize')
+
+        global __cusolverDnCheevjBatched_bufferSize
+        __cusolverDnCheevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevjBatched_bufferSize')
+        if __cusolverDnCheevjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevjBatched_bufferSize = dlsym(handle, 'cusolverDnCheevjBatched_bufferSize')
+
+        global __cusolverDnZheevjBatched_bufferSize
+        __cusolverDnZheevjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevjBatched_bufferSize')
+        if __cusolverDnZheevjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevjBatched_bufferSize = dlsym(handle, 'cusolverDnZheevjBatched_bufferSize')
+
+        global __cusolverDnSsyevjBatched
+        __cusolverDnSsyevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevjBatched')
+        if __cusolverDnSsyevjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevjBatched = dlsym(handle, 'cusolverDnSsyevjBatched')
+
+        global __cusolverDnDsyevjBatched
+        __cusolverDnDsyevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevjBatched')
+        if __cusolverDnDsyevjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevjBatched = dlsym(handle, 'cusolverDnDsyevjBatched')
+
+        global __cusolverDnCheevjBatched
+        __cusolverDnCheevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCheevjBatched')
+        if __cusolverDnCheevjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevjBatched = dlsym(handle, 'cusolverDnCheevjBatched')
+
+        global __cusolverDnZheevjBatched
+        __cusolverDnZheevjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZheevjBatched')
+        if __cusolverDnZheevjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevjBatched = dlsym(handle, 'cusolverDnZheevjBatched')
+
+        global __cusolverDnSsyevj_bufferSize
+        __cusolverDnSsyevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevj_bufferSize')
+        if __cusolverDnSsyevj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevj_bufferSize = dlsym(handle, 'cusolverDnSsyevj_bufferSize')
+
+        global __cusolverDnDsyevj_bufferSize
+        __cusolverDnDsyevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevj_bufferSize')
+        if __cusolverDnDsyevj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevj_bufferSize = dlsym(handle, 'cusolverDnDsyevj_bufferSize')
+
+        global __cusolverDnCheevj_bufferSize
+        __cusolverDnCheevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCheevj_bufferSize')
+        if __cusolverDnCheevj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevj_bufferSize = dlsym(handle, 'cusolverDnCheevj_bufferSize')
+
+        global __cusolverDnZheevj_bufferSize
+        __cusolverDnZheevj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZheevj_bufferSize')
+        if __cusolverDnZheevj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevj_bufferSize = dlsym(handle, 'cusolverDnZheevj_bufferSize')
+
+        global __cusolverDnSsyevj
+        __cusolverDnSsyevj = dlsym(RTLD_DEFAULT, 'cusolverDnSsyevj')
+        if __cusolverDnSsyevj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsyevj = dlsym(handle, 'cusolverDnSsyevj')
+
+        global __cusolverDnDsyevj
+        __cusolverDnDsyevj = dlsym(RTLD_DEFAULT, 'cusolverDnDsyevj')
+        if __cusolverDnDsyevj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsyevj = dlsym(handle, 'cusolverDnDsyevj')
+
+        global __cusolverDnCheevj
+        __cusolverDnCheevj = dlsym(RTLD_DEFAULT, 'cusolverDnCheevj')
+        if __cusolverDnCheevj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCheevj = dlsym(handle, 'cusolverDnCheevj')
+
+        global __cusolverDnZheevj
+        __cusolverDnZheevj = dlsym(RTLD_DEFAULT, 'cusolverDnZheevj')
+        if __cusolverDnZheevj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZheevj = dlsym(handle, 'cusolverDnZheevj')
+
+        global __cusolverDnSsygvj_bufferSize
+        __cusolverDnSsygvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvj_bufferSize')
+        if __cusolverDnSsygvj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvj_bufferSize = dlsym(handle, 'cusolverDnSsygvj_bufferSize')
+
+        global __cusolverDnDsygvj_bufferSize
+        __cusolverDnDsygvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvj_bufferSize')
+        if __cusolverDnDsygvj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvj_bufferSize = dlsym(handle, 'cusolverDnDsygvj_bufferSize')
+
+        global __cusolverDnChegvj_bufferSize
+        __cusolverDnChegvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnChegvj_bufferSize')
+        if __cusolverDnChegvj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvj_bufferSize = dlsym(handle, 'cusolverDnChegvj_bufferSize')
+
+        global __cusolverDnZhegvj_bufferSize
+        __cusolverDnZhegvj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvj_bufferSize')
+        if __cusolverDnZhegvj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvj_bufferSize = dlsym(handle, 'cusolverDnZhegvj_bufferSize')
+
+        global __cusolverDnSsygvj
+        __cusolverDnSsygvj = dlsym(RTLD_DEFAULT, 'cusolverDnSsygvj')
+        if __cusolverDnSsygvj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSsygvj = dlsym(handle, 'cusolverDnSsygvj')
+
+        global __cusolverDnDsygvj
+        __cusolverDnDsygvj = dlsym(RTLD_DEFAULT, 'cusolverDnDsygvj')
+        if __cusolverDnDsygvj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDsygvj = dlsym(handle, 'cusolverDnDsygvj')
+
+        global __cusolverDnChegvj
+        __cusolverDnChegvj = dlsym(RTLD_DEFAULT, 'cusolverDnChegvj')
+        if __cusolverDnChegvj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnChegvj = dlsym(handle, 'cusolverDnChegvj')
+
+        global __cusolverDnZhegvj
+        __cusolverDnZhegvj = dlsym(RTLD_DEFAULT, 'cusolverDnZhegvj')
+        if __cusolverDnZhegvj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZhegvj = dlsym(handle, 'cusolverDnZhegvj')
+
+        global __cusolverDnCreateGesvdjInfo
+        __cusolverDnCreateGesvdjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnCreateGesvdjInfo')
+        if __cusolverDnCreateGesvdjInfo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCreateGesvdjInfo = dlsym(handle, 'cusolverDnCreateGesvdjInfo')
+
+        global __cusolverDnDestroyGesvdjInfo
+        __cusolverDnDestroyGesvdjInfo = dlsym(RTLD_DEFAULT, 'cusolverDnDestroyGesvdjInfo')
+        if __cusolverDnDestroyGesvdjInfo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDestroyGesvdjInfo = dlsym(handle, 'cusolverDnDestroyGesvdjInfo')
+
+        global __cusolverDnXgesvdjSetTolerance
+        __cusolverDnXgesvdjSetTolerance = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetTolerance')
+        if __cusolverDnXgesvdjSetTolerance == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdjSetTolerance = dlsym(handle, 'cusolverDnXgesvdjSetTolerance')
+
+        global __cusolverDnXgesvdjSetMaxSweeps
+        __cusolverDnXgesvdjSetMaxSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetMaxSweeps')
+        if __cusolverDnXgesvdjSetMaxSweeps == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdjSetMaxSweeps = dlsym(handle, 'cusolverDnXgesvdjSetMaxSweeps')
+
+        global __cusolverDnXgesvdjSetSortEig
+        __cusolverDnXgesvdjSetSortEig = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjSetSortEig')
+        if __cusolverDnXgesvdjSetSortEig == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdjSetSortEig = dlsym(handle, 'cusolverDnXgesvdjSetSortEig')
+
+        global __cusolverDnXgesvdjGetResidual
+        __cusolverDnXgesvdjGetResidual = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjGetResidual')
+        if __cusolverDnXgesvdjGetResidual == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdjGetResidual = dlsym(handle, 'cusolverDnXgesvdjGetResidual')
+
+        global __cusolverDnXgesvdjGetSweeps
+        __cusolverDnXgesvdjGetSweeps = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdjGetSweeps')
+        if __cusolverDnXgesvdjGetSweeps == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdjGetSweeps = dlsym(handle, 'cusolverDnXgesvdjGetSweeps')
+
+        global __cusolverDnSgesvdjBatched_bufferSize
+        __cusolverDnSgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdjBatched_bufferSize')
+        if __cusolverDnSgesvdjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnSgesvdjBatched_bufferSize')
+
+        global __cusolverDnDgesvdjBatched_bufferSize
+        __cusolverDnDgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdjBatched_bufferSize')
+        if __cusolverDnDgesvdjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnDgesvdjBatched_bufferSize')
+
+        global __cusolverDnCgesvdjBatched_bufferSize
+        __cusolverDnCgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdjBatched_bufferSize')
+        if __cusolverDnCgesvdjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnCgesvdjBatched_bufferSize')
+
+        global __cusolverDnZgesvdjBatched_bufferSize
+        __cusolverDnZgesvdjBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdjBatched_bufferSize')
+        if __cusolverDnZgesvdjBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdjBatched_bufferSize = dlsym(handle, 'cusolverDnZgesvdjBatched_bufferSize')
+
+        global __cusolverDnSgesvdjBatched
+        __cusolverDnSgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdjBatched')
+        if __cusolverDnSgesvdjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdjBatched = dlsym(handle, 'cusolverDnSgesvdjBatched')
+
+        global __cusolverDnDgesvdjBatched
+        __cusolverDnDgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdjBatched')
+        if __cusolverDnDgesvdjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdjBatched = dlsym(handle, 'cusolverDnDgesvdjBatched')
+
+        global __cusolverDnCgesvdjBatched
+        __cusolverDnCgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdjBatched')
+        if __cusolverDnCgesvdjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdjBatched = dlsym(handle, 'cusolverDnCgesvdjBatched')
+
+        global __cusolverDnZgesvdjBatched
+        __cusolverDnZgesvdjBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdjBatched')
+        if __cusolverDnZgesvdjBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdjBatched = dlsym(handle, 'cusolverDnZgesvdjBatched')
+
+        global __cusolverDnSgesvdj_bufferSize
+        __cusolverDnSgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdj_bufferSize')
+        if __cusolverDnSgesvdj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdj_bufferSize = dlsym(handle, 'cusolverDnSgesvdj_bufferSize')
+
+        global __cusolverDnDgesvdj_bufferSize
+        __cusolverDnDgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdj_bufferSize')
+        if __cusolverDnDgesvdj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdj_bufferSize = dlsym(handle, 'cusolverDnDgesvdj_bufferSize')
+
+        global __cusolverDnCgesvdj_bufferSize
+        __cusolverDnCgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdj_bufferSize')
+        if __cusolverDnCgesvdj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdj_bufferSize = dlsym(handle, 'cusolverDnCgesvdj_bufferSize')
+
+        global __cusolverDnZgesvdj_bufferSize
+        __cusolverDnZgesvdj_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdj_bufferSize')
+        if __cusolverDnZgesvdj_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdj_bufferSize = dlsym(handle, 'cusolverDnZgesvdj_bufferSize')
+
+        global __cusolverDnSgesvdj
+        __cusolverDnSgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdj')
+        if __cusolverDnSgesvdj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdj = dlsym(handle, 'cusolverDnSgesvdj')
+
+        global __cusolverDnDgesvdj
+        __cusolverDnDgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdj')
+        if __cusolverDnDgesvdj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdj = dlsym(handle, 'cusolverDnDgesvdj')
+
+        global __cusolverDnCgesvdj
+        __cusolverDnCgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdj')
+        if __cusolverDnCgesvdj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdj = dlsym(handle, 'cusolverDnCgesvdj')
+
+        global __cusolverDnZgesvdj
+        __cusolverDnZgesvdj = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdj')
+        if __cusolverDnZgesvdj == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdj = dlsym(handle, 'cusolverDnZgesvdj')
+
+        global __cusolverDnSgesvdaStridedBatched_bufferSize
+        __cusolverDnSgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdaStridedBatched_bufferSize')
+        if __cusolverDnSgesvdaStridedBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnSgesvdaStridedBatched_bufferSize')
+
+        global __cusolverDnDgesvdaStridedBatched_bufferSize
+        __cusolverDnDgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdaStridedBatched_bufferSize')
+        if __cusolverDnDgesvdaStridedBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnDgesvdaStridedBatched_bufferSize')
+
+        global __cusolverDnCgesvdaStridedBatched_bufferSize
+        __cusolverDnCgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdaStridedBatched_bufferSize')
+        if __cusolverDnCgesvdaStridedBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnCgesvdaStridedBatched_bufferSize')
+
+        global __cusolverDnZgesvdaStridedBatched_bufferSize
+        __cusolverDnZgesvdaStridedBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdaStridedBatched_bufferSize')
+        if __cusolverDnZgesvdaStridedBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdaStridedBatched_bufferSize = dlsym(handle, 'cusolverDnZgesvdaStridedBatched_bufferSize')
+
+        global __cusolverDnSgesvdaStridedBatched
+        __cusolverDnSgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnSgesvdaStridedBatched')
+        if __cusolverDnSgesvdaStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSgesvdaStridedBatched = dlsym(handle, 'cusolverDnSgesvdaStridedBatched')
+
+        global __cusolverDnDgesvdaStridedBatched
+        __cusolverDnDgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnDgesvdaStridedBatched')
+        if __cusolverDnDgesvdaStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDgesvdaStridedBatched = dlsym(handle, 'cusolverDnDgesvdaStridedBatched')
+
+        global __cusolverDnCgesvdaStridedBatched
+        __cusolverDnCgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnCgesvdaStridedBatched')
+        if __cusolverDnCgesvdaStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCgesvdaStridedBatched = dlsym(handle, 'cusolverDnCgesvdaStridedBatched')
+
+        global __cusolverDnZgesvdaStridedBatched
+        __cusolverDnZgesvdaStridedBatched = dlsym(RTLD_DEFAULT, 'cusolverDnZgesvdaStridedBatched')
+        if __cusolverDnZgesvdaStridedBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnZgesvdaStridedBatched = dlsym(handle, 'cusolverDnZgesvdaStridedBatched')
+
+        global __cusolverDnCreateParams
+        __cusolverDnCreateParams = dlsym(RTLD_DEFAULT, 'cusolverDnCreateParams')
+        if __cusolverDnCreateParams == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnCreateParams = dlsym(handle, 'cusolverDnCreateParams')
+
+        global __cusolverDnDestroyParams
+        __cusolverDnDestroyParams = dlsym(RTLD_DEFAULT, 'cusolverDnDestroyParams')
+        if __cusolverDnDestroyParams == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnDestroyParams = dlsym(handle, 'cusolverDnDestroyParams')
+
+        global __cusolverDnSetAdvOptions
+        __cusolverDnSetAdvOptions = dlsym(RTLD_DEFAULT, 'cusolverDnSetAdvOptions')
+        if __cusolverDnSetAdvOptions == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSetAdvOptions = dlsym(handle, 'cusolverDnSetAdvOptions')
+
+        global __cusolverDnXpotrf_bufferSize
+        __cusolverDnXpotrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrf_bufferSize')
+        if __cusolverDnXpotrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXpotrf_bufferSize = dlsym(handle, 'cusolverDnXpotrf_bufferSize')
+
+        global __cusolverDnXpotrf
+        __cusolverDnXpotrf = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrf')
+        if __cusolverDnXpotrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXpotrf = dlsym(handle, 'cusolverDnXpotrf')
+
+        global __cusolverDnXpotrs
+        __cusolverDnXpotrs = dlsym(RTLD_DEFAULT, 'cusolverDnXpotrs')
+        if __cusolverDnXpotrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXpotrs = dlsym(handle, 'cusolverDnXpotrs')
+
+        global __cusolverDnXgeqrf_bufferSize
+        __cusolverDnXgeqrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgeqrf_bufferSize')
+        if __cusolverDnXgeqrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgeqrf_bufferSize = dlsym(handle, 'cusolverDnXgeqrf_bufferSize')
+
+        global __cusolverDnXgeqrf
+        __cusolverDnXgeqrf = dlsym(RTLD_DEFAULT, 'cusolverDnXgeqrf')
+        if __cusolverDnXgeqrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgeqrf = dlsym(handle, 'cusolverDnXgeqrf')
+
+        global __cusolverDnXgetrf_bufferSize
+        __cusolverDnXgetrf_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrf_bufferSize')
+        if __cusolverDnXgetrf_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgetrf_bufferSize = dlsym(handle, 'cusolverDnXgetrf_bufferSize')
+
+        global __cusolverDnXgetrf
+        __cusolverDnXgetrf = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrf')
+        if __cusolverDnXgetrf == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgetrf = dlsym(handle, 'cusolverDnXgetrf')
+
+        global __cusolverDnXgetrs
+        __cusolverDnXgetrs = dlsym(RTLD_DEFAULT, 'cusolverDnXgetrs')
+        if __cusolverDnXgetrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgetrs = dlsym(handle, 'cusolverDnXgetrs')
+
+        global __cusolverDnXsyevd_bufferSize
+        __cusolverDnXsyevd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevd_bufferSize')
+        if __cusolverDnXsyevd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevd_bufferSize = dlsym(handle, 'cusolverDnXsyevd_bufferSize')
+
+        global __cusolverDnXsyevd
+        __cusolverDnXsyevd = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevd')
+        if __cusolverDnXsyevd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevd = dlsym(handle, 'cusolverDnXsyevd')
+
+        global __cusolverDnXsyevdx_bufferSize
+        __cusolverDnXsyevdx_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevdx_bufferSize')
+        if __cusolverDnXsyevdx_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevdx_bufferSize = dlsym(handle, 'cusolverDnXsyevdx_bufferSize')
+
+        global __cusolverDnXsyevdx
+        __cusolverDnXsyevdx = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevdx')
+        if __cusolverDnXsyevdx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevdx = dlsym(handle, 'cusolverDnXsyevdx')
+
+        global __cusolverDnXgesvd_bufferSize
+        __cusolverDnXgesvd_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvd_bufferSize')
+        if __cusolverDnXgesvd_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvd_bufferSize = dlsym(handle, 'cusolverDnXgesvd_bufferSize')
+
+        global __cusolverDnXgesvd
+        __cusolverDnXgesvd = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvd')
+        if __cusolverDnXgesvd == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvd = dlsym(handle, 'cusolverDnXgesvd')
+
+        global __cusolverDnXgesvdp_bufferSize
+        __cusolverDnXgesvdp_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdp_bufferSize')
+        if __cusolverDnXgesvdp_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdp_bufferSize = dlsym(handle, 'cusolverDnXgesvdp_bufferSize')
+
+        global __cusolverDnXgesvdp
+        __cusolverDnXgesvdp = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdp')
+        if __cusolverDnXgesvdp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdp = dlsym(handle, 'cusolverDnXgesvdp')
+
+        global __cusolverDnXgesvdr_bufferSize
+        __cusolverDnXgesvdr_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdr_bufferSize')
+        if __cusolverDnXgesvdr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdr_bufferSize = dlsym(handle, 'cusolverDnXgesvdr_bufferSize')
+
+        global __cusolverDnXgesvdr
+        __cusolverDnXgesvdr = dlsym(RTLD_DEFAULT, 'cusolverDnXgesvdr')
+        if __cusolverDnXgesvdr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgesvdr = dlsym(handle, 'cusolverDnXgesvdr')
+
+        global __cusolverDnXsytrs_bufferSize
+        __cusolverDnXsytrs_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsytrs_bufferSize')
+        if __cusolverDnXsytrs_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsytrs_bufferSize = dlsym(handle, 'cusolverDnXsytrs_bufferSize')
+
+        global __cusolverDnXsytrs
+        __cusolverDnXsytrs = dlsym(RTLD_DEFAULT, 'cusolverDnXsytrs')
+        if __cusolverDnXsytrs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsytrs = dlsym(handle, 'cusolverDnXsytrs')
+
+        global __cusolverDnXtrtri_bufferSize
+        __cusolverDnXtrtri_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXtrtri_bufferSize')
+        if __cusolverDnXtrtri_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXtrtri_bufferSize = dlsym(handle, 'cusolverDnXtrtri_bufferSize')
+
+        global __cusolverDnXtrtri
+        __cusolverDnXtrtri = dlsym(RTLD_DEFAULT, 'cusolverDnXtrtri')
+        if __cusolverDnXtrtri == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXtrtri = dlsym(handle, 'cusolverDnXtrtri')
+
+        global __cusolverDnLoggerSetCallback
+        __cusolverDnLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetCallback')
+        if __cusolverDnLoggerSetCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerSetCallback = dlsym(handle, 'cusolverDnLoggerSetCallback')
+
+        global __cusolverDnLoggerSetFile
+        __cusolverDnLoggerSetFile = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetFile')
+        if __cusolverDnLoggerSetFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerSetFile = dlsym(handle, 'cusolverDnLoggerSetFile')
+
+        global __cusolverDnLoggerOpenFile
+        __cusolverDnLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerOpenFile')
+        if __cusolverDnLoggerOpenFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerOpenFile = dlsym(handle, 'cusolverDnLoggerOpenFile')
+
+        global __cusolverDnLoggerSetLevel
+        __cusolverDnLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetLevel')
+        if __cusolverDnLoggerSetLevel == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerSetLevel = dlsym(handle, 'cusolverDnLoggerSetLevel')
+
+        global __cusolverDnLoggerSetMask
+        __cusolverDnLoggerSetMask = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerSetMask')
+        if __cusolverDnLoggerSetMask == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerSetMask = dlsym(handle, 'cusolverDnLoggerSetMask')
+
+        global __cusolverDnLoggerForceDisable
+        __cusolverDnLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cusolverDnLoggerForceDisable')
+        if __cusolverDnLoggerForceDisable == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnLoggerForceDisable = dlsym(handle, 'cusolverDnLoggerForceDisable')
+
+        global __cusolverDnSetDeterministicMode
+        __cusolverDnSetDeterministicMode = dlsym(RTLD_DEFAULT, 'cusolverDnSetDeterministicMode')
+        if __cusolverDnSetDeterministicMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnSetDeterministicMode = dlsym(handle, 'cusolverDnSetDeterministicMode')
+
+        global __cusolverDnGetDeterministicMode
+        __cusolverDnGetDeterministicMode = dlsym(RTLD_DEFAULT, 'cusolverDnGetDeterministicMode')
+        if __cusolverDnGetDeterministicMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnGetDeterministicMode = dlsym(handle, 'cusolverDnGetDeterministicMode')
+
+        global __cusolverDnXlarft_bufferSize
+        __cusolverDnXlarft_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft_bufferSize')
+        if __cusolverDnXlarft_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXlarft_bufferSize = dlsym(handle, 'cusolverDnXlarft_bufferSize')
+
+        global __cusolverDnXlarft
+        __cusolverDnXlarft = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft')
+        if __cusolverDnXlarft == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXlarft = dlsym(handle, 'cusolverDnXlarft')
+
+        global __cusolverDnXsyevBatched_bufferSize
+        __cusolverDnXsyevBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched_bufferSize')
+        if __cusolverDnXsyevBatched_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevBatched_bufferSize = dlsym(handle, 'cusolverDnXsyevBatched_bufferSize')
+
+        global __cusolverDnXsyevBatched
+        __cusolverDnXsyevBatched = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched')
+        if __cusolverDnXsyevBatched == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXsyevBatched = dlsym(handle, 'cusolverDnXsyevBatched')
+
+        global __cusolverDnXgeev_bufferSize
+        __cusolverDnXgeev_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev_bufferSize')
+        if __cusolverDnXgeev_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgeev_bufferSize = dlsym(handle, 'cusolverDnXgeev_bufferSize')
+
+        global __cusolverDnXgeev
+        __cusolverDnXgeev = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev')
+        if __cusolverDnXgeev == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverDnXgeev = dlsym(handle, 'cusolverDnXgeev')
+
+        __py_cusolverDn_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusolverDn_windows.pyx b/nvmath/bindings/_internal/cusolverDn_windows.pyx
index ef82e19..ad52b0e 100644
--- a/nvmath/bindings/_internal/cusolverDn_windows.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_windows.pyx
@@ -11,20 +11,77 @@ from .cusparse cimport load_library as load_cusparse
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusolverDn_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusolverDnCreate = NULL
 cdef void* __cusolverDnDestroy = NULL
@@ -414,2266 +471,1134 @@ cdef int _check_or_init_cusolverDn() except -1 nogil:
     if __py_cusolverDn_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __cusolverDnCreate
-        try:
-            __cusolverDnCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCreate')
-        except:
-            pass
+        __cusolverDnCreate = GetProcAddress(handle, 'cusolverDnCreate')
 
         global __cusolverDnDestroy
-        try:
-            __cusolverDnDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDestroy')
-        except:
-            pass
+        __cusolverDnDestroy = GetProcAddress(handle, 'cusolverDnDestroy')
 
         global __cusolverDnSetStream
-        try:
-            __cusolverDnSetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSetStream')
-        except:
-            pass
+        __cusolverDnSetStream = GetProcAddress(handle, 'cusolverDnSetStream')
 
         global __cusolverDnGetStream
-        try:
-            __cusolverDnGetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnGetStream')
-        except:
-            pass
+        __cusolverDnGetStream = GetProcAddress(handle, 'cusolverDnGetStream')
 
         global __cusolverDnIRSParamsCreate
-        try:
-            __cusolverDnIRSParamsCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsCreate')
-        except:
-            pass
+        __cusolverDnIRSParamsCreate = GetProcAddress(handle, 'cusolverDnIRSParamsCreate')
 
         global __cusolverDnIRSParamsDestroy
-        try:
-            __cusolverDnIRSParamsDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsDestroy')
-        except:
-            pass
+        __cusolverDnIRSParamsDestroy = GetProcAddress(handle, 'cusolverDnIRSParamsDestroy')
 
         global __cusolverDnIRSParamsSetRefinementSolver
-        try:
-            __cusolverDnIRSParamsSetRefinementSolver = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetRefinementSolver')
-        except:
-            pass
+        __cusolverDnIRSParamsSetRefinementSolver = GetProcAddress(handle, 'cusolverDnIRSParamsSetRefinementSolver')
 
         global __cusolverDnIRSParamsSetSolverMainPrecision
-        try:
-            __cusolverDnIRSParamsSetSolverMainPrecision = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverMainPrecision')
-        except:
-            pass
+        __cusolverDnIRSParamsSetSolverMainPrecision = GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverMainPrecision')
 
         global __cusolverDnIRSParamsSetSolverLowestPrecision
-        try:
-            __cusolverDnIRSParamsSetSolverLowestPrecision = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverLowestPrecision')
-        except:
-            pass
+        __cusolverDnIRSParamsSetSolverLowestPrecision = GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverLowestPrecision')
 
         global __cusolverDnIRSParamsSetSolverPrecisions
-        try:
-            __cusolverDnIRSParamsSetSolverPrecisions = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverPrecisions')
-        except:
-            pass
+        __cusolverDnIRSParamsSetSolverPrecisions = GetProcAddress(handle, 'cusolverDnIRSParamsSetSolverPrecisions')
 
         global __cusolverDnIRSParamsSetTol
-        try:
-            __cusolverDnIRSParamsSetTol = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetTol')
-        except:
-            pass
+        __cusolverDnIRSParamsSetTol = GetProcAddress(handle, 'cusolverDnIRSParamsSetTol')
 
         global __cusolverDnIRSParamsSetTolInner
-        try:
-            __cusolverDnIRSParamsSetTolInner = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetTolInner')
-        except:
-            pass
+        __cusolverDnIRSParamsSetTolInner = GetProcAddress(handle, 'cusolverDnIRSParamsSetTolInner')
 
         global __cusolverDnIRSParamsSetMaxIters
-        try:
-            __cusolverDnIRSParamsSetMaxIters = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetMaxIters')
-        except:
-            pass
+        __cusolverDnIRSParamsSetMaxIters = GetProcAddress(handle, 'cusolverDnIRSParamsSetMaxIters')
 
         global __cusolverDnIRSParamsSetMaxItersInner
-        try:
-            __cusolverDnIRSParamsSetMaxItersInner = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsSetMaxItersInner')
-        except:
-            pass
+        __cusolverDnIRSParamsSetMaxItersInner = GetProcAddress(handle, 'cusolverDnIRSParamsSetMaxItersInner')
 
         global __cusolverDnIRSParamsGetMaxIters
-        try:
-            __cusolverDnIRSParamsGetMaxIters = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsGetMaxIters')
-        except:
-            pass
+        __cusolverDnIRSParamsGetMaxIters = GetProcAddress(handle, 'cusolverDnIRSParamsGetMaxIters')
 
         global __cusolverDnIRSParamsEnableFallback
-        try:
-            __cusolverDnIRSParamsEnableFallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsEnableFallback')
-        except:
-            pass
+        __cusolverDnIRSParamsEnableFallback = GetProcAddress(handle, 'cusolverDnIRSParamsEnableFallback')
 
         global __cusolverDnIRSParamsDisableFallback
-        try:
-            __cusolverDnIRSParamsDisableFallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSParamsDisableFallback')
-        except:
-            pass
+        __cusolverDnIRSParamsDisableFallback = GetProcAddress(handle, 'cusolverDnIRSParamsDisableFallback')
 
         global __cusolverDnIRSInfosDestroy
-        try:
-            __cusolverDnIRSInfosDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosDestroy')
-        except:
-            pass
+        __cusolverDnIRSInfosDestroy = GetProcAddress(handle, 'cusolverDnIRSInfosDestroy')
 
         global __cusolverDnIRSInfosCreate
-        try:
-            __cusolverDnIRSInfosCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosCreate')
-        except:
-            pass
+        __cusolverDnIRSInfosCreate = GetProcAddress(handle, 'cusolverDnIRSInfosCreate')
 
         global __cusolverDnIRSInfosGetNiters
-        try:
-            __cusolverDnIRSInfosGetNiters = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosGetNiters')
-        except:
-            pass
+        __cusolverDnIRSInfosGetNiters = GetProcAddress(handle, 'cusolverDnIRSInfosGetNiters')
 
         global __cusolverDnIRSInfosGetOuterNiters
-        try:
-            __cusolverDnIRSInfosGetOuterNiters = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosGetOuterNiters')
-        except:
-            pass
+        __cusolverDnIRSInfosGetOuterNiters = GetProcAddress(handle, 'cusolverDnIRSInfosGetOuterNiters')
 
         global __cusolverDnIRSInfosRequestResidual
-        try:
-            __cusolverDnIRSInfosRequestResidual = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosRequestResidual')
-        except:
-            pass
+        __cusolverDnIRSInfosRequestResidual = GetProcAddress(handle, 'cusolverDnIRSInfosRequestResidual')
 
         global __cusolverDnIRSInfosGetResidualHistory
-        try:
-            __cusolverDnIRSInfosGetResidualHistory = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosGetResidualHistory')
-        except:
-            pass
+        __cusolverDnIRSInfosGetResidualHistory = GetProcAddress(handle, 'cusolverDnIRSInfosGetResidualHistory')
 
         global __cusolverDnIRSInfosGetMaxIters
-        try:
-            __cusolverDnIRSInfosGetMaxIters = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSInfosGetMaxIters')
-        except:
-            pass
+        __cusolverDnIRSInfosGetMaxIters = GetProcAddress(handle, 'cusolverDnIRSInfosGetMaxIters')
 
         global __cusolverDnZZgesv
-        try:
-            __cusolverDnZZgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZZgesv')
-        except:
-            pass
+        __cusolverDnZZgesv = GetProcAddress(handle, 'cusolverDnZZgesv')
 
         global __cusolverDnZCgesv
-        try:
-            __cusolverDnZCgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZCgesv')
-        except:
-            pass
+        __cusolverDnZCgesv = GetProcAddress(handle, 'cusolverDnZCgesv')
 
         global __cusolverDnZKgesv
-        try:
-            __cusolverDnZKgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZKgesv')
-        except:
-            pass
+        __cusolverDnZKgesv = GetProcAddress(handle, 'cusolverDnZKgesv')
 
         global __cusolverDnZEgesv
-        try:
-            __cusolverDnZEgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZEgesv')
-        except:
-            pass
+        __cusolverDnZEgesv = GetProcAddress(handle, 'cusolverDnZEgesv')
 
         global __cusolverDnZYgesv
-        try:
-            __cusolverDnZYgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZYgesv')
-        except:
-            pass
+        __cusolverDnZYgesv = GetProcAddress(handle, 'cusolverDnZYgesv')
 
         global __cusolverDnCCgesv
-        try:
-            __cusolverDnCCgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCCgesv')
-        except:
-            pass
+        __cusolverDnCCgesv = GetProcAddress(handle, 'cusolverDnCCgesv')
 
         global __cusolverDnCEgesv
-        try:
-            __cusolverDnCEgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCEgesv')
-        except:
-            pass
+        __cusolverDnCEgesv = GetProcAddress(handle, 'cusolverDnCEgesv')
 
         global __cusolverDnCKgesv
-        try:
-            __cusolverDnCKgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCKgesv')
-        except:
-            pass
+        __cusolverDnCKgesv = GetProcAddress(handle, 'cusolverDnCKgesv')
 
         global __cusolverDnCYgesv
-        try:
-            __cusolverDnCYgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCYgesv')
-        except:
-            pass
+        __cusolverDnCYgesv = GetProcAddress(handle, 'cusolverDnCYgesv')
 
         global __cusolverDnDDgesv
-        try:
-            __cusolverDnDDgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDDgesv')
-        except:
-            pass
+        __cusolverDnDDgesv = GetProcAddress(handle, 'cusolverDnDDgesv')
 
         global __cusolverDnDSgesv
-        try:
-            __cusolverDnDSgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDSgesv')
-        except:
-            pass
+        __cusolverDnDSgesv = GetProcAddress(handle, 'cusolverDnDSgesv')
 
         global __cusolverDnDHgesv
-        try:
-            __cusolverDnDHgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDHgesv')
-        except:
-            pass
+        __cusolverDnDHgesv = GetProcAddress(handle, 'cusolverDnDHgesv')
 
         global __cusolverDnDBgesv
-        try:
-            __cusolverDnDBgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDBgesv')
-        except:
-            pass
+        __cusolverDnDBgesv = GetProcAddress(handle, 'cusolverDnDBgesv')
 
         global __cusolverDnDXgesv
-        try:
-            __cusolverDnDXgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDXgesv')
-        except:
-            pass
+        __cusolverDnDXgesv = GetProcAddress(handle, 'cusolverDnDXgesv')
 
         global __cusolverDnSSgesv
-        try:
-            __cusolverDnSSgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSSgesv')
-        except:
-            pass
+        __cusolverDnSSgesv = GetProcAddress(handle, 'cusolverDnSSgesv')
 
         global __cusolverDnSHgesv
-        try:
-            __cusolverDnSHgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSHgesv')
-        except:
-            pass
+        __cusolverDnSHgesv = GetProcAddress(handle, 'cusolverDnSHgesv')
 
         global __cusolverDnSBgesv
-        try:
-            __cusolverDnSBgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSBgesv')
-        except:
-            pass
+        __cusolverDnSBgesv = GetProcAddress(handle, 'cusolverDnSBgesv')
 
         global __cusolverDnSXgesv
-        try:
-            __cusolverDnSXgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSXgesv')
-        except:
-            pass
+        __cusolverDnSXgesv = GetProcAddress(handle, 'cusolverDnSXgesv')
 
         global __cusolverDnZZgesv_bufferSize
-        try:
-            __cusolverDnZZgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZZgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnZZgesv_bufferSize = GetProcAddress(handle, 'cusolverDnZZgesv_bufferSize')
 
         global __cusolverDnZCgesv_bufferSize
-        try:
-            __cusolverDnZCgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZCgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnZCgesv_bufferSize = GetProcAddress(handle, 'cusolverDnZCgesv_bufferSize')
 
         global __cusolverDnZKgesv_bufferSize
-        try:
-            __cusolverDnZKgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZKgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnZKgesv_bufferSize = GetProcAddress(handle, 'cusolverDnZKgesv_bufferSize')
 
         global __cusolverDnZEgesv_bufferSize
-        try:
-            __cusolverDnZEgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZEgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnZEgesv_bufferSize = GetProcAddress(handle, 'cusolverDnZEgesv_bufferSize')
 
         global __cusolverDnZYgesv_bufferSize
-        try:
-            __cusolverDnZYgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZYgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnZYgesv_bufferSize = GetProcAddress(handle, 'cusolverDnZYgesv_bufferSize')
 
         global __cusolverDnCCgesv_bufferSize
-        try:
-            __cusolverDnCCgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCCgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnCCgesv_bufferSize = GetProcAddress(handle, 'cusolverDnCCgesv_bufferSize')
 
         global __cusolverDnCKgesv_bufferSize
-        try:
-            __cusolverDnCKgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCKgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnCKgesv_bufferSize = GetProcAddress(handle, 'cusolverDnCKgesv_bufferSize')
 
         global __cusolverDnCEgesv_bufferSize
-        try:
-            __cusolverDnCEgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCEgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnCEgesv_bufferSize = GetProcAddress(handle, 'cusolverDnCEgesv_bufferSize')
 
         global __cusolverDnCYgesv_bufferSize
-        try:
-            __cusolverDnCYgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCYgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnCYgesv_bufferSize = GetProcAddress(handle, 'cusolverDnCYgesv_bufferSize')
 
         global __cusolverDnDDgesv_bufferSize
-        try:
-            __cusolverDnDDgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDDgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnDDgesv_bufferSize = GetProcAddress(handle, 'cusolverDnDDgesv_bufferSize')
 
         global __cusolverDnDSgesv_bufferSize
-        try:
-            __cusolverDnDSgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDSgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnDSgesv_bufferSize = GetProcAddress(handle, 'cusolverDnDSgesv_bufferSize')
 
         global __cusolverDnDHgesv_bufferSize
-        try:
-            __cusolverDnDHgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDHgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnDHgesv_bufferSize = GetProcAddress(handle, 'cusolverDnDHgesv_bufferSize')
 
         global __cusolverDnDBgesv_bufferSize
-        try:
-            __cusolverDnDBgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDBgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnDBgesv_bufferSize = GetProcAddress(handle, 'cusolverDnDBgesv_bufferSize')
 
         global __cusolverDnDXgesv_bufferSize
-        try:
-            __cusolverDnDXgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDXgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnDXgesv_bufferSize = GetProcAddress(handle, 'cusolverDnDXgesv_bufferSize')
 
         global __cusolverDnSSgesv_bufferSize
-        try:
-            __cusolverDnSSgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSSgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnSSgesv_bufferSize = GetProcAddress(handle, 'cusolverDnSSgesv_bufferSize')
 
         global __cusolverDnSHgesv_bufferSize
-        try:
-            __cusolverDnSHgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSHgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnSHgesv_bufferSize = GetProcAddress(handle, 'cusolverDnSHgesv_bufferSize')
 
         global __cusolverDnSBgesv_bufferSize
-        try:
-            __cusolverDnSBgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSBgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnSBgesv_bufferSize = GetProcAddress(handle, 'cusolverDnSBgesv_bufferSize')
 
         global __cusolverDnSXgesv_bufferSize
-        try:
-            __cusolverDnSXgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSXgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnSXgesv_bufferSize = GetProcAddress(handle, 'cusolverDnSXgesv_bufferSize')
 
         global __cusolverDnZZgels
-        try:
-            __cusolverDnZZgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZZgels')
-        except:
-            pass
+        __cusolverDnZZgels = GetProcAddress(handle, 'cusolverDnZZgels')
 
         global __cusolverDnZCgels
-        try:
-            __cusolverDnZCgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZCgels')
-        except:
-            pass
+        __cusolverDnZCgels = GetProcAddress(handle, 'cusolverDnZCgels')
 
         global __cusolverDnZKgels
-        try:
-            __cusolverDnZKgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZKgels')
-        except:
-            pass
+        __cusolverDnZKgels = GetProcAddress(handle, 'cusolverDnZKgels')
 
         global __cusolverDnZEgels
-        try:
-            __cusolverDnZEgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZEgels')
-        except:
-            pass
+        __cusolverDnZEgels = GetProcAddress(handle, 'cusolverDnZEgels')
 
         global __cusolverDnZYgels
-        try:
-            __cusolverDnZYgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZYgels')
-        except:
-            pass
+        __cusolverDnZYgels = GetProcAddress(handle, 'cusolverDnZYgels')
 
         global __cusolverDnCCgels
-        try:
-            __cusolverDnCCgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCCgels')
-        except:
-            pass
+        __cusolverDnCCgels = GetProcAddress(handle, 'cusolverDnCCgels')
 
         global __cusolverDnCKgels
-        try:
-            __cusolverDnCKgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCKgels')
-        except:
-            pass
+        __cusolverDnCKgels = GetProcAddress(handle, 'cusolverDnCKgels')
 
         global __cusolverDnCEgels
-        try:
-            __cusolverDnCEgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCEgels')
-        except:
-            pass
+        __cusolverDnCEgels = GetProcAddress(handle, 'cusolverDnCEgels')
 
         global __cusolverDnCYgels
-        try:
-            __cusolverDnCYgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCYgels')
-        except:
-            pass
+        __cusolverDnCYgels = GetProcAddress(handle, 'cusolverDnCYgels')
 
         global __cusolverDnDDgels
-        try:
-            __cusolverDnDDgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDDgels')
-        except:
-            pass
+        __cusolverDnDDgels = GetProcAddress(handle, 'cusolverDnDDgels')
 
         global __cusolverDnDSgels
-        try:
-            __cusolverDnDSgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDSgels')
-        except:
-            pass
+        __cusolverDnDSgels = GetProcAddress(handle, 'cusolverDnDSgels')
 
         global __cusolverDnDHgels
-        try:
-            __cusolverDnDHgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDHgels')
-        except:
-            pass
+        __cusolverDnDHgels = GetProcAddress(handle, 'cusolverDnDHgels')
 
         global __cusolverDnDBgels
-        try:
-            __cusolverDnDBgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDBgels')
-        except:
-            pass
+        __cusolverDnDBgels = GetProcAddress(handle, 'cusolverDnDBgels')
 
         global __cusolverDnDXgels
-        try:
-            __cusolverDnDXgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDXgels')
-        except:
-            pass
+        __cusolverDnDXgels = GetProcAddress(handle, 'cusolverDnDXgels')
 
         global __cusolverDnSSgels
-        try:
-            __cusolverDnSSgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSSgels')
-        except:
-            pass
+        __cusolverDnSSgels = GetProcAddress(handle, 'cusolverDnSSgels')
 
         global __cusolverDnSHgels
-        try:
-            __cusolverDnSHgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSHgels')
-        except:
-            pass
+        __cusolverDnSHgels = GetProcAddress(handle, 'cusolverDnSHgels')
 
         global __cusolverDnSBgels
-        try:
-            __cusolverDnSBgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSBgels')
-        except:
-            pass
+        __cusolverDnSBgels = GetProcAddress(handle, 'cusolverDnSBgels')
 
         global __cusolverDnSXgels
-        try:
-            __cusolverDnSXgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSXgels')
-        except:
-            pass
+        __cusolverDnSXgels = GetProcAddress(handle, 'cusolverDnSXgels')
 
         global __cusolverDnZZgels_bufferSize
-        try:
-            __cusolverDnZZgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZZgels_bufferSize')
-        except:
-            pass
+        __cusolverDnZZgels_bufferSize = GetProcAddress(handle, 'cusolverDnZZgels_bufferSize')
 
         global __cusolverDnZCgels_bufferSize
-        try:
-            __cusolverDnZCgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZCgels_bufferSize')
-        except:
-            pass
+        __cusolverDnZCgels_bufferSize = GetProcAddress(handle, 'cusolverDnZCgels_bufferSize')
 
         global __cusolverDnZKgels_bufferSize
-        try:
-            __cusolverDnZKgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZKgels_bufferSize')
-        except:
-            pass
+        __cusolverDnZKgels_bufferSize = GetProcAddress(handle, 'cusolverDnZKgels_bufferSize')
 
         global __cusolverDnZEgels_bufferSize
-        try:
-            __cusolverDnZEgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZEgels_bufferSize')
-        except:
-            pass
+        __cusolverDnZEgels_bufferSize = GetProcAddress(handle, 'cusolverDnZEgels_bufferSize')
 
         global __cusolverDnZYgels_bufferSize
-        try:
-            __cusolverDnZYgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZYgels_bufferSize')
-        except:
-            pass
+        __cusolverDnZYgels_bufferSize = GetProcAddress(handle, 'cusolverDnZYgels_bufferSize')
 
         global __cusolverDnCCgels_bufferSize
-        try:
-            __cusolverDnCCgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCCgels_bufferSize')
-        except:
-            pass
+        __cusolverDnCCgels_bufferSize = GetProcAddress(handle, 'cusolverDnCCgels_bufferSize')
 
         global __cusolverDnCKgels_bufferSize
-        try:
-            __cusolverDnCKgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCKgels_bufferSize')
-        except:
-            pass
+        __cusolverDnCKgels_bufferSize = GetProcAddress(handle, 'cusolverDnCKgels_bufferSize')
 
         global __cusolverDnCEgels_bufferSize
-        try:
-            __cusolverDnCEgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCEgels_bufferSize')
-        except:
-            pass
+        __cusolverDnCEgels_bufferSize = GetProcAddress(handle, 'cusolverDnCEgels_bufferSize')
 
         global __cusolverDnCYgels_bufferSize
-        try:
-            __cusolverDnCYgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCYgels_bufferSize')
-        except:
-            pass
+        __cusolverDnCYgels_bufferSize = GetProcAddress(handle, 'cusolverDnCYgels_bufferSize')
 
         global __cusolverDnDDgels_bufferSize
-        try:
-            __cusolverDnDDgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDDgels_bufferSize')
-        except:
-            pass
+        __cusolverDnDDgels_bufferSize = GetProcAddress(handle, 'cusolverDnDDgels_bufferSize')
 
         global __cusolverDnDSgels_bufferSize
-        try:
-            __cusolverDnDSgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDSgels_bufferSize')
-        except:
-            pass
+        __cusolverDnDSgels_bufferSize = GetProcAddress(handle, 'cusolverDnDSgels_bufferSize')
 
         global __cusolverDnDHgels_bufferSize
-        try:
-            __cusolverDnDHgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDHgels_bufferSize')
-        except:
-            pass
+        __cusolverDnDHgels_bufferSize = GetProcAddress(handle, 'cusolverDnDHgels_bufferSize')
 
         global __cusolverDnDBgels_bufferSize
-        try:
-            __cusolverDnDBgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDBgels_bufferSize')
-        except:
-            pass
+        __cusolverDnDBgels_bufferSize = GetProcAddress(handle, 'cusolverDnDBgels_bufferSize')
 
         global __cusolverDnDXgels_bufferSize
-        try:
-            __cusolverDnDXgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDXgels_bufferSize')
-        except:
-            pass
+        __cusolverDnDXgels_bufferSize = GetProcAddress(handle, 'cusolverDnDXgels_bufferSize')
 
         global __cusolverDnSSgels_bufferSize
-        try:
-            __cusolverDnSSgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSSgels_bufferSize')
-        except:
-            pass
+        __cusolverDnSSgels_bufferSize = GetProcAddress(handle, 'cusolverDnSSgels_bufferSize')
 
         global __cusolverDnSHgels_bufferSize
-        try:
-            __cusolverDnSHgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSHgels_bufferSize')
-        except:
-            pass
+        __cusolverDnSHgels_bufferSize = GetProcAddress(handle, 'cusolverDnSHgels_bufferSize')
 
         global __cusolverDnSBgels_bufferSize
-        try:
-            __cusolverDnSBgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSBgels_bufferSize')
-        except:
-            pass
+        __cusolverDnSBgels_bufferSize = GetProcAddress(handle, 'cusolverDnSBgels_bufferSize')
 
         global __cusolverDnSXgels_bufferSize
-        try:
-            __cusolverDnSXgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSXgels_bufferSize')
-        except:
-            pass
+        __cusolverDnSXgels_bufferSize = GetProcAddress(handle, 'cusolverDnSXgels_bufferSize')
 
         global __cusolverDnIRSXgesv
-        try:
-            __cusolverDnIRSXgesv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSXgesv')
-        except:
-            pass
+        __cusolverDnIRSXgesv = GetProcAddress(handle, 'cusolverDnIRSXgesv')
 
         global __cusolverDnIRSXgesv_bufferSize
-        try:
-            __cusolverDnIRSXgesv_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSXgesv_bufferSize')
-        except:
-            pass
+        __cusolverDnIRSXgesv_bufferSize = GetProcAddress(handle, 'cusolverDnIRSXgesv_bufferSize')
 
         global __cusolverDnIRSXgels
-        try:
-            __cusolverDnIRSXgels = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSXgels')
-        except:
-            pass
+        __cusolverDnIRSXgels = GetProcAddress(handle, 'cusolverDnIRSXgels')
 
         global __cusolverDnIRSXgels_bufferSize
-        try:
-            __cusolverDnIRSXgels_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnIRSXgels_bufferSize')
-        except:
-            pass
+        __cusolverDnIRSXgels_bufferSize = GetProcAddress(handle, 'cusolverDnIRSXgels_bufferSize')
 
         global __cusolverDnSpotrf_bufferSize
-        try:
-            __cusolverDnSpotrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotrf_bufferSize')
-        except:
-            pass
+        __cusolverDnSpotrf_bufferSize = GetProcAddress(handle, 'cusolverDnSpotrf_bufferSize')
 
         global __cusolverDnDpotrf_bufferSize
-        try:
-            __cusolverDnDpotrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotrf_bufferSize')
-        except:
-            pass
+        __cusolverDnDpotrf_bufferSize = GetProcAddress(handle, 'cusolverDnDpotrf_bufferSize')
 
         global __cusolverDnCpotrf_bufferSize
-        try:
-            __cusolverDnCpotrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotrf_bufferSize')
-        except:
-            pass
+        __cusolverDnCpotrf_bufferSize = GetProcAddress(handle, 'cusolverDnCpotrf_bufferSize')
 
         global __cusolverDnZpotrf_bufferSize
-        try:
-            __cusolverDnZpotrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotrf_bufferSize')
-        except:
-            pass
+        __cusolverDnZpotrf_bufferSize = GetProcAddress(handle, 'cusolverDnZpotrf_bufferSize')
 
         global __cusolverDnSpotrf
-        try:
-            __cusolverDnSpotrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotrf')
-        except:
-            pass
+        __cusolverDnSpotrf = GetProcAddress(handle, 'cusolverDnSpotrf')
 
         global __cusolverDnDpotrf
-        try:
-            __cusolverDnDpotrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotrf')
-        except:
-            pass
+        __cusolverDnDpotrf = GetProcAddress(handle, 'cusolverDnDpotrf')
 
         global __cusolverDnCpotrf
-        try:
-            __cusolverDnCpotrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotrf')
-        except:
-            pass
+        __cusolverDnCpotrf = GetProcAddress(handle, 'cusolverDnCpotrf')
 
         global __cusolverDnZpotrf
-        try:
-            __cusolverDnZpotrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotrf')
-        except:
-            pass
+        __cusolverDnZpotrf = GetProcAddress(handle, 'cusolverDnZpotrf')
 
         global __cusolverDnSpotrs
-        try:
-            __cusolverDnSpotrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotrs')
-        except:
-            pass
+        __cusolverDnSpotrs = GetProcAddress(handle, 'cusolverDnSpotrs')
 
         global __cusolverDnDpotrs
-        try:
-            __cusolverDnDpotrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotrs')
-        except:
-            pass
+        __cusolverDnDpotrs = GetProcAddress(handle, 'cusolverDnDpotrs')
 
         global __cusolverDnCpotrs
-        try:
-            __cusolverDnCpotrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotrs')
-        except:
-            pass
+        __cusolverDnCpotrs = GetProcAddress(handle, 'cusolverDnCpotrs')
 
         global __cusolverDnZpotrs
-        try:
-            __cusolverDnZpotrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotrs')
-        except:
-            pass
+        __cusolverDnZpotrs = GetProcAddress(handle, 'cusolverDnZpotrs')
 
         global __cusolverDnSpotrfBatched
-        try:
-            __cusolverDnSpotrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotrfBatched')
-        except:
-            pass
+        __cusolverDnSpotrfBatched = GetProcAddress(handle, 'cusolverDnSpotrfBatched')
 
         global __cusolverDnDpotrfBatched
-        try:
-            __cusolverDnDpotrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotrfBatched')
-        except:
-            pass
+        __cusolverDnDpotrfBatched = GetProcAddress(handle, 'cusolverDnDpotrfBatched')
 
         global __cusolverDnCpotrfBatched
-        try:
-            __cusolverDnCpotrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotrfBatched')
-        except:
-            pass
+        __cusolverDnCpotrfBatched = GetProcAddress(handle, 'cusolverDnCpotrfBatched')
 
         global __cusolverDnZpotrfBatched
-        try:
-            __cusolverDnZpotrfBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotrfBatched')
-        except:
-            pass
+        __cusolverDnZpotrfBatched = GetProcAddress(handle, 'cusolverDnZpotrfBatched')
 
         global __cusolverDnSpotrsBatched
-        try:
-            __cusolverDnSpotrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotrsBatched')
-        except:
-            pass
+        __cusolverDnSpotrsBatched = GetProcAddress(handle, 'cusolverDnSpotrsBatched')
 
         global __cusolverDnDpotrsBatched
-        try:
-            __cusolverDnDpotrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotrsBatched')
-        except:
-            pass
+        __cusolverDnDpotrsBatched = GetProcAddress(handle, 'cusolverDnDpotrsBatched')
 
         global __cusolverDnCpotrsBatched
-        try:
-            __cusolverDnCpotrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotrsBatched')
-        except:
-            pass
+        __cusolverDnCpotrsBatched = GetProcAddress(handle, 'cusolverDnCpotrsBatched')
 
         global __cusolverDnZpotrsBatched
-        try:
-            __cusolverDnZpotrsBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotrsBatched')
-        except:
-            pass
+        __cusolverDnZpotrsBatched = GetProcAddress(handle, 'cusolverDnZpotrsBatched')
 
         global __cusolverDnSpotri_bufferSize
-        try:
-            __cusolverDnSpotri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotri_bufferSize')
-        except:
-            pass
+        __cusolverDnSpotri_bufferSize = GetProcAddress(handle, 'cusolverDnSpotri_bufferSize')
 
         global __cusolverDnDpotri_bufferSize
-        try:
-            __cusolverDnDpotri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotri_bufferSize')
-        except:
-            pass
+        __cusolverDnDpotri_bufferSize = GetProcAddress(handle, 'cusolverDnDpotri_bufferSize')
 
         global __cusolverDnCpotri_bufferSize
-        try:
-            __cusolverDnCpotri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotri_bufferSize')
-        except:
-            pass
+        __cusolverDnCpotri_bufferSize = GetProcAddress(handle, 'cusolverDnCpotri_bufferSize')
 
         global __cusolverDnZpotri_bufferSize
-        try:
-            __cusolverDnZpotri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotri_bufferSize')
-        except:
-            pass
+        __cusolverDnZpotri_bufferSize = GetProcAddress(handle, 'cusolverDnZpotri_bufferSize')
 
         global __cusolverDnSpotri
-        try:
-            __cusolverDnSpotri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSpotri')
-        except:
-            pass
+        __cusolverDnSpotri = GetProcAddress(handle, 'cusolverDnSpotri')
 
         global __cusolverDnDpotri
-        try:
-            __cusolverDnDpotri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDpotri')
-        except:
-            pass
+        __cusolverDnDpotri = GetProcAddress(handle, 'cusolverDnDpotri')
 
         global __cusolverDnCpotri
-        try:
-            __cusolverDnCpotri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCpotri')
-        except:
-            pass
+        __cusolverDnCpotri = GetProcAddress(handle, 'cusolverDnCpotri')
 
         global __cusolverDnZpotri
-        try:
-            __cusolverDnZpotri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZpotri')
-        except:
-            pass
+        __cusolverDnZpotri = GetProcAddress(handle, 'cusolverDnZpotri')
 
         global __cusolverDnSlauum_bufferSize
-        try:
-            __cusolverDnSlauum_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSlauum_bufferSize')
-        except:
-            pass
+        __cusolverDnSlauum_bufferSize = GetProcAddress(handle, 'cusolverDnSlauum_bufferSize')
 
         global __cusolverDnDlauum_bufferSize
-        try:
-            __cusolverDnDlauum_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDlauum_bufferSize')
-        except:
-            pass
+        __cusolverDnDlauum_bufferSize = GetProcAddress(handle, 'cusolverDnDlauum_bufferSize')
 
         global __cusolverDnClauum_bufferSize
-        try:
-            __cusolverDnClauum_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnClauum_bufferSize')
-        except:
-            pass
+        __cusolverDnClauum_bufferSize = GetProcAddress(handle, 'cusolverDnClauum_bufferSize')
 
         global __cusolverDnZlauum_bufferSize
-        try:
-            __cusolverDnZlauum_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZlauum_bufferSize')
-        except:
-            pass
+        __cusolverDnZlauum_bufferSize = GetProcAddress(handle, 'cusolverDnZlauum_bufferSize')
 
         global __cusolverDnSlauum
-        try:
-            __cusolverDnSlauum = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSlauum')
-        except:
-            pass
+        __cusolverDnSlauum = GetProcAddress(handle, 'cusolverDnSlauum')
 
         global __cusolverDnDlauum
-        try:
-            __cusolverDnDlauum = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDlauum')
-        except:
-            pass
+        __cusolverDnDlauum = GetProcAddress(handle, 'cusolverDnDlauum')
 
         global __cusolverDnClauum
-        try:
-            __cusolverDnClauum = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnClauum')
-        except:
-            pass
+        __cusolverDnClauum = GetProcAddress(handle, 'cusolverDnClauum')
 
         global __cusolverDnZlauum
-        try:
-            __cusolverDnZlauum = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZlauum')
-        except:
-            pass
+        __cusolverDnZlauum = GetProcAddress(handle, 'cusolverDnZlauum')
 
         global __cusolverDnSgetrf_bufferSize
-        try:
-            __cusolverDnSgetrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgetrf_bufferSize')
-        except:
-            pass
+        __cusolverDnSgetrf_bufferSize = GetProcAddress(handle, 'cusolverDnSgetrf_bufferSize')
 
         global __cusolverDnDgetrf_bufferSize
-        try:
-            __cusolverDnDgetrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgetrf_bufferSize')
-        except:
-            pass
+        __cusolverDnDgetrf_bufferSize = GetProcAddress(handle, 'cusolverDnDgetrf_bufferSize')
 
         global __cusolverDnCgetrf_bufferSize
-        try:
-            __cusolverDnCgetrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgetrf_bufferSize')
-        except:
-            pass
+        __cusolverDnCgetrf_bufferSize = GetProcAddress(handle, 'cusolverDnCgetrf_bufferSize')
 
         global __cusolverDnZgetrf_bufferSize
-        try:
-            __cusolverDnZgetrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgetrf_bufferSize')
-        except:
-            pass
+        __cusolverDnZgetrf_bufferSize = GetProcAddress(handle, 'cusolverDnZgetrf_bufferSize')
 
         global __cusolverDnSgetrf
-        try:
-            __cusolverDnSgetrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgetrf')
-        except:
-            pass
+        __cusolverDnSgetrf = GetProcAddress(handle, 'cusolverDnSgetrf')
 
         global __cusolverDnDgetrf
-        try:
-            __cusolverDnDgetrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgetrf')
-        except:
-            pass
+        __cusolverDnDgetrf = GetProcAddress(handle, 'cusolverDnDgetrf')
 
         global __cusolverDnCgetrf
-        try:
-            __cusolverDnCgetrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgetrf')
-        except:
-            pass
+        __cusolverDnCgetrf = GetProcAddress(handle, 'cusolverDnCgetrf')
 
         global __cusolverDnZgetrf
-        try:
-            __cusolverDnZgetrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgetrf')
-        except:
-            pass
+        __cusolverDnZgetrf = GetProcAddress(handle, 'cusolverDnZgetrf')
 
         global __cusolverDnSlaswp
-        try:
-            __cusolverDnSlaswp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSlaswp')
-        except:
-            pass
+        __cusolverDnSlaswp = GetProcAddress(handle, 'cusolverDnSlaswp')
 
         global __cusolverDnDlaswp
-        try:
-            __cusolverDnDlaswp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDlaswp')
-        except:
-            pass
+        __cusolverDnDlaswp = GetProcAddress(handle, 'cusolverDnDlaswp')
 
         global __cusolverDnClaswp
-        try:
-            __cusolverDnClaswp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnClaswp')
-        except:
-            pass
+        __cusolverDnClaswp = GetProcAddress(handle, 'cusolverDnClaswp')
 
         global __cusolverDnZlaswp
-        try:
-            __cusolverDnZlaswp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZlaswp')
-        except:
-            pass
+        __cusolverDnZlaswp = GetProcAddress(handle, 'cusolverDnZlaswp')
 
         global __cusolverDnSgetrs
-        try:
-            __cusolverDnSgetrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgetrs')
-        except:
-            pass
+        __cusolverDnSgetrs = GetProcAddress(handle, 'cusolverDnSgetrs')
 
         global __cusolverDnDgetrs
-        try:
-            __cusolverDnDgetrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgetrs')
-        except:
-            pass
+        __cusolverDnDgetrs = GetProcAddress(handle, 'cusolverDnDgetrs')
 
         global __cusolverDnCgetrs
-        try:
-            __cusolverDnCgetrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgetrs')
-        except:
-            pass
+        __cusolverDnCgetrs = GetProcAddress(handle, 'cusolverDnCgetrs')
 
         global __cusolverDnZgetrs
-        try:
-            __cusolverDnZgetrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgetrs')
-        except:
-            pass
+        __cusolverDnZgetrs = GetProcAddress(handle, 'cusolverDnZgetrs')
 
         global __cusolverDnSgeqrf_bufferSize
-        try:
-            __cusolverDnSgeqrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgeqrf_bufferSize')
-        except:
-            pass
+        __cusolverDnSgeqrf_bufferSize = GetProcAddress(handle, 'cusolverDnSgeqrf_bufferSize')
 
         global __cusolverDnDgeqrf_bufferSize
-        try:
-            __cusolverDnDgeqrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgeqrf_bufferSize')
-        except:
-            pass
+        __cusolverDnDgeqrf_bufferSize = GetProcAddress(handle, 'cusolverDnDgeqrf_bufferSize')
 
         global __cusolverDnCgeqrf_bufferSize
-        try:
-            __cusolverDnCgeqrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgeqrf_bufferSize')
-        except:
-            pass
+        __cusolverDnCgeqrf_bufferSize = GetProcAddress(handle, 'cusolverDnCgeqrf_bufferSize')
 
         global __cusolverDnZgeqrf_bufferSize
-        try:
-            __cusolverDnZgeqrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgeqrf_bufferSize')
-        except:
-            pass
+        __cusolverDnZgeqrf_bufferSize = GetProcAddress(handle, 'cusolverDnZgeqrf_bufferSize')
 
         global __cusolverDnSgeqrf
-        try:
-            __cusolverDnSgeqrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgeqrf')
-        except:
-            pass
+        __cusolverDnSgeqrf = GetProcAddress(handle, 'cusolverDnSgeqrf')
 
         global __cusolverDnDgeqrf
-        try:
-            __cusolverDnDgeqrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgeqrf')
-        except:
-            pass
+        __cusolverDnDgeqrf = GetProcAddress(handle, 'cusolverDnDgeqrf')
 
         global __cusolverDnCgeqrf
-        try:
-            __cusolverDnCgeqrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgeqrf')
-        except:
-            pass
+        __cusolverDnCgeqrf = GetProcAddress(handle, 'cusolverDnCgeqrf')
 
         global __cusolverDnZgeqrf
-        try:
-            __cusolverDnZgeqrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgeqrf')
-        except:
-            pass
+        __cusolverDnZgeqrf = GetProcAddress(handle, 'cusolverDnZgeqrf')
 
         global __cusolverDnSorgqr_bufferSize
-        try:
-            __cusolverDnSorgqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgqr_bufferSize')
-        except:
-            pass
+        __cusolverDnSorgqr_bufferSize = GetProcAddress(handle, 'cusolverDnSorgqr_bufferSize')
 
         global __cusolverDnDorgqr_bufferSize
-        try:
-            __cusolverDnDorgqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgqr_bufferSize')
-        except:
-            pass
+        __cusolverDnDorgqr_bufferSize = GetProcAddress(handle, 'cusolverDnDorgqr_bufferSize')
 
         global __cusolverDnCungqr_bufferSize
-        try:
-            __cusolverDnCungqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungqr_bufferSize')
-        except:
-            pass
+        __cusolverDnCungqr_bufferSize = GetProcAddress(handle, 'cusolverDnCungqr_bufferSize')
 
         global __cusolverDnZungqr_bufferSize
-        try:
-            __cusolverDnZungqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungqr_bufferSize')
-        except:
-            pass
+        __cusolverDnZungqr_bufferSize = GetProcAddress(handle, 'cusolverDnZungqr_bufferSize')
 
         global __cusolverDnSorgqr
-        try:
-            __cusolverDnSorgqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgqr')
-        except:
-            pass
+        __cusolverDnSorgqr = GetProcAddress(handle, 'cusolverDnSorgqr')
 
         global __cusolverDnDorgqr
-        try:
-            __cusolverDnDorgqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgqr')
-        except:
-            pass
+        __cusolverDnDorgqr = GetProcAddress(handle, 'cusolverDnDorgqr')
 
         global __cusolverDnCungqr
-        try:
-            __cusolverDnCungqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungqr')
-        except:
-            pass
+        __cusolverDnCungqr = GetProcAddress(handle, 'cusolverDnCungqr')
 
         global __cusolverDnZungqr
-        try:
-            __cusolverDnZungqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungqr')
-        except:
-            pass
+        __cusolverDnZungqr = GetProcAddress(handle, 'cusolverDnZungqr')
 
         global __cusolverDnSormqr_bufferSize
-        try:
-            __cusolverDnSormqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSormqr_bufferSize')
-        except:
-            pass
+        __cusolverDnSormqr_bufferSize = GetProcAddress(handle, 'cusolverDnSormqr_bufferSize')
 
         global __cusolverDnDormqr_bufferSize
-        try:
-            __cusolverDnDormqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDormqr_bufferSize')
-        except:
-            pass
+        __cusolverDnDormqr_bufferSize = GetProcAddress(handle, 'cusolverDnDormqr_bufferSize')
 
         global __cusolverDnCunmqr_bufferSize
-        try:
-            __cusolverDnCunmqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCunmqr_bufferSize')
-        except:
-            pass
+        __cusolverDnCunmqr_bufferSize = GetProcAddress(handle, 'cusolverDnCunmqr_bufferSize')
 
         global __cusolverDnZunmqr_bufferSize
-        try:
-            __cusolverDnZunmqr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZunmqr_bufferSize')
-        except:
-            pass
+        __cusolverDnZunmqr_bufferSize = GetProcAddress(handle, 'cusolverDnZunmqr_bufferSize')
 
         global __cusolverDnSormqr
-        try:
-            __cusolverDnSormqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSormqr')
-        except:
-            pass
+        __cusolverDnSormqr = GetProcAddress(handle, 'cusolverDnSormqr')
 
         global __cusolverDnDormqr
-        try:
-            __cusolverDnDormqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDormqr')
-        except:
-            pass
+        __cusolverDnDormqr = GetProcAddress(handle, 'cusolverDnDormqr')
 
         global __cusolverDnCunmqr
-        try:
-            __cusolverDnCunmqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCunmqr')
-        except:
-            pass
+        __cusolverDnCunmqr = GetProcAddress(handle, 'cusolverDnCunmqr')
 
         global __cusolverDnZunmqr
-        try:
-            __cusolverDnZunmqr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZunmqr')
-        except:
-            pass
+        __cusolverDnZunmqr = GetProcAddress(handle, 'cusolverDnZunmqr')
 
         global __cusolverDnSsytrf_bufferSize
-        try:
-            __cusolverDnSsytrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytrf_bufferSize')
-        except:
-            pass
+        __cusolverDnSsytrf_bufferSize = GetProcAddress(handle, 'cusolverDnSsytrf_bufferSize')
 
         global __cusolverDnDsytrf_bufferSize
-        try:
-            __cusolverDnDsytrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytrf_bufferSize')
-        except:
-            pass
+        __cusolverDnDsytrf_bufferSize = GetProcAddress(handle, 'cusolverDnDsytrf_bufferSize')
 
         global __cusolverDnCsytrf_bufferSize
-        try:
-            __cusolverDnCsytrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCsytrf_bufferSize')
-        except:
-            pass
+        __cusolverDnCsytrf_bufferSize = GetProcAddress(handle, 'cusolverDnCsytrf_bufferSize')
 
         global __cusolverDnZsytrf_bufferSize
-        try:
-            __cusolverDnZsytrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZsytrf_bufferSize')
-        except:
-            pass
+        __cusolverDnZsytrf_bufferSize = GetProcAddress(handle, 'cusolverDnZsytrf_bufferSize')
 
         global __cusolverDnSsytrf
-        try:
-            __cusolverDnSsytrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytrf')
-        except:
-            pass
+        __cusolverDnSsytrf = GetProcAddress(handle, 'cusolverDnSsytrf')
 
         global __cusolverDnDsytrf
-        try:
-            __cusolverDnDsytrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytrf')
-        except:
-            pass
+        __cusolverDnDsytrf = GetProcAddress(handle, 'cusolverDnDsytrf')
 
         global __cusolverDnCsytrf
-        try:
-            __cusolverDnCsytrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCsytrf')
-        except:
-            pass
+        __cusolverDnCsytrf = GetProcAddress(handle, 'cusolverDnCsytrf')
 
         global __cusolverDnZsytrf
-        try:
-            __cusolverDnZsytrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZsytrf')
-        except:
-            pass
+        __cusolverDnZsytrf = GetProcAddress(handle, 'cusolverDnZsytrf')
 
         global __cusolverDnSsytri_bufferSize
-        try:
-            __cusolverDnSsytri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytri_bufferSize')
-        except:
-            pass
+        __cusolverDnSsytri_bufferSize = GetProcAddress(handle, 'cusolverDnSsytri_bufferSize')
 
         global __cusolverDnDsytri_bufferSize
-        try:
-            __cusolverDnDsytri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytri_bufferSize')
-        except:
-            pass
+        __cusolverDnDsytri_bufferSize = GetProcAddress(handle, 'cusolverDnDsytri_bufferSize')
 
         global __cusolverDnCsytri_bufferSize
-        try:
-            __cusolverDnCsytri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCsytri_bufferSize')
-        except:
-            pass
+        __cusolverDnCsytri_bufferSize = GetProcAddress(handle, 'cusolverDnCsytri_bufferSize')
 
         global __cusolverDnZsytri_bufferSize
-        try:
-            __cusolverDnZsytri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZsytri_bufferSize')
-        except:
-            pass
+        __cusolverDnZsytri_bufferSize = GetProcAddress(handle, 'cusolverDnZsytri_bufferSize')
 
         global __cusolverDnSsytri
-        try:
-            __cusolverDnSsytri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytri')
-        except:
-            pass
+        __cusolverDnSsytri = GetProcAddress(handle, 'cusolverDnSsytri')
 
         global __cusolverDnDsytri
-        try:
-            __cusolverDnDsytri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytri')
-        except:
-            pass
+        __cusolverDnDsytri = GetProcAddress(handle, 'cusolverDnDsytri')
 
         global __cusolverDnCsytri
-        try:
-            __cusolverDnCsytri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCsytri')
-        except:
-            pass
+        __cusolverDnCsytri = GetProcAddress(handle, 'cusolverDnCsytri')
 
         global __cusolverDnZsytri
-        try:
-            __cusolverDnZsytri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZsytri')
-        except:
-            pass
+        __cusolverDnZsytri = GetProcAddress(handle, 'cusolverDnZsytri')
 
         global __cusolverDnSgebrd_bufferSize
-        try:
-            __cusolverDnSgebrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgebrd_bufferSize')
-        except:
-            pass
+        __cusolverDnSgebrd_bufferSize = GetProcAddress(handle, 'cusolverDnSgebrd_bufferSize')
 
         global __cusolverDnDgebrd_bufferSize
-        try:
-            __cusolverDnDgebrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgebrd_bufferSize')
-        except:
-            pass
+        __cusolverDnDgebrd_bufferSize = GetProcAddress(handle, 'cusolverDnDgebrd_bufferSize')
 
         global __cusolverDnCgebrd_bufferSize
-        try:
-            __cusolverDnCgebrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgebrd_bufferSize')
-        except:
-            pass
+        __cusolverDnCgebrd_bufferSize = GetProcAddress(handle, 'cusolverDnCgebrd_bufferSize')
 
         global __cusolverDnZgebrd_bufferSize
-        try:
-            __cusolverDnZgebrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgebrd_bufferSize')
-        except:
-            pass
+        __cusolverDnZgebrd_bufferSize = GetProcAddress(handle, 'cusolverDnZgebrd_bufferSize')
 
         global __cusolverDnSgebrd
-        try:
-            __cusolverDnSgebrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgebrd')
-        except:
-            pass
+        __cusolverDnSgebrd = GetProcAddress(handle, 'cusolverDnSgebrd')
 
         global __cusolverDnDgebrd
-        try:
-            __cusolverDnDgebrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgebrd')
-        except:
-            pass
+        __cusolverDnDgebrd = GetProcAddress(handle, 'cusolverDnDgebrd')
 
         global __cusolverDnCgebrd
-        try:
-            __cusolverDnCgebrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgebrd')
-        except:
-            pass
+        __cusolverDnCgebrd = GetProcAddress(handle, 'cusolverDnCgebrd')
 
         global __cusolverDnZgebrd
-        try:
-            __cusolverDnZgebrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgebrd')
-        except:
-            pass
+        __cusolverDnZgebrd = GetProcAddress(handle, 'cusolverDnZgebrd')
 
         global __cusolverDnSorgbr_bufferSize
-        try:
-            __cusolverDnSorgbr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgbr_bufferSize')
-        except:
-            pass
+        __cusolverDnSorgbr_bufferSize = GetProcAddress(handle, 'cusolverDnSorgbr_bufferSize')
 
         global __cusolverDnDorgbr_bufferSize
-        try:
-            __cusolverDnDorgbr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgbr_bufferSize')
-        except:
-            pass
+        __cusolverDnDorgbr_bufferSize = GetProcAddress(handle, 'cusolverDnDorgbr_bufferSize')
 
         global __cusolverDnCungbr_bufferSize
-        try:
-            __cusolverDnCungbr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungbr_bufferSize')
-        except:
-            pass
+        __cusolverDnCungbr_bufferSize = GetProcAddress(handle, 'cusolverDnCungbr_bufferSize')
 
         global __cusolverDnZungbr_bufferSize
-        try:
-            __cusolverDnZungbr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungbr_bufferSize')
-        except:
-            pass
+        __cusolverDnZungbr_bufferSize = GetProcAddress(handle, 'cusolverDnZungbr_bufferSize')
 
         global __cusolverDnSorgbr
-        try:
-            __cusolverDnSorgbr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgbr')
-        except:
-            pass
+        __cusolverDnSorgbr = GetProcAddress(handle, 'cusolverDnSorgbr')
 
         global __cusolverDnDorgbr
-        try:
-            __cusolverDnDorgbr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgbr')
-        except:
-            pass
+        __cusolverDnDorgbr = GetProcAddress(handle, 'cusolverDnDorgbr')
 
         global __cusolverDnCungbr
-        try:
-            __cusolverDnCungbr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungbr')
-        except:
-            pass
+        __cusolverDnCungbr = GetProcAddress(handle, 'cusolverDnCungbr')
 
         global __cusolverDnZungbr
-        try:
-            __cusolverDnZungbr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungbr')
-        except:
-            pass
+        __cusolverDnZungbr = GetProcAddress(handle, 'cusolverDnZungbr')
 
         global __cusolverDnSsytrd_bufferSize
-        try:
-            __cusolverDnSsytrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytrd_bufferSize')
-        except:
-            pass
+        __cusolverDnSsytrd_bufferSize = GetProcAddress(handle, 'cusolverDnSsytrd_bufferSize')
 
         global __cusolverDnDsytrd_bufferSize
-        try:
-            __cusolverDnDsytrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytrd_bufferSize')
-        except:
-            pass
+        __cusolverDnDsytrd_bufferSize = GetProcAddress(handle, 'cusolverDnDsytrd_bufferSize')
 
         global __cusolverDnChetrd_bufferSize
-        try:
-            __cusolverDnChetrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChetrd_bufferSize')
-        except:
-            pass
+        __cusolverDnChetrd_bufferSize = GetProcAddress(handle, 'cusolverDnChetrd_bufferSize')
 
         global __cusolverDnZhetrd_bufferSize
-        try:
-            __cusolverDnZhetrd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhetrd_bufferSize')
-        except:
-            pass
+        __cusolverDnZhetrd_bufferSize = GetProcAddress(handle, 'cusolverDnZhetrd_bufferSize')
 
         global __cusolverDnSsytrd
-        try:
-            __cusolverDnSsytrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsytrd')
-        except:
-            pass
+        __cusolverDnSsytrd = GetProcAddress(handle, 'cusolverDnSsytrd')
 
         global __cusolverDnDsytrd
-        try:
-            __cusolverDnDsytrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsytrd')
-        except:
-            pass
+        __cusolverDnDsytrd = GetProcAddress(handle, 'cusolverDnDsytrd')
 
         global __cusolverDnChetrd
-        try:
-            __cusolverDnChetrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChetrd')
-        except:
-            pass
+        __cusolverDnChetrd = GetProcAddress(handle, 'cusolverDnChetrd')
 
         global __cusolverDnZhetrd
-        try:
-            __cusolverDnZhetrd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhetrd')
-        except:
-            pass
+        __cusolverDnZhetrd = GetProcAddress(handle, 'cusolverDnZhetrd')
 
         global __cusolverDnSorgtr_bufferSize
-        try:
-            __cusolverDnSorgtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgtr_bufferSize')
-        except:
-            pass
+        __cusolverDnSorgtr_bufferSize = GetProcAddress(handle, 'cusolverDnSorgtr_bufferSize')
 
         global __cusolverDnDorgtr_bufferSize
-        try:
-            __cusolverDnDorgtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgtr_bufferSize')
-        except:
-            pass
+        __cusolverDnDorgtr_bufferSize = GetProcAddress(handle, 'cusolverDnDorgtr_bufferSize')
 
         global __cusolverDnCungtr_bufferSize
-        try:
-            __cusolverDnCungtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungtr_bufferSize')
-        except:
-            pass
+        __cusolverDnCungtr_bufferSize = GetProcAddress(handle, 'cusolverDnCungtr_bufferSize')
 
         global __cusolverDnZungtr_bufferSize
-        try:
-            __cusolverDnZungtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungtr_bufferSize')
-        except:
-            pass
+        __cusolverDnZungtr_bufferSize = GetProcAddress(handle, 'cusolverDnZungtr_bufferSize')
 
         global __cusolverDnSorgtr
-        try:
-            __cusolverDnSorgtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSorgtr')
-        except:
-            pass
+        __cusolverDnSorgtr = GetProcAddress(handle, 'cusolverDnSorgtr')
 
         global __cusolverDnDorgtr
-        try:
-            __cusolverDnDorgtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDorgtr')
-        except:
-            pass
+        __cusolverDnDorgtr = GetProcAddress(handle, 'cusolverDnDorgtr')
 
         global __cusolverDnCungtr
-        try:
-            __cusolverDnCungtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCungtr')
-        except:
-            pass
+        __cusolverDnCungtr = GetProcAddress(handle, 'cusolverDnCungtr')
 
         global __cusolverDnZungtr
-        try:
-            __cusolverDnZungtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZungtr')
-        except:
-            pass
+        __cusolverDnZungtr = GetProcAddress(handle, 'cusolverDnZungtr')
 
         global __cusolverDnSormtr_bufferSize
-        try:
-            __cusolverDnSormtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSormtr_bufferSize')
-        except:
-            pass
+        __cusolverDnSormtr_bufferSize = GetProcAddress(handle, 'cusolverDnSormtr_bufferSize')
 
         global __cusolverDnDormtr_bufferSize
-        try:
-            __cusolverDnDormtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDormtr_bufferSize')
-        except:
-            pass
+        __cusolverDnDormtr_bufferSize = GetProcAddress(handle, 'cusolverDnDormtr_bufferSize')
 
         global __cusolverDnCunmtr_bufferSize
-        try:
-            __cusolverDnCunmtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCunmtr_bufferSize')
-        except:
-            pass
+        __cusolverDnCunmtr_bufferSize = GetProcAddress(handle, 'cusolverDnCunmtr_bufferSize')
 
         global __cusolverDnZunmtr_bufferSize
-        try:
-            __cusolverDnZunmtr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZunmtr_bufferSize')
-        except:
-            pass
+        __cusolverDnZunmtr_bufferSize = GetProcAddress(handle, 'cusolverDnZunmtr_bufferSize')
 
         global __cusolverDnSormtr
-        try:
-            __cusolverDnSormtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSormtr')
-        except:
-            pass
+        __cusolverDnSormtr = GetProcAddress(handle, 'cusolverDnSormtr')
 
         global __cusolverDnDormtr
-        try:
-            __cusolverDnDormtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDormtr')
-        except:
-            pass
+        __cusolverDnDormtr = GetProcAddress(handle, 'cusolverDnDormtr')
 
         global __cusolverDnCunmtr
-        try:
-            __cusolverDnCunmtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCunmtr')
-        except:
-            pass
+        __cusolverDnCunmtr = GetProcAddress(handle, 'cusolverDnCunmtr')
 
         global __cusolverDnZunmtr
-        try:
-            __cusolverDnZunmtr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZunmtr')
-        except:
-            pass
+        __cusolverDnZunmtr = GetProcAddress(handle, 'cusolverDnZunmtr')
 
         global __cusolverDnSgesvd_bufferSize
-        try:
-            __cusolverDnSgesvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvd_bufferSize')
-        except:
-            pass
+        __cusolverDnSgesvd_bufferSize = GetProcAddress(handle, 'cusolverDnSgesvd_bufferSize')
 
         global __cusolverDnDgesvd_bufferSize
-        try:
-            __cusolverDnDgesvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvd_bufferSize')
-        except:
-            pass
+        __cusolverDnDgesvd_bufferSize = GetProcAddress(handle, 'cusolverDnDgesvd_bufferSize')
 
         global __cusolverDnCgesvd_bufferSize
-        try:
-            __cusolverDnCgesvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvd_bufferSize')
-        except:
-            pass
+        __cusolverDnCgesvd_bufferSize = GetProcAddress(handle, 'cusolverDnCgesvd_bufferSize')
 
         global __cusolverDnZgesvd_bufferSize
-        try:
-            __cusolverDnZgesvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvd_bufferSize')
-        except:
-            pass
+        __cusolverDnZgesvd_bufferSize = GetProcAddress(handle, 'cusolverDnZgesvd_bufferSize')
 
         global __cusolverDnSgesvd
-        try:
-            __cusolverDnSgesvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvd')
-        except:
-            pass
+        __cusolverDnSgesvd = GetProcAddress(handle, 'cusolverDnSgesvd')
 
         global __cusolverDnDgesvd
-        try:
-            __cusolverDnDgesvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvd')
-        except:
-            pass
+        __cusolverDnDgesvd = GetProcAddress(handle, 'cusolverDnDgesvd')
 
         global __cusolverDnCgesvd
-        try:
-            __cusolverDnCgesvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvd')
-        except:
-            pass
+        __cusolverDnCgesvd = GetProcAddress(handle, 'cusolverDnCgesvd')
 
         global __cusolverDnZgesvd
-        try:
-            __cusolverDnZgesvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvd')
-        except:
-            pass
+        __cusolverDnZgesvd = GetProcAddress(handle, 'cusolverDnZgesvd')
 
         global __cusolverDnSsyevd_bufferSize
-        try:
-            __cusolverDnSsyevd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevd_bufferSize')
-        except:
-            pass
+        __cusolverDnSsyevd_bufferSize = GetProcAddress(handle, 'cusolverDnSsyevd_bufferSize')
 
         global __cusolverDnDsyevd_bufferSize
-        try:
-            __cusolverDnDsyevd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevd_bufferSize')
-        except:
-            pass
+        __cusolverDnDsyevd_bufferSize = GetProcAddress(handle, 'cusolverDnDsyevd_bufferSize')
 
         global __cusolverDnCheevd_bufferSize
-        try:
-            __cusolverDnCheevd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevd_bufferSize')
-        except:
-            pass
+        __cusolverDnCheevd_bufferSize = GetProcAddress(handle, 'cusolverDnCheevd_bufferSize')
 
         global __cusolverDnZheevd_bufferSize
-        try:
-            __cusolverDnZheevd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevd_bufferSize')
-        except:
-            pass
+        __cusolverDnZheevd_bufferSize = GetProcAddress(handle, 'cusolverDnZheevd_bufferSize')
 
         global __cusolverDnSsyevd
-        try:
-            __cusolverDnSsyevd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevd')
-        except:
-            pass
+        __cusolverDnSsyevd = GetProcAddress(handle, 'cusolverDnSsyevd')
 
         global __cusolverDnDsyevd
-        try:
-            __cusolverDnDsyevd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevd')
-        except:
-            pass
+        __cusolverDnDsyevd = GetProcAddress(handle, 'cusolverDnDsyevd')
 
         global __cusolverDnCheevd
-        try:
-            __cusolverDnCheevd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevd')
-        except:
-            pass
+        __cusolverDnCheevd = GetProcAddress(handle, 'cusolverDnCheevd')
 
         global __cusolverDnZheevd
-        try:
-            __cusolverDnZheevd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevd')
-        except:
-            pass
+        __cusolverDnZheevd = GetProcAddress(handle, 'cusolverDnZheevd')
 
         global __cusolverDnSsyevdx_bufferSize
-        try:
-            __cusolverDnSsyevdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevdx_bufferSize')
-        except:
-            pass
+        __cusolverDnSsyevdx_bufferSize = GetProcAddress(handle, 'cusolverDnSsyevdx_bufferSize')
 
         global __cusolverDnDsyevdx_bufferSize
-        try:
-            __cusolverDnDsyevdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevdx_bufferSize')
-        except:
-            pass
+        __cusolverDnDsyevdx_bufferSize = GetProcAddress(handle, 'cusolverDnDsyevdx_bufferSize')
 
         global __cusolverDnCheevdx_bufferSize
-        try:
-            __cusolverDnCheevdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevdx_bufferSize')
-        except:
-            pass
+        __cusolverDnCheevdx_bufferSize = GetProcAddress(handle, 'cusolverDnCheevdx_bufferSize')
 
         global __cusolverDnZheevdx_bufferSize
-        try:
-            __cusolverDnZheevdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevdx_bufferSize')
-        except:
-            pass
+        __cusolverDnZheevdx_bufferSize = GetProcAddress(handle, 'cusolverDnZheevdx_bufferSize')
 
         global __cusolverDnSsyevdx
-        try:
-            __cusolverDnSsyevdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevdx')
-        except:
-            pass
+        __cusolverDnSsyevdx = GetProcAddress(handle, 'cusolverDnSsyevdx')
 
         global __cusolverDnDsyevdx
-        try:
-            __cusolverDnDsyevdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevdx')
-        except:
-            pass
+        __cusolverDnDsyevdx = GetProcAddress(handle, 'cusolverDnDsyevdx')
 
         global __cusolverDnCheevdx
-        try:
-            __cusolverDnCheevdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevdx')
-        except:
-            pass
+        __cusolverDnCheevdx = GetProcAddress(handle, 'cusolverDnCheevdx')
 
         global __cusolverDnZheevdx
-        try:
-            __cusolverDnZheevdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevdx')
-        except:
-            pass
+        __cusolverDnZheevdx = GetProcAddress(handle, 'cusolverDnZheevdx')
 
         global __cusolverDnSsygvdx_bufferSize
-        try:
-            __cusolverDnSsygvdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvdx_bufferSize')
-        except:
-            pass
+        __cusolverDnSsygvdx_bufferSize = GetProcAddress(handle, 'cusolverDnSsygvdx_bufferSize')
 
         global __cusolverDnDsygvdx_bufferSize
-        try:
-            __cusolverDnDsygvdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvdx_bufferSize')
-        except:
-            pass
+        __cusolverDnDsygvdx_bufferSize = GetProcAddress(handle, 'cusolverDnDsygvdx_bufferSize')
 
         global __cusolverDnChegvdx_bufferSize
-        try:
-            __cusolverDnChegvdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvdx_bufferSize')
-        except:
-            pass
+        __cusolverDnChegvdx_bufferSize = GetProcAddress(handle, 'cusolverDnChegvdx_bufferSize')
 
         global __cusolverDnZhegvdx_bufferSize
-        try:
-            __cusolverDnZhegvdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvdx_bufferSize')
-        except:
-            pass
+        __cusolverDnZhegvdx_bufferSize = GetProcAddress(handle, 'cusolverDnZhegvdx_bufferSize')
 
         global __cusolverDnSsygvdx
-        try:
-            __cusolverDnSsygvdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvdx')
-        except:
-            pass
+        __cusolverDnSsygvdx = GetProcAddress(handle, 'cusolverDnSsygvdx')
 
         global __cusolverDnDsygvdx
-        try:
-            __cusolverDnDsygvdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvdx')
-        except:
-            pass
+        __cusolverDnDsygvdx = GetProcAddress(handle, 'cusolverDnDsygvdx')
 
         global __cusolverDnChegvdx
-        try:
-            __cusolverDnChegvdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvdx')
-        except:
-            pass
+        __cusolverDnChegvdx = GetProcAddress(handle, 'cusolverDnChegvdx')
 
         global __cusolverDnZhegvdx
-        try:
-            __cusolverDnZhegvdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvdx')
-        except:
-            pass
+        __cusolverDnZhegvdx = GetProcAddress(handle, 'cusolverDnZhegvdx')
 
         global __cusolverDnSsygvd_bufferSize
-        try:
-            __cusolverDnSsygvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvd_bufferSize')
-        except:
-            pass
+        __cusolverDnSsygvd_bufferSize = GetProcAddress(handle, 'cusolverDnSsygvd_bufferSize')
 
         global __cusolverDnDsygvd_bufferSize
-        try:
-            __cusolverDnDsygvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvd_bufferSize')
-        except:
-            pass
+        __cusolverDnDsygvd_bufferSize = GetProcAddress(handle, 'cusolverDnDsygvd_bufferSize')
 
         global __cusolverDnChegvd_bufferSize
-        try:
-            __cusolverDnChegvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvd_bufferSize')
-        except:
-            pass
+        __cusolverDnChegvd_bufferSize = GetProcAddress(handle, 'cusolverDnChegvd_bufferSize')
 
         global __cusolverDnZhegvd_bufferSize
-        try:
-            __cusolverDnZhegvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvd_bufferSize')
-        except:
-            pass
+        __cusolverDnZhegvd_bufferSize = GetProcAddress(handle, 'cusolverDnZhegvd_bufferSize')
 
         global __cusolverDnSsygvd
-        try:
-            __cusolverDnSsygvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvd')
-        except:
-            pass
+        __cusolverDnSsygvd = GetProcAddress(handle, 'cusolverDnSsygvd')
 
         global __cusolverDnDsygvd
-        try:
-            __cusolverDnDsygvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvd')
-        except:
-            pass
+        __cusolverDnDsygvd = GetProcAddress(handle, 'cusolverDnDsygvd')
 
         global __cusolverDnChegvd
-        try:
-            __cusolverDnChegvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvd')
-        except:
-            pass
+        __cusolverDnChegvd = GetProcAddress(handle, 'cusolverDnChegvd')
 
         global __cusolverDnZhegvd
-        try:
-            __cusolverDnZhegvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvd')
-        except:
-            pass
+        __cusolverDnZhegvd = GetProcAddress(handle, 'cusolverDnZhegvd')
 
         global __cusolverDnCreateSyevjInfo
-        try:
-            __cusolverDnCreateSyevjInfo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCreateSyevjInfo')
-        except:
-            pass
+        __cusolverDnCreateSyevjInfo = GetProcAddress(handle, 'cusolverDnCreateSyevjInfo')
 
         global __cusolverDnDestroySyevjInfo
-        try:
-            __cusolverDnDestroySyevjInfo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDestroySyevjInfo')
-        except:
-            pass
+        __cusolverDnDestroySyevjInfo = GetProcAddress(handle, 'cusolverDnDestroySyevjInfo')
 
         global __cusolverDnXsyevjSetTolerance
-        try:
-            __cusolverDnXsyevjSetTolerance = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevjSetTolerance')
-        except:
-            pass
+        __cusolverDnXsyevjSetTolerance = GetProcAddress(handle, 'cusolverDnXsyevjSetTolerance')
 
         global __cusolverDnXsyevjSetMaxSweeps
-        try:
-            __cusolverDnXsyevjSetMaxSweeps = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevjSetMaxSweeps')
-        except:
-            pass
+        __cusolverDnXsyevjSetMaxSweeps = GetProcAddress(handle, 'cusolverDnXsyevjSetMaxSweeps')
 
         global __cusolverDnXsyevjSetSortEig
-        try:
-            __cusolverDnXsyevjSetSortEig = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevjSetSortEig')
-        except:
-            pass
+        __cusolverDnXsyevjSetSortEig = GetProcAddress(handle, 'cusolverDnXsyevjSetSortEig')
 
         global __cusolverDnXsyevjGetResidual
-        try:
-            __cusolverDnXsyevjGetResidual = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevjGetResidual')
-        except:
-            pass
+        __cusolverDnXsyevjGetResidual = GetProcAddress(handle, 'cusolverDnXsyevjGetResidual')
 
         global __cusolverDnXsyevjGetSweeps
-        try:
-            __cusolverDnXsyevjGetSweeps = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevjGetSweeps')
-        except:
-            pass
+        __cusolverDnXsyevjGetSweeps = GetProcAddress(handle, 'cusolverDnXsyevjGetSweeps')
 
         global __cusolverDnSsyevjBatched_bufferSize
-        try:
-            __cusolverDnSsyevjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnSsyevjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnSsyevjBatched_bufferSize')
 
         global __cusolverDnDsyevjBatched_bufferSize
-        try:
-            __cusolverDnDsyevjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnDsyevjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnDsyevjBatched_bufferSize')
 
         global __cusolverDnCheevjBatched_bufferSize
-        try:
-            __cusolverDnCheevjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnCheevjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnCheevjBatched_bufferSize')
 
         global __cusolverDnZheevjBatched_bufferSize
-        try:
-            __cusolverDnZheevjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnZheevjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnZheevjBatched_bufferSize')
 
         global __cusolverDnSsyevjBatched
-        try:
-            __cusolverDnSsyevjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevjBatched')
-        except:
-            pass
+        __cusolverDnSsyevjBatched = GetProcAddress(handle, 'cusolverDnSsyevjBatched')
 
         global __cusolverDnDsyevjBatched
-        try:
-            __cusolverDnDsyevjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevjBatched')
-        except:
-            pass
+        __cusolverDnDsyevjBatched = GetProcAddress(handle, 'cusolverDnDsyevjBatched')
 
         global __cusolverDnCheevjBatched
-        try:
-            __cusolverDnCheevjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevjBatched')
-        except:
-            pass
+        __cusolverDnCheevjBatched = GetProcAddress(handle, 'cusolverDnCheevjBatched')
 
         global __cusolverDnZheevjBatched
-        try:
-            __cusolverDnZheevjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevjBatched')
-        except:
-            pass
+        __cusolverDnZheevjBatched = GetProcAddress(handle, 'cusolverDnZheevjBatched')
 
         global __cusolverDnSsyevj_bufferSize
-        try:
-            __cusolverDnSsyevj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevj_bufferSize')
-        except:
-            pass
+        __cusolverDnSsyevj_bufferSize = GetProcAddress(handle, 'cusolverDnSsyevj_bufferSize')
 
         global __cusolverDnDsyevj_bufferSize
-        try:
-            __cusolverDnDsyevj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevj_bufferSize')
-        except:
-            pass
+        __cusolverDnDsyevj_bufferSize = GetProcAddress(handle, 'cusolverDnDsyevj_bufferSize')
 
         global __cusolverDnCheevj_bufferSize
-        try:
-            __cusolverDnCheevj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevj_bufferSize')
-        except:
-            pass
+        __cusolverDnCheevj_bufferSize = GetProcAddress(handle, 'cusolverDnCheevj_bufferSize')
 
         global __cusolverDnZheevj_bufferSize
-        try:
-            __cusolverDnZheevj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevj_bufferSize')
-        except:
-            pass
+        __cusolverDnZheevj_bufferSize = GetProcAddress(handle, 'cusolverDnZheevj_bufferSize')
 
         global __cusolverDnSsyevj
-        try:
-            __cusolverDnSsyevj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsyevj')
-        except:
-            pass
+        __cusolverDnSsyevj = GetProcAddress(handle, 'cusolverDnSsyevj')
 
         global __cusolverDnDsyevj
-        try:
-            __cusolverDnDsyevj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsyevj')
-        except:
-            pass
+        __cusolverDnDsyevj = GetProcAddress(handle, 'cusolverDnDsyevj')
 
         global __cusolverDnCheevj
-        try:
-            __cusolverDnCheevj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCheevj')
-        except:
-            pass
+        __cusolverDnCheevj = GetProcAddress(handle, 'cusolverDnCheevj')
 
         global __cusolverDnZheevj
-        try:
-            __cusolverDnZheevj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZheevj')
-        except:
-            pass
+        __cusolverDnZheevj = GetProcAddress(handle, 'cusolverDnZheevj')
 
         global __cusolverDnSsygvj_bufferSize
-        try:
-            __cusolverDnSsygvj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvj_bufferSize')
-        except:
-            pass
+        __cusolverDnSsygvj_bufferSize = GetProcAddress(handle, 'cusolverDnSsygvj_bufferSize')
 
         global __cusolverDnDsygvj_bufferSize
-        try:
-            __cusolverDnDsygvj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvj_bufferSize')
-        except:
-            pass
+        __cusolverDnDsygvj_bufferSize = GetProcAddress(handle, 'cusolverDnDsygvj_bufferSize')
 
         global __cusolverDnChegvj_bufferSize
-        try:
-            __cusolverDnChegvj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvj_bufferSize')
-        except:
-            pass
+        __cusolverDnChegvj_bufferSize = GetProcAddress(handle, 'cusolverDnChegvj_bufferSize')
 
         global __cusolverDnZhegvj_bufferSize
-        try:
-            __cusolverDnZhegvj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvj_bufferSize')
-        except:
-            pass
+        __cusolverDnZhegvj_bufferSize = GetProcAddress(handle, 'cusolverDnZhegvj_bufferSize')
 
         global __cusolverDnSsygvj
-        try:
-            __cusolverDnSsygvj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSsygvj')
-        except:
-            pass
+        __cusolverDnSsygvj = GetProcAddress(handle, 'cusolverDnSsygvj')
 
         global __cusolverDnDsygvj
-        try:
-            __cusolverDnDsygvj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDsygvj')
-        except:
-            pass
+        __cusolverDnDsygvj = GetProcAddress(handle, 'cusolverDnDsygvj')
 
         global __cusolverDnChegvj
-        try:
-            __cusolverDnChegvj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnChegvj')
-        except:
-            pass
+        __cusolverDnChegvj = GetProcAddress(handle, 'cusolverDnChegvj')
 
         global __cusolverDnZhegvj
-        try:
-            __cusolverDnZhegvj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZhegvj')
-        except:
-            pass
+        __cusolverDnZhegvj = GetProcAddress(handle, 'cusolverDnZhegvj')
 
         global __cusolverDnCreateGesvdjInfo
-        try:
-            __cusolverDnCreateGesvdjInfo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCreateGesvdjInfo')
-        except:
-            pass
+        __cusolverDnCreateGesvdjInfo = GetProcAddress(handle, 'cusolverDnCreateGesvdjInfo')
 
         global __cusolverDnDestroyGesvdjInfo
-        try:
-            __cusolverDnDestroyGesvdjInfo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDestroyGesvdjInfo')
-        except:
-            pass
+        __cusolverDnDestroyGesvdjInfo = GetProcAddress(handle, 'cusolverDnDestroyGesvdjInfo')
 
         global __cusolverDnXgesvdjSetTolerance
-        try:
-            __cusolverDnXgesvdjSetTolerance = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdjSetTolerance')
-        except:
-            pass
+        __cusolverDnXgesvdjSetTolerance = GetProcAddress(handle, 'cusolverDnXgesvdjSetTolerance')
 
         global __cusolverDnXgesvdjSetMaxSweeps
-        try:
-            __cusolverDnXgesvdjSetMaxSweeps = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdjSetMaxSweeps')
-        except:
-            pass
+        __cusolverDnXgesvdjSetMaxSweeps = GetProcAddress(handle, 'cusolverDnXgesvdjSetMaxSweeps')
 
         global __cusolverDnXgesvdjSetSortEig
-        try:
-            __cusolverDnXgesvdjSetSortEig = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdjSetSortEig')
-        except:
-            pass
+        __cusolverDnXgesvdjSetSortEig = GetProcAddress(handle, 'cusolverDnXgesvdjSetSortEig')
 
         global __cusolverDnXgesvdjGetResidual
-        try:
-            __cusolverDnXgesvdjGetResidual = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdjGetResidual')
-        except:
-            pass
+        __cusolverDnXgesvdjGetResidual = GetProcAddress(handle, 'cusolverDnXgesvdjGetResidual')
 
         global __cusolverDnXgesvdjGetSweeps
-        try:
-            __cusolverDnXgesvdjGetSweeps = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdjGetSweeps')
-        except:
-            pass
+        __cusolverDnXgesvdjGetSweeps = GetProcAddress(handle, 'cusolverDnXgesvdjGetSweeps')
 
         global __cusolverDnSgesvdjBatched_bufferSize
-        try:
-            __cusolverDnSgesvdjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnSgesvdjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnSgesvdjBatched_bufferSize')
 
         global __cusolverDnDgesvdjBatched_bufferSize
-        try:
-            __cusolverDnDgesvdjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnDgesvdjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnDgesvdjBatched_bufferSize')
 
         global __cusolverDnCgesvdjBatched_bufferSize
-        try:
-            __cusolverDnCgesvdjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnCgesvdjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnCgesvdjBatched_bufferSize')
 
         global __cusolverDnZgesvdjBatched_bufferSize
-        try:
-            __cusolverDnZgesvdjBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdjBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnZgesvdjBatched_bufferSize = GetProcAddress(handle, 'cusolverDnZgesvdjBatched_bufferSize')
 
         global __cusolverDnSgesvdjBatched
-        try:
-            __cusolverDnSgesvdjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdjBatched')
-        except:
-            pass
+        __cusolverDnSgesvdjBatched = GetProcAddress(handle, 'cusolverDnSgesvdjBatched')
 
         global __cusolverDnDgesvdjBatched
-        try:
-            __cusolverDnDgesvdjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdjBatched')
-        except:
-            pass
+        __cusolverDnDgesvdjBatched = GetProcAddress(handle, 'cusolverDnDgesvdjBatched')
 
         global __cusolverDnCgesvdjBatched
-        try:
-            __cusolverDnCgesvdjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdjBatched')
-        except:
-            pass
+        __cusolverDnCgesvdjBatched = GetProcAddress(handle, 'cusolverDnCgesvdjBatched')
 
         global __cusolverDnZgesvdjBatched
-        try:
-            __cusolverDnZgesvdjBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdjBatched')
-        except:
-            pass
+        __cusolverDnZgesvdjBatched = GetProcAddress(handle, 'cusolverDnZgesvdjBatched')
 
         global __cusolverDnSgesvdj_bufferSize
-        try:
-            __cusolverDnSgesvdj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdj_bufferSize')
-        except:
-            pass
+        __cusolverDnSgesvdj_bufferSize = GetProcAddress(handle, 'cusolverDnSgesvdj_bufferSize')
 
         global __cusolverDnDgesvdj_bufferSize
-        try:
-            __cusolverDnDgesvdj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdj_bufferSize')
-        except:
-            pass
+        __cusolverDnDgesvdj_bufferSize = GetProcAddress(handle, 'cusolverDnDgesvdj_bufferSize')
 
         global __cusolverDnCgesvdj_bufferSize
-        try:
-            __cusolverDnCgesvdj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdj_bufferSize')
-        except:
-            pass
+        __cusolverDnCgesvdj_bufferSize = GetProcAddress(handle, 'cusolverDnCgesvdj_bufferSize')
 
         global __cusolverDnZgesvdj_bufferSize
-        try:
-            __cusolverDnZgesvdj_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdj_bufferSize')
-        except:
-            pass
+        __cusolverDnZgesvdj_bufferSize = GetProcAddress(handle, 'cusolverDnZgesvdj_bufferSize')
 
         global __cusolverDnSgesvdj
-        try:
-            __cusolverDnSgesvdj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdj')
-        except:
-            pass
+        __cusolverDnSgesvdj = GetProcAddress(handle, 'cusolverDnSgesvdj')
 
         global __cusolverDnDgesvdj
-        try:
-            __cusolverDnDgesvdj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdj')
-        except:
-            pass
+        __cusolverDnDgesvdj = GetProcAddress(handle, 'cusolverDnDgesvdj')
 
         global __cusolverDnCgesvdj
-        try:
-            __cusolverDnCgesvdj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdj')
-        except:
-            pass
+        __cusolverDnCgesvdj = GetProcAddress(handle, 'cusolverDnCgesvdj')
 
         global __cusolverDnZgesvdj
-        try:
-            __cusolverDnZgesvdj = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdj')
-        except:
-            pass
+        __cusolverDnZgesvdj = GetProcAddress(handle, 'cusolverDnZgesvdj')
 
         global __cusolverDnSgesvdaStridedBatched_bufferSize
-        try:
-            __cusolverDnSgesvdaStridedBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdaStridedBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnSgesvdaStridedBatched_bufferSize = GetProcAddress(handle, 'cusolverDnSgesvdaStridedBatched_bufferSize')
 
         global __cusolverDnDgesvdaStridedBatched_bufferSize
-        try:
-            __cusolverDnDgesvdaStridedBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdaStridedBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnDgesvdaStridedBatched_bufferSize = GetProcAddress(handle, 'cusolverDnDgesvdaStridedBatched_bufferSize')
 
         global __cusolverDnCgesvdaStridedBatched_bufferSize
-        try:
-            __cusolverDnCgesvdaStridedBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdaStridedBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnCgesvdaStridedBatched_bufferSize = GetProcAddress(handle, 'cusolverDnCgesvdaStridedBatched_bufferSize')
 
         global __cusolverDnZgesvdaStridedBatched_bufferSize
-        try:
-            __cusolverDnZgesvdaStridedBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdaStridedBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnZgesvdaStridedBatched_bufferSize = GetProcAddress(handle, 'cusolverDnZgesvdaStridedBatched_bufferSize')
 
         global __cusolverDnSgesvdaStridedBatched
-        try:
-            __cusolverDnSgesvdaStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSgesvdaStridedBatched')
-        except:
-            pass
+        __cusolverDnSgesvdaStridedBatched = GetProcAddress(handle, 'cusolverDnSgesvdaStridedBatched')
 
         global __cusolverDnDgesvdaStridedBatched
-        try:
-            __cusolverDnDgesvdaStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDgesvdaStridedBatched')
-        except:
-            pass
+        __cusolverDnDgesvdaStridedBatched = GetProcAddress(handle, 'cusolverDnDgesvdaStridedBatched')
 
         global __cusolverDnCgesvdaStridedBatched
-        try:
-            __cusolverDnCgesvdaStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCgesvdaStridedBatched')
-        except:
-            pass
+        __cusolverDnCgesvdaStridedBatched = GetProcAddress(handle, 'cusolverDnCgesvdaStridedBatched')
 
         global __cusolverDnZgesvdaStridedBatched
-        try:
-            __cusolverDnZgesvdaStridedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnZgesvdaStridedBatched')
-        except:
-            pass
+        __cusolverDnZgesvdaStridedBatched = GetProcAddress(handle, 'cusolverDnZgesvdaStridedBatched')
 
         global __cusolverDnCreateParams
-        try:
-            __cusolverDnCreateParams = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnCreateParams')
-        except:
-            pass
+        __cusolverDnCreateParams = GetProcAddress(handle, 'cusolverDnCreateParams')
 
         global __cusolverDnDestroyParams
-        try:
-            __cusolverDnDestroyParams = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnDestroyParams')
-        except:
-            pass
+        __cusolverDnDestroyParams = GetProcAddress(handle, 'cusolverDnDestroyParams')
 
         global __cusolverDnSetAdvOptions
-        try:
-            __cusolverDnSetAdvOptions = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSetAdvOptions')
-        except:
-            pass
+        __cusolverDnSetAdvOptions = GetProcAddress(handle, 'cusolverDnSetAdvOptions')
 
         global __cusolverDnXpotrf_bufferSize
-        try:
-            __cusolverDnXpotrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXpotrf_bufferSize')
-        except:
-            pass
+        __cusolverDnXpotrf_bufferSize = GetProcAddress(handle, 'cusolverDnXpotrf_bufferSize')
 
         global __cusolverDnXpotrf
-        try:
-            __cusolverDnXpotrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXpotrf')
-        except:
-            pass
+        __cusolverDnXpotrf = GetProcAddress(handle, 'cusolverDnXpotrf')
 
         global __cusolverDnXpotrs
-        try:
-            __cusolverDnXpotrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXpotrs')
-        except:
-            pass
+        __cusolverDnXpotrs = GetProcAddress(handle, 'cusolverDnXpotrs')
 
         global __cusolverDnXgeqrf_bufferSize
-        try:
-            __cusolverDnXgeqrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeqrf_bufferSize')
-        except:
-            pass
+        __cusolverDnXgeqrf_bufferSize = GetProcAddress(handle, 'cusolverDnXgeqrf_bufferSize')
 
         global __cusolverDnXgeqrf
-        try:
-            __cusolverDnXgeqrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeqrf')
-        except:
-            pass
+        __cusolverDnXgeqrf = GetProcAddress(handle, 'cusolverDnXgeqrf')
 
         global __cusolverDnXgetrf_bufferSize
-        try:
-            __cusolverDnXgetrf_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgetrf_bufferSize')
-        except:
-            pass
+        __cusolverDnXgetrf_bufferSize = GetProcAddress(handle, 'cusolverDnXgetrf_bufferSize')
 
         global __cusolverDnXgetrf
-        try:
-            __cusolverDnXgetrf = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgetrf')
-        except:
-            pass
+        __cusolverDnXgetrf = GetProcAddress(handle, 'cusolverDnXgetrf')
 
         global __cusolverDnXgetrs
-        try:
-            __cusolverDnXgetrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgetrs')
-        except:
-            pass
+        __cusolverDnXgetrs = GetProcAddress(handle, 'cusolverDnXgetrs')
 
         global __cusolverDnXsyevd_bufferSize
-        try:
-            __cusolverDnXsyevd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevd_bufferSize')
-        except:
-            pass
+        __cusolverDnXsyevd_bufferSize = GetProcAddress(handle, 'cusolverDnXsyevd_bufferSize')
 
         global __cusolverDnXsyevd
-        try:
-            __cusolverDnXsyevd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevd')
-        except:
-            pass
+        __cusolverDnXsyevd = GetProcAddress(handle, 'cusolverDnXsyevd')
 
         global __cusolverDnXsyevdx_bufferSize
-        try:
-            __cusolverDnXsyevdx_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevdx_bufferSize')
-        except:
-            pass
+        __cusolverDnXsyevdx_bufferSize = GetProcAddress(handle, 'cusolverDnXsyevdx_bufferSize')
 
         global __cusolverDnXsyevdx
-        try:
-            __cusolverDnXsyevdx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevdx')
-        except:
-            pass
+        __cusolverDnXsyevdx = GetProcAddress(handle, 'cusolverDnXsyevdx')
 
         global __cusolverDnXgesvd_bufferSize
-        try:
-            __cusolverDnXgesvd_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvd_bufferSize')
-        except:
-            pass
+        __cusolverDnXgesvd_bufferSize = GetProcAddress(handle, 'cusolverDnXgesvd_bufferSize')
 
         global __cusolverDnXgesvd
-        try:
-            __cusolverDnXgesvd = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvd')
-        except:
-            pass
+        __cusolverDnXgesvd = GetProcAddress(handle, 'cusolverDnXgesvd')
 
         global __cusolverDnXgesvdp_bufferSize
-        try:
-            __cusolverDnXgesvdp_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdp_bufferSize')
-        except:
-            pass
+        __cusolverDnXgesvdp_bufferSize = GetProcAddress(handle, 'cusolverDnXgesvdp_bufferSize')
 
         global __cusolverDnXgesvdp
-        try:
-            __cusolverDnXgesvdp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdp')
-        except:
-            pass
+        __cusolverDnXgesvdp = GetProcAddress(handle, 'cusolverDnXgesvdp')
 
         global __cusolverDnXgesvdr_bufferSize
-        try:
-            __cusolverDnXgesvdr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdr_bufferSize')
-        except:
-            pass
+        __cusolverDnXgesvdr_bufferSize = GetProcAddress(handle, 'cusolverDnXgesvdr_bufferSize')
 
         global __cusolverDnXgesvdr
-        try:
-            __cusolverDnXgesvdr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgesvdr')
-        except:
-            pass
+        __cusolverDnXgesvdr = GetProcAddress(handle, 'cusolverDnXgesvdr')
 
         global __cusolverDnXsytrs_bufferSize
-        try:
-            __cusolverDnXsytrs_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsytrs_bufferSize')
-        except:
-            pass
+        __cusolverDnXsytrs_bufferSize = GetProcAddress(handle, 'cusolverDnXsytrs_bufferSize')
 
         global __cusolverDnXsytrs
-        try:
-            __cusolverDnXsytrs = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsytrs')
-        except:
-            pass
+        __cusolverDnXsytrs = GetProcAddress(handle, 'cusolverDnXsytrs')
 
         global __cusolverDnXtrtri_bufferSize
-        try:
-            __cusolverDnXtrtri_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXtrtri_bufferSize')
-        except:
-            pass
+        __cusolverDnXtrtri_bufferSize = GetProcAddress(handle, 'cusolverDnXtrtri_bufferSize')
 
         global __cusolverDnXtrtri
-        try:
-            __cusolverDnXtrtri = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXtrtri')
-        except:
-            pass
+        __cusolverDnXtrtri = GetProcAddress(handle, 'cusolverDnXtrtri')
 
         global __cusolverDnLoggerSetCallback
-        try:
-            __cusolverDnLoggerSetCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerSetCallback')
-        except:
-            pass
+        __cusolverDnLoggerSetCallback = GetProcAddress(handle, 'cusolverDnLoggerSetCallback')
 
         global __cusolverDnLoggerSetFile
-        try:
-            __cusolverDnLoggerSetFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerSetFile')
-        except:
-            pass
+        __cusolverDnLoggerSetFile = GetProcAddress(handle, 'cusolverDnLoggerSetFile')
 
         global __cusolverDnLoggerOpenFile
-        try:
-            __cusolverDnLoggerOpenFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerOpenFile')
-        except:
-            pass
+        __cusolverDnLoggerOpenFile = GetProcAddress(handle, 'cusolverDnLoggerOpenFile')
 
         global __cusolverDnLoggerSetLevel
-        try:
-            __cusolverDnLoggerSetLevel = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerSetLevel')
-        except:
-            pass
+        __cusolverDnLoggerSetLevel = GetProcAddress(handle, 'cusolverDnLoggerSetLevel')
 
         global __cusolverDnLoggerSetMask
-        try:
-            __cusolverDnLoggerSetMask = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerSetMask')
-        except:
-            pass
+        __cusolverDnLoggerSetMask = GetProcAddress(handle, 'cusolverDnLoggerSetMask')
 
         global __cusolverDnLoggerForceDisable
-        try:
-            __cusolverDnLoggerForceDisable = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnLoggerForceDisable')
-        except:
-            pass
+        __cusolverDnLoggerForceDisable = GetProcAddress(handle, 'cusolverDnLoggerForceDisable')
 
         global __cusolverDnSetDeterministicMode
-        try:
-            __cusolverDnSetDeterministicMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnSetDeterministicMode')
-        except:
-            pass
+        __cusolverDnSetDeterministicMode = GetProcAddress(handle, 'cusolverDnSetDeterministicMode')
 
         global __cusolverDnGetDeterministicMode
-        try:
-            __cusolverDnGetDeterministicMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnGetDeterministicMode')
-        except:
-            pass
+        __cusolverDnGetDeterministicMode = GetProcAddress(handle, 'cusolverDnGetDeterministicMode')
 
         global __cusolverDnXlarft_bufferSize
-        try:
-            __cusolverDnXlarft_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXlarft_bufferSize')
-        except:
-            pass
+        __cusolverDnXlarft_bufferSize = GetProcAddress(handle, 'cusolverDnXlarft_bufferSize')
 
         global __cusolverDnXlarft
-        try:
-            __cusolverDnXlarft = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXlarft')
-        except:
-            pass
+        __cusolverDnXlarft = GetProcAddress(handle, 'cusolverDnXlarft')
 
         global __cusolverDnXsyevBatched_bufferSize
-        try:
-            __cusolverDnXsyevBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevBatched_bufferSize')
-        except:
-            pass
+        __cusolverDnXsyevBatched_bufferSize = GetProcAddress(handle, 'cusolverDnXsyevBatched_bufferSize')
 
         global __cusolverDnXsyevBatched
-        try:
-            __cusolverDnXsyevBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevBatched')
-        except:
-            pass
+        __cusolverDnXsyevBatched = GetProcAddress(handle, 'cusolverDnXsyevBatched')
 
         global __cusolverDnXgeev_bufferSize
-        try:
-            __cusolverDnXgeev_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeev_bufferSize')
-        except:
-            pass
+        __cusolverDnXgeev_bufferSize = GetProcAddress(handle, 'cusolverDnXgeev_bufferSize')
 
         global __cusolverDnXgeev
-        try:
-            __cusolverDnXgeev = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeev')
-        except:
-            pass
+        __cusolverDnXgeev = GetProcAddress(handle, 'cusolverDnXgeev')
 
-    __py_cusolverDn_init = True
-    return 0
+        __py_cusolverDn_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusolver_linux.pyx b/nvmath/bindings/_internal/cusolver_linux.pyx
index a8ef1c0..982c5bc 100644
--- a/nvmath/bindings/_internal/cusolver_linux.pyx
+++ b/nvmath/bindings/_internal/cusolver_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusolver_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusolverGetProperty = NULL
 cdef void* __cusolverGetVersion = NULL
@@ -49,44 +70,28 @@ cdef int _check_or_init_cusolver() except -1 nogil:
     if __py_cusolver_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cusolverGetProperty
-    __cusolverGetProperty = dlsym(RTLD_DEFAULT, 'cusolverGetProperty')
-    if __cusolverGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverGetProperty = dlsym(handle, 'cusolverGetProperty')
-
-    global __cusolverGetVersion
-    __cusolverGetVersion = dlsym(RTLD_DEFAULT, 'cusolverGetVersion')
-    if __cusolverGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverGetVersion = dlsym(handle, 'cusolverGetVersion')
 
-    __py_cusolver_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cusolverGetProperty
+        __cusolverGetProperty = dlsym(RTLD_DEFAULT, 'cusolverGetProperty')
+        if __cusolverGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverGetProperty = dlsym(handle, 'cusolverGetProperty')
+
+        global __cusolverGetVersion
+        __cusolverGetVersion = dlsym(RTLD_DEFAULT, 'cusolverGetVersion')
+        if __cusolverGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverGetVersion = dlsym(handle, 'cusolverGetVersion')
+
+        __py_cusolver_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusolver_windows.pyx b/nvmath/bindings/_internal/cusolver_windows.pyx
index d050c24..93984f8 100644
--- a/nvmath/bindings/_internal/cusolver_windows.pyx
+++ b/nvmath/bindings/_internal/cusolver_windows.pyx
@@ -11,20 +11,77 @@ from .cusparse cimport load_library as load_cusparse
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusolver_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusolverGetProperty = NULL
 cdef void* __cusolverGetVersion = NULL
@@ -43,40 +100,21 @@ cdef int _check_or_init_cusolver() except -1 nogil:
     if __py_cusolver_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __cusolverGetProperty
-        try:
-            __cusolverGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverGetProperty')
-        except:
-            pass
+        __cusolverGetProperty = GetProcAddress(handle, 'cusolverGetProperty')
 
         global __cusolverGetVersion
-        try:
-            __cusolverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverGetVersion')
-        except:
-            pass
+        __cusolverGetVersion = GetProcAddress(handle, 'cusolverGetVersion')
 
-    __py_cusolver_init = True
-    return 0
+        __py_cusolver_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusparse_linux.pyx b/nvmath/bindings/_internal/cusparse_linux.pyx
index da90151..7efa5d1 100644
--- a/nvmath/bindings/_internal/cusparse_linux.pyx
+++ b/nvmath/bindings/_internal/cusparse_linux.pyx
@@ -6,10 +6,13 @@
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -28,13 +31,31 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusparse_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusparseCreate = NULL
 cdef void* __cusparseDestroy = NULL
@@ -304,1822 +325,1806 @@ cdef int _check_or_init_cusparse() except -1 nogil:
     if __py_cusparse_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __cusparseCreate
-    __cusparseCreate = dlsym(RTLD_DEFAULT, 'cusparseCreate')
-    if __cusparseCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreate = dlsym(handle, 'cusparseCreate')
-
-    global __cusparseDestroy
-    __cusparseDestroy = dlsym(RTLD_DEFAULT, 'cusparseDestroy')
-    if __cusparseDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroy = dlsym(handle, 'cusparseDestroy')
-
-    global __cusparseGetVersion
-    __cusparseGetVersion = dlsym(RTLD_DEFAULT, 'cusparseGetVersion')
-    if __cusparseGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetVersion = dlsym(handle, 'cusparseGetVersion')
-
-    global __cusparseGetProperty
-    __cusparseGetProperty = dlsym(RTLD_DEFAULT, 'cusparseGetProperty')
-    if __cusparseGetProperty == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetProperty = dlsym(handle, 'cusparseGetProperty')
-
-    global __cusparseGetErrorName
-    __cusparseGetErrorName = dlsym(RTLD_DEFAULT, 'cusparseGetErrorName')
-    if __cusparseGetErrorName == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetErrorName = dlsym(handle, 'cusparseGetErrorName')
-
-    global __cusparseGetErrorString
-    __cusparseGetErrorString = dlsym(RTLD_DEFAULT, 'cusparseGetErrorString')
-    if __cusparseGetErrorString == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetErrorString = dlsym(handle, 'cusparseGetErrorString')
-
-    global __cusparseSetStream
-    __cusparseSetStream = dlsym(RTLD_DEFAULT, 'cusparseSetStream')
-    if __cusparseSetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetStream = dlsym(handle, 'cusparseSetStream')
-
-    global __cusparseGetStream
-    __cusparseGetStream = dlsym(RTLD_DEFAULT, 'cusparseGetStream')
-    if __cusparseGetStream == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetStream = dlsym(handle, 'cusparseGetStream')
-
-    global __cusparseGetPointerMode
-    __cusparseGetPointerMode = dlsym(RTLD_DEFAULT, 'cusparseGetPointerMode')
-    if __cusparseGetPointerMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetPointerMode = dlsym(handle, 'cusparseGetPointerMode')
-
-    global __cusparseSetPointerMode
-    __cusparseSetPointerMode = dlsym(RTLD_DEFAULT, 'cusparseSetPointerMode')
-    if __cusparseSetPointerMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetPointerMode = dlsym(handle, 'cusparseSetPointerMode')
-
-    global __cusparseCreateMatDescr
-    __cusparseCreateMatDescr = dlsym(RTLD_DEFAULT, 'cusparseCreateMatDescr')
-    if __cusparseCreateMatDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateMatDescr = dlsym(handle, 'cusparseCreateMatDescr')
-
-    global __cusparseDestroyMatDescr
-    __cusparseDestroyMatDescr = dlsym(RTLD_DEFAULT, 'cusparseDestroyMatDescr')
-    if __cusparseDestroyMatDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroyMatDescr = dlsym(handle, 'cusparseDestroyMatDescr')
-
-    global __cusparseSetMatType
-    __cusparseSetMatType = dlsym(RTLD_DEFAULT, 'cusparseSetMatType')
-    if __cusparseSetMatType == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetMatType = dlsym(handle, 'cusparseSetMatType')
-
-    global __cusparseGetMatType
-    __cusparseGetMatType = dlsym(RTLD_DEFAULT, 'cusparseGetMatType')
-    if __cusparseGetMatType == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetMatType = dlsym(handle, 'cusparseGetMatType')
-
-    global __cusparseSetMatFillMode
-    __cusparseSetMatFillMode = dlsym(RTLD_DEFAULT, 'cusparseSetMatFillMode')
-    if __cusparseSetMatFillMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetMatFillMode = dlsym(handle, 'cusparseSetMatFillMode')
-
-    global __cusparseGetMatFillMode
-    __cusparseGetMatFillMode = dlsym(RTLD_DEFAULT, 'cusparseGetMatFillMode')
-    if __cusparseGetMatFillMode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetMatFillMode = dlsym(handle, 'cusparseGetMatFillMode')
-
-    global __cusparseSetMatDiagType
-    __cusparseSetMatDiagType = dlsym(RTLD_DEFAULT, 'cusparseSetMatDiagType')
-    if __cusparseSetMatDiagType == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetMatDiagType = dlsym(handle, 'cusparseSetMatDiagType')
-
-    global __cusparseGetMatDiagType
-    __cusparseGetMatDiagType = dlsym(RTLD_DEFAULT, 'cusparseGetMatDiagType')
-    if __cusparseGetMatDiagType == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetMatDiagType = dlsym(handle, 'cusparseGetMatDiagType')
-
-    global __cusparseSetMatIndexBase
-    __cusparseSetMatIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSetMatIndexBase')
-    if __cusparseSetMatIndexBase == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSetMatIndexBase = dlsym(handle, 'cusparseSetMatIndexBase')
-
-    global __cusparseGetMatIndexBase
-    __cusparseGetMatIndexBase = dlsym(RTLD_DEFAULT, 'cusparseGetMatIndexBase')
-    if __cusparseGetMatIndexBase == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGetMatIndexBase = dlsym(handle, 'cusparseGetMatIndexBase')
-
-    global __cusparseSgemvi
-    __cusparseSgemvi = dlsym(RTLD_DEFAULT, 'cusparseSgemvi')
-    if __cusparseSgemvi == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgemvi = dlsym(handle, 'cusparseSgemvi')
-
-    global __cusparseSgemvi_bufferSize
-    __cusparseSgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgemvi_bufferSize')
-    if __cusparseSgemvi_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgemvi_bufferSize = dlsym(handle, 'cusparseSgemvi_bufferSize')
-
-    global __cusparseDgemvi
-    __cusparseDgemvi = dlsym(RTLD_DEFAULT, 'cusparseDgemvi')
-    if __cusparseDgemvi == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgemvi = dlsym(handle, 'cusparseDgemvi')
-
-    global __cusparseDgemvi_bufferSize
-    __cusparseDgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgemvi_bufferSize')
-    if __cusparseDgemvi_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgemvi_bufferSize = dlsym(handle, 'cusparseDgemvi_bufferSize')
-
-    global __cusparseCgemvi
-    __cusparseCgemvi = dlsym(RTLD_DEFAULT, 'cusparseCgemvi')
-    if __cusparseCgemvi == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgemvi = dlsym(handle, 'cusparseCgemvi')
-
-    global __cusparseCgemvi_bufferSize
-    __cusparseCgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgemvi_bufferSize')
-    if __cusparseCgemvi_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgemvi_bufferSize = dlsym(handle, 'cusparseCgemvi_bufferSize')
-
-    global __cusparseZgemvi
-    __cusparseZgemvi = dlsym(RTLD_DEFAULT, 'cusparseZgemvi')
-    if __cusparseZgemvi == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgemvi = dlsym(handle, 'cusparseZgemvi')
-
-    global __cusparseZgemvi_bufferSize
-    __cusparseZgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgemvi_bufferSize')
-    if __cusparseZgemvi_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgemvi_bufferSize = dlsym(handle, 'cusparseZgemvi_bufferSize')
-
-    global __cusparseSbsrmv
-    __cusparseSbsrmv = dlsym(RTLD_DEFAULT, 'cusparseSbsrmv')
-    if __cusparseSbsrmv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSbsrmv = dlsym(handle, 'cusparseSbsrmv')
-
-    global __cusparseDbsrmv
-    __cusparseDbsrmv = dlsym(RTLD_DEFAULT, 'cusparseDbsrmv')
-    if __cusparseDbsrmv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDbsrmv = dlsym(handle, 'cusparseDbsrmv')
-
-    global __cusparseCbsrmv
-    __cusparseCbsrmv = dlsym(RTLD_DEFAULT, 'cusparseCbsrmv')
-    if __cusparseCbsrmv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCbsrmv = dlsym(handle, 'cusparseCbsrmv')
-
-    global __cusparseZbsrmv
-    __cusparseZbsrmv = dlsym(RTLD_DEFAULT, 'cusparseZbsrmv')
-    if __cusparseZbsrmv == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZbsrmv = dlsym(handle, 'cusparseZbsrmv')
-
-    global __cusparseSbsrmm
-    __cusparseSbsrmm = dlsym(RTLD_DEFAULT, 'cusparseSbsrmm')
-    if __cusparseSbsrmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSbsrmm = dlsym(handle, 'cusparseSbsrmm')
-
-    global __cusparseDbsrmm
-    __cusparseDbsrmm = dlsym(RTLD_DEFAULT, 'cusparseDbsrmm')
-    if __cusparseDbsrmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDbsrmm = dlsym(handle, 'cusparseDbsrmm')
-
-    global __cusparseCbsrmm
-    __cusparseCbsrmm = dlsym(RTLD_DEFAULT, 'cusparseCbsrmm')
-    if __cusparseCbsrmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCbsrmm = dlsym(handle, 'cusparseCbsrmm')
-
-    global __cusparseZbsrmm
-    __cusparseZbsrmm = dlsym(RTLD_DEFAULT, 'cusparseZbsrmm')
-    if __cusparseZbsrmm == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZbsrmm = dlsym(handle, 'cusparseZbsrmm')
-
-    global __cusparseSgtsv2_bufferSizeExt
-    __cusparseSgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_bufferSizeExt')
-    if __cusparseSgtsv2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2_bufferSizeExt')
 
-    global __cusparseDgtsv2_bufferSizeExt
-    __cusparseDgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_bufferSizeExt')
-    if __cusparseDgtsv2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2_bufferSizeExt')
-
-    global __cusparseCgtsv2_bufferSizeExt
-    __cusparseCgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_bufferSizeExt')
-    if __cusparseCgtsv2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2_bufferSizeExt')
-
-    global __cusparseZgtsv2_bufferSizeExt
-    __cusparseZgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_bufferSizeExt')
-    if __cusparseZgtsv2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2_bufferSizeExt')
-
-    global __cusparseSgtsv2
-    __cusparseSgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2')
-    if __cusparseSgtsv2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2 = dlsym(handle, 'cusparseSgtsv2')
-
-    global __cusparseDgtsv2
-    __cusparseDgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2')
-    if __cusparseDgtsv2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2 = dlsym(handle, 'cusparseDgtsv2')
-
-    global __cusparseCgtsv2
-    __cusparseCgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2')
-    if __cusparseCgtsv2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2 = dlsym(handle, 'cusparseCgtsv2')
-
-    global __cusparseZgtsv2
-    __cusparseZgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2')
-    if __cusparseZgtsv2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2 = dlsym(handle, 'cusparseZgtsv2')
-
-    global __cusparseSgtsv2_nopivot_bufferSizeExt
-    __cusparseSgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_nopivot_bufferSizeExt')
-    if __cusparseSgtsv2_nopivot_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2_nopivot_bufferSizeExt')
-
-    global __cusparseDgtsv2_nopivot_bufferSizeExt
-    __cusparseDgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_nopivot_bufferSizeExt')
-    if __cusparseDgtsv2_nopivot_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2_nopivot_bufferSizeExt')
-
-    global __cusparseCgtsv2_nopivot_bufferSizeExt
-    __cusparseCgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_nopivot_bufferSizeExt')
-    if __cusparseCgtsv2_nopivot_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2_nopivot_bufferSizeExt')
-
-    global __cusparseZgtsv2_nopivot_bufferSizeExt
-    __cusparseZgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_nopivot_bufferSizeExt')
-    if __cusparseZgtsv2_nopivot_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2_nopivot_bufferSizeExt')
-
-    global __cusparseSgtsv2_nopivot
-    __cusparseSgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_nopivot')
-    if __cusparseSgtsv2_nopivot == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2_nopivot = dlsym(handle, 'cusparseSgtsv2_nopivot')
-
-    global __cusparseDgtsv2_nopivot
-    __cusparseDgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_nopivot')
-    if __cusparseDgtsv2_nopivot == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2_nopivot = dlsym(handle, 'cusparseDgtsv2_nopivot')
-
-    global __cusparseCgtsv2_nopivot
-    __cusparseCgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_nopivot')
-    if __cusparseCgtsv2_nopivot == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2_nopivot = dlsym(handle, 'cusparseCgtsv2_nopivot')
-
-    global __cusparseZgtsv2_nopivot
-    __cusparseZgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_nopivot')
-    if __cusparseZgtsv2_nopivot == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2_nopivot = dlsym(handle, 'cusparseZgtsv2_nopivot')
-
-    global __cusparseSgtsv2StridedBatch_bufferSizeExt
-    __cusparseSgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
-    if __cusparseSgtsv2StridedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
-
-    global __cusparseDgtsv2StridedBatch_bufferSizeExt
-    __cusparseDgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
-    if __cusparseDgtsv2StridedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
-
-    global __cusparseCgtsv2StridedBatch_bufferSizeExt
-    __cusparseCgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
-    if __cusparseCgtsv2StridedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
-
-    global __cusparseZgtsv2StridedBatch_bufferSizeExt
-    __cusparseZgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
-    if __cusparseZgtsv2StridedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
-
-    global __cusparseSgtsv2StridedBatch
-    __cusparseSgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2StridedBatch')
-    if __cusparseSgtsv2StridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsv2StridedBatch = dlsym(handle, 'cusparseSgtsv2StridedBatch')
-
-    global __cusparseDgtsv2StridedBatch
-    __cusparseDgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2StridedBatch')
-    if __cusparseDgtsv2StridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsv2StridedBatch = dlsym(handle, 'cusparseDgtsv2StridedBatch')
-
-    global __cusparseCgtsv2StridedBatch
-    __cusparseCgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2StridedBatch')
-    if __cusparseCgtsv2StridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsv2StridedBatch = dlsym(handle, 'cusparseCgtsv2StridedBatch')
-
-    global __cusparseZgtsv2StridedBatch
-    __cusparseZgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2StridedBatch')
-    if __cusparseZgtsv2StridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsv2StridedBatch = dlsym(handle, 'cusparseZgtsv2StridedBatch')
-
-    global __cusparseSgtsvInterleavedBatch_bufferSizeExt
-    __cusparseSgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
-    if __cusparseSgtsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseDgtsvInterleavedBatch_bufferSizeExt
-    __cusparseDgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
-    if __cusparseDgtsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseCgtsvInterleavedBatch_bufferSizeExt
-    __cusparseCgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
-    if __cusparseCgtsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseZgtsvInterleavedBatch_bufferSizeExt
-    __cusparseZgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
-    if __cusparseZgtsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseSgtsvInterleavedBatch
-    __cusparseSgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgtsvInterleavedBatch')
-    if __cusparseSgtsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgtsvInterleavedBatch = dlsym(handle, 'cusparseSgtsvInterleavedBatch')
-
-    global __cusparseDgtsvInterleavedBatch
-    __cusparseDgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgtsvInterleavedBatch')
-    if __cusparseDgtsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgtsvInterleavedBatch = dlsym(handle, 'cusparseDgtsvInterleavedBatch')
-
-    global __cusparseCgtsvInterleavedBatch
-    __cusparseCgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgtsvInterleavedBatch')
-    if __cusparseCgtsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgtsvInterleavedBatch = dlsym(handle, 'cusparseCgtsvInterleavedBatch')
-
-    global __cusparseZgtsvInterleavedBatch
-    __cusparseZgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgtsvInterleavedBatch')
-    if __cusparseZgtsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgtsvInterleavedBatch = dlsym(handle, 'cusparseZgtsvInterleavedBatch')
-
-    global __cusparseSgpsvInterleavedBatch_bufferSizeExt
-    __cusparseSgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
-    if __cusparseSgpsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseDgpsvInterleavedBatch_bufferSizeExt
-    __cusparseDgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
-    if __cusparseDgpsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseCgpsvInterleavedBatch_bufferSizeExt
-    __cusparseCgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
-    if __cusparseCgpsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseZgpsvInterleavedBatch_bufferSizeExt
-    __cusparseZgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
-    if __cusparseZgpsvInterleavedBatch_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
-
-    global __cusparseSgpsvInterleavedBatch
-    __cusparseSgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgpsvInterleavedBatch')
-    if __cusparseSgpsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgpsvInterleavedBatch = dlsym(handle, 'cusparseSgpsvInterleavedBatch')
-
-    global __cusparseDgpsvInterleavedBatch
-    __cusparseDgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgpsvInterleavedBatch')
-    if __cusparseDgpsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgpsvInterleavedBatch = dlsym(handle, 'cusparseDgpsvInterleavedBatch')
-
-    global __cusparseCgpsvInterleavedBatch
-    __cusparseCgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgpsvInterleavedBatch')
-    if __cusparseCgpsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgpsvInterleavedBatch = dlsym(handle, 'cusparseCgpsvInterleavedBatch')
-
-    global __cusparseZgpsvInterleavedBatch
-    __cusparseZgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgpsvInterleavedBatch')
-    if __cusparseZgpsvInterleavedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgpsvInterleavedBatch = dlsym(handle, 'cusparseZgpsvInterleavedBatch')
-
-    global __cusparseScsrgeam2_bufferSizeExt
-    __cusparseScsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseScsrgeam2_bufferSizeExt')
-    if __cusparseScsrgeam2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseScsrgeam2_bufferSizeExt')
-
-    global __cusparseDcsrgeam2_bufferSizeExt
-    __cusparseDcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDcsrgeam2_bufferSizeExt')
-    if __cusparseDcsrgeam2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseDcsrgeam2_bufferSizeExt')
-
-    global __cusparseCcsrgeam2_bufferSizeExt
-    __cusparseCcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCcsrgeam2_bufferSizeExt')
-    if __cusparseCcsrgeam2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseCcsrgeam2_bufferSizeExt')
-
-    global __cusparseZcsrgeam2_bufferSizeExt
-    __cusparseZcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZcsrgeam2_bufferSizeExt')
-    if __cusparseZcsrgeam2_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseZcsrgeam2_bufferSizeExt')
-
-    global __cusparseXcsrgeam2Nnz
-    __cusparseXcsrgeam2Nnz = dlsym(RTLD_DEFAULT, 'cusparseXcsrgeam2Nnz')
-    if __cusparseXcsrgeam2Nnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcsrgeam2Nnz = dlsym(handle, 'cusparseXcsrgeam2Nnz')
-
-    global __cusparseScsrgeam2
-    __cusparseScsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseScsrgeam2')
-    if __cusparseScsrgeam2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScsrgeam2 = dlsym(handle, 'cusparseScsrgeam2')
-
-    global __cusparseDcsrgeam2
-    __cusparseDcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseDcsrgeam2')
-    if __cusparseDcsrgeam2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDcsrgeam2 = dlsym(handle, 'cusparseDcsrgeam2')
-
-    global __cusparseCcsrgeam2
-    __cusparseCcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseCcsrgeam2')
-    if __cusparseCcsrgeam2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCcsrgeam2 = dlsym(handle, 'cusparseCcsrgeam2')
-
-    global __cusparseZcsrgeam2
-    __cusparseZcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseZcsrgeam2')
-    if __cusparseZcsrgeam2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZcsrgeam2 = dlsym(handle, 'cusparseZcsrgeam2')
-
-    global __cusparseSnnz
-    __cusparseSnnz = dlsym(RTLD_DEFAULT, 'cusparseSnnz')
-    if __cusparseSnnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSnnz = dlsym(handle, 'cusparseSnnz')
-
-    global __cusparseDnnz
-    __cusparseDnnz = dlsym(RTLD_DEFAULT, 'cusparseDnnz')
-    if __cusparseDnnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnnz = dlsym(handle, 'cusparseDnnz')
-
-    global __cusparseCnnz
-    __cusparseCnnz = dlsym(RTLD_DEFAULT, 'cusparseCnnz')
-    if __cusparseCnnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCnnz = dlsym(handle, 'cusparseCnnz')
-
-    global __cusparseZnnz
-    __cusparseZnnz = dlsym(RTLD_DEFAULT, 'cusparseZnnz')
-    if __cusparseZnnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZnnz = dlsym(handle, 'cusparseZnnz')
-
-    global __cusparseXcoo2csr
-    __cusparseXcoo2csr = dlsym(RTLD_DEFAULT, 'cusparseXcoo2csr')
-    if __cusparseXcoo2csr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcoo2csr = dlsym(handle, 'cusparseXcoo2csr')
-
-    global __cusparseXcsr2coo
-    __cusparseXcsr2coo = dlsym(RTLD_DEFAULT, 'cusparseXcsr2coo')
-    if __cusparseXcsr2coo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcsr2coo = dlsym(handle, 'cusparseXcsr2coo')
-
-    global __cusparseSbsr2csr
-    __cusparseSbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseSbsr2csr')
-    if __cusparseSbsr2csr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSbsr2csr = dlsym(handle, 'cusparseSbsr2csr')
-
-    global __cusparseDbsr2csr
-    __cusparseDbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseDbsr2csr')
-    if __cusparseDbsr2csr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDbsr2csr = dlsym(handle, 'cusparseDbsr2csr')
-
-    global __cusparseCbsr2csr
-    __cusparseCbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseCbsr2csr')
-    if __cusparseCbsr2csr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCbsr2csr = dlsym(handle, 'cusparseCbsr2csr')
-
-    global __cusparseZbsr2csr
-    __cusparseZbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseZbsr2csr')
-    if __cusparseZbsr2csr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZbsr2csr = dlsym(handle, 'cusparseZbsr2csr')
-
-    global __cusparseSgebsr2gebsc_bufferSize
-    __cusparseSgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc_bufferSize')
-    if __cusparseSgebsr2gebsc_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseSgebsr2gebsc_bufferSize')
-
-    global __cusparseDgebsr2gebsc_bufferSize
-    __cusparseDgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc_bufferSize')
-    if __cusparseDgebsr2gebsc_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseDgebsr2gebsc_bufferSize')
-
-    global __cusparseCgebsr2gebsc_bufferSize
-    __cusparseCgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc_bufferSize')
-    if __cusparseCgebsr2gebsc_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseCgebsr2gebsc_bufferSize')
-
-    global __cusparseZgebsr2gebsc_bufferSize
-    __cusparseZgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc_bufferSize')
-    if __cusparseZgebsr2gebsc_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseZgebsr2gebsc_bufferSize')
-
-    global __cusparseSgebsr2gebsc_bufferSizeExt
-    __cusparseSgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc_bufferSizeExt')
-    if __cusparseSgebsr2gebsc_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseSgebsr2gebsc_bufferSizeExt')
-
-    global __cusparseDgebsr2gebsc_bufferSizeExt
-    __cusparseDgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc_bufferSizeExt')
-    if __cusparseDgebsr2gebsc_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseDgebsr2gebsc_bufferSizeExt')
-
-    global __cusparseCgebsr2gebsc_bufferSizeExt
-    __cusparseCgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc_bufferSizeExt')
-    if __cusparseCgebsr2gebsc_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseCgebsr2gebsc_bufferSizeExt')
-
-    global __cusparseZgebsr2gebsc_bufferSizeExt
-    __cusparseZgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc_bufferSizeExt')
-    if __cusparseZgebsr2gebsc_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseZgebsr2gebsc_bufferSizeExt')
-
-    global __cusparseSgebsr2gebsc
-    __cusparseSgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc')
-    if __cusparseSgebsr2gebsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsc = dlsym(handle, 'cusparseSgebsr2gebsc')
-
-    global __cusparseDgebsr2gebsc
-    __cusparseDgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc')
-    if __cusparseDgebsr2gebsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsc = dlsym(handle, 'cusparseDgebsr2gebsc')
-
-    global __cusparseCgebsr2gebsc
-    __cusparseCgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc')
-    if __cusparseCgebsr2gebsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsc = dlsym(handle, 'cusparseCgebsr2gebsc')
-
-    global __cusparseZgebsr2gebsc
-    __cusparseZgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc')
-    if __cusparseZgebsr2gebsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsc = dlsym(handle, 'cusparseZgebsr2gebsc')
-
-    global __cusparseScsr2gebsr_bufferSize
-    __cusparseScsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr_bufferSize')
-    if __cusparseScsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScsr2gebsr_bufferSize = dlsym(handle, 'cusparseScsr2gebsr_bufferSize')
-
-    global __cusparseDcsr2gebsr_bufferSize
-    __cusparseDcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr_bufferSize')
-    if __cusparseDcsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDcsr2gebsr_bufferSize = dlsym(handle, 'cusparseDcsr2gebsr_bufferSize')
-
-    global __cusparseCcsr2gebsr_bufferSize
-    __cusparseCcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr_bufferSize')
-    if __cusparseCcsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCcsr2gebsr_bufferSize = dlsym(handle, 'cusparseCcsr2gebsr_bufferSize')
-
-    global __cusparseZcsr2gebsr_bufferSize
-    __cusparseZcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr_bufferSize')
-    if __cusparseZcsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZcsr2gebsr_bufferSize = dlsym(handle, 'cusparseZcsr2gebsr_bufferSize')
-
-    global __cusparseScsr2gebsr_bufferSizeExt
-    __cusparseScsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr_bufferSizeExt')
-    if __cusparseScsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseScsr2gebsr_bufferSizeExt')
-
-    global __cusparseDcsr2gebsr_bufferSizeExt
-    __cusparseDcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr_bufferSizeExt')
-    if __cusparseDcsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseDcsr2gebsr_bufferSizeExt')
-
-    global __cusparseCcsr2gebsr_bufferSizeExt
-    __cusparseCcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr_bufferSizeExt')
-    if __cusparseCcsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseCcsr2gebsr_bufferSizeExt')
-
-    global __cusparseZcsr2gebsr_bufferSizeExt
-    __cusparseZcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr_bufferSizeExt')
-    if __cusparseZcsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseZcsr2gebsr_bufferSizeExt')
-
-    global __cusparseXcsr2gebsrNnz
-    __cusparseXcsr2gebsrNnz = dlsym(RTLD_DEFAULT, 'cusparseXcsr2gebsrNnz')
-    if __cusparseXcsr2gebsrNnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcsr2gebsrNnz = dlsym(handle, 'cusparseXcsr2gebsrNnz')
-
-    global __cusparseScsr2gebsr
-    __cusparseScsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr')
-    if __cusparseScsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScsr2gebsr = dlsym(handle, 'cusparseScsr2gebsr')
-
-    global __cusparseDcsr2gebsr
-    __cusparseDcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr')
-    if __cusparseDcsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDcsr2gebsr = dlsym(handle, 'cusparseDcsr2gebsr')
-
-    global __cusparseCcsr2gebsr
-    __cusparseCcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr')
-    if __cusparseCcsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCcsr2gebsr = dlsym(handle, 'cusparseCcsr2gebsr')
-
-    global __cusparseZcsr2gebsr
-    __cusparseZcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr')
-    if __cusparseZcsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZcsr2gebsr = dlsym(handle, 'cusparseZcsr2gebsr')
-
-    global __cusparseSgebsr2gebsr_bufferSize
-    __cusparseSgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr_bufferSize')
-    if __cusparseSgebsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseSgebsr2gebsr_bufferSize')
-
-    global __cusparseDgebsr2gebsr_bufferSize
-    __cusparseDgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr_bufferSize')
-    if __cusparseDgebsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseDgebsr2gebsr_bufferSize')
-
-    global __cusparseCgebsr2gebsr_bufferSize
-    __cusparseCgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr_bufferSize')
-    if __cusparseCgebsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseCgebsr2gebsr_bufferSize')
-
-    global __cusparseZgebsr2gebsr_bufferSize
-    __cusparseZgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr_bufferSize')
-    if __cusparseZgebsr2gebsr_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseZgebsr2gebsr_bufferSize')
-
-    global __cusparseSgebsr2gebsr_bufferSizeExt
-    __cusparseSgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr_bufferSizeExt')
-    if __cusparseSgebsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseSgebsr2gebsr_bufferSizeExt')
-
-    global __cusparseDgebsr2gebsr_bufferSizeExt
-    __cusparseDgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr_bufferSizeExt')
-    if __cusparseDgebsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseDgebsr2gebsr_bufferSizeExt')
-
-    global __cusparseCgebsr2gebsr_bufferSizeExt
-    __cusparseCgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr_bufferSizeExt')
-    if __cusparseCgebsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseCgebsr2gebsr_bufferSizeExt')
-
-    global __cusparseZgebsr2gebsr_bufferSizeExt
-    __cusparseZgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr_bufferSizeExt')
-    if __cusparseZgebsr2gebsr_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseZgebsr2gebsr_bufferSizeExt')
-
-    global __cusparseXgebsr2gebsrNnz
-    __cusparseXgebsr2gebsrNnz = dlsym(RTLD_DEFAULT, 'cusparseXgebsr2gebsrNnz')
-    if __cusparseXgebsr2gebsrNnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXgebsr2gebsrNnz = dlsym(handle, 'cusparseXgebsr2gebsrNnz')
-
-    global __cusparseSgebsr2gebsr
-    __cusparseSgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr')
-    if __cusparseSgebsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSgebsr2gebsr = dlsym(handle, 'cusparseSgebsr2gebsr')
-
-    global __cusparseDgebsr2gebsr
-    __cusparseDgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr')
-    if __cusparseDgebsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDgebsr2gebsr = dlsym(handle, 'cusparseDgebsr2gebsr')
-
-    global __cusparseCgebsr2gebsr
-    __cusparseCgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr')
-    if __cusparseCgebsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCgebsr2gebsr = dlsym(handle, 'cusparseCgebsr2gebsr')
-
-    global __cusparseZgebsr2gebsr
-    __cusparseZgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr')
-    if __cusparseZgebsr2gebsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseZgebsr2gebsr = dlsym(handle, 'cusparseZgebsr2gebsr')
-
-    global __cusparseXcoosort_bufferSizeExt
-    __cusparseXcoosort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcoosort_bufferSizeExt')
-    if __cusparseXcoosort_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcoosort_bufferSizeExt = dlsym(handle, 'cusparseXcoosort_bufferSizeExt')
-
-    global __cusparseXcoosortByRow
-    __cusparseXcoosortByRow = dlsym(RTLD_DEFAULT, 'cusparseXcoosortByRow')
-    if __cusparseXcoosortByRow == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcoosortByRow = dlsym(handle, 'cusparseXcoosortByRow')
-
-    global __cusparseXcoosortByColumn
-    __cusparseXcoosortByColumn = dlsym(RTLD_DEFAULT, 'cusparseXcoosortByColumn')
-    if __cusparseXcoosortByColumn == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcoosortByColumn = dlsym(handle, 'cusparseXcoosortByColumn')
-
-    global __cusparseXcsrsort_bufferSizeExt
-    __cusparseXcsrsort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcsrsort_bufferSizeExt')
-    if __cusparseXcsrsort_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcsrsort_bufferSizeExt = dlsym(handle, 'cusparseXcsrsort_bufferSizeExt')
-
-    global __cusparseXcsrsort
-    __cusparseXcsrsort = dlsym(RTLD_DEFAULT, 'cusparseXcsrsort')
-    if __cusparseXcsrsort == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcsrsort = dlsym(handle, 'cusparseXcsrsort')
-
-    global __cusparseXcscsort_bufferSizeExt
-    __cusparseXcscsort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcscsort_bufferSizeExt')
-    if __cusparseXcscsort_bufferSizeExt == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcscsort_bufferSizeExt = dlsym(handle, 'cusparseXcscsort_bufferSizeExt')
-
-    global __cusparseXcscsort
-    __cusparseXcscsort = dlsym(RTLD_DEFAULT, 'cusparseXcscsort')
-    if __cusparseXcscsort == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseXcscsort = dlsym(handle, 'cusparseXcscsort')
-
-    global __cusparseCsr2cscEx2
-    __cusparseCsr2cscEx2 = dlsym(RTLD_DEFAULT, 'cusparseCsr2cscEx2')
-    if __cusparseCsr2cscEx2 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCsr2cscEx2 = dlsym(handle, 'cusparseCsr2cscEx2')
-
-    global __cusparseCsr2cscEx2_bufferSize
-    __cusparseCsr2cscEx2_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCsr2cscEx2_bufferSize')
-    if __cusparseCsr2cscEx2_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCsr2cscEx2_bufferSize = dlsym(handle, 'cusparseCsr2cscEx2_bufferSize')
-
-    global __cusparseCreateSpVec
-    __cusparseCreateSpVec = dlsym(RTLD_DEFAULT, 'cusparseCreateSpVec')
-    if __cusparseCreateSpVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateSpVec = dlsym(handle, 'cusparseCreateSpVec')
-
-    global __cusparseDestroySpVec
-    __cusparseDestroySpVec = dlsym(RTLD_DEFAULT, 'cusparseDestroySpVec')
-    if __cusparseDestroySpVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroySpVec = dlsym(handle, 'cusparseDestroySpVec')
-
-    global __cusparseSpVecGet
-    __cusparseSpVecGet = dlsym(RTLD_DEFAULT, 'cusparseSpVecGet')
-    if __cusparseSpVecGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVecGet = dlsym(handle, 'cusparseSpVecGet')
-
-    global __cusparseSpVecGetIndexBase
-    __cusparseSpVecGetIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSpVecGetIndexBase')
-    if __cusparseSpVecGetIndexBase == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVecGetIndexBase = dlsym(handle, 'cusparseSpVecGetIndexBase')
-
-    global __cusparseSpVecGetValues
-    __cusparseSpVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseSpVecGetValues')
-    if __cusparseSpVecGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVecGetValues = dlsym(handle, 'cusparseSpVecGetValues')
-
-    global __cusparseSpVecSetValues
-    __cusparseSpVecSetValues = dlsym(RTLD_DEFAULT, 'cusparseSpVecSetValues')
-    if __cusparseSpVecSetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVecSetValues = dlsym(handle, 'cusparseSpVecSetValues')
-
-    global __cusparseCreateDnVec
-    __cusparseCreateDnVec = dlsym(RTLD_DEFAULT, 'cusparseCreateDnVec')
-    if __cusparseCreateDnVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateDnVec = dlsym(handle, 'cusparseCreateDnVec')
-
-    global __cusparseDestroyDnVec
-    __cusparseDestroyDnVec = dlsym(RTLD_DEFAULT, 'cusparseDestroyDnVec')
-    if __cusparseDestroyDnVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroyDnVec = dlsym(handle, 'cusparseDestroyDnVec')
-
-    global __cusparseDnVecGet
-    __cusparseDnVecGet = dlsym(RTLD_DEFAULT, 'cusparseDnVecGet')
-    if __cusparseDnVecGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnVecGet = dlsym(handle, 'cusparseDnVecGet')
-
-    global __cusparseDnVecGetValues
-    __cusparseDnVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseDnVecGetValues')
-    if __cusparseDnVecGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnVecGetValues = dlsym(handle, 'cusparseDnVecGetValues')
-
-    global __cusparseDnVecSetValues
-    __cusparseDnVecSetValues = dlsym(RTLD_DEFAULT, 'cusparseDnVecSetValues')
-    if __cusparseDnVecSetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnVecSetValues = dlsym(handle, 'cusparseDnVecSetValues')
-
-    global __cusparseDestroySpMat
-    __cusparseDestroySpMat = dlsym(RTLD_DEFAULT, 'cusparseDestroySpMat')
-    if __cusparseDestroySpMat == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroySpMat = dlsym(handle, 'cusparseDestroySpMat')
-
-    global __cusparseSpMatGetFormat
-    __cusparseSpMatGetFormat = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetFormat')
-    if __cusparseSpMatGetFormat == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetFormat = dlsym(handle, 'cusparseSpMatGetFormat')
-
-    global __cusparseSpMatGetIndexBase
-    __cusparseSpMatGetIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetIndexBase')
-    if __cusparseSpMatGetIndexBase == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetIndexBase = dlsym(handle, 'cusparseSpMatGetIndexBase')
-
-    global __cusparseSpMatGetValues
-    __cusparseSpMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetValues')
-    if __cusparseSpMatGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetValues = dlsym(handle, 'cusparseSpMatGetValues')
-
-    global __cusparseSpMatSetValues
-    __cusparseSpMatSetValues = dlsym(RTLD_DEFAULT, 'cusparseSpMatSetValues')
-    if __cusparseSpMatSetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatSetValues = dlsym(handle, 'cusparseSpMatSetValues')
-
-    global __cusparseSpMatGetSize
-    __cusparseSpMatGetSize = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetSize')
-    if __cusparseSpMatGetSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetSize = dlsym(handle, 'cusparseSpMatGetSize')
-
-    global __cusparseSpMatGetStridedBatch
-    __cusparseSpMatGetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetStridedBatch')
-    if __cusparseSpMatGetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetStridedBatch = dlsym(handle, 'cusparseSpMatGetStridedBatch')
-
-    global __cusparseCooSetStridedBatch
-    __cusparseCooSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCooSetStridedBatch')
-    if __cusparseCooSetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCooSetStridedBatch = dlsym(handle, 'cusparseCooSetStridedBatch')
-
-    global __cusparseCsrSetStridedBatch
-    __cusparseCsrSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCsrSetStridedBatch')
-    if __cusparseCsrSetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCsrSetStridedBatch = dlsym(handle, 'cusparseCsrSetStridedBatch')
-
-    global __cusparseCreateCsr
-    __cusparseCreateCsr = dlsym(RTLD_DEFAULT, 'cusparseCreateCsr')
-    if __cusparseCreateCsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateCsr = dlsym(handle, 'cusparseCreateCsr')
-
-    global __cusparseCsrGet
-    __cusparseCsrGet = dlsym(RTLD_DEFAULT, 'cusparseCsrGet')
-    if __cusparseCsrGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCsrGet = dlsym(handle, 'cusparseCsrGet')
-
-    global __cusparseCsrSetPointers
-    __cusparseCsrSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCsrSetPointers')
-    if __cusparseCsrSetPointers == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCsrSetPointers = dlsym(handle, 'cusparseCsrSetPointers')
-
-    global __cusparseCreateCoo
-    __cusparseCreateCoo = dlsym(RTLD_DEFAULT, 'cusparseCreateCoo')
-    if __cusparseCreateCoo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateCoo = dlsym(handle, 'cusparseCreateCoo')
-
-    global __cusparseCooGet
-    __cusparseCooGet = dlsym(RTLD_DEFAULT, 'cusparseCooGet')
-    if __cusparseCooGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCooGet = dlsym(handle, 'cusparseCooGet')
-
-    global __cusparseCreateDnMat
-    __cusparseCreateDnMat = dlsym(RTLD_DEFAULT, 'cusparseCreateDnMat')
-    if __cusparseCreateDnMat == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateDnMat = dlsym(handle, 'cusparseCreateDnMat')
-
-    global __cusparseDestroyDnMat
-    __cusparseDestroyDnMat = dlsym(RTLD_DEFAULT, 'cusparseDestroyDnMat')
-    if __cusparseDestroyDnMat == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDestroyDnMat = dlsym(handle, 'cusparseDestroyDnMat')
-
-    global __cusparseDnMatGet
-    __cusparseDnMatGet = dlsym(RTLD_DEFAULT, 'cusparseDnMatGet')
-    if __cusparseDnMatGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnMatGet = dlsym(handle, 'cusparseDnMatGet')
-
-    global __cusparseDnMatGetValues
-    __cusparseDnMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseDnMatGetValues')
-    if __cusparseDnMatGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnMatGetValues = dlsym(handle, 'cusparseDnMatGetValues')
-
-    global __cusparseDnMatSetValues
-    __cusparseDnMatSetValues = dlsym(RTLD_DEFAULT, 'cusparseDnMatSetValues')
-    if __cusparseDnMatSetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnMatSetValues = dlsym(handle, 'cusparseDnMatSetValues')
-
-    global __cusparseDnMatSetStridedBatch
-    __cusparseDnMatSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDnMatSetStridedBatch')
-    if __cusparseDnMatSetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnMatSetStridedBatch = dlsym(handle, 'cusparseDnMatSetStridedBatch')
-
-    global __cusparseDnMatGetStridedBatch
-    __cusparseDnMatGetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDnMatGetStridedBatch')
-    if __cusparseDnMatGetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDnMatGetStridedBatch = dlsym(handle, 'cusparseDnMatGetStridedBatch')
-
-    global __cusparseAxpby
-    __cusparseAxpby = dlsym(RTLD_DEFAULT, 'cusparseAxpby')
-    if __cusparseAxpby == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseAxpby = dlsym(handle, 'cusparseAxpby')
-
-    global __cusparseGather
-    __cusparseGather = dlsym(RTLD_DEFAULT, 'cusparseGather')
-    if __cusparseGather == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseGather = dlsym(handle, 'cusparseGather')
-
-    global __cusparseScatter
-    __cusparseScatter = dlsym(RTLD_DEFAULT, 'cusparseScatter')
-    if __cusparseScatter == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseScatter = dlsym(handle, 'cusparseScatter')
-
-    global __cusparseSpVV_bufferSize
-    __cusparseSpVV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpVV_bufferSize')
-    if __cusparseSpVV_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVV_bufferSize = dlsym(handle, 'cusparseSpVV_bufferSize')
-
-    global __cusparseSpVV
-    __cusparseSpVV = dlsym(RTLD_DEFAULT, 'cusparseSpVV')
-    if __cusparseSpVV == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpVV = dlsym(handle, 'cusparseSpVV')
-
-    global __cusparseSpMV
-    __cusparseSpMV = dlsym(RTLD_DEFAULT, 'cusparseSpMV')
-    if __cusparseSpMV == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMV = dlsym(handle, 'cusparseSpMV')
-
-    global __cusparseSpMV_bufferSize
-    __cusparseSpMV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpMV_bufferSize')
-    if __cusparseSpMV_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMV_bufferSize = dlsym(handle, 'cusparseSpMV_bufferSize')
-
-    global __cusparseSpMM
-    __cusparseSpMM = dlsym(RTLD_DEFAULT, 'cusparseSpMM')
-    if __cusparseSpMM == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMM = dlsym(handle, 'cusparseSpMM')
-
-    global __cusparseSpMM_bufferSize
-    __cusparseSpMM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpMM_bufferSize')
-    if __cusparseSpMM_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMM_bufferSize = dlsym(handle, 'cusparseSpMM_bufferSize')
-
-    global __cusparseSpGEMM_createDescr
-    __cusparseSpGEMM_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_createDescr')
-    if __cusparseSpGEMM_createDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_createDescr = dlsym(handle, 'cusparseSpGEMM_createDescr')
-
-    global __cusparseSpGEMM_destroyDescr
-    __cusparseSpGEMM_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_destroyDescr')
-    if __cusparseSpGEMM_destroyDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_destroyDescr = dlsym(handle, 'cusparseSpGEMM_destroyDescr')
-
-    global __cusparseSpGEMM_workEstimation
-    __cusparseSpGEMM_workEstimation = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_workEstimation')
-    if __cusparseSpGEMM_workEstimation == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_workEstimation = dlsym(handle, 'cusparseSpGEMM_workEstimation')
-
-    global __cusparseSpGEMM_compute
-    __cusparseSpGEMM_compute = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_compute')
-    if __cusparseSpGEMM_compute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_compute = dlsym(handle, 'cusparseSpGEMM_compute')
-
-    global __cusparseSpGEMM_copy
-    __cusparseSpGEMM_copy = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_copy')
-    if __cusparseSpGEMM_copy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_copy = dlsym(handle, 'cusparseSpGEMM_copy')
-
-    global __cusparseCreateCsc
-    __cusparseCreateCsc = dlsym(RTLD_DEFAULT, 'cusparseCreateCsc')
-    if __cusparseCreateCsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateCsc = dlsym(handle, 'cusparseCreateCsc')
-
-    global __cusparseCscSetPointers
-    __cusparseCscSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCscSetPointers')
-    if __cusparseCscSetPointers == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCscSetPointers = dlsym(handle, 'cusparseCscSetPointers')
-
-    global __cusparseCooSetPointers
-    __cusparseCooSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCooSetPointers')
-    if __cusparseCooSetPointers == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCooSetPointers = dlsym(handle, 'cusparseCooSetPointers')
-
-    global __cusparseSparseToDense_bufferSize
-    __cusparseSparseToDense_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSparseToDense_bufferSize')
-    if __cusparseSparseToDense_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSparseToDense_bufferSize = dlsym(handle, 'cusparseSparseToDense_bufferSize')
-
-    global __cusparseSparseToDense
-    __cusparseSparseToDense = dlsym(RTLD_DEFAULT, 'cusparseSparseToDense')
-    if __cusparseSparseToDense == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSparseToDense = dlsym(handle, 'cusparseSparseToDense')
-
-    global __cusparseDenseToSparse_bufferSize
-    __cusparseDenseToSparse_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_bufferSize')
-    if __cusparseDenseToSparse_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDenseToSparse_bufferSize = dlsym(handle, 'cusparseDenseToSparse_bufferSize')
-
-    global __cusparseDenseToSparse_analysis
-    __cusparseDenseToSparse_analysis = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_analysis')
-    if __cusparseDenseToSparse_analysis == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDenseToSparse_analysis = dlsym(handle, 'cusparseDenseToSparse_analysis')
-
-    global __cusparseDenseToSparse_convert
-    __cusparseDenseToSparse_convert = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_convert')
-    if __cusparseDenseToSparse_convert == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseDenseToSparse_convert = dlsym(handle, 'cusparseDenseToSparse_convert')
-
-    global __cusparseCreateBlockedEll
-    __cusparseCreateBlockedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateBlockedEll')
-    if __cusparseCreateBlockedEll == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateBlockedEll = dlsym(handle, 'cusparseCreateBlockedEll')
-
-    global __cusparseBlockedEllGet
-    __cusparseBlockedEllGet = dlsym(RTLD_DEFAULT, 'cusparseBlockedEllGet')
-    if __cusparseBlockedEllGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseBlockedEllGet = dlsym(handle, 'cusparseBlockedEllGet')
-
-    global __cusparseSpMM_preprocess
-    __cusparseSpMM_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSpMM_preprocess')
-    if __cusparseSpMM_preprocess == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMM_preprocess = dlsym(handle, 'cusparseSpMM_preprocess')
-
-    global __cusparseSDDMM_bufferSize
-    __cusparseSDDMM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSDDMM_bufferSize')
-    if __cusparseSDDMM_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSDDMM_bufferSize = dlsym(handle, 'cusparseSDDMM_bufferSize')
-
-    global __cusparseSDDMM_preprocess
-    __cusparseSDDMM_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSDDMM_preprocess')
-    if __cusparseSDDMM_preprocess == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSDDMM_preprocess = dlsym(handle, 'cusparseSDDMM_preprocess')
-
-    global __cusparseSDDMM
-    __cusparseSDDMM = dlsym(RTLD_DEFAULT, 'cusparseSDDMM')
-    if __cusparseSDDMM == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSDDMM = dlsym(handle, 'cusparseSDDMM')
-
-    global __cusparseSpMatGetAttribute
-    __cusparseSpMatGetAttribute = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetAttribute')
-    if __cusparseSpMatGetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatGetAttribute = dlsym(handle, 'cusparseSpMatGetAttribute')
-
-    global __cusparseSpMatSetAttribute
-    __cusparseSpMatSetAttribute = dlsym(RTLD_DEFAULT, 'cusparseSpMatSetAttribute')
-    if __cusparseSpMatSetAttribute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMatSetAttribute = dlsym(handle, 'cusparseSpMatSetAttribute')
-
-    global __cusparseSpSV_createDescr
-    __cusparseSpSV_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSV_createDescr')
-    if __cusparseSpSV_createDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_createDescr = dlsym(handle, 'cusparseSpSV_createDescr')
-
-    global __cusparseSpSV_destroyDescr
-    __cusparseSpSV_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSV_destroyDescr')
-    if __cusparseSpSV_destroyDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_destroyDescr = dlsym(handle, 'cusparseSpSV_destroyDescr')
-
-    global __cusparseSpSV_bufferSize
-    __cusparseSpSV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpSV_bufferSize')
-    if __cusparseSpSV_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_bufferSize = dlsym(handle, 'cusparseSpSV_bufferSize')
-
-    global __cusparseSpSV_analysis
-    __cusparseSpSV_analysis = dlsym(RTLD_DEFAULT, 'cusparseSpSV_analysis')
-    if __cusparseSpSV_analysis == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_analysis = dlsym(handle, 'cusparseSpSV_analysis')
-
-    global __cusparseSpSV_solve
-    __cusparseSpSV_solve = dlsym(RTLD_DEFAULT, 'cusparseSpSV_solve')
-    if __cusparseSpSV_solve == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_solve = dlsym(handle, 'cusparseSpSV_solve')
-
-    global __cusparseSpSM_createDescr
-    __cusparseSpSM_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSM_createDescr')
-    if __cusparseSpSM_createDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_createDescr = dlsym(handle, 'cusparseSpSM_createDescr')
-
-    global __cusparseSpSM_destroyDescr
-    __cusparseSpSM_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSM_destroyDescr')
-    if __cusparseSpSM_destroyDescr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_destroyDescr = dlsym(handle, 'cusparseSpSM_destroyDescr')
-
-    global __cusparseSpSM_bufferSize
-    __cusparseSpSM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpSM_bufferSize')
-    if __cusparseSpSM_bufferSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_bufferSize = dlsym(handle, 'cusparseSpSM_bufferSize')
-
-    global __cusparseSpSM_analysis
-    __cusparseSpSM_analysis = dlsym(RTLD_DEFAULT, 'cusparseSpSM_analysis')
-    if __cusparseSpSM_analysis == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_analysis = dlsym(handle, 'cusparseSpSM_analysis')
-
-    global __cusparseSpSM_solve
-    __cusparseSpSM_solve = dlsym(RTLD_DEFAULT, 'cusparseSpSM_solve')
-    if __cusparseSpSM_solve == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_solve = dlsym(handle, 'cusparseSpSM_solve')
-
-    global __cusparseSpGEMMreuse_workEstimation
-    __cusparseSpGEMMreuse_workEstimation = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_workEstimation')
-    if __cusparseSpGEMMreuse_workEstimation == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMMreuse_workEstimation = dlsym(handle, 'cusparseSpGEMMreuse_workEstimation')
-
-    global __cusparseSpGEMMreuse_nnz
-    __cusparseSpGEMMreuse_nnz = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_nnz')
-    if __cusparseSpGEMMreuse_nnz == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMMreuse_nnz = dlsym(handle, 'cusparseSpGEMMreuse_nnz')
-
-    global __cusparseSpGEMMreuse_copy
-    __cusparseSpGEMMreuse_copy = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_copy')
-    if __cusparseSpGEMMreuse_copy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMMreuse_copy = dlsym(handle, 'cusparseSpGEMMreuse_copy')
-
-    global __cusparseSpGEMMreuse_compute
-    __cusparseSpGEMMreuse_compute = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_compute')
-    if __cusparseSpGEMMreuse_compute == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMMreuse_compute = dlsym(handle, 'cusparseSpGEMMreuse_compute')
-
-    global __cusparseLoggerSetCallback
-    __cusparseLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetCallback')
-    if __cusparseLoggerSetCallback == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerSetCallback = dlsym(handle, 'cusparseLoggerSetCallback')
-
-    global __cusparseLoggerSetFile
-    __cusparseLoggerSetFile = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetFile')
-    if __cusparseLoggerSetFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerSetFile = dlsym(handle, 'cusparseLoggerSetFile')
-
-    global __cusparseLoggerOpenFile
-    __cusparseLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cusparseLoggerOpenFile')
-    if __cusparseLoggerOpenFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerOpenFile = dlsym(handle, 'cusparseLoggerOpenFile')
-
-    global __cusparseLoggerSetLevel
-    __cusparseLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetLevel')
-    if __cusparseLoggerSetLevel == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerSetLevel = dlsym(handle, 'cusparseLoggerSetLevel')
-
-    global __cusparseLoggerSetMask
-    __cusparseLoggerSetMask = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetMask')
-    if __cusparseLoggerSetMask == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerSetMask = dlsym(handle, 'cusparseLoggerSetMask')
-
-    global __cusparseLoggerForceDisable
-    __cusparseLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cusparseLoggerForceDisable')
-    if __cusparseLoggerForceDisable == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseLoggerForceDisable = dlsym(handle, 'cusparseLoggerForceDisable')
-
-    global __cusparseSpMMOp_createPlan
-    __cusparseSpMMOp_createPlan = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp_createPlan')
-    if __cusparseSpMMOp_createPlan == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMMOp_createPlan = dlsym(handle, 'cusparseSpMMOp_createPlan')
-
-    global __cusparseSpMMOp
-    __cusparseSpMMOp = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp')
-    if __cusparseSpMMOp == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMMOp = dlsym(handle, 'cusparseSpMMOp')
-
-    global __cusparseSpMMOp_destroyPlan
-    __cusparseSpMMOp_destroyPlan = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp_destroyPlan')
-    if __cusparseSpMMOp_destroyPlan == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMMOp_destroyPlan = dlsym(handle, 'cusparseSpMMOp_destroyPlan')
-
-    global __cusparseCscGet
-    __cusparseCscGet = dlsym(RTLD_DEFAULT, 'cusparseCscGet')
-    if __cusparseCscGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCscGet = dlsym(handle, 'cusparseCscGet')
-
-    global __cusparseCreateConstSpVec
-    __cusparseCreateConstSpVec = dlsym(RTLD_DEFAULT, 'cusparseCreateConstSpVec')
-    if __cusparseCreateConstSpVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstSpVec = dlsym(handle, 'cusparseCreateConstSpVec')
-
-    global __cusparseConstSpVecGet
-    __cusparseConstSpVecGet = dlsym(RTLD_DEFAULT, 'cusparseConstSpVecGet')
-    if __cusparseConstSpVecGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstSpVecGet = dlsym(handle, 'cusparseConstSpVecGet')
-
-    global __cusparseConstSpVecGetValues
-    __cusparseConstSpVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstSpVecGetValues')
-    if __cusparseConstSpVecGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstSpVecGetValues = dlsym(handle, 'cusparseConstSpVecGetValues')
-
-    global __cusparseCreateConstDnVec
-    __cusparseCreateConstDnVec = dlsym(RTLD_DEFAULT, 'cusparseCreateConstDnVec')
-    if __cusparseCreateConstDnVec == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstDnVec = dlsym(handle, 'cusparseCreateConstDnVec')
-
-    global __cusparseConstDnVecGet
-    __cusparseConstDnVecGet = dlsym(RTLD_DEFAULT, 'cusparseConstDnVecGet')
-    if __cusparseConstDnVecGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstDnVecGet = dlsym(handle, 'cusparseConstDnVecGet')
-
-    global __cusparseConstDnVecGetValues
-    __cusparseConstDnVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstDnVecGetValues')
-    if __cusparseConstDnVecGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstDnVecGetValues = dlsym(handle, 'cusparseConstDnVecGetValues')
-
-    global __cusparseConstSpMatGetValues
-    __cusparseConstSpMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstSpMatGetValues')
-    if __cusparseConstSpMatGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstSpMatGetValues = dlsym(handle, 'cusparseConstSpMatGetValues')
-
-    global __cusparseCreateConstCsr
-    __cusparseCreateConstCsr = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCsr')
-    if __cusparseCreateConstCsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstCsr = dlsym(handle, 'cusparseCreateConstCsr')
-
-    global __cusparseCreateConstCsc
-    __cusparseCreateConstCsc = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCsc')
-    if __cusparseCreateConstCsc == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstCsc = dlsym(handle, 'cusparseCreateConstCsc')
-
-    global __cusparseConstCsrGet
-    __cusparseConstCsrGet = dlsym(RTLD_DEFAULT, 'cusparseConstCsrGet')
-    if __cusparseConstCsrGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstCsrGet = dlsym(handle, 'cusparseConstCsrGet')
-
-    global __cusparseConstCscGet
-    __cusparseConstCscGet = dlsym(RTLD_DEFAULT, 'cusparseConstCscGet')
-    if __cusparseConstCscGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstCscGet = dlsym(handle, 'cusparseConstCscGet')
-
-    global __cusparseCreateConstCoo
-    __cusparseCreateConstCoo = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCoo')
-    if __cusparseCreateConstCoo == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstCoo = dlsym(handle, 'cusparseCreateConstCoo')
-
-    global __cusparseConstCooGet
-    __cusparseConstCooGet = dlsym(RTLD_DEFAULT, 'cusparseConstCooGet')
-    if __cusparseConstCooGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstCooGet = dlsym(handle, 'cusparseConstCooGet')
-
-    global __cusparseCreateConstBlockedEll
-    __cusparseCreateConstBlockedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateConstBlockedEll')
-    if __cusparseCreateConstBlockedEll == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstBlockedEll = dlsym(handle, 'cusparseCreateConstBlockedEll')
-
-    global __cusparseConstBlockedEllGet
-    __cusparseConstBlockedEllGet = dlsym(RTLD_DEFAULT, 'cusparseConstBlockedEllGet')
-    if __cusparseConstBlockedEllGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstBlockedEllGet = dlsym(handle, 'cusparseConstBlockedEllGet')
-
-    global __cusparseCreateConstDnMat
-    __cusparseCreateConstDnMat = dlsym(RTLD_DEFAULT, 'cusparseCreateConstDnMat')
-    if __cusparseCreateConstDnMat == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstDnMat = dlsym(handle, 'cusparseCreateConstDnMat')
-
-    global __cusparseConstDnMatGet
-    __cusparseConstDnMatGet = dlsym(RTLD_DEFAULT, 'cusparseConstDnMatGet')
-    if __cusparseConstDnMatGet == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstDnMatGet = dlsym(handle, 'cusparseConstDnMatGet')
-
-    global __cusparseConstDnMatGetValues
-    __cusparseConstDnMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstDnMatGetValues')
-    if __cusparseConstDnMatGetValues == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseConstDnMatGetValues = dlsym(handle, 'cusparseConstDnMatGetValues')
-
-    global __cusparseSpGEMM_getNumProducts
-    __cusparseSpGEMM_getNumProducts = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_getNumProducts')
-    if __cusparseSpGEMM_getNumProducts == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_getNumProducts = dlsym(handle, 'cusparseSpGEMM_getNumProducts')
-
-    global __cusparseSpGEMM_estimateMemory
-    __cusparseSpGEMM_estimateMemory = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_estimateMemory')
-    if __cusparseSpGEMM_estimateMemory == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpGEMM_estimateMemory = dlsym(handle, 'cusparseSpGEMM_estimateMemory')
-
-    global __cusparseBsrSetStridedBatch
-    __cusparseBsrSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseBsrSetStridedBatch')
-    if __cusparseBsrSetStridedBatch == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseBsrSetStridedBatch = dlsym(handle, 'cusparseBsrSetStridedBatch')
-
-    global __cusparseCreateBsr
-    __cusparseCreateBsr = dlsym(RTLD_DEFAULT, 'cusparseCreateBsr')
-    if __cusparseCreateBsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateBsr = dlsym(handle, 'cusparseCreateBsr')
-
-    global __cusparseCreateConstBsr
-    __cusparseCreateConstBsr = dlsym(RTLD_DEFAULT, 'cusparseCreateConstBsr')
-    if __cusparseCreateConstBsr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstBsr = dlsym(handle, 'cusparseCreateConstBsr')
-
-    global __cusparseCreateSlicedEll
-    __cusparseCreateSlicedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateSlicedEll')
-    if __cusparseCreateSlicedEll == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateSlicedEll = dlsym(handle, 'cusparseCreateSlicedEll')
-
-    global __cusparseCreateConstSlicedEll
-    __cusparseCreateConstSlicedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateConstSlicedEll')
-    if __cusparseCreateConstSlicedEll == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseCreateConstSlicedEll = dlsym(handle, 'cusparseCreateConstSlicedEll')
-
-    global __cusparseSpSV_updateMatrix
-    __cusparseSpSV_updateMatrix = dlsym(RTLD_DEFAULT, 'cusparseSpSV_updateMatrix')
-    if __cusparseSpSV_updateMatrix == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSV_updateMatrix = dlsym(handle, 'cusparseSpSV_updateMatrix')
-
-    global __cusparseSpMV_preprocess
-    __cusparseSpMV_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSpMV_preprocess')
-    if __cusparseSpMV_preprocess == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpMV_preprocess = dlsym(handle, 'cusparseSpMV_preprocess')
-
-    global __cusparseSpSM_updateMatrix
-    __cusparseSpSM_updateMatrix = dlsym(RTLD_DEFAULT, 'cusparseSpSM_updateMatrix')
-    if __cusparseSpSM_updateMatrix == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusparseSpSM_updateMatrix = dlsym(handle, 'cusparseSpSM_updateMatrix')
-
-    __py_cusparse_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __cusparseCreate
+        __cusparseCreate = dlsym(RTLD_DEFAULT, 'cusparseCreate')
+        if __cusparseCreate == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreate = dlsym(handle, 'cusparseCreate')
+
+        global __cusparseDestroy
+        __cusparseDestroy = dlsym(RTLD_DEFAULT, 'cusparseDestroy')
+        if __cusparseDestroy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroy = dlsym(handle, 'cusparseDestroy')
+
+        global __cusparseGetVersion
+        __cusparseGetVersion = dlsym(RTLD_DEFAULT, 'cusparseGetVersion')
+        if __cusparseGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetVersion = dlsym(handle, 'cusparseGetVersion')
+
+        global __cusparseGetProperty
+        __cusparseGetProperty = dlsym(RTLD_DEFAULT, 'cusparseGetProperty')
+        if __cusparseGetProperty == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetProperty = dlsym(handle, 'cusparseGetProperty')
+
+        global __cusparseGetErrorName
+        __cusparseGetErrorName = dlsym(RTLD_DEFAULT, 'cusparseGetErrorName')
+        if __cusparseGetErrorName == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetErrorName = dlsym(handle, 'cusparseGetErrorName')
+
+        global __cusparseGetErrorString
+        __cusparseGetErrorString = dlsym(RTLD_DEFAULT, 'cusparseGetErrorString')
+        if __cusparseGetErrorString == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetErrorString = dlsym(handle, 'cusparseGetErrorString')
+
+        global __cusparseSetStream
+        __cusparseSetStream = dlsym(RTLD_DEFAULT, 'cusparseSetStream')
+        if __cusparseSetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetStream = dlsym(handle, 'cusparseSetStream')
+
+        global __cusparseGetStream
+        __cusparseGetStream = dlsym(RTLD_DEFAULT, 'cusparseGetStream')
+        if __cusparseGetStream == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetStream = dlsym(handle, 'cusparseGetStream')
+
+        global __cusparseGetPointerMode
+        __cusparseGetPointerMode = dlsym(RTLD_DEFAULT, 'cusparseGetPointerMode')
+        if __cusparseGetPointerMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetPointerMode = dlsym(handle, 'cusparseGetPointerMode')
+
+        global __cusparseSetPointerMode
+        __cusparseSetPointerMode = dlsym(RTLD_DEFAULT, 'cusparseSetPointerMode')
+        if __cusparseSetPointerMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetPointerMode = dlsym(handle, 'cusparseSetPointerMode')
+
+        global __cusparseCreateMatDescr
+        __cusparseCreateMatDescr = dlsym(RTLD_DEFAULT, 'cusparseCreateMatDescr')
+        if __cusparseCreateMatDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateMatDescr = dlsym(handle, 'cusparseCreateMatDescr')
+
+        global __cusparseDestroyMatDescr
+        __cusparseDestroyMatDescr = dlsym(RTLD_DEFAULT, 'cusparseDestroyMatDescr')
+        if __cusparseDestroyMatDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroyMatDescr = dlsym(handle, 'cusparseDestroyMatDescr')
+
+        global __cusparseSetMatType
+        __cusparseSetMatType = dlsym(RTLD_DEFAULT, 'cusparseSetMatType')
+        if __cusparseSetMatType == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetMatType = dlsym(handle, 'cusparseSetMatType')
+
+        global __cusparseGetMatType
+        __cusparseGetMatType = dlsym(RTLD_DEFAULT, 'cusparseGetMatType')
+        if __cusparseGetMatType == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetMatType = dlsym(handle, 'cusparseGetMatType')
+
+        global __cusparseSetMatFillMode
+        __cusparseSetMatFillMode = dlsym(RTLD_DEFAULT, 'cusparseSetMatFillMode')
+        if __cusparseSetMatFillMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetMatFillMode = dlsym(handle, 'cusparseSetMatFillMode')
+
+        global __cusparseGetMatFillMode
+        __cusparseGetMatFillMode = dlsym(RTLD_DEFAULT, 'cusparseGetMatFillMode')
+        if __cusparseGetMatFillMode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetMatFillMode = dlsym(handle, 'cusparseGetMatFillMode')
+
+        global __cusparseSetMatDiagType
+        __cusparseSetMatDiagType = dlsym(RTLD_DEFAULT, 'cusparseSetMatDiagType')
+        if __cusparseSetMatDiagType == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetMatDiagType = dlsym(handle, 'cusparseSetMatDiagType')
+
+        global __cusparseGetMatDiagType
+        __cusparseGetMatDiagType = dlsym(RTLD_DEFAULT, 'cusparseGetMatDiagType')
+        if __cusparseGetMatDiagType == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetMatDiagType = dlsym(handle, 'cusparseGetMatDiagType')
+
+        global __cusparseSetMatIndexBase
+        __cusparseSetMatIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSetMatIndexBase')
+        if __cusparseSetMatIndexBase == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSetMatIndexBase = dlsym(handle, 'cusparseSetMatIndexBase')
+
+        global __cusparseGetMatIndexBase
+        __cusparseGetMatIndexBase = dlsym(RTLD_DEFAULT, 'cusparseGetMatIndexBase')
+        if __cusparseGetMatIndexBase == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGetMatIndexBase = dlsym(handle, 'cusparseGetMatIndexBase')
+
+        global __cusparseSgemvi
+        __cusparseSgemvi = dlsym(RTLD_DEFAULT, 'cusparseSgemvi')
+        if __cusparseSgemvi == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgemvi = dlsym(handle, 'cusparseSgemvi')
+
+        global __cusparseSgemvi_bufferSize
+        __cusparseSgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgemvi_bufferSize')
+        if __cusparseSgemvi_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgemvi_bufferSize = dlsym(handle, 'cusparseSgemvi_bufferSize')
+
+        global __cusparseDgemvi
+        __cusparseDgemvi = dlsym(RTLD_DEFAULT, 'cusparseDgemvi')
+        if __cusparseDgemvi == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgemvi = dlsym(handle, 'cusparseDgemvi')
+
+        global __cusparseDgemvi_bufferSize
+        __cusparseDgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgemvi_bufferSize')
+        if __cusparseDgemvi_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgemvi_bufferSize = dlsym(handle, 'cusparseDgemvi_bufferSize')
+
+        global __cusparseCgemvi
+        __cusparseCgemvi = dlsym(RTLD_DEFAULT, 'cusparseCgemvi')
+        if __cusparseCgemvi == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgemvi = dlsym(handle, 'cusparseCgemvi')
+
+        global __cusparseCgemvi_bufferSize
+        __cusparseCgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgemvi_bufferSize')
+        if __cusparseCgemvi_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgemvi_bufferSize = dlsym(handle, 'cusparseCgemvi_bufferSize')
+
+        global __cusparseZgemvi
+        __cusparseZgemvi = dlsym(RTLD_DEFAULT, 'cusparseZgemvi')
+        if __cusparseZgemvi == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgemvi = dlsym(handle, 'cusparseZgemvi')
+
+        global __cusparseZgemvi_bufferSize
+        __cusparseZgemvi_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgemvi_bufferSize')
+        if __cusparseZgemvi_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgemvi_bufferSize = dlsym(handle, 'cusparseZgemvi_bufferSize')
+
+        global __cusparseSbsrmv
+        __cusparseSbsrmv = dlsym(RTLD_DEFAULT, 'cusparseSbsrmv')
+        if __cusparseSbsrmv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSbsrmv = dlsym(handle, 'cusparseSbsrmv')
+
+        global __cusparseDbsrmv
+        __cusparseDbsrmv = dlsym(RTLD_DEFAULT, 'cusparseDbsrmv')
+        if __cusparseDbsrmv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDbsrmv = dlsym(handle, 'cusparseDbsrmv')
+
+        global __cusparseCbsrmv
+        __cusparseCbsrmv = dlsym(RTLD_DEFAULT, 'cusparseCbsrmv')
+        if __cusparseCbsrmv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCbsrmv = dlsym(handle, 'cusparseCbsrmv')
+
+        global __cusparseZbsrmv
+        __cusparseZbsrmv = dlsym(RTLD_DEFAULT, 'cusparseZbsrmv')
+        if __cusparseZbsrmv == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZbsrmv = dlsym(handle, 'cusparseZbsrmv')
+
+        global __cusparseSbsrmm
+        __cusparseSbsrmm = dlsym(RTLD_DEFAULT, 'cusparseSbsrmm')
+        if __cusparseSbsrmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSbsrmm = dlsym(handle, 'cusparseSbsrmm')
+
+        global __cusparseDbsrmm
+        __cusparseDbsrmm = dlsym(RTLD_DEFAULT, 'cusparseDbsrmm')
+        if __cusparseDbsrmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDbsrmm = dlsym(handle, 'cusparseDbsrmm')
+
+        global __cusparseCbsrmm
+        __cusparseCbsrmm = dlsym(RTLD_DEFAULT, 'cusparseCbsrmm')
+        if __cusparseCbsrmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCbsrmm = dlsym(handle, 'cusparseCbsrmm')
+
+        global __cusparseZbsrmm
+        __cusparseZbsrmm = dlsym(RTLD_DEFAULT, 'cusparseZbsrmm')
+        if __cusparseZbsrmm == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZbsrmm = dlsym(handle, 'cusparseZbsrmm')
+
+        global __cusparseSgtsv2_bufferSizeExt
+        __cusparseSgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_bufferSizeExt')
+        if __cusparseSgtsv2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2_bufferSizeExt')
+
+        global __cusparseDgtsv2_bufferSizeExt
+        __cusparseDgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_bufferSizeExt')
+        if __cusparseDgtsv2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2_bufferSizeExt')
+
+        global __cusparseCgtsv2_bufferSizeExt
+        __cusparseCgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_bufferSizeExt')
+        if __cusparseCgtsv2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2_bufferSizeExt')
+
+        global __cusparseZgtsv2_bufferSizeExt
+        __cusparseZgtsv2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_bufferSizeExt')
+        if __cusparseZgtsv2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2_bufferSizeExt')
+
+        global __cusparseSgtsv2
+        __cusparseSgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2')
+        if __cusparseSgtsv2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2 = dlsym(handle, 'cusparseSgtsv2')
+
+        global __cusparseDgtsv2
+        __cusparseDgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2')
+        if __cusparseDgtsv2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2 = dlsym(handle, 'cusparseDgtsv2')
+
+        global __cusparseCgtsv2
+        __cusparseCgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2')
+        if __cusparseCgtsv2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2 = dlsym(handle, 'cusparseCgtsv2')
+
+        global __cusparseZgtsv2
+        __cusparseZgtsv2 = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2')
+        if __cusparseZgtsv2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2 = dlsym(handle, 'cusparseZgtsv2')
+
+        global __cusparseSgtsv2_nopivot_bufferSizeExt
+        __cusparseSgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_nopivot_bufferSizeExt')
+        if __cusparseSgtsv2_nopivot_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2_nopivot_bufferSizeExt')
+
+        global __cusparseDgtsv2_nopivot_bufferSizeExt
+        __cusparseDgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_nopivot_bufferSizeExt')
+        if __cusparseDgtsv2_nopivot_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2_nopivot_bufferSizeExt')
+
+        global __cusparseCgtsv2_nopivot_bufferSizeExt
+        __cusparseCgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_nopivot_bufferSizeExt')
+        if __cusparseCgtsv2_nopivot_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2_nopivot_bufferSizeExt')
+
+        global __cusparseZgtsv2_nopivot_bufferSizeExt
+        __cusparseZgtsv2_nopivot_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_nopivot_bufferSizeExt')
+        if __cusparseZgtsv2_nopivot_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2_nopivot_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2_nopivot_bufferSizeExt')
+
+        global __cusparseSgtsv2_nopivot
+        __cusparseSgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2_nopivot')
+        if __cusparseSgtsv2_nopivot == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2_nopivot = dlsym(handle, 'cusparseSgtsv2_nopivot')
+
+        global __cusparseDgtsv2_nopivot
+        __cusparseDgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2_nopivot')
+        if __cusparseDgtsv2_nopivot == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2_nopivot = dlsym(handle, 'cusparseDgtsv2_nopivot')
+
+        global __cusparseCgtsv2_nopivot
+        __cusparseCgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2_nopivot')
+        if __cusparseCgtsv2_nopivot == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2_nopivot = dlsym(handle, 'cusparseCgtsv2_nopivot')
+
+        global __cusparseZgtsv2_nopivot
+        __cusparseZgtsv2_nopivot = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2_nopivot')
+        if __cusparseZgtsv2_nopivot == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2_nopivot = dlsym(handle, 'cusparseZgtsv2_nopivot')
+
+        global __cusparseSgtsv2StridedBatch_bufferSizeExt
+        __cusparseSgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
+        if __cusparseSgtsv2StridedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
+
+        global __cusparseDgtsv2StridedBatch_bufferSizeExt
+        __cusparseDgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
+        if __cusparseDgtsv2StridedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
+
+        global __cusparseCgtsv2StridedBatch_bufferSizeExt
+        __cusparseCgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
+        if __cusparseCgtsv2StridedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
+
+        global __cusparseZgtsv2StridedBatch_bufferSizeExt
+        __cusparseZgtsv2StridedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
+        if __cusparseZgtsv2StridedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2StridedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
+
+        global __cusparseSgtsv2StridedBatch
+        __cusparseSgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgtsv2StridedBatch')
+        if __cusparseSgtsv2StridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsv2StridedBatch = dlsym(handle, 'cusparseSgtsv2StridedBatch')
+
+        global __cusparseDgtsv2StridedBatch
+        __cusparseDgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgtsv2StridedBatch')
+        if __cusparseDgtsv2StridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsv2StridedBatch = dlsym(handle, 'cusparseDgtsv2StridedBatch')
+
+        global __cusparseCgtsv2StridedBatch
+        __cusparseCgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgtsv2StridedBatch')
+        if __cusparseCgtsv2StridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsv2StridedBatch = dlsym(handle, 'cusparseCgtsv2StridedBatch')
+
+        global __cusparseZgtsv2StridedBatch
+        __cusparseZgtsv2StridedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgtsv2StridedBatch')
+        if __cusparseZgtsv2StridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsv2StridedBatch = dlsym(handle, 'cusparseZgtsv2StridedBatch')
+
+        global __cusparseSgtsvInterleavedBatch_bufferSizeExt
+        __cusparseSgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
+        if __cusparseSgtsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseDgtsvInterleavedBatch_bufferSizeExt
+        __cusparseDgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
+        if __cusparseDgtsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseCgtsvInterleavedBatch_bufferSizeExt
+        __cusparseCgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
+        if __cusparseCgtsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseZgtsvInterleavedBatch_bufferSizeExt
+        __cusparseZgtsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
+        if __cusparseZgtsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseSgtsvInterleavedBatch
+        __cusparseSgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgtsvInterleavedBatch')
+        if __cusparseSgtsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgtsvInterleavedBatch = dlsym(handle, 'cusparseSgtsvInterleavedBatch')
+
+        global __cusparseDgtsvInterleavedBatch
+        __cusparseDgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgtsvInterleavedBatch')
+        if __cusparseDgtsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgtsvInterleavedBatch = dlsym(handle, 'cusparseDgtsvInterleavedBatch')
+
+        global __cusparseCgtsvInterleavedBatch
+        __cusparseCgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgtsvInterleavedBatch')
+        if __cusparseCgtsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgtsvInterleavedBatch = dlsym(handle, 'cusparseCgtsvInterleavedBatch')
+
+        global __cusparseZgtsvInterleavedBatch
+        __cusparseZgtsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgtsvInterleavedBatch')
+        if __cusparseZgtsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgtsvInterleavedBatch = dlsym(handle, 'cusparseZgtsvInterleavedBatch')
+
+        global __cusparseSgpsvInterleavedBatch_bufferSizeExt
+        __cusparseSgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
+        if __cusparseSgpsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseDgpsvInterleavedBatch_bufferSizeExt
+        __cusparseDgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
+        if __cusparseDgpsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseCgpsvInterleavedBatch_bufferSizeExt
+        __cusparseCgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
+        if __cusparseCgpsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseZgpsvInterleavedBatch_bufferSizeExt
+        __cusparseZgpsvInterleavedBatch_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
+        if __cusparseZgpsvInterleavedBatch_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgpsvInterleavedBatch_bufferSizeExt = dlsym(handle, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
+
+        global __cusparseSgpsvInterleavedBatch
+        __cusparseSgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseSgpsvInterleavedBatch')
+        if __cusparseSgpsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgpsvInterleavedBatch = dlsym(handle, 'cusparseSgpsvInterleavedBatch')
+
+        global __cusparseDgpsvInterleavedBatch
+        __cusparseDgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseDgpsvInterleavedBatch')
+        if __cusparseDgpsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgpsvInterleavedBatch = dlsym(handle, 'cusparseDgpsvInterleavedBatch')
+
+        global __cusparseCgpsvInterleavedBatch
+        __cusparseCgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseCgpsvInterleavedBatch')
+        if __cusparseCgpsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgpsvInterleavedBatch = dlsym(handle, 'cusparseCgpsvInterleavedBatch')
+
+        global __cusparseZgpsvInterleavedBatch
+        __cusparseZgpsvInterleavedBatch = dlsym(RTLD_DEFAULT, 'cusparseZgpsvInterleavedBatch')
+        if __cusparseZgpsvInterleavedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgpsvInterleavedBatch = dlsym(handle, 'cusparseZgpsvInterleavedBatch')
+
+        global __cusparseScsrgeam2_bufferSizeExt
+        __cusparseScsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseScsrgeam2_bufferSizeExt')
+        if __cusparseScsrgeam2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseScsrgeam2_bufferSizeExt')
+
+        global __cusparseDcsrgeam2_bufferSizeExt
+        __cusparseDcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDcsrgeam2_bufferSizeExt')
+        if __cusparseDcsrgeam2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseDcsrgeam2_bufferSizeExt')
+
+        global __cusparseCcsrgeam2_bufferSizeExt
+        __cusparseCcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCcsrgeam2_bufferSizeExt')
+        if __cusparseCcsrgeam2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseCcsrgeam2_bufferSizeExt')
+
+        global __cusparseZcsrgeam2_bufferSizeExt
+        __cusparseZcsrgeam2_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZcsrgeam2_bufferSizeExt')
+        if __cusparseZcsrgeam2_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZcsrgeam2_bufferSizeExt = dlsym(handle, 'cusparseZcsrgeam2_bufferSizeExt')
+
+        global __cusparseXcsrgeam2Nnz
+        __cusparseXcsrgeam2Nnz = dlsym(RTLD_DEFAULT, 'cusparseXcsrgeam2Nnz')
+        if __cusparseXcsrgeam2Nnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcsrgeam2Nnz = dlsym(handle, 'cusparseXcsrgeam2Nnz')
+
+        global __cusparseScsrgeam2
+        __cusparseScsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseScsrgeam2')
+        if __cusparseScsrgeam2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScsrgeam2 = dlsym(handle, 'cusparseScsrgeam2')
+
+        global __cusparseDcsrgeam2
+        __cusparseDcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseDcsrgeam2')
+        if __cusparseDcsrgeam2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDcsrgeam2 = dlsym(handle, 'cusparseDcsrgeam2')
+
+        global __cusparseCcsrgeam2
+        __cusparseCcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseCcsrgeam2')
+        if __cusparseCcsrgeam2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCcsrgeam2 = dlsym(handle, 'cusparseCcsrgeam2')
+
+        global __cusparseZcsrgeam2
+        __cusparseZcsrgeam2 = dlsym(RTLD_DEFAULT, 'cusparseZcsrgeam2')
+        if __cusparseZcsrgeam2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZcsrgeam2 = dlsym(handle, 'cusparseZcsrgeam2')
+
+        global __cusparseSnnz
+        __cusparseSnnz = dlsym(RTLD_DEFAULT, 'cusparseSnnz')
+        if __cusparseSnnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSnnz = dlsym(handle, 'cusparseSnnz')
+
+        global __cusparseDnnz
+        __cusparseDnnz = dlsym(RTLD_DEFAULT, 'cusparseDnnz')
+        if __cusparseDnnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnnz = dlsym(handle, 'cusparseDnnz')
+
+        global __cusparseCnnz
+        __cusparseCnnz = dlsym(RTLD_DEFAULT, 'cusparseCnnz')
+        if __cusparseCnnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCnnz = dlsym(handle, 'cusparseCnnz')
+
+        global __cusparseZnnz
+        __cusparseZnnz = dlsym(RTLD_DEFAULT, 'cusparseZnnz')
+        if __cusparseZnnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZnnz = dlsym(handle, 'cusparseZnnz')
+
+        global __cusparseXcoo2csr
+        __cusparseXcoo2csr = dlsym(RTLD_DEFAULT, 'cusparseXcoo2csr')
+        if __cusparseXcoo2csr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcoo2csr = dlsym(handle, 'cusparseXcoo2csr')
+
+        global __cusparseXcsr2coo
+        __cusparseXcsr2coo = dlsym(RTLD_DEFAULT, 'cusparseXcsr2coo')
+        if __cusparseXcsr2coo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcsr2coo = dlsym(handle, 'cusparseXcsr2coo')
+
+        global __cusparseSbsr2csr
+        __cusparseSbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseSbsr2csr')
+        if __cusparseSbsr2csr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSbsr2csr = dlsym(handle, 'cusparseSbsr2csr')
+
+        global __cusparseDbsr2csr
+        __cusparseDbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseDbsr2csr')
+        if __cusparseDbsr2csr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDbsr2csr = dlsym(handle, 'cusparseDbsr2csr')
+
+        global __cusparseCbsr2csr
+        __cusparseCbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseCbsr2csr')
+        if __cusparseCbsr2csr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCbsr2csr = dlsym(handle, 'cusparseCbsr2csr')
+
+        global __cusparseZbsr2csr
+        __cusparseZbsr2csr = dlsym(RTLD_DEFAULT, 'cusparseZbsr2csr')
+        if __cusparseZbsr2csr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZbsr2csr = dlsym(handle, 'cusparseZbsr2csr')
+
+        global __cusparseSgebsr2gebsc_bufferSize
+        __cusparseSgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc_bufferSize')
+        if __cusparseSgebsr2gebsc_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseSgebsr2gebsc_bufferSize')
+
+        global __cusparseDgebsr2gebsc_bufferSize
+        __cusparseDgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc_bufferSize')
+        if __cusparseDgebsr2gebsc_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseDgebsr2gebsc_bufferSize')
+
+        global __cusparseCgebsr2gebsc_bufferSize
+        __cusparseCgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc_bufferSize')
+        if __cusparseCgebsr2gebsc_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseCgebsr2gebsc_bufferSize')
+
+        global __cusparseZgebsr2gebsc_bufferSize
+        __cusparseZgebsr2gebsc_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc_bufferSize')
+        if __cusparseZgebsr2gebsc_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsc_bufferSize = dlsym(handle, 'cusparseZgebsr2gebsc_bufferSize')
+
+        global __cusparseSgebsr2gebsc_bufferSizeExt
+        __cusparseSgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc_bufferSizeExt')
+        if __cusparseSgebsr2gebsc_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseSgebsr2gebsc_bufferSizeExt')
+
+        global __cusparseDgebsr2gebsc_bufferSizeExt
+        __cusparseDgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc_bufferSizeExt')
+        if __cusparseDgebsr2gebsc_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseDgebsr2gebsc_bufferSizeExt')
+
+        global __cusparseCgebsr2gebsc_bufferSizeExt
+        __cusparseCgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc_bufferSizeExt')
+        if __cusparseCgebsr2gebsc_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseCgebsr2gebsc_bufferSizeExt')
+
+        global __cusparseZgebsr2gebsc_bufferSizeExt
+        __cusparseZgebsr2gebsc_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc_bufferSizeExt')
+        if __cusparseZgebsr2gebsc_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsc_bufferSizeExt = dlsym(handle, 'cusparseZgebsr2gebsc_bufferSizeExt')
+
+        global __cusparseSgebsr2gebsc
+        __cusparseSgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsc')
+        if __cusparseSgebsr2gebsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsc = dlsym(handle, 'cusparseSgebsr2gebsc')
+
+        global __cusparseDgebsr2gebsc
+        __cusparseDgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsc')
+        if __cusparseDgebsr2gebsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsc = dlsym(handle, 'cusparseDgebsr2gebsc')
+
+        global __cusparseCgebsr2gebsc
+        __cusparseCgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsc')
+        if __cusparseCgebsr2gebsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsc = dlsym(handle, 'cusparseCgebsr2gebsc')
+
+        global __cusparseZgebsr2gebsc
+        __cusparseZgebsr2gebsc = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsc')
+        if __cusparseZgebsr2gebsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsc = dlsym(handle, 'cusparseZgebsr2gebsc')
+
+        global __cusparseScsr2gebsr_bufferSize
+        __cusparseScsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr_bufferSize')
+        if __cusparseScsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScsr2gebsr_bufferSize = dlsym(handle, 'cusparseScsr2gebsr_bufferSize')
+
+        global __cusparseDcsr2gebsr_bufferSize
+        __cusparseDcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr_bufferSize')
+        if __cusparseDcsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDcsr2gebsr_bufferSize = dlsym(handle, 'cusparseDcsr2gebsr_bufferSize')
+
+        global __cusparseCcsr2gebsr_bufferSize
+        __cusparseCcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr_bufferSize')
+        if __cusparseCcsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCcsr2gebsr_bufferSize = dlsym(handle, 'cusparseCcsr2gebsr_bufferSize')
+
+        global __cusparseZcsr2gebsr_bufferSize
+        __cusparseZcsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr_bufferSize')
+        if __cusparseZcsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZcsr2gebsr_bufferSize = dlsym(handle, 'cusparseZcsr2gebsr_bufferSize')
+
+        global __cusparseScsr2gebsr_bufferSizeExt
+        __cusparseScsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr_bufferSizeExt')
+        if __cusparseScsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseScsr2gebsr_bufferSizeExt')
+
+        global __cusparseDcsr2gebsr_bufferSizeExt
+        __cusparseDcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr_bufferSizeExt')
+        if __cusparseDcsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseDcsr2gebsr_bufferSizeExt')
+
+        global __cusparseCcsr2gebsr_bufferSizeExt
+        __cusparseCcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr_bufferSizeExt')
+        if __cusparseCcsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseCcsr2gebsr_bufferSizeExt')
+
+        global __cusparseZcsr2gebsr_bufferSizeExt
+        __cusparseZcsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr_bufferSizeExt')
+        if __cusparseZcsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZcsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseZcsr2gebsr_bufferSizeExt')
+
+        global __cusparseXcsr2gebsrNnz
+        __cusparseXcsr2gebsrNnz = dlsym(RTLD_DEFAULT, 'cusparseXcsr2gebsrNnz')
+        if __cusparseXcsr2gebsrNnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcsr2gebsrNnz = dlsym(handle, 'cusparseXcsr2gebsrNnz')
+
+        global __cusparseScsr2gebsr
+        __cusparseScsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseScsr2gebsr')
+        if __cusparseScsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScsr2gebsr = dlsym(handle, 'cusparseScsr2gebsr')
+
+        global __cusparseDcsr2gebsr
+        __cusparseDcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseDcsr2gebsr')
+        if __cusparseDcsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDcsr2gebsr = dlsym(handle, 'cusparseDcsr2gebsr')
+
+        global __cusparseCcsr2gebsr
+        __cusparseCcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseCcsr2gebsr')
+        if __cusparseCcsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCcsr2gebsr = dlsym(handle, 'cusparseCcsr2gebsr')
+
+        global __cusparseZcsr2gebsr
+        __cusparseZcsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseZcsr2gebsr')
+        if __cusparseZcsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZcsr2gebsr = dlsym(handle, 'cusparseZcsr2gebsr')
+
+        global __cusparseSgebsr2gebsr_bufferSize
+        __cusparseSgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr_bufferSize')
+        if __cusparseSgebsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseSgebsr2gebsr_bufferSize')
+
+        global __cusparseDgebsr2gebsr_bufferSize
+        __cusparseDgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr_bufferSize')
+        if __cusparseDgebsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseDgebsr2gebsr_bufferSize')
+
+        global __cusparseCgebsr2gebsr_bufferSize
+        __cusparseCgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr_bufferSize')
+        if __cusparseCgebsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseCgebsr2gebsr_bufferSize')
+
+        global __cusparseZgebsr2gebsr_bufferSize
+        __cusparseZgebsr2gebsr_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr_bufferSize')
+        if __cusparseZgebsr2gebsr_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsr_bufferSize = dlsym(handle, 'cusparseZgebsr2gebsr_bufferSize')
+
+        global __cusparseSgebsr2gebsr_bufferSizeExt
+        __cusparseSgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr_bufferSizeExt')
+        if __cusparseSgebsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseSgebsr2gebsr_bufferSizeExt')
+
+        global __cusparseDgebsr2gebsr_bufferSizeExt
+        __cusparseDgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr_bufferSizeExt')
+        if __cusparseDgebsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseDgebsr2gebsr_bufferSizeExt')
+
+        global __cusparseCgebsr2gebsr_bufferSizeExt
+        __cusparseCgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr_bufferSizeExt')
+        if __cusparseCgebsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseCgebsr2gebsr_bufferSizeExt')
+
+        global __cusparseZgebsr2gebsr_bufferSizeExt
+        __cusparseZgebsr2gebsr_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr_bufferSizeExt')
+        if __cusparseZgebsr2gebsr_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsr_bufferSizeExt = dlsym(handle, 'cusparseZgebsr2gebsr_bufferSizeExt')
+
+        global __cusparseXgebsr2gebsrNnz
+        __cusparseXgebsr2gebsrNnz = dlsym(RTLD_DEFAULT, 'cusparseXgebsr2gebsrNnz')
+        if __cusparseXgebsr2gebsrNnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXgebsr2gebsrNnz = dlsym(handle, 'cusparseXgebsr2gebsrNnz')
+
+        global __cusparseSgebsr2gebsr
+        __cusparseSgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseSgebsr2gebsr')
+        if __cusparseSgebsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSgebsr2gebsr = dlsym(handle, 'cusparseSgebsr2gebsr')
+
+        global __cusparseDgebsr2gebsr
+        __cusparseDgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseDgebsr2gebsr')
+        if __cusparseDgebsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDgebsr2gebsr = dlsym(handle, 'cusparseDgebsr2gebsr')
+
+        global __cusparseCgebsr2gebsr
+        __cusparseCgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseCgebsr2gebsr')
+        if __cusparseCgebsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCgebsr2gebsr = dlsym(handle, 'cusparseCgebsr2gebsr')
+
+        global __cusparseZgebsr2gebsr
+        __cusparseZgebsr2gebsr = dlsym(RTLD_DEFAULT, 'cusparseZgebsr2gebsr')
+        if __cusparseZgebsr2gebsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseZgebsr2gebsr = dlsym(handle, 'cusparseZgebsr2gebsr')
+
+        global __cusparseXcoosort_bufferSizeExt
+        __cusparseXcoosort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcoosort_bufferSizeExt')
+        if __cusparseXcoosort_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcoosort_bufferSizeExt = dlsym(handle, 'cusparseXcoosort_bufferSizeExt')
+
+        global __cusparseXcoosortByRow
+        __cusparseXcoosortByRow = dlsym(RTLD_DEFAULT, 'cusparseXcoosortByRow')
+        if __cusparseXcoosortByRow == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcoosortByRow = dlsym(handle, 'cusparseXcoosortByRow')
+
+        global __cusparseXcoosortByColumn
+        __cusparseXcoosortByColumn = dlsym(RTLD_DEFAULT, 'cusparseXcoosortByColumn')
+        if __cusparseXcoosortByColumn == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcoosortByColumn = dlsym(handle, 'cusparseXcoosortByColumn')
+
+        global __cusparseXcsrsort_bufferSizeExt
+        __cusparseXcsrsort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcsrsort_bufferSizeExt')
+        if __cusparseXcsrsort_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcsrsort_bufferSizeExt = dlsym(handle, 'cusparseXcsrsort_bufferSizeExt')
+
+        global __cusparseXcsrsort
+        __cusparseXcsrsort = dlsym(RTLD_DEFAULT, 'cusparseXcsrsort')
+        if __cusparseXcsrsort == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcsrsort = dlsym(handle, 'cusparseXcsrsort')
+
+        global __cusparseXcscsort_bufferSizeExt
+        __cusparseXcscsort_bufferSizeExt = dlsym(RTLD_DEFAULT, 'cusparseXcscsort_bufferSizeExt')
+        if __cusparseXcscsort_bufferSizeExt == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcscsort_bufferSizeExt = dlsym(handle, 'cusparseXcscsort_bufferSizeExt')
+
+        global __cusparseXcscsort
+        __cusparseXcscsort = dlsym(RTLD_DEFAULT, 'cusparseXcscsort')
+        if __cusparseXcscsort == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseXcscsort = dlsym(handle, 'cusparseXcscsort')
+
+        global __cusparseCsr2cscEx2
+        __cusparseCsr2cscEx2 = dlsym(RTLD_DEFAULT, 'cusparseCsr2cscEx2')
+        if __cusparseCsr2cscEx2 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCsr2cscEx2 = dlsym(handle, 'cusparseCsr2cscEx2')
+
+        global __cusparseCsr2cscEx2_bufferSize
+        __cusparseCsr2cscEx2_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseCsr2cscEx2_bufferSize')
+        if __cusparseCsr2cscEx2_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCsr2cscEx2_bufferSize = dlsym(handle, 'cusparseCsr2cscEx2_bufferSize')
+
+        global __cusparseCreateSpVec
+        __cusparseCreateSpVec = dlsym(RTLD_DEFAULT, 'cusparseCreateSpVec')
+        if __cusparseCreateSpVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateSpVec = dlsym(handle, 'cusparseCreateSpVec')
+
+        global __cusparseDestroySpVec
+        __cusparseDestroySpVec = dlsym(RTLD_DEFAULT, 'cusparseDestroySpVec')
+        if __cusparseDestroySpVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroySpVec = dlsym(handle, 'cusparseDestroySpVec')
+
+        global __cusparseSpVecGet
+        __cusparseSpVecGet = dlsym(RTLD_DEFAULT, 'cusparseSpVecGet')
+        if __cusparseSpVecGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVecGet = dlsym(handle, 'cusparseSpVecGet')
+
+        global __cusparseSpVecGetIndexBase
+        __cusparseSpVecGetIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSpVecGetIndexBase')
+        if __cusparseSpVecGetIndexBase == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVecGetIndexBase = dlsym(handle, 'cusparseSpVecGetIndexBase')
+
+        global __cusparseSpVecGetValues
+        __cusparseSpVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseSpVecGetValues')
+        if __cusparseSpVecGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVecGetValues = dlsym(handle, 'cusparseSpVecGetValues')
+
+        global __cusparseSpVecSetValues
+        __cusparseSpVecSetValues = dlsym(RTLD_DEFAULT, 'cusparseSpVecSetValues')
+        if __cusparseSpVecSetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVecSetValues = dlsym(handle, 'cusparseSpVecSetValues')
+
+        global __cusparseCreateDnVec
+        __cusparseCreateDnVec = dlsym(RTLD_DEFAULT, 'cusparseCreateDnVec')
+        if __cusparseCreateDnVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateDnVec = dlsym(handle, 'cusparseCreateDnVec')
+
+        global __cusparseDestroyDnVec
+        __cusparseDestroyDnVec = dlsym(RTLD_DEFAULT, 'cusparseDestroyDnVec')
+        if __cusparseDestroyDnVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroyDnVec = dlsym(handle, 'cusparseDestroyDnVec')
+
+        global __cusparseDnVecGet
+        __cusparseDnVecGet = dlsym(RTLD_DEFAULT, 'cusparseDnVecGet')
+        if __cusparseDnVecGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnVecGet = dlsym(handle, 'cusparseDnVecGet')
+
+        global __cusparseDnVecGetValues
+        __cusparseDnVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseDnVecGetValues')
+        if __cusparseDnVecGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnVecGetValues = dlsym(handle, 'cusparseDnVecGetValues')
+
+        global __cusparseDnVecSetValues
+        __cusparseDnVecSetValues = dlsym(RTLD_DEFAULT, 'cusparseDnVecSetValues')
+        if __cusparseDnVecSetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnVecSetValues = dlsym(handle, 'cusparseDnVecSetValues')
+
+        global __cusparseDestroySpMat
+        __cusparseDestroySpMat = dlsym(RTLD_DEFAULT, 'cusparseDestroySpMat')
+        if __cusparseDestroySpMat == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroySpMat = dlsym(handle, 'cusparseDestroySpMat')
+
+        global __cusparseSpMatGetFormat
+        __cusparseSpMatGetFormat = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetFormat')
+        if __cusparseSpMatGetFormat == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetFormat = dlsym(handle, 'cusparseSpMatGetFormat')
+
+        global __cusparseSpMatGetIndexBase
+        __cusparseSpMatGetIndexBase = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetIndexBase')
+        if __cusparseSpMatGetIndexBase == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetIndexBase = dlsym(handle, 'cusparseSpMatGetIndexBase')
+
+        global __cusparseSpMatGetValues
+        __cusparseSpMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetValues')
+        if __cusparseSpMatGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetValues = dlsym(handle, 'cusparseSpMatGetValues')
+
+        global __cusparseSpMatSetValues
+        __cusparseSpMatSetValues = dlsym(RTLD_DEFAULT, 'cusparseSpMatSetValues')
+        if __cusparseSpMatSetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatSetValues = dlsym(handle, 'cusparseSpMatSetValues')
+
+        global __cusparseSpMatGetSize
+        __cusparseSpMatGetSize = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetSize')
+        if __cusparseSpMatGetSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetSize = dlsym(handle, 'cusparseSpMatGetSize')
+
+        global __cusparseSpMatGetStridedBatch
+        __cusparseSpMatGetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetStridedBatch')
+        if __cusparseSpMatGetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetStridedBatch = dlsym(handle, 'cusparseSpMatGetStridedBatch')
+
+        global __cusparseCooSetStridedBatch
+        __cusparseCooSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCooSetStridedBatch')
+        if __cusparseCooSetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCooSetStridedBatch = dlsym(handle, 'cusparseCooSetStridedBatch')
+
+        global __cusparseCsrSetStridedBatch
+        __cusparseCsrSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseCsrSetStridedBatch')
+        if __cusparseCsrSetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCsrSetStridedBatch = dlsym(handle, 'cusparseCsrSetStridedBatch')
+
+        global __cusparseCreateCsr
+        __cusparseCreateCsr = dlsym(RTLD_DEFAULT, 'cusparseCreateCsr')
+        if __cusparseCreateCsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateCsr = dlsym(handle, 'cusparseCreateCsr')
+
+        global __cusparseCsrGet
+        __cusparseCsrGet = dlsym(RTLD_DEFAULT, 'cusparseCsrGet')
+        if __cusparseCsrGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCsrGet = dlsym(handle, 'cusparseCsrGet')
+
+        global __cusparseCsrSetPointers
+        __cusparseCsrSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCsrSetPointers')
+        if __cusparseCsrSetPointers == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCsrSetPointers = dlsym(handle, 'cusparseCsrSetPointers')
+
+        global __cusparseCreateCoo
+        __cusparseCreateCoo = dlsym(RTLD_DEFAULT, 'cusparseCreateCoo')
+        if __cusparseCreateCoo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateCoo = dlsym(handle, 'cusparseCreateCoo')
+
+        global __cusparseCooGet
+        __cusparseCooGet = dlsym(RTLD_DEFAULT, 'cusparseCooGet')
+        if __cusparseCooGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCooGet = dlsym(handle, 'cusparseCooGet')
+
+        global __cusparseCreateDnMat
+        __cusparseCreateDnMat = dlsym(RTLD_DEFAULT, 'cusparseCreateDnMat')
+        if __cusparseCreateDnMat == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateDnMat = dlsym(handle, 'cusparseCreateDnMat')
+
+        global __cusparseDestroyDnMat
+        __cusparseDestroyDnMat = dlsym(RTLD_DEFAULT, 'cusparseDestroyDnMat')
+        if __cusparseDestroyDnMat == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDestroyDnMat = dlsym(handle, 'cusparseDestroyDnMat')
+
+        global __cusparseDnMatGet
+        __cusparseDnMatGet = dlsym(RTLD_DEFAULT, 'cusparseDnMatGet')
+        if __cusparseDnMatGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnMatGet = dlsym(handle, 'cusparseDnMatGet')
+
+        global __cusparseDnMatGetValues
+        __cusparseDnMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseDnMatGetValues')
+        if __cusparseDnMatGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnMatGetValues = dlsym(handle, 'cusparseDnMatGetValues')
+
+        global __cusparseDnMatSetValues
+        __cusparseDnMatSetValues = dlsym(RTLD_DEFAULT, 'cusparseDnMatSetValues')
+        if __cusparseDnMatSetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnMatSetValues = dlsym(handle, 'cusparseDnMatSetValues')
+
+        global __cusparseDnMatSetStridedBatch
+        __cusparseDnMatSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDnMatSetStridedBatch')
+        if __cusparseDnMatSetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnMatSetStridedBatch = dlsym(handle, 'cusparseDnMatSetStridedBatch')
+
+        global __cusparseDnMatGetStridedBatch
+        __cusparseDnMatGetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseDnMatGetStridedBatch')
+        if __cusparseDnMatGetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDnMatGetStridedBatch = dlsym(handle, 'cusparseDnMatGetStridedBatch')
+
+        global __cusparseAxpby
+        __cusparseAxpby = dlsym(RTLD_DEFAULT, 'cusparseAxpby')
+        if __cusparseAxpby == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseAxpby = dlsym(handle, 'cusparseAxpby')
+
+        global __cusparseGather
+        __cusparseGather = dlsym(RTLD_DEFAULT, 'cusparseGather')
+        if __cusparseGather == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseGather = dlsym(handle, 'cusparseGather')
+
+        global __cusparseScatter
+        __cusparseScatter = dlsym(RTLD_DEFAULT, 'cusparseScatter')
+        if __cusparseScatter == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseScatter = dlsym(handle, 'cusparseScatter')
+
+        global __cusparseSpVV_bufferSize
+        __cusparseSpVV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpVV_bufferSize')
+        if __cusparseSpVV_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVV_bufferSize = dlsym(handle, 'cusparseSpVV_bufferSize')
+
+        global __cusparseSpVV
+        __cusparseSpVV = dlsym(RTLD_DEFAULT, 'cusparseSpVV')
+        if __cusparseSpVV == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpVV = dlsym(handle, 'cusparseSpVV')
+
+        global __cusparseSpMV
+        __cusparseSpMV = dlsym(RTLD_DEFAULT, 'cusparseSpMV')
+        if __cusparseSpMV == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMV = dlsym(handle, 'cusparseSpMV')
+
+        global __cusparseSpMV_bufferSize
+        __cusparseSpMV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpMV_bufferSize')
+        if __cusparseSpMV_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMV_bufferSize = dlsym(handle, 'cusparseSpMV_bufferSize')
+
+        global __cusparseSpMM
+        __cusparseSpMM = dlsym(RTLD_DEFAULT, 'cusparseSpMM')
+        if __cusparseSpMM == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMM = dlsym(handle, 'cusparseSpMM')
+
+        global __cusparseSpMM_bufferSize
+        __cusparseSpMM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpMM_bufferSize')
+        if __cusparseSpMM_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMM_bufferSize = dlsym(handle, 'cusparseSpMM_bufferSize')
+
+        global __cusparseSpGEMM_createDescr
+        __cusparseSpGEMM_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_createDescr')
+        if __cusparseSpGEMM_createDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_createDescr = dlsym(handle, 'cusparseSpGEMM_createDescr')
+
+        global __cusparseSpGEMM_destroyDescr
+        __cusparseSpGEMM_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_destroyDescr')
+        if __cusparseSpGEMM_destroyDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_destroyDescr = dlsym(handle, 'cusparseSpGEMM_destroyDescr')
+
+        global __cusparseSpGEMM_workEstimation
+        __cusparseSpGEMM_workEstimation = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_workEstimation')
+        if __cusparseSpGEMM_workEstimation == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_workEstimation = dlsym(handle, 'cusparseSpGEMM_workEstimation')
+
+        global __cusparseSpGEMM_compute
+        __cusparseSpGEMM_compute = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_compute')
+        if __cusparseSpGEMM_compute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_compute = dlsym(handle, 'cusparseSpGEMM_compute')
+
+        global __cusparseSpGEMM_copy
+        __cusparseSpGEMM_copy = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_copy')
+        if __cusparseSpGEMM_copy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_copy = dlsym(handle, 'cusparseSpGEMM_copy')
+
+        global __cusparseCreateCsc
+        __cusparseCreateCsc = dlsym(RTLD_DEFAULT, 'cusparseCreateCsc')
+        if __cusparseCreateCsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateCsc = dlsym(handle, 'cusparseCreateCsc')
+
+        global __cusparseCscSetPointers
+        __cusparseCscSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCscSetPointers')
+        if __cusparseCscSetPointers == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCscSetPointers = dlsym(handle, 'cusparseCscSetPointers')
+
+        global __cusparseCooSetPointers
+        __cusparseCooSetPointers = dlsym(RTLD_DEFAULT, 'cusparseCooSetPointers')
+        if __cusparseCooSetPointers == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCooSetPointers = dlsym(handle, 'cusparseCooSetPointers')
+
+        global __cusparseSparseToDense_bufferSize
+        __cusparseSparseToDense_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSparseToDense_bufferSize')
+        if __cusparseSparseToDense_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSparseToDense_bufferSize = dlsym(handle, 'cusparseSparseToDense_bufferSize')
+
+        global __cusparseSparseToDense
+        __cusparseSparseToDense = dlsym(RTLD_DEFAULT, 'cusparseSparseToDense')
+        if __cusparseSparseToDense == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSparseToDense = dlsym(handle, 'cusparseSparseToDense')
+
+        global __cusparseDenseToSparse_bufferSize
+        __cusparseDenseToSparse_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_bufferSize')
+        if __cusparseDenseToSparse_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDenseToSparse_bufferSize = dlsym(handle, 'cusparseDenseToSparse_bufferSize')
+
+        global __cusparseDenseToSparse_analysis
+        __cusparseDenseToSparse_analysis = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_analysis')
+        if __cusparseDenseToSparse_analysis == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDenseToSparse_analysis = dlsym(handle, 'cusparseDenseToSparse_analysis')
+
+        global __cusparseDenseToSparse_convert
+        __cusparseDenseToSparse_convert = dlsym(RTLD_DEFAULT, 'cusparseDenseToSparse_convert')
+        if __cusparseDenseToSparse_convert == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseDenseToSparse_convert = dlsym(handle, 'cusparseDenseToSparse_convert')
+
+        global __cusparseCreateBlockedEll
+        __cusparseCreateBlockedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateBlockedEll')
+        if __cusparseCreateBlockedEll == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateBlockedEll = dlsym(handle, 'cusparseCreateBlockedEll')
+
+        global __cusparseBlockedEllGet
+        __cusparseBlockedEllGet = dlsym(RTLD_DEFAULT, 'cusparseBlockedEllGet')
+        if __cusparseBlockedEllGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseBlockedEllGet = dlsym(handle, 'cusparseBlockedEllGet')
+
+        global __cusparseSpMM_preprocess
+        __cusparseSpMM_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSpMM_preprocess')
+        if __cusparseSpMM_preprocess == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMM_preprocess = dlsym(handle, 'cusparseSpMM_preprocess')
+
+        global __cusparseSDDMM_bufferSize
+        __cusparseSDDMM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSDDMM_bufferSize')
+        if __cusparseSDDMM_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSDDMM_bufferSize = dlsym(handle, 'cusparseSDDMM_bufferSize')
+
+        global __cusparseSDDMM_preprocess
+        __cusparseSDDMM_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSDDMM_preprocess')
+        if __cusparseSDDMM_preprocess == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSDDMM_preprocess = dlsym(handle, 'cusparseSDDMM_preprocess')
+
+        global __cusparseSDDMM
+        __cusparseSDDMM = dlsym(RTLD_DEFAULT, 'cusparseSDDMM')
+        if __cusparseSDDMM == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSDDMM = dlsym(handle, 'cusparseSDDMM')
+
+        global __cusparseSpMatGetAttribute
+        __cusparseSpMatGetAttribute = dlsym(RTLD_DEFAULT, 'cusparseSpMatGetAttribute')
+        if __cusparseSpMatGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatGetAttribute = dlsym(handle, 'cusparseSpMatGetAttribute')
+
+        global __cusparseSpMatSetAttribute
+        __cusparseSpMatSetAttribute = dlsym(RTLD_DEFAULT, 'cusparseSpMatSetAttribute')
+        if __cusparseSpMatSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMatSetAttribute = dlsym(handle, 'cusparseSpMatSetAttribute')
+
+        global __cusparseSpSV_createDescr
+        __cusparseSpSV_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSV_createDescr')
+        if __cusparseSpSV_createDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_createDescr = dlsym(handle, 'cusparseSpSV_createDescr')
+
+        global __cusparseSpSV_destroyDescr
+        __cusparseSpSV_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSV_destroyDescr')
+        if __cusparseSpSV_destroyDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_destroyDescr = dlsym(handle, 'cusparseSpSV_destroyDescr')
+
+        global __cusparseSpSV_bufferSize
+        __cusparseSpSV_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpSV_bufferSize')
+        if __cusparseSpSV_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_bufferSize = dlsym(handle, 'cusparseSpSV_bufferSize')
+
+        global __cusparseSpSV_analysis
+        __cusparseSpSV_analysis = dlsym(RTLD_DEFAULT, 'cusparseSpSV_analysis')
+        if __cusparseSpSV_analysis == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_analysis = dlsym(handle, 'cusparseSpSV_analysis')
+
+        global __cusparseSpSV_solve
+        __cusparseSpSV_solve = dlsym(RTLD_DEFAULT, 'cusparseSpSV_solve')
+        if __cusparseSpSV_solve == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_solve = dlsym(handle, 'cusparseSpSV_solve')
+
+        global __cusparseSpSM_createDescr
+        __cusparseSpSM_createDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSM_createDescr')
+        if __cusparseSpSM_createDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_createDescr = dlsym(handle, 'cusparseSpSM_createDescr')
+
+        global __cusparseSpSM_destroyDescr
+        __cusparseSpSM_destroyDescr = dlsym(RTLD_DEFAULT, 'cusparseSpSM_destroyDescr')
+        if __cusparseSpSM_destroyDescr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_destroyDescr = dlsym(handle, 'cusparseSpSM_destroyDescr')
+
+        global __cusparseSpSM_bufferSize
+        __cusparseSpSM_bufferSize = dlsym(RTLD_DEFAULT, 'cusparseSpSM_bufferSize')
+        if __cusparseSpSM_bufferSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_bufferSize = dlsym(handle, 'cusparseSpSM_bufferSize')
+
+        global __cusparseSpSM_analysis
+        __cusparseSpSM_analysis = dlsym(RTLD_DEFAULT, 'cusparseSpSM_analysis')
+        if __cusparseSpSM_analysis == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_analysis = dlsym(handle, 'cusparseSpSM_analysis')
+
+        global __cusparseSpSM_solve
+        __cusparseSpSM_solve = dlsym(RTLD_DEFAULT, 'cusparseSpSM_solve')
+        if __cusparseSpSM_solve == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_solve = dlsym(handle, 'cusparseSpSM_solve')
+
+        global __cusparseSpGEMMreuse_workEstimation
+        __cusparseSpGEMMreuse_workEstimation = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_workEstimation')
+        if __cusparseSpGEMMreuse_workEstimation == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMMreuse_workEstimation = dlsym(handle, 'cusparseSpGEMMreuse_workEstimation')
+
+        global __cusparseSpGEMMreuse_nnz
+        __cusparseSpGEMMreuse_nnz = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_nnz')
+        if __cusparseSpGEMMreuse_nnz == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMMreuse_nnz = dlsym(handle, 'cusparseSpGEMMreuse_nnz')
+
+        global __cusparseSpGEMMreuse_copy
+        __cusparseSpGEMMreuse_copy = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_copy')
+        if __cusparseSpGEMMreuse_copy == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMMreuse_copy = dlsym(handle, 'cusparseSpGEMMreuse_copy')
+
+        global __cusparseSpGEMMreuse_compute
+        __cusparseSpGEMMreuse_compute = dlsym(RTLD_DEFAULT, 'cusparseSpGEMMreuse_compute')
+        if __cusparseSpGEMMreuse_compute == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMMreuse_compute = dlsym(handle, 'cusparseSpGEMMreuse_compute')
+
+        global __cusparseLoggerSetCallback
+        __cusparseLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetCallback')
+        if __cusparseLoggerSetCallback == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerSetCallback = dlsym(handle, 'cusparseLoggerSetCallback')
+
+        global __cusparseLoggerSetFile
+        __cusparseLoggerSetFile = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetFile')
+        if __cusparseLoggerSetFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerSetFile = dlsym(handle, 'cusparseLoggerSetFile')
+
+        global __cusparseLoggerOpenFile
+        __cusparseLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cusparseLoggerOpenFile')
+        if __cusparseLoggerOpenFile == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerOpenFile = dlsym(handle, 'cusparseLoggerOpenFile')
+
+        global __cusparseLoggerSetLevel
+        __cusparseLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetLevel')
+        if __cusparseLoggerSetLevel == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerSetLevel = dlsym(handle, 'cusparseLoggerSetLevel')
+
+        global __cusparseLoggerSetMask
+        __cusparseLoggerSetMask = dlsym(RTLD_DEFAULT, 'cusparseLoggerSetMask')
+        if __cusparseLoggerSetMask == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerSetMask = dlsym(handle, 'cusparseLoggerSetMask')
+
+        global __cusparseLoggerForceDisable
+        __cusparseLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cusparseLoggerForceDisable')
+        if __cusparseLoggerForceDisable == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseLoggerForceDisable = dlsym(handle, 'cusparseLoggerForceDisable')
+
+        global __cusparseSpMMOp_createPlan
+        __cusparseSpMMOp_createPlan = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp_createPlan')
+        if __cusparseSpMMOp_createPlan == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMMOp_createPlan = dlsym(handle, 'cusparseSpMMOp_createPlan')
+
+        global __cusparseSpMMOp
+        __cusparseSpMMOp = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp')
+        if __cusparseSpMMOp == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMMOp = dlsym(handle, 'cusparseSpMMOp')
+
+        global __cusparseSpMMOp_destroyPlan
+        __cusparseSpMMOp_destroyPlan = dlsym(RTLD_DEFAULT, 'cusparseSpMMOp_destroyPlan')
+        if __cusparseSpMMOp_destroyPlan == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMMOp_destroyPlan = dlsym(handle, 'cusparseSpMMOp_destroyPlan')
+
+        global __cusparseCscGet
+        __cusparseCscGet = dlsym(RTLD_DEFAULT, 'cusparseCscGet')
+        if __cusparseCscGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCscGet = dlsym(handle, 'cusparseCscGet')
+
+        global __cusparseCreateConstSpVec
+        __cusparseCreateConstSpVec = dlsym(RTLD_DEFAULT, 'cusparseCreateConstSpVec')
+        if __cusparseCreateConstSpVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstSpVec = dlsym(handle, 'cusparseCreateConstSpVec')
+
+        global __cusparseConstSpVecGet
+        __cusparseConstSpVecGet = dlsym(RTLD_DEFAULT, 'cusparseConstSpVecGet')
+        if __cusparseConstSpVecGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstSpVecGet = dlsym(handle, 'cusparseConstSpVecGet')
+
+        global __cusparseConstSpVecGetValues
+        __cusparseConstSpVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstSpVecGetValues')
+        if __cusparseConstSpVecGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstSpVecGetValues = dlsym(handle, 'cusparseConstSpVecGetValues')
+
+        global __cusparseCreateConstDnVec
+        __cusparseCreateConstDnVec = dlsym(RTLD_DEFAULT, 'cusparseCreateConstDnVec')
+        if __cusparseCreateConstDnVec == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstDnVec = dlsym(handle, 'cusparseCreateConstDnVec')
+
+        global __cusparseConstDnVecGet
+        __cusparseConstDnVecGet = dlsym(RTLD_DEFAULT, 'cusparseConstDnVecGet')
+        if __cusparseConstDnVecGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstDnVecGet = dlsym(handle, 'cusparseConstDnVecGet')
+
+        global __cusparseConstDnVecGetValues
+        __cusparseConstDnVecGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstDnVecGetValues')
+        if __cusparseConstDnVecGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstDnVecGetValues = dlsym(handle, 'cusparseConstDnVecGetValues')
+
+        global __cusparseConstSpMatGetValues
+        __cusparseConstSpMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstSpMatGetValues')
+        if __cusparseConstSpMatGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstSpMatGetValues = dlsym(handle, 'cusparseConstSpMatGetValues')
+
+        global __cusparseCreateConstCsr
+        __cusparseCreateConstCsr = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCsr')
+        if __cusparseCreateConstCsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstCsr = dlsym(handle, 'cusparseCreateConstCsr')
+
+        global __cusparseCreateConstCsc
+        __cusparseCreateConstCsc = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCsc')
+        if __cusparseCreateConstCsc == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstCsc = dlsym(handle, 'cusparseCreateConstCsc')
+
+        global __cusparseConstCsrGet
+        __cusparseConstCsrGet = dlsym(RTLD_DEFAULT, 'cusparseConstCsrGet')
+        if __cusparseConstCsrGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstCsrGet = dlsym(handle, 'cusparseConstCsrGet')
+
+        global __cusparseConstCscGet
+        __cusparseConstCscGet = dlsym(RTLD_DEFAULT, 'cusparseConstCscGet')
+        if __cusparseConstCscGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstCscGet = dlsym(handle, 'cusparseConstCscGet')
+
+        global __cusparseCreateConstCoo
+        __cusparseCreateConstCoo = dlsym(RTLD_DEFAULT, 'cusparseCreateConstCoo')
+        if __cusparseCreateConstCoo == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstCoo = dlsym(handle, 'cusparseCreateConstCoo')
+
+        global __cusparseConstCooGet
+        __cusparseConstCooGet = dlsym(RTLD_DEFAULT, 'cusparseConstCooGet')
+        if __cusparseConstCooGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstCooGet = dlsym(handle, 'cusparseConstCooGet')
+
+        global __cusparseCreateConstBlockedEll
+        __cusparseCreateConstBlockedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateConstBlockedEll')
+        if __cusparseCreateConstBlockedEll == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstBlockedEll = dlsym(handle, 'cusparseCreateConstBlockedEll')
+
+        global __cusparseConstBlockedEllGet
+        __cusparseConstBlockedEllGet = dlsym(RTLD_DEFAULT, 'cusparseConstBlockedEllGet')
+        if __cusparseConstBlockedEllGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstBlockedEllGet = dlsym(handle, 'cusparseConstBlockedEllGet')
+
+        global __cusparseCreateConstDnMat
+        __cusparseCreateConstDnMat = dlsym(RTLD_DEFAULT, 'cusparseCreateConstDnMat')
+        if __cusparseCreateConstDnMat == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstDnMat = dlsym(handle, 'cusparseCreateConstDnMat')
+
+        global __cusparseConstDnMatGet
+        __cusparseConstDnMatGet = dlsym(RTLD_DEFAULT, 'cusparseConstDnMatGet')
+        if __cusparseConstDnMatGet == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstDnMatGet = dlsym(handle, 'cusparseConstDnMatGet')
+
+        global __cusparseConstDnMatGetValues
+        __cusparseConstDnMatGetValues = dlsym(RTLD_DEFAULT, 'cusparseConstDnMatGetValues')
+        if __cusparseConstDnMatGetValues == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseConstDnMatGetValues = dlsym(handle, 'cusparseConstDnMatGetValues')
+
+        global __cusparseSpGEMM_getNumProducts
+        __cusparseSpGEMM_getNumProducts = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_getNumProducts')
+        if __cusparseSpGEMM_getNumProducts == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_getNumProducts = dlsym(handle, 'cusparseSpGEMM_getNumProducts')
+
+        global __cusparseSpGEMM_estimateMemory
+        __cusparseSpGEMM_estimateMemory = dlsym(RTLD_DEFAULT, 'cusparseSpGEMM_estimateMemory')
+        if __cusparseSpGEMM_estimateMemory == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpGEMM_estimateMemory = dlsym(handle, 'cusparseSpGEMM_estimateMemory')
+
+        global __cusparseBsrSetStridedBatch
+        __cusparseBsrSetStridedBatch = dlsym(RTLD_DEFAULT, 'cusparseBsrSetStridedBatch')
+        if __cusparseBsrSetStridedBatch == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseBsrSetStridedBatch = dlsym(handle, 'cusparseBsrSetStridedBatch')
+
+        global __cusparseCreateBsr
+        __cusparseCreateBsr = dlsym(RTLD_DEFAULT, 'cusparseCreateBsr')
+        if __cusparseCreateBsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateBsr = dlsym(handle, 'cusparseCreateBsr')
+
+        global __cusparseCreateConstBsr
+        __cusparseCreateConstBsr = dlsym(RTLD_DEFAULT, 'cusparseCreateConstBsr')
+        if __cusparseCreateConstBsr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstBsr = dlsym(handle, 'cusparseCreateConstBsr')
+
+        global __cusparseCreateSlicedEll
+        __cusparseCreateSlicedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateSlicedEll')
+        if __cusparseCreateSlicedEll == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateSlicedEll = dlsym(handle, 'cusparseCreateSlicedEll')
+
+        global __cusparseCreateConstSlicedEll
+        __cusparseCreateConstSlicedEll = dlsym(RTLD_DEFAULT, 'cusparseCreateConstSlicedEll')
+        if __cusparseCreateConstSlicedEll == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseCreateConstSlicedEll = dlsym(handle, 'cusparseCreateConstSlicedEll')
+
+        global __cusparseSpSV_updateMatrix
+        __cusparseSpSV_updateMatrix = dlsym(RTLD_DEFAULT, 'cusparseSpSV_updateMatrix')
+        if __cusparseSpSV_updateMatrix == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSV_updateMatrix = dlsym(handle, 'cusparseSpSV_updateMatrix')
+
+        global __cusparseSpMV_preprocess
+        __cusparseSpMV_preprocess = dlsym(RTLD_DEFAULT, 'cusparseSpMV_preprocess')
+        if __cusparseSpMV_preprocess == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpMV_preprocess = dlsym(handle, 'cusparseSpMV_preprocess')
+
+        global __cusparseSpSM_updateMatrix
+        __cusparseSpSM_updateMatrix = dlsym(RTLD_DEFAULT, 'cusparseSpSM_updateMatrix')
+        if __cusparseSpSM_updateMatrix == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusparseSpSM_updateMatrix = dlsym(handle, 'cusparseSpSM_updateMatrix')
+
+        __py_cusparse_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cusparse_windows.pyx b/nvmath/bindings/_internal/cusparse_windows.pyx
index 2524b6a..ce45356 100644
--- a/nvmath/bindings/_internal/cusparse_windows.pyx
+++ b/nvmath/bindings/_internal/cusparse_windows.pyx
@@ -8,20 +8,76 @@ from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+from .utils import NotSupportedError
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_cusparse_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __cusparseCreate = NULL
 cdef void* __cusparseDestroy = NULL
@@ -295,1564 +351,783 @@ cdef int _check_or_init_cusparse() except -1 nogil:
     if __py_cusparse_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = <intptr_t>load_library(driver_ver)
 
         # Load function
         global __cusparseCreate
-        try:
-            __cusparseCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreate')
-        except:
-            pass
+        __cusparseCreate = GetProcAddress(handle, 'cusparseCreate')
 
         global __cusparseDestroy
-        try:
-            __cusparseDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroy')
-        except:
-            pass
+        __cusparseDestroy = GetProcAddress(handle, 'cusparseDestroy')
 
         global __cusparseGetVersion
-        try:
-            __cusparseGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetVersion')
-        except:
-            pass
+        __cusparseGetVersion = GetProcAddress(handle, 'cusparseGetVersion')
 
         global __cusparseGetProperty
-        try:
-            __cusparseGetProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetProperty')
-        except:
-            pass
+        __cusparseGetProperty = GetProcAddress(handle, 'cusparseGetProperty')
 
         global __cusparseGetErrorName
-        try:
-            __cusparseGetErrorName = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetErrorName')
-        except:
-            pass
+        __cusparseGetErrorName = GetProcAddress(handle, 'cusparseGetErrorName')
 
         global __cusparseGetErrorString
-        try:
-            __cusparseGetErrorString = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetErrorString')
-        except:
-            pass
+        __cusparseGetErrorString = GetProcAddress(handle, 'cusparseGetErrorString')
 
         global __cusparseSetStream
-        try:
-            __cusparseSetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetStream')
-        except:
-            pass
+        __cusparseSetStream = GetProcAddress(handle, 'cusparseSetStream')
 
         global __cusparseGetStream
-        try:
-            __cusparseGetStream = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetStream')
-        except:
-            pass
+        __cusparseGetStream = GetProcAddress(handle, 'cusparseGetStream')
 
         global __cusparseGetPointerMode
-        try:
-            __cusparseGetPointerMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetPointerMode')
-        except:
-            pass
+        __cusparseGetPointerMode = GetProcAddress(handle, 'cusparseGetPointerMode')
 
         global __cusparseSetPointerMode
-        try:
-            __cusparseSetPointerMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetPointerMode')
-        except:
-            pass
+        __cusparseSetPointerMode = GetProcAddress(handle, 'cusparseSetPointerMode')
 
         global __cusparseCreateMatDescr
-        try:
-            __cusparseCreateMatDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateMatDescr')
-        except:
-            pass
+        __cusparseCreateMatDescr = GetProcAddress(handle, 'cusparseCreateMatDescr')
 
         global __cusparseDestroyMatDescr
-        try:
-            __cusparseDestroyMatDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroyMatDescr')
-        except:
-            pass
+        __cusparseDestroyMatDescr = GetProcAddress(handle, 'cusparseDestroyMatDescr')
 
         global __cusparseSetMatType
-        try:
-            __cusparseSetMatType = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetMatType')
-        except:
-            pass
+        __cusparseSetMatType = GetProcAddress(handle, 'cusparseSetMatType')
 
         global __cusparseGetMatType
-        try:
-            __cusparseGetMatType = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetMatType')
-        except:
-            pass
+        __cusparseGetMatType = GetProcAddress(handle, 'cusparseGetMatType')
 
         global __cusparseSetMatFillMode
-        try:
-            __cusparseSetMatFillMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetMatFillMode')
-        except:
-            pass
+        __cusparseSetMatFillMode = GetProcAddress(handle, 'cusparseSetMatFillMode')
 
         global __cusparseGetMatFillMode
-        try:
-            __cusparseGetMatFillMode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetMatFillMode')
-        except:
-            pass
+        __cusparseGetMatFillMode = GetProcAddress(handle, 'cusparseGetMatFillMode')
 
         global __cusparseSetMatDiagType
-        try:
-            __cusparseSetMatDiagType = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetMatDiagType')
-        except:
-            pass
+        __cusparseSetMatDiagType = GetProcAddress(handle, 'cusparseSetMatDiagType')
 
         global __cusparseGetMatDiagType
-        try:
-            __cusparseGetMatDiagType = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetMatDiagType')
-        except:
-            pass
+        __cusparseGetMatDiagType = GetProcAddress(handle, 'cusparseGetMatDiagType')
 
         global __cusparseSetMatIndexBase
-        try:
-            __cusparseSetMatIndexBase = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSetMatIndexBase')
-        except:
-            pass
+        __cusparseSetMatIndexBase = GetProcAddress(handle, 'cusparseSetMatIndexBase')
 
         global __cusparseGetMatIndexBase
-        try:
-            __cusparseGetMatIndexBase = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGetMatIndexBase')
-        except:
-            pass
+        __cusparseGetMatIndexBase = GetProcAddress(handle, 'cusparseGetMatIndexBase')
 
         global __cusparseSgemvi
-        try:
-            __cusparseSgemvi = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgemvi')
-        except:
-            pass
+        __cusparseSgemvi = GetProcAddress(handle, 'cusparseSgemvi')
 
         global __cusparseSgemvi_bufferSize
-        try:
-            __cusparseSgemvi_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgemvi_bufferSize')
-        except:
-            pass
+        __cusparseSgemvi_bufferSize = GetProcAddress(handle, 'cusparseSgemvi_bufferSize')
 
         global __cusparseDgemvi
-        try:
-            __cusparseDgemvi = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgemvi')
-        except:
-            pass
+        __cusparseDgemvi = GetProcAddress(handle, 'cusparseDgemvi')
 
         global __cusparseDgemvi_bufferSize
-        try:
-            __cusparseDgemvi_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgemvi_bufferSize')
-        except:
-            pass
+        __cusparseDgemvi_bufferSize = GetProcAddress(handle, 'cusparseDgemvi_bufferSize')
 
         global __cusparseCgemvi
-        try:
-            __cusparseCgemvi = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgemvi')
-        except:
-            pass
+        __cusparseCgemvi = GetProcAddress(handle, 'cusparseCgemvi')
 
         global __cusparseCgemvi_bufferSize
-        try:
-            __cusparseCgemvi_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgemvi_bufferSize')
-        except:
-            pass
+        __cusparseCgemvi_bufferSize = GetProcAddress(handle, 'cusparseCgemvi_bufferSize')
 
         global __cusparseZgemvi
-        try:
-            __cusparseZgemvi = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgemvi')
-        except:
-            pass
+        __cusparseZgemvi = GetProcAddress(handle, 'cusparseZgemvi')
 
         global __cusparseZgemvi_bufferSize
-        try:
-            __cusparseZgemvi_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgemvi_bufferSize')
-        except:
-            pass
+        __cusparseZgemvi_bufferSize = GetProcAddress(handle, 'cusparseZgemvi_bufferSize')
 
         global __cusparseSbsrmv
-        try:
-            __cusparseSbsrmv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSbsrmv')
-        except:
-            pass
+        __cusparseSbsrmv = GetProcAddress(handle, 'cusparseSbsrmv')
 
         global __cusparseDbsrmv
-        try:
-            __cusparseDbsrmv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDbsrmv')
-        except:
-            pass
+        __cusparseDbsrmv = GetProcAddress(handle, 'cusparseDbsrmv')
 
         global __cusparseCbsrmv
-        try:
-            __cusparseCbsrmv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCbsrmv')
-        except:
-            pass
+        __cusparseCbsrmv = GetProcAddress(handle, 'cusparseCbsrmv')
 
         global __cusparseZbsrmv
-        try:
-            __cusparseZbsrmv = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZbsrmv')
-        except:
-            pass
+        __cusparseZbsrmv = GetProcAddress(handle, 'cusparseZbsrmv')
 
         global __cusparseSbsrmm
-        try:
-            __cusparseSbsrmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSbsrmm')
-        except:
-            pass
+        __cusparseSbsrmm = GetProcAddress(handle, 'cusparseSbsrmm')
 
         global __cusparseDbsrmm
-        try:
-            __cusparseDbsrmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDbsrmm')
-        except:
-            pass
+        __cusparseDbsrmm = GetProcAddress(handle, 'cusparseDbsrmm')
 
         global __cusparseCbsrmm
-        try:
-            __cusparseCbsrmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCbsrmm')
-        except:
-            pass
+        __cusparseCbsrmm = GetProcAddress(handle, 'cusparseCbsrmm')
 
         global __cusparseZbsrmm
-        try:
-            __cusparseZbsrmm = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZbsrmm')
-        except:
-            pass
+        __cusparseZbsrmm = GetProcAddress(handle, 'cusparseZbsrmm')
 
         global __cusparseSgtsv2_bufferSizeExt
-        try:
-            __cusparseSgtsv2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgtsv2_bufferSizeExt = GetProcAddress(handle, 'cusparseSgtsv2_bufferSizeExt')
 
         global __cusparseDgtsv2_bufferSizeExt
-        try:
-            __cusparseDgtsv2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgtsv2_bufferSizeExt = GetProcAddress(handle, 'cusparseDgtsv2_bufferSizeExt')
 
         global __cusparseCgtsv2_bufferSizeExt
-        try:
-            __cusparseCgtsv2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgtsv2_bufferSizeExt = GetProcAddress(handle, 'cusparseCgtsv2_bufferSizeExt')
 
         global __cusparseZgtsv2_bufferSizeExt
-        try:
-            __cusparseZgtsv2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgtsv2_bufferSizeExt = GetProcAddress(handle, 'cusparseZgtsv2_bufferSizeExt')
 
         global __cusparseSgtsv2
-        try:
-            __cusparseSgtsv2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2')
-        except:
-            pass
+        __cusparseSgtsv2 = GetProcAddress(handle, 'cusparseSgtsv2')
 
         global __cusparseDgtsv2
-        try:
-            __cusparseDgtsv2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2')
-        except:
-            pass
+        __cusparseDgtsv2 = GetProcAddress(handle, 'cusparseDgtsv2')
 
         global __cusparseCgtsv2
-        try:
-            __cusparseCgtsv2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2')
-        except:
-            pass
+        __cusparseCgtsv2 = GetProcAddress(handle, 'cusparseCgtsv2')
 
         global __cusparseZgtsv2
-        try:
-            __cusparseZgtsv2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2')
-        except:
-            pass
+        __cusparseZgtsv2 = GetProcAddress(handle, 'cusparseZgtsv2')
 
         global __cusparseSgtsv2_nopivot_bufferSizeExt
-        try:
-            __cusparseSgtsv2_nopivot_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2_nopivot_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgtsv2_nopivot_bufferSizeExt = GetProcAddress(handle, 'cusparseSgtsv2_nopivot_bufferSizeExt')
 
         global __cusparseDgtsv2_nopivot_bufferSizeExt
-        try:
-            __cusparseDgtsv2_nopivot_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2_nopivot_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgtsv2_nopivot_bufferSizeExt = GetProcAddress(handle, 'cusparseDgtsv2_nopivot_bufferSizeExt')
 
         global __cusparseCgtsv2_nopivot_bufferSizeExt
-        try:
-            __cusparseCgtsv2_nopivot_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2_nopivot_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgtsv2_nopivot_bufferSizeExt = GetProcAddress(handle, 'cusparseCgtsv2_nopivot_bufferSizeExt')
 
         global __cusparseZgtsv2_nopivot_bufferSizeExt
-        try:
-            __cusparseZgtsv2_nopivot_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2_nopivot_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgtsv2_nopivot_bufferSizeExt = GetProcAddress(handle, 'cusparseZgtsv2_nopivot_bufferSizeExt')
 
         global __cusparseSgtsv2_nopivot
-        try:
-            __cusparseSgtsv2_nopivot = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2_nopivot')
-        except:
-            pass
+        __cusparseSgtsv2_nopivot = GetProcAddress(handle, 'cusparseSgtsv2_nopivot')
 
         global __cusparseDgtsv2_nopivot
-        try:
-            __cusparseDgtsv2_nopivot = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2_nopivot')
-        except:
-            pass
+        __cusparseDgtsv2_nopivot = GetProcAddress(handle, 'cusparseDgtsv2_nopivot')
 
         global __cusparseCgtsv2_nopivot
-        try:
-            __cusparseCgtsv2_nopivot = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2_nopivot')
-        except:
-            pass
+        __cusparseCgtsv2_nopivot = GetProcAddress(handle, 'cusparseCgtsv2_nopivot')
 
         global __cusparseZgtsv2_nopivot
-        try:
-            __cusparseZgtsv2_nopivot = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2_nopivot')
-        except:
-            pass
+        __cusparseZgtsv2_nopivot = GetProcAddress(handle, 'cusparseZgtsv2_nopivot')
 
         global __cusparseSgtsv2StridedBatch_bufferSizeExt
-        try:
-            __cusparseSgtsv2StridedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgtsv2StridedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseSgtsv2StridedBatch_bufferSizeExt')
 
         global __cusparseDgtsv2StridedBatch_bufferSizeExt
-        try:
-            __cusparseDgtsv2StridedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgtsv2StridedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseDgtsv2StridedBatch_bufferSizeExt')
 
         global __cusparseCgtsv2StridedBatch_bufferSizeExt
-        try:
-            __cusparseCgtsv2StridedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgtsv2StridedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseCgtsv2StridedBatch_bufferSizeExt')
 
         global __cusparseZgtsv2StridedBatch_bufferSizeExt
-        try:
-            __cusparseZgtsv2StridedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgtsv2StridedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseZgtsv2StridedBatch_bufferSizeExt')
 
         global __cusparseSgtsv2StridedBatch
-        try:
-            __cusparseSgtsv2StridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsv2StridedBatch')
-        except:
-            pass
+        __cusparseSgtsv2StridedBatch = GetProcAddress(handle, 'cusparseSgtsv2StridedBatch')
 
         global __cusparseDgtsv2StridedBatch
-        try:
-            __cusparseDgtsv2StridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsv2StridedBatch')
-        except:
-            pass
+        __cusparseDgtsv2StridedBatch = GetProcAddress(handle, 'cusparseDgtsv2StridedBatch')
 
         global __cusparseCgtsv2StridedBatch
-        try:
-            __cusparseCgtsv2StridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsv2StridedBatch')
-        except:
-            pass
+        __cusparseCgtsv2StridedBatch = GetProcAddress(handle, 'cusparseCgtsv2StridedBatch')
 
         global __cusparseZgtsv2StridedBatch
-        try:
-            __cusparseZgtsv2StridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsv2StridedBatch')
-        except:
-            pass
+        __cusparseZgtsv2StridedBatch = GetProcAddress(handle, 'cusparseZgtsv2StridedBatch')
 
         global __cusparseSgtsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseSgtsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgtsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseSgtsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseDgtsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseDgtsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgtsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseDgtsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseCgtsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseCgtsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgtsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseCgtsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseZgtsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseZgtsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgtsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseZgtsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseSgtsvInterleavedBatch
-        try:
-            __cusparseSgtsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgtsvInterleavedBatch')
-        except:
-            pass
+        __cusparseSgtsvInterleavedBatch = GetProcAddress(handle, 'cusparseSgtsvInterleavedBatch')
 
         global __cusparseDgtsvInterleavedBatch
-        try:
-            __cusparseDgtsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgtsvInterleavedBatch')
-        except:
-            pass
+        __cusparseDgtsvInterleavedBatch = GetProcAddress(handle, 'cusparseDgtsvInterleavedBatch')
 
         global __cusparseCgtsvInterleavedBatch
-        try:
-            __cusparseCgtsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgtsvInterleavedBatch')
-        except:
-            pass
+        __cusparseCgtsvInterleavedBatch = GetProcAddress(handle, 'cusparseCgtsvInterleavedBatch')
 
         global __cusparseZgtsvInterleavedBatch
-        try:
-            __cusparseZgtsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgtsvInterleavedBatch')
-        except:
-            pass
+        __cusparseZgtsvInterleavedBatch = GetProcAddress(handle, 'cusparseZgtsvInterleavedBatch')
 
         global __cusparseSgpsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseSgpsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgpsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseSgpsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseDgpsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseDgpsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgpsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseDgpsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseCgpsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseCgpsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgpsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseCgpsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseZgpsvInterleavedBatch_bufferSizeExt
-        try:
-            __cusparseZgpsvInterleavedBatch_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgpsvInterleavedBatch_bufferSizeExt = GetProcAddress(handle, 'cusparseZgpsvInterleavedBatch_bufferSizeExt')
 
         global __cusparseSgpsvInterleavedBatch
-        try:
-            __cusparseSgpsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgpsvInterleavedBatch')
-        except:
-            pass
+        __cusparseSgpsvInterleavedBatch = GetProcAddress(handle, 'cusparseSgpsvInterleavedBatch')
 
         global __cusparseDgpsvInterleavedBatch
-        try:
-            __cusparseDgpsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgpsvInterleavedBatch')
-        except:
-            pass
+        __cusparseDgpsvInterleavedBatch = GetProcAddress(handle, 'cusparseDgpsvInterleavedBatch')
 
         global __cusparseCgpsvInterleavedBatch
-        try:
-            __cusparseCgpsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgpsvInterleavedBatch')
-        except:
-            pass
+        __cusparseCgpsvInterleavedBatch = GetProcAddress(handle, 'cusparseCgpsvInterleavedBatch')
 
         global __cusparseZgpsvInterleavedBatch
-        try:
-            __cusparseZgpsvInterleavedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgpsvInterleavedBatch')
-        except:
-            pass
+        __cusparseZgpsvInterleavedBatch = GetProcAddress(handle, 'cusparseZgpsvInterleavedBatch')
 
         global __cusparseScsrgeam2_bufferSizeExt
-        try:
-            __cusparseScsrgeam2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScsrgeam2_bufferSizeExt')
-        except:
-            pass
+        __cusparseScsrgeam2_bufferSizeExt = GetProcAddress(handle, 'cusparseScsrgeam2_bufferSizeExt')
 
         global __cusparseDcsrgeam2_bufferSizeExt
-        try:
-            __cusparseDcsrgeam2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDcsrgeam2_bufferSizeExt')
-        except:
-            pass
+        __cusparseDcsrgeam2_bufferSizeExt = GetProcAddress(handle, 'cusparseDcsrgeam2_bufferSizeExt')
 
         global __cusparseCcsrgeam2_bufferSizeExt
-        try:
-            __cusparseCcsrgeam2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCcsrgeam2_bufferSizeExt')
-        except:
-            pass
+        __cusparseCcsrgeam2_bufferSizeExt = GetProcAddress(handle, 'cusparseCcsrgeam2_bufferSizeExt')
 
         global __cusparseZcsrgeam2_bufferSizeExt
-        try:
-            __cusparseZcsrgeam2_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZcsrgeam2_bufferSizeExt')
-        except:
-            pass
+        __cusparseZcsrgeam2_bufferSizeExt = GetProcAddress(handle, 'cusparseZcsrgeam2_bufferSizeExt')
 
         global __cusparseXcsrgeam2Nnz
-        try:
-            __cusparseXcsrgeam2Nnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcsrgeam2Nnz')
-        except:
-            pass
+        __cusparseXcsrgeam2Nnz = GetProcAddress(handle, 'cusparseXcsrgeam2Nnz')
 
         global __cusparseScsrgeam2
-        try:
-            __cusparseScsrgeam2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScsrgeam2')
-        except:
-            pass
+        __cusparseScsrgeam2 = GetProcAddress(handle, 'cusparseScsrgeam2')
 
         global __cusparseDcsrgeam2
-        try:
-            __cusparseDcsrgeam2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDcsrgeam2')
-        except:
-            pass
+        __cusparseDcsrgeam2 = GetProcAddress(handle, 'cusparseDcsrgeam2')
 
         global __cusparseCcsrgeam2
-        try:
-            __cusparseCcsrgeam2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCcsrgeam2')
-        except:
-            pass
+        __cusparseCcsrgeam2 = GetProcAddress(handle, 'cusparseCcsrgeam2')
 
         global __cusparseZcsrgeam2
-        try:
-            __cusparseZcsrgeam2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZcsrgeam2')
-        except:
-            pass
+        __cusparseZcsrgeam2 = GetProcAddress(handle, 'cusparseZcsrgeam2')
 
         global __cusparseSnnz
-        try:
-            __cusparseSnnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSnnz')
-        except:
-            pass
+        __cusparseSnnz = GetProcAddress(handle, 'cusparseSnnz')
 
         global __cusparseDnnz
-        try:
-            __cusparseDnnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnnz')
-        except:
-            pass
+        __cusparseDnnz = GetProcAddress(handle, 'cusparseDnnz')
 
         global __cusparseCnnz
-        try:
-            __cusparseCnnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCnnz')
-        except:
-            pass
+        __cusparseCnnz = GetProcAddress(handle, 'cusparseCnnz')
 
         global __cusparseZnnz
-        try:
-            __cusparseZnnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZnnz')
-        except:
-            pass
+        __cusparseZnnz = GetProcAddress(handle, 'cusparseZnnz')
 
         global __cusparseXcoo2csr
-        try:
-            __cusparseXcoo2csr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcoo2csr')
-        except:
-            pass
+        __cusparseXcoo2csr = GetProcAddress(handle, 'cusparseXcoo2csr')
 
         global __cusparseXcsr2coo
-        try:
-            __cusparseXcsr2coo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcsr2coo')
-        except:
-            pass
+        __cusparseXcsr2coo = GetProcAddress(handle, 'cusparseXcsr2coo')
 
         global __cusparseSbsr2csr
-        try:
-            __cusparseSbsr2csr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSbsr2csr')
-        except:
-            pass
+        __cusparseSbsr2csr = GetProcAddress(handle, 'cusparseSbsr2csr')
 
         global __cusparseDbsr2csr
-        try:
-            __cusparseDbsr2csr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDbsr2csr')
-        except:
-            pass
+        __cusparseDbsr2csr = GetProcAddress(handle, 'cusparseDbsr2csr')
 
         global __cusparseCbsr2csr
-        try:
-            __cusparseCbsr2csr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCbsr2csr')
-        except:
-            pass
+        __cusparseCbsr2csr = GetProcAddress(handle, 'cusparseCbsr2csr')
 
         global __cusparseZbsr2csr
-        try:
-            __cusparseZbsr2csr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZbsr2csr')
-        except:
-            pass
+        __cusparseZbsr2csr = GetProcAddress(handle, 'cusparseZbsr2csr')
 
         global __cusparseSgebsr2gebsc_bufferSize
-        try:
-            __cusparseSgebsr2gebsc_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsc_bufferSize')
-        except:
-            pass
+        __cusparseSgebsr2gebsc_bufferSize = GetProcAddress(handle, 'cusparseSgebsr2gebsc_bufferSize')
 
         global __cusparseDgebsr2gebsc_bufferSize
-        try:
-            __cusparseDgebsr2gebsc_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsc_bufferSize')
-        except:
-            pass
+        __cusparseDgebsr2gebsc_bufferSize = GetProcAddress(handle, 'cusparseDgebsr2gebsc_bufferSize')
 
         global __cusparseCgebsr2gebsc_bufferSize
-        try:
-            __cusparseCgebsr2gebsc_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsc_bufferSize')
-        except:
-            pass
+        __cusparseCgebsr2gebsc_bufferSize = GetProcAddress(handle, 'cusparseCgebsr2gebsc_bufferSize')
 
         global __cusparseZgebsr2gebsc_bufferSize
-        try:
-            __cusparseZgebsr2gebsc_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsc_bufferSize')
-        except:
-            pass
+        __cusparseZgebsr2gebsc_bufferSize = GetProcAddress(handle, 'cusparseZgebsr2gebsc_bufferSize')
 
         global __cusparseSgebsr2gebsc_bufferSizeExt
-        try:
-            __cusparseSgebsr2gebsc_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsc_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgebsr2gebsc_bufferSizeExt = GetProcAddress(handle, 'cusparseSgebsr2gebsc_bufferSizeExt')
 
         global __cusparseDgebsr2gebsc_bufferSizeExt
-        try:
-            __cusparseDgebsr2gebsc_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsc_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgebsr2gebsc_bufferSizeExt = GetProcAddress(handle, 'cusparseDgebsr2gebsc_bufferSizeExt')
 
         global __cusparseCgebsr2gebsc_bufferSizeExt
-        try:
-            __cusparseCgebsr2gebsc_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsc_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgebsr2gebsc_bufferSizeExt = GetProcAddress(handle, 'cusparseCgebsr2gebsc_bufferSizeExt')
 
         global __cusparseZgebsr2gebsc_bufferSizeExt
-        try:
-            __cusparseZgebsr2gebsc_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsc_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgebsr2gebsc_bufferSizeExt = GetProcAddress(handle, 'cusparseZgebsr2gebsc_bufferSizeExt')
 
         global __cusparseSgebsr2gebsc
-        try:
-            __cusparseSgebsr2gebsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsc')
-        except:
-            pass
+        __cusparseSgebsr2gebsc = GetProcAddress(handle, 'cusparseSgebsr2gebsc')
 
         global __cusparseDgebsr2gebsc
-        try:
-            __cusparseDgebsr2gebsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsc')
-        except:
-            pass
+        __cusparseDgebsr2gebsc = GetProcAddress(handle, 'cusparseDgebsr2gebsc')
 
         global __cusparseCgebsr2gebsc
-        try:
-            __cusparseCgebsr2gebsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsc')
-        except:
-            pass
+        __cusparseCgebsr2gebsc = GetProcAddress(handle, 'cusparseCgebsr2gebsc')
 
         global __cusparseZgebsr2gebsc
-        try:
-            __cusparseZgebsr2gebsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsc')
-        except:
-            pass
+        __cusparseZgebsr2gebsc = GetProcAddress(handle, 'cusparseZgebsr2gebsc')
 
         global __cusparseScsr2gebsr_bufferSize
-        try:
-            __cusparseScsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseScsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseScsr2gebsr_bufferSize')
 
         global __cusparseDcsr2gebsr_bufferSize
-        try:
-            __cusparseDcsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDcsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseDcsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseDcsr2gebsr_bufferSize')
 
         global __cusparseCcsr2gebsr_bufferSize
-        try:
-            __cusparseCcsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCcsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseCcsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseCcsr2gebsr_bufferSize')
 
         global __cusparseZcsr2gebsr_bufferSize
-        try:
-            __cusparseZcsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZcsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseZcsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseZcsr2gebsr_bufferSize')
 
         global __cusparseScsr2gebsr_bufferSizeExt
-        try:
-            __cusparseScsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseScsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseScsr2gebsr_bufferSizeExt')
 
         global __cusparseDcsr2gebsr_bufferSizeExt
-        try:
-            __cusparseDcsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDcsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseDcsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseDcsr2gebsr_bufferSizeExt')
 
         global __cusparseCcsr2gebsr_bufferSizeExt
-        try:
-            __cusparseCcsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCcsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseCcsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseCcsr2gebsr_bufferSizeExt')
 
         global __cusparseZcsr2gebsr_bufferSizeExt
-        try:
-            __cusparseZcsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZcsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseZcsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseZcsr2gebsr_bufferSizeExt')
 
         global __cusparseXcsr2gebsrNnz
-        try:
-            __cusparseXcsr2gebsrNnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcsr2gebsrNnz')
-        except:
-            pass
+        __cusparseXcsr2gebsrNnz = GetProcAddress(handle, 'cusparseXcsr2gebsrNnz')
 
         global __cusparseScsr2gebsr
-        try:
-            __cusparseScsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScsr2gebsr')
-        except:
-            pass
+        __cusparseScsr2gebsr = GetProcAddress(handle, 'cusparseScsr2gebsr')
 
         global __cusparseDcsr2gebsr
-        try:
-            __cusparseDcsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDcsr2gebsr')
-        except:
-            pass
+        __cusparseDcsr2gebsr = GetProcAddress(handle, 'cusparseDcsr2gebsr')
 
         global __cusparseCcsr2gebsr
-        try:
-            __cusparseCcsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCcsr2gebsr')
-        except:
-            pass
+        __cusparseCcsr2gebsr = GetProcAddress(handle, 'cusparseCcsr2gebsr')
 
         global __cusparseZcsr2gebsr
-        try:
-            __cusparseZcsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZcsr2gebsr')
-        except:
-            pass
+        __cusparseZcsr2gebsr = GetProcAddress(handle, 'cusparseZcsr2gebsr')
 
         global __cusparseSgebsr2gebsr_bufferSize
-        try:
-            __cusparseSgebsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseSgebsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseSgebsr2gebsr_bufferSize')
 
         global __cusparseDgebsr2gebsr_bufferSize
-        try:
-            __cusparseDgebsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseDgebsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseDgebsr2gebsr_bufferSize')
 
         global __cusparseCgebsr2gebsr_bufferSize
-        try:
-            __cusparseCgebsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseCgebsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseCgebsr2gebsr_bufferSize')
 
         global __cusparseZgebsr2gebsr_bufferSize
-        try:
-            __cusparseZgebsr2gebsr_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsr_bufferSize')
-        except:
-            pass
+        __cusparseZgebsr2gebsr_bufferSize = GetProcAddress(handle, 'cusparseZgebsr2gebsr_bufferSize')
 
         global __cusparseSgebsr2gebsr_bufferSizeExt
-        try:
-            __cusparseSgebsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseSgebsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseSgebsr2gebsr_bufferSizeExt')
 
         global __cusparseDgebsr2gebsr_bufferSizeExt
-        try:
-            __cusparseDgebsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseDgebsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseDgebsr2gebsr_bufferSizeExt')
 
         global __cusparseCgebsr2gebsr_bufferSizeExt
-        try:
-            __cusparseCgebsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseCgebsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseCgebsr2gebsr_bufferSizeExt')
 
         global __cusparseZgebsr2gebsr_bufferSizeExt
-        try:
-            __cusparseZgebsr2gebsr_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsr_bufferSizeExt')
-        except:
-            pass
+        __cusparseZgebsr2gebsr_bufferSizeExt = GetProcAddress(handle, 'cusparseZgebsr2gebsr_bufferSizeExt')
 
         global __cusparseXgebsr2gebsrNnz
-        try:
-            __cusparseXgebsr2gebsrNnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXgebsr2gebsrNnz')
-        except:
-            pass
+        __cusparseXgebsr2gebsrNnz = GetProcAddress(handle, 'cusparseXgebsr2gebsrNnz')
 
         global __cusparseSgebsr2gebsr
-        try:
-            __cusparseSgebsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSgebsr2gebsr')
-        except:
-            pass
+        __cusparseSgebsr2gebsr = GetProcAddress(handle, 'cusparseSgebsr2gebsr')
 
         global __cusparseDgebsr2gebsr
-        try:
-            __cusparseDgebsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDgebsr2gebsr')
-        except:
-            pass
+        __cusparseDgebsr2gebsr = GetProcAddress(handle, 'cusparseDgebsr2gebsr')
 
         global __cusparseCgebsr2gebsr
-        try:
-            __cusparseCgebsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCgebsr2gebsr')
-        except:
-            pass
+        __cusparseCgebsr2gebsr = GetProcAddress(handle, 'cusparseCgebsr2gebsr')
 
         global __cusparseZgebsr2gebsr
-        try:
-            __cusparseZgebsr2gebsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseZgebsr2gebsr')
-        except:
-            pass
+        __cusparseZgebsr2gebsr = GetProcAddress(handle, 'cusparseZgebsr2gebsr')
 
         global __cusparseXcoosort_bufferSizeExt
-        try:
-            __cusparseXcoosort_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcoosort_bufferSizeExt')
-        except:
-            pass
+        __cusparseXcoosort_bufferSizeExt = GetProcAddress(handle, 'cusparseXcoosort_bufferSizeExt')
 
         global __cusparseXcoosortByRow
-        try:
-            __cusparseXcoosortByRow = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcoosortByRow')
-        except:
-            pass
+        __cusparseXcoosortByRow = GetProcAddress(handle, 'cusparseXcoosortByRow')
 
         global __cusparseXcoosortByColumn
-        try:
-            __cusparseXcoosortByColumn = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcoosortByColumn')
-        except:
-            pass
+        __cusparseXcoosortByColumn = GetProcAddress(handle, 'cusparseXcoosortByColumn')
 
         global __cusparseXcsrsort_bufferSizeExt
-        try:
-            __cusparseXcsrsort_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcsrsort_bufferSizeExt')
-        except:
-            pass
+        __cusparseXcsrsort_bufferSizeExt = GetProcAddress(handle, 'cusparseXcsrsort_bufferSizeExt')
 
         global __cusparseXcsrsort
-        try:
-            __cusparseXcsrsort = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcsrsort')
-        except:
-            pass
+        __cusparseXcsrsort = GetProcAddress(handle, 'cusparseXcsrsort')
 
         global __cusparseXcscsort_bufferSizeExt
-        try:
-            __cusparseXcscsort_bufferSizeExt = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcscsort_bufferSizeExt')
-        except:
-            pass
+        __cusparseXcscsort_bufferSizeExt = GetProcAddress(handle, 'cusparseXcscsort_bufferSizeExt')
 
         global __cusparseXcscsort
-        try:
-            __cusparseXcscsort = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseXcscsort')
-        except:
-            pass
+        __cusparseXcscsort = GetProcAddress(handle, 'cusparseXcscsort')
 
         global __cusparseCsr2cscEx2
-        try:
-            __cusparseCsr2cscEx2 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCsr2cscEx2')
-        except:
-            pass
+        __cusparseCsr2cscEx2 = GetProcAddress(handle, 'cusparseCsr2cscEx2')
 
         global __cusparseCsr2cscEx2_bufferSize
-        try:
-            __cusparseCsr2cscEx2_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCsr2cscEx2_bufferSize')
-        except:
-            pass
+        __cusparseCsr2cscEx2_bufferSize = GetProcAddress(handle, 'cusparseCsr2cscEx2_bufferSize')
 
         global __cusparseCreateSpVec
-        try:
-            __cusparseCreateSpVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateSpVec')
-        except:
-            pass
+        __cusparseCreateSpVec = GetProcAddress(handle, 'cusparseCreateSpVec')
 
         global __cusparseDestroySpVec
-        try:
-            __cusparseDestroySpVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroySpVec')
-        except:
-            pass
+        __cusparseDestroySpVec = GetProcAddress(handle, 'cusparseDestroySpVec')
 
         global __cusparseSpVecGet
-        try:
-            __cusparseSpVecGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVecGet')
-        except:
-            pass
+        __cusparseSpVecGet = GetProcAddress(handle, 'cusparseSpVecGet')
 
         global __cusparseSpVecGetIndexBase
-        try:
-            __cusparseSpVecGetIndexBase = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVecGetIndexBase')
-        except:
-            pass
+        __cusparseSpVecGetIndexBase = GetProcAddress(handle, 'cusparseSpVecGetIndexBase')
 
         global __cusparseSpVecGetValues
-        try:
-            __cusparseSpVecGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVecGetValues')
-        except:
-            pass
+        __cusparseSpVecGetValues = GetProcAddress(handle, 'cusparseSpVecGetValues')
 
         global __cusparseSpVecSetValues
-        try:
-            __cusparseSpVecSetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVecSetValues')
-        except:
-            pass
+        __cusparseSpVecSetValues = GetProcAddress(handle, 'cusparseSpVecSetValues')
 
         global __cusparseCreateDnVec
-        try:
-            __cusparseCreateDnVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateDnVec')
-        except:
-            pass
+        __cusparseCreateDnVec = GetProcAddress(handle, 'cusparseCreateDnVec')
 
         global __cusparseDestroyDnVec
-        try:
-            __cusparseDestroyDnVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroyDnVec')
-        except:
-            pass
+        __cusparseDestroyDnVec = GetProcAddress(handle, 'cusparseDestroyDnVec')
 
         global __cusparseDnVecGet
-        try:
-            __cusparseDnVecGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnVecGet')
-        except:
-            pass
+        __cusparseDnVecGet = GetProcAddress(handle, 'cusparseDnVecGet')
 
         global __cusparseDnVecGetValues
-        try:
-            __cusparseDnVecGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnVecGetValues')
-        except:
-            pass
+        __cusparseDnVecGetValues = GetProcAddress(handle, 'cusparseDnVecGetValues')
 
         global __cusparseDnVecSetValues
-        try:
-            __cusparseDnVecSetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnVecSetValues')
-        except:
-            pass
+        __cusparseDnVecSetValues = GetProcAddress(handle, 'cusparseDnVecSetValues')
 
         global __cusparseDestroySpMat
-        try:
-            __cusparseDestroySpMat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroySpMat')
-        except:
-            pass
+        __cusparseDestroySpMat = GetProcAddress(handle, 'cusparseDestroySpMat')
 
         global __cusparseSpMatGetFormat
-        try:
-            __cusparseSpMatGetFormat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetFormat')
-        except:
-            pass
+        __cusparseSpMatGetFormat = GetProcAddress(handle, 'cusparseSpMatGetFormat')
 
         global __cusparseSpMatGetIndexBase
-        try:
-            __cusparseSpMatGetIndexBase = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetIndexBase')
-        except:
-            pass
+        __cusparseSpMatGetIndexBase = GetProcAddress(handle, 'cusparseSpMatGetIndexBase')
 
         global __cusparseSpMatGetValues
-        try:
-            __cusparseSpMatGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetValues')
-        except:
-            pass
+        __cusparseSpMatGetValues = GetProcAddress(handle, 'cusparseSpMatGetValues')
 
         global __cusparseSpMatSetValues
-        try:
-            __cusparseSpMatSetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatSetValues')
-        except:
-            pass
+        __cusparseSpMatSetValues = GetProcAddress(handle, 'cusparseSpMatSetValues')
 
         global __cusparseSpMatGetSize
-        try:
-            __cusparseSpMatGetSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetSize')
-        except:
-            pass
+        __cusparseSpMatGetSize = GetProcAddress(handle, 'cusparseSpMatGetSize')
 
         global __cusparseSpMatGetStridedBatch
-        try:
-            __cusparseSpMatGetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetStridedBatch')
-        except:
-            pass
+        __cusparseSpMatGetStridedBatch = GetProcAddress(handle, 'cusparseSpMatGetStridedBatch')
 
         global __cusparseCooSetStridedBatch
-        try:
-            __cusparseCooSetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCooSetStridedBatch')
-        except:
-            pass
+        __cusparseCooSetStridedBatch = GetProcAddress(handle, 'cusparseCooSetStridedBatch')
 
         global __cusparseCsrSetStridedBatch
-        try:
-            __cusparseCsrSetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCsrSetStridedBatch')
-        except:
-            pass
+        __cusparseCsrSetStridedBatch = GetProcAddress(handle, 'cusparseCsrSetStridedBatch')
 
         global __cusparseCreateCsr
-        try:
-            __cusparseCreateCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateCsr')
-        except:
-            pass
+        __cusparseCreateCsr = GetProcAddress(handle, 'cusparseCreateCsr')
 
         global __cusparseCsrGet
-        try:
-            __cusparseCsrGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCsrGet')
-        except:
-            pass
+        __cusparseCsrGet = GetProcAddress(handle, 'cusparseCsrGet')
 
         global __cusparseCsrSetPointers
-        try:
-            __cusparseCsrSetPointers = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCsrSetPointers')
-        except:
-            pass
+        __cusparseCsrSetPointers = GetProcAddress(handle, 'cusparseCsrSetPointers')
 
         global __cusparseCreateCoo
-        try:
-            __cusparseCreateCoo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateCoo')
-        except:
-            pass
+        __cusparseCreateCoo = GetProcAddress(handle, 'cusparseCreateCoo')
 
         global __cusparseCooGet
-        try:
-            __cusparseCooGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCooGet')
-        except:
-            pass
+        __cusparseCooGet = GetProcAddress(handle, 'cusparseCooGet')
 
         global __cusparseCreateDnMat
-        try:
-            __cusparseCreateDnMat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateDnMat')
-        except:
-            pass
+        __cusparseCreateDnMat = GetProcAddress(handle, 'cusparseCreateDnMat')
 
         global __cusparseDestroyDnMat
-        try:
-            __cusparseDestroyDnMat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDestroyDnMat')
-        except:
-            pass
+        __cusparseDestroyDnMat = GetProcAddress(handle, 'cusparseDestroyDnMat')
 
         global __cusparseDnMatGet
-        try:
-            __cusparseDnMatGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnMatGet')
-        except:
-            pass
+        __cusparseDnMatGet = GetProcAddress(handle, 'cusparseDnMatGet')
 
         global __cusparseDnMatGetValues
-        try:
-            __cusparseDnMatGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnMatGetValues')
-        except:
-            pass
+        __cusparseDnMatGetValues = GetProcAddress(handle, 'cusparseDnMatGetValues')
 
         global __cusparseDnMatSetValues
-        try:
-            __cusparseDnMatSetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnMatSetValues')
-        except:
-            pass
+        __cusparseDnMatSetValues = GetProcAddress(handle, 'cusparseDnMatSetValues')
 
         global __cusparseDnMatSetStridedBatch
-        try:
-            __cusparseDnMatSetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnMatSetStridedBatch')
-        except:
-            pass
+        __cusparseDnMatSetStridedBatch = GetProcAddress(handle, 'cusparseDnMatSetStridedBatch')
 
         global __cusparseDnMatGetStridedBatch
-        try:
-            __cusparseDnMatGetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDnMatGetStridedBatch')
-        except:
-            pass
+        __cusparseDnMatGetStridedBatch = GetProcAddress(handle, 'cusparseDnMatGetStridedBatch')
 
         global __cusparseAxpby
-        try:
-            __cusparseAxpby = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseAxpby')
-        except:
-            pass
+        __cusparseAxpby = GetProcAddress(handle, 'cusparseAxpby')
 
         global __cusparseGather
-        try:
-            __cusparseGather = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseGather')
-        except:
-            pass
+        __cusparseGather = GetProcAddress(handle, 'cusparseGather')
 
         global __cusparseScatter
-        try:
-            __cusparseScatter = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseScatter')
-        except:
-            pass
+        __cusparseScatter = GetProcAddress(handle, 'cusparseScatter')
 
         global __cusparseSpVV_bufferSize
-        try:
-            __cusparseSpVV_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVV_bufferSize')
-        except:
-            pass
+        __cusparseSpVV_bufferSize = GetProcAddress(handle, 'cusparseSpVV_bufferSize')
 
         global __cusparseSpVV
-        try:
-            __cusparseSpVV = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpVV')
-        except:
-            pass
+        __cusparseSpVV = GetProcAddress(handle, 'cusparseSpVV')
 
         global __cusparseSpMV
-        try:
-            __cusparseSpMV = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMV')
-        except:
-            pass
+        __cusparseSpMV = GetProcAddress(handle, 'cusparseSpMV')
 
         global __cusparseSpMV_bufferSize
-        try:
-            __cusparseSpMV_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMV_bufferSize')
-        except:
-            pass
+        __cusparseSpMV_bufferSize = GetProcAddress(handle, 'cusparseSpMV_bufferSize')
 
         global __cusparseSpMM
-        try:
-            __cusparseSpMM = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMM')
-        except:
-            pass
+        __cusparseSpMM = GetProcAddress(handle, 'cusparseSpMM')
 
         global __cusparseSpMM_bufferSize
-        try:
-            __cusparseSpMM_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMM_bufferSize')
-        except:
-            pass
+        __cusparseSpMM_bufferSize = GetProcAddress(handle, 'cusparseSpMM_bufferSize')
 
         global __cusparseSpGEMM_createDescr
-        try:
-            __cusparseSpGEMM_createDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_createDescr')
-        except:
-            pass
+        __cusparseSpGEMM_createDescr = GetProcAddress(handle, 'cusparseSpGEMM_createDescr')
 
         global __cusparseSpGEMM_destroyDescr
-        try:
-            __cusparseSpGEMM_destroyDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_destroyDescr')
-        except:
-            pass
+        __cusparseSpGEMM_destroyDescr = GetProcAddress(handle, 'cusparseSpGEMM_destroyDescr')
 
         global __cusparseSpGEMM_workEstimation
-        try:
-            __cusparseSpGEMM_workEstimation = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_workEstimation')
-        except:
-            pass
+        __cusparseSpGEMM_workEstimation = GetProcAddress(handle, 'cusparseSpGEMM_workEstimation')
 
         global __cusparseSpGEMM_compute
-        try:
-            __cusparseSpGEMM_compute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_compute')
-        except:
-            pass
+        __cusparseSpGEMM_compute = GetProcAddress(handle, 'cusparseSpGEMM_compute')
 
         global __cusparseSpGEMM_copy
-        try:
-            __cusparseSpGEMM_copy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_copy')
-        except:
-            pass
+        __cusparseSpGEMM_copy = GetProcAddress(handle, 'cusparseSpGEMM_copy')
 
         global __cusparseCreateCsc
-        try:
-            __cusparseCreateCsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateCsc')
-        except:
-            pass
+        __cusparseCreateCsc = GetProcAddress(handle, 'cusparseCreateCsc')
 
         global __cusparseCscSetPointers
-        try:
-            __cusparseCscSetPointers = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCscSetPointers')
-        except:
-            pass
+        __cusparseCscSetPointers = GetProcAddress(handle, 'cusparseCscSetPointers')
 
         global __cusparseCooSetPointers
-        try:
-            __cusparseCooSetPointers = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCooSetPointers')
-        except:
-            pass
+        __cusparseCooSetPointers = GetProcAddress(handle, 'cusparseCooSetPointers')
 
         global __cusparseSparseToDense_bufferSize
-        try:
-            __cusparseSparseToDense_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSparseToDense_bufferSize')
-        except:
-            pass
+        __cusparseSparseToDense_bufferSize = GetProcAddress(handle, 'cusparseSparseToDense_bufferSize')
 
         global __cusparseSparseToDense
-        try:
-            __cusparseSparseToDense = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSparseToDense')
-        except:
-            pass
+        __cusparseSparseToDense = GetProcAddress(handle, 'cusparseSparseToDense')
 
         global __cusparseDenseToSparse_bufferSize
-        try:
-            __cusparseDenseToSparse_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDenseToSparse_bufferSize')
-        except:
-            pass
+        __cusparseDenseToSparse_bufferSize = GetProcAddress(handle, 'cusparseDenseToSparse_bufferSize')
 
         global __cusparseDenseToSparse_analysis
-        try:
-            __cusparseDenseToSparse_analysis = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDenseToSparse_analysis')
-        except:
-            pass
+        __cusparseDenseToSparse_analysis = GetProcAddress(handle, 'cusparseDenseToSparse_analysis')
 
         global __cusparseDenseToSparse_convert
-        try:
-            __cusparseDenseToSparse_convert = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseDenseToSparse_convert')
-        except:
-            pass
+        __cusparseDenseToSparse_convert = GetProcAddress(handle, 'cusparseDenseToSparse_convert')
 
         global __cusparseCreateBlockedEll
-        try:
-            __cusparseCreateBlockedEll = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateBlockedEll')
-        except:
-            pass
+        __cusparseCreateBlockedEll = GetProcAddress(handle, 'cusparseCreateBlockedEll')
 
         global __cusparseBlockedEllGet
-        try:
-            __cusparseBlockedEllGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseBlockedEllGet')
-        except:
-            pass
+        __cusparseBlockedEllGet = GetProcAddress(handle, 'cusparseBlockedEllGet')
 
         global __cusparseSpMM_preprocess
-        try:
-            __cusparseSpMM_preprocess = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMM_preprocess')
-        except:
-            pass
+        __cusparseSpMM_preprocess = GetProcAddress(handle, 'cusparseSpMM_preprocess')
 
         global __cusparseSDDMM_bufferSize
-        try:
-            __cusparseSDDMM_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSDDMM_bufferSize')
-        except:
-            pass
+        __cusparseSDDMM_bufferSize = GetProcAddress(handle, 'cusparseSDDMM_bufferSize')
 
         global __cusparseSDDMM_preprocess
-        try:
-            __cusparseSDDMM_preprocess = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSDDMM_preprocess')
-        except:
-            pass
+        __cusparseSDDMM_preprocess = GetProcAddress(handle, 'cusparseSDDMM_preprocess')
 
         global __cusparseSDDMM
-        try:
-            __cusparseSDDMM = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSDDMM')
-        except:
-            pass
+        __cusparseSDDMM = GetProcAddress(handle, 'cusparseSDDMM')
 
         global __cusparseSpMatGetAttribute
-        try:
-            __cusparseSpMatGetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatGetAttribute')
-        except:
-            pass
+        __cusparseSpMatGetAttribute = GetProcAddress(handle, 'cusparseSpMatGetAttribute')
 
         global __cusparseSpMatSetAttribute
-        try:
-            __cusparseSpMatSetAttribute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMatSetAttribute')
-        except:
-            pass
+        __cusparseSpMatSetAttribute = GetProcAddress(handle, 'cusparseSpMatSetAttribute')
 
         global __cusparseSpSV_createDescr
-        try:
-            __cusparseSpSV_createDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_createDescr')
-        except:
-            pass
+        __cusparseSpSV_createDescr = GetProcAddress(handle, 'cusparseSpSV_createDescr')
 
         global __cusparseSpSV_destroyDescr
-        try:
-            __cusparseSpSV_destroyDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_destroyDescr')
-        except:
-            pass
+        __cusparseSpSV_destroyDescr = GetProcAddress(handle, 'cusparseSpSV_destroyDescr')
 
         global __cusparseSpSV_bufferSize
-        try:
-            __cusparseSpSV_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_bufferSize')
-        except:
-            pass
+        __cusparseSpSV_bufferSize = GetProcAddress(handle, 'cusparseSpSV_bufferSize')
 
         global __cusparseSpSV_analysis
-        try:
-            __cusparseSpSV_analysis = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_analysis')
-        except:
-            pass
+        __cusparseSpSV_analysis = GetProcAddress(handle, 'cusparseSpSV_analysis')
 
         global __cusparseSpSV_solve
-        try:
-            __cusparseSpSV_solve = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_solve')
-        except:
-            pass
+        __cusparseSpSV_solve = GetProcAddress(handle, 'cusparseSpSV_solve')
 
         global __cusparseSpSM_createDescr
-        try:
-            __cusparseSpSM_createDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_createDescr')
-        except:
-            pass
+        __cusparseSpSM_createDescr = GetProcAddress(handle, 'cusparseSpSM_createDescr')
 
         global __cusparseSpSM_destroyDescr
-        try:
-            __cusparseSpSM_destroyDescr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_destroyDescr')
-        except:
-            pass
+        __cusparseSpSM_destroyDescr = GetProcAddress(handle, 'cusparseSpSM_destroyDescr')
 
         global __cusparseSpSM_bufferSize
-        try:
-            __cusparseSpSM_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_bufferSize')
-        except:
-            pass
+        __cusparseSpSM_bufferSize = GetProcAddress(handle, 'cusparseSpSM_bufferSize')
 
         global __cusparseSpSM_analysis
-        try:
-            __cusparseSpSM_analysis = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_analysis')
-        except:
-            pass
+        __cusparseSpSM_analysis = GetProcAddress(handle, 'cusparseSpSM_analysis')
 
         global __cusparseSpSM_solve
-        try:
-            __cusparseSpSM_solve = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_solve')
-        except:
-            pass
+        __cusparseSpSM_solve = GetProcAddress(handle, 'cusparseSpSM_solve')
 
         global __cusparseSpGEMMreuse_workEstimation
-        try:
-            __cusparseSpGEMMreuse_workEstimation = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMMreuse_workEstimation')
-        except:
-            pass
+        __cusparseSpGEMMreuse_workEstimation = GetProcAddress(handle, 'cusparseSpGEMMreuse_workEstimation')
 
         global __cusparseSpGEMMreuse_nnz
-        try:
-            __cusparseSpGEMMreuse_nnz = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMMreuse_nnz')
-        except:
-            pass
+        __cusparseSpGEMMreuse_nnz = GetProcAddress(handle, 'cusparseSpGEMMreuse_nnz')
 
         global __cusparseSpGEMMreuse_copy
-        try:
-            __cusparseSpGEMMreuse_copy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMMreuse_copy')
-        except:
-            pass
+        __cusparseSpGEMMreuse_copy = GetProcAddress(handle, 'cusparseSpGEMMreuse_copy')
 
         global __cusparseSpGEMMreuse_compute
-        try:
-            __cusparseSpGEMMreuse_compute = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMMreuse_compute')
-        except:
-            pass
+        __cusparseSpGEMMreuse_compute = GetProcAddress(handle, 'cusparseSpGEMMreuse_compute')
 
         global __cusparseLoggerSetCallback
-        try:
-            __cusparseLoggerSetCallback = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerSetCallback')
-        except:
-            pass
+        __cusparseLoggerSetCallback = GetProcAddress(handle, 'cusparseLoggerSetCallback')
 
         global __cusparseLoggerSetFile
-        try:
-            __cusparseLoggerSetFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerSetFile')
-        except:
-            pass
+        __cusparseLoggerSetFile = GetProcAddress(handle, 'cusparseLoggerSetFile')
 
         global __cusparseLoggerOpenFile
-        try:
-            __cusparseLoggerOpenFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerOpenFile')
-        except:
-            pass
+        __cusparseLoggerOpenFile = GetProcAddress(handle, 'cusparseLoggerOpenFile')
 
         global __cusparseLoggerSetLevel
-        try:
-            __cusparseLoggerSetLevel = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerSetLevel')
-        except:
-            pass
+        __cusparseLoggerSetLevel = GetProcAddress(handle, 'cusparseLoggerSetLevel')
 
         global __cusparseLoggerSetMask
-        try:
-            __cusparseLoggerSetMask = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerSetMask')
-        except:
-            pass
+        __cusparseLoggerSetMask = GetProcAddress(handle, 'cusparseLoggerSetMask')
 
         global __cusparseLoggerForceDisable
-        try:
-            __cusparseLoggerForceDisable = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseLoggerForceDisable')
-        except:
-            pass
+        __cusparseLoggerForceDisable = GetProcAddress(handle, 'cusparseLoggerForceDisable')
 
         global __cusparseSpMMOp_createPlan
-        try:
-            __cusparseSpMMOp_createPlan = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMMOp_createPlan')
-        except:
-            pass
+        __cusparseSpMMOp_createPlan = GetProcAddress(handle, 'cusparseSpMMOp_createPlan')
 
         global __cusparseSpMMOp
-        try:
-            __cusparseSpMMOp = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMMOp')
-        except:
-            pass
+        __cusparseSpMMOp = GetProcAddress(handle, 'cusparseSpMMOp')
 
         global __cusparseSpMMOp_destroyPlan
-        try:
-            __cusparseSpMMOp_destroyPlan = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMMOp_destroyPlan')
-        except:
-            pass
+        __cusparseSpMMOp_destroyPlan = GetProcAddress(handle, 'cusparseSpMMOp_destroyPlan')
 
         global __cusparseCscGet
-        try:
-            __cusparseCscGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCscGet')
-        except:
-            pass
+        __cusparseCscGet = GetProcAddress(handle, 'cusparseCscGet')
 
         global __cusparseCreateConstSpVec
-        try:
-            __cusparseCreateConstSpVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstSpVec')
-        except:
-            pass
+        __cusparseCreateConstSpVec = GetProcAddress(handle, 'cusparseCreateConstSpVec')
 
         global __cusparseConstSpVecGet
-        try:
-            __cusparseConstSpVecGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstSpVecGet')
-        except:
-            pass
+        __cusparseConstSpVecGet = GetProcAddress(handle, 'cusparseConstSpVecGet')
 
         global __cusparseConstSpVecGetValues
-        try:
-            __cusparseConstSpVecGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstSpVecGetValues')
-        except:
-            pass
+        __cusparseConstSpVecGetValues = GetProcAddress(handle, 'cusparseConstSpVecGetValues')
 
         global __cusparseCreateConstDnVec
-        try:
-            __cusparseCreateConstDnVec = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstDnVec')
-        except:
-            pass
+        __cusparseCreateConstDnVec = GetProcAddress(handle, 'cusparseCreateConstDnVec')
 
         global __cusparseConstDnVecGet
-        try:
-            __cusparseConstDnVecGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstDnVecGet')
-        except:
-            pass
+        __cusparseConstDnVecGet = GetProcAddress(handle, 'cusparseConstDnVecGet')
 
         global __cusparseConstDnVecGetValues
-        try:
-            __cusparseConstDnVecGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstDnVecGetValues')
-        except:
-            pass
+        __cusparseConstDnVecGetValues = GetProcAddress(handle, 'cusparseConstDnVecGetValues')
 
         global __cusparseConstSpMatGetValues
-        try:
-            __cusparseConstSpMatGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstSpMatGetValues')
-        except:
-            pass
+        __cusparseConstSpMatGetValues = GetProcAddress(handle, 'cusparseConstSpMatGetValues')
 
         global __cusparseCreateConstCsr
-        try:
-            __cusparseCreateConstCsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstCsr')
-        except:
-            pass
+        __cusparseCreateConstCsr = GetProcAddress(handle, 'cusparseCreateConstCsr')
 
         global __cusparseCreateConstCsc
-        try:
-            __cusparseCreateConstCsc = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstCsc')
-        except:
-            pass
+        __cusparseCreateConstCsc = GetProcAddress(handle, 'cusparseCreateConstCsc')
 
         global __cusparseConstCsrGet
-        try:
-            __cusparseConstCsrGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstCsrGet')
-        except:
-            pass
+        __cusparseConstCsrGet = GetProcAddress(handle, 'cusparseConstCsrGet')
 
         global __cusparseConstCscGet
-        try:
-            __cusparseConstCscGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstCscGet')
-        except:
-            pass
+        __cusparseConstCscGet = GetProcAddress(handle, 'cusparseConstCscGet')
 
         global __cusparseCreateConstCoo
-        try:
-            __cusparseCreateConstCoo = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstCoo')
-        except:
-            pass
+        __cusparseCreateConstCoo = GetProcAddress(handle, 'cusparseCreateConstCoo')
 
         global __cusparseConstCooGet
-        try:
-            __cusparseConstCooGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstCooGet')
-        except:
-            pass
+        __cusparseConstCooGet = GetProcAddress(handle, 'cusparseConstCooGet')
 
         global __cusparseCreateConstBlockedEll
-        try:
-            __cusparseCreateConstBlockedEll = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstBlockedEll')
-        except:
-            pass
+        __cusparseCreateConstBlockedEll = GetProcAddress(handle, 'cusparseCreateConstBlockedEll')
 
         global __cusparseConstBlockedEllGet
-        try:
-            __cusparseConstBlockedEllGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstBlockedEllGet')
-        except:
-            pass
+        __cusparseConstBlockedEllGet = GetProcAddress(handle, 'cusparseConstBlockedEllGet')
 
         global __cusparseCreateConstDnMat
-        try:
-            __cusparseCreateConstDnMat = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstDnMat')
-        except:
-            pass
+        __cusparseCreateConstDnMat = GetProcAddress(handle, 'cusparseCreateConstDnMat')
 
         global __cusparseConstDnMatGet
-        try:
-            __cusparseConstDnMatGet = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstDnMatGet')
-        except:
-            pass
+        __cusparseConstDnMatGet = GetProcAddress(handle, 'cusparseConstDnMatGet')
 
         global __cusparseConstDnMatGetValues
-        try:
-            __cusparseConstDnMatGetValues = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseConstDnMatGetValues')
-        except:
-            pass
+        __cusparseConstDnMatGetValues = GetProcAddress(handle, 'cusparseConstDnMatGetValues')
 
         global __cusparseSpGEMM_getNumProducts
-        try:
-            __cusparseSpGEMM_getNumProducts = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_getNumProducts')
-        except:
-            pass
+        __cusparseSpGEMM_getNumProducts = GetProcAddress(handle, 'cusparseSpGEMM_getNumProducts')
 
         global __cusparseSpGEMM_estimateMemory
-        try:
-            __cusparseSpGEMM_estimateMemory = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpGEMM_estimateMemory')
-        except:
-            pass
+        __cusparseSpGEMM_estimateMemory = GetProcAddress(handle, 'cusparseSpGEMM_estimateMemory')
 
         global __cusparseBsrSetStridedBatch
-        try:
-            __cusparseBsrSetStridedBatch = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseBsrSetStridedBatch')
-        except:
-            pass
+        __cusparseBsrSetStridedBatch = GetProcAddress(handle, 'cusparseBsrSetStridedBatch')
 
         global __cusparseCreateBsr
-        try:
-            __cusparseCreateBsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateBsr')
-        except:
-            pass
+        __cusparseCreateBsr = GetProcAddress(handle, 'cusparseCreateBsr')
 
         global __cusparseCreateConstBsr
-        try:
-            __cusparseCreateConstBsr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstBsr')
-        except:
-            pass
+        __cusparseCreateConstBsr = GetProcAddress(handle, 'cusparseCreateConstBsr')
 
         global __cusparseCreateSlicedEll
-        try:
-            __cusparseCreateSlicedEll = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateSlicedEll')
-        except:
-            pass
+        __cusparseCreateSlicedEll = GetProcAddress(handle, 'cusparseCreateSlicedEll')
 
         global __cusparseCreateConstSlicedEll
-        try:
-            __cusparseCreateConstSlicedEll = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseCreateConstSlicedEll')
-        except:
-            pass
+        __cusparseCreateConstSlicedEll = GetProcAddress(handle, 'cusparseCreateConstSlicedEll')
 
         global __cusparseSpSV_updateMatrix
-        try:
-            __cusparseSpSV_updateMatrix = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSV_updateMatrix')
-        except:
-            pass
+        __cusparseSpSV_updateMatrix = GetProcAddress(handle, 'cusparseSpSV_updateMatrix')
 
         global __cusparseSpMV_preprocess
-        try:
-            __cusparseSpMV_preprocess = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpMV_preprocess')
-        except:
-            pass
+        __cusparseSpMV_preprocess = GetProcAddress(handle, 'cusparseSpMV_preprocess')
 
         global __cusparseSpSM_updateMatrix
-        try:
-            __cusparseSpSM_updateMatrix = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_updateMatrix')
-        except:
-            pass
+        __cusparseSpSM_updateMatrix = GetProcAddress(handle, 'cusparseSpSM_updateMatrix')
 
-    __py_cusparse_init = True
-    return 0
+        __py_cusparse_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/cutensor.pxd b/nvmath/bindings/_internal/cutensor.pxd
new file mode 100644
index 0000000..7386658
--- /dev/null
+++ b/nvmath/bindings/_internal/cutensor.pxd
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+
+from ..cycutensor cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cutensorStatus_t _cutensorCreate(cutensorHandle_t* handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroy(cutensorHandle_t handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorHandleResizePlanCache(cutensorHandle_t handle, const uint32_t numEntries) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorHandleWritePlanCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorHandleReadPlanCacheFromFile(cutensorHandle_t handle, const char filename[], uint32_t* numCachelinesRead) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorWriteKernelCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorReadKernelCacheFromFile(cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateTensorDescriptor(const cutensorHandle_t handle, cutensorTensorDescriptor_t* desc, const uint32_t numModes, const int64_t extent[], const int64_t stride[], cudaDataType_t dataType, uint32_t alignmentRequirement) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroyTensorDescriptor(cutensorTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateElementwiseTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAB, cutensorOperator_t opABC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorElementwiseTrinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* B, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateElementwiseBinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorElementwiseBinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreatePermutation(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorPermute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, void* B, const cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroyOperationDescriptor(cutensorOperationDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorOperationDescriptorSetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorOperationDescriptorGetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreatePlanPreference(const cutensorHandle_t handle, cutensorPlanPreference_t* pref, cutensorAlgo_t algo, cutensorJitMode_t jitMode) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroyPlanPreference(cutensorPlanPreference_t pref) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorPlanPreferenceSetAttribute(const cutensorHandle_t handle, cutensorPlanPreference_t pref, cutensorPlanPreferenceAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorPlanGetAttribute(const cutensorHandle_t handle, const cutensorPlan_t plan, cutensorPlanAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorEstimateWorkspaceSize(const cutensorHandle_t handle, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t planPref, const cutensorWorksizePreference_t workspacePref, uint64_t* workspaceSizeEstimate) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreatePlan(const cutensorHandle_t handle, cutensorPlan_t* plan, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t pref, uint64_t workspaceSizeLimit) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroyPlan(cutensorPlan_t plan) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateReduction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opReduce, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorReduce(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateContractionTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opD, const cutensorTensorDescriptor_t descE, const int32_t modeE[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorContractTrinary(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* C, const void* beta, const void* D, void* E, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateBlockSparseTensorDescriptor(cutensorHandle_t handle, cutensorBlockSparseTensorDescriptor_t* desc, const uint32_t numModes, const uint64_t numNonZeroBlocks, const uint32_t numSectionsPerMode[], const int64_t extent[], const int32_t nonZeroCoordinates[], const int64_t stride[], cudaDataType_t dataType) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorDestroyBlockSparseTensorDescriptor(cutensorBlockSparseTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorCreateBlockSparseContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorBlockSparseTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorBlockSparseTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorBlockSparseTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorBlockSparseTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorBlockSparseContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* const A[], const void* const B[], const void* beta, const void* const C[], void* const D[], void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* _cutensorGetErrorString(const cutensorStatus_t error) except?NULL nogil
+cdef size_t _cutensorGetVersion() except?0 nogil
+cdef size_t _cutensorGetCudartVersion() except?0 nogil
+cdef cutensorStatus_t _cutensorLoggerSetCallback(cutensorLoggerCallback_t callback) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorLoggerSetFile(FILE* file) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorLoggerOpenFile(const char* logFile) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorLoggerSetLevel(int32_t level) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorLoggerSetMask(int32_t mask) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t _cutensorLoggerForceDisable() except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/cutensor_linux.pyx b/nvmath/bindings/_internal/cutensor_linux.pyx
new file mode 100644
index 0000000..fe18844
--- /dev/null
+++ b/nvmath/bindings/_internal/cutensor_linux.pyx
@@ -0,0 +1,1029 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_cutensor_init = False
+
+cdef void* __cutensorCreate = NULL
+cdef void* __cutensorDestroy = NULL
+cdef void* __cutensorHandleResizePlanCache = NULL
+cdef void* __cutensorHandleWritePlanCacheToFile = NULL
+cdef void* __cutensorHandleReadPlanCacheFromFile = NULL
+cdef void* __cutensorWriteKernelCacheToFile = NULL
+cdef void* __cutensorReadKernelCacheFromFile = NULL
+cdef void* __cutensorCreateTensorDescriptor = NULL
+cdef void* __cutensorDestroyTensorDescriptor = NULL
+cdef void* __cutensorCreateElementwiseTrinary = NULL
+cdef void* __cutensorElementwiseTrinaryExecute = NULL
+cdef void* __cutensorCreateElementwiseBinary = NULL
+cdef void* __cutensorElementwiseBinaryExecute = NULL
+cdef void* __cutensorCreatePermutation = NULL
+cdef void* __cutensorPermute = NULL
+cdef void* __cutensorCreateContraction = NULL
+cdef void* __cutensorDestroyOperationDescriptor = NULL
+cdef void* __cutensorOperationDescriptorSetAttribute = NULL
+cdef void* __cutensorOperationDescriptorGetAttribute = NULL
+cdef void* __cutensorCreatePlanPreference = NULL
+cdef void* __cutensorDestroyPlanPreference = NULL
+cdef void* __cutensorPlanPreferenceSetAttribute = NULL
+cdef void* __cutensorPlanGetAttribute = NULL
+cdef void* __cutensorEstimateWorkspaceSize = NULL
+cdef void* __cutensorCreatePlan = NULL
+cdef void* __cutensorDestroyPlan = NULL
+cdef void* __cutensorContract = NULL
+cdef void* __cutensorCreateReduction = NULL
+cdef void* __cutensorReduce = NULL
+cdef void* __cutensorCreateContractionTrinary = NULL
+cdef void* __cutensorContractTrinary = NULL
+cdef void* __cutensorCreateBlockSparseTensorDescriptor = NULL
+cdef void* __cutensorDestroyBlockSparseTensorDescriptor = NULL
+cdef void* __cutensorCreateBlockSparseContraction = NULL
+cdef void* __cutensorBlockSparseContract = NULL
+cdef void* __cutensorGetErrorString = NULL
+cdef void* __cutensorGetVersion = NULL
+cdef void* __cutensorGetCudartVersion = NULL
+cdef void* __cutensorLoggerSetCallback = NULL
+cdef void* __cutensorLoggerSetFile = NULL
+cdef void* __cutensorLoggerOpenFile = NULL
+cdef void* __cutensorLoggerSetLevel = NULL
+cdef void* __cutensorLoggerSetMask = NULL
+cdef void* __cutensorLoggerForceDisable = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cutensor")._handle_uint
+    return <void*>handle
+
+
+cdef int _check_or_init_cutensor() except -1 nogil:
+    global __py_cutensor_init
+    if __py_cutensor_init:
+        return 0
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Load function
+        global __cutensorCreate
+        __cutensorCreate = dlsym(RTLD_DEFAULT, 'cutensorCreate')
+        if __cutensorCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreate = dlsym(handle, 'cutensorCreate')
+
+        global __cutensorDestroy
+        __cutensorDestroy = dlsym(RTLD_DEFAULT, 'cutensorDestroy')
+        if __cutensorDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroy = dlsym(handle, 'cutensorDestroy')
+
+        global __cutensorHandleResizePlanCache
+        __cutensorHandleResizePlanCache = dlsym(RTLD_DEFAULT, 'cutensorHandleResizePlanCache')
+        if __cutensorHandleResizePlanCache == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorHandleResizePlanCache = dlsym(handle, 'cutensorHandleResizePlanCache')
+
+        global __cutensorHandleWritePlanCacheToFile
+        __cutensorHandleWritePlanCacheToFile = dlsym(RTLD_DEFAULT, 'cutensorHandleWritePlanCacheToFile')
+        if __cutensorHandleWritePlanCacheToFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorHandleWritePlanCacheToFile = dlsym(handle, 'cutensorHandleWritePlanCacheToFile')
+
+        global __cutensorHandleReadPlanCacheFromFile
+        __cutensorHandleReadPlanCacheFromFile = dlsym(RTLD_DEFAULT, 'cutensorHandleReadPlanCacheFromFile')
+        if __cutensorHandleReadPlanCacheFromFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorHandleReadPlanCacheFromFile = dlsym(handle, 'cutensorHandleReadPlanCacheFromFile')
+
+        global __cutensorWriteKernelCacheToFile
+        __cutensorWriteKernelCacheToFile = dlsym(RTLD_DEFAULT, 'cutensorWriteKernelCacheToFile')
+        if __cutensorWriteKernelCacheToFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorWriteKernelCacheToFile = dlsym(handle, 'cutensorWriteKernelCacheToFile')
+
+        global __cutensorReadKernelCacheFromFile
+        __cutensorReadKernelCacheFromFile = dlsym(RTLD_DEFAULT, 'cutensorReadKernelCacheFromFile')
+        if __cutensorReadKernelCacheFromFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorReadKernelCacheFromFile = dlsym(handle, 'cutensorReadKernelCacheFromFile')
+
+        global __cutensorCreateTensorDescriptor
+        __cutensorCreateTensorDescriptor = dlsym(RTLD_DEFAULT, 'cutensorCreateTensorDescriptor')
+        if __cutensorCreateTensorDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateTensorDescriptor = dlsym(handle, 'cutensorCreateTensorDescriptor')
+
+        global __cutensorDestroyTensorDescriptor
+        __cutensorDestroyTensorDescriptor = dlsym(RTLD_DEFAULT, 'cutensorDestroyTensorDescriptor')
+        if __cutensorDestroyTensorDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroyTensorDescriptor = dlsym(handle, 'cutensorDestroyTensorDescriptor')
+
+        global __cutensorCreateElementwiseTrinary
+        __cutensorCreateElementwiseTrinary = dlsym(RTLD_DEFAULT, 'cutensorCreateElementwiseTrinary')
+        if __cutensorCreateElementwiseTrinary == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateElementwiseTrinary = dlsym(handle, 'cutensorCreateElementwiseTrinary')
+
+        global __cutensorElementwiseTrinaryExecute
+        __cutensorElementwiseTrinaryExecute = dlsym(RTLD_DEFAULT, 'cutensorElementwiseTrinaryExecute')
+        if __cutensorElementwiseTrinaryExecute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorElementwiseTrinaryExecute = dlsym(handle, 'cutensorElementwiseTrinaryExecute')
+
+        global __cutensorCreateElementwiseBinary
+        __cutensorCreateElementwiseBinary = dlsym(RTLD_DEFAULT, 'cutensorCreateElementwiseBinary')
+        if __cutensorCreateElementwiseBinary == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateElementwiseBinary = dlsym(handle, 'cutensorCreateElementwiseBinary')
+
+        global __cutensorElementwiseBinaryExecute
+        __cutensorElementwiseBinaryExecute = dlsym(RTLD_DEFAULT, 'cutensorElementwiseBinaryExecute')
+        if __cutensorElementwiseBinaryExecute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorElementwiseBinaryExecute = dlsym(handle, 'cutensorElementwiseBinaryExecute')
+
+        global __cutensorCreatePermutation
+        __cutensorCreatePermutation = dlsym(RTLD_DEFAULT, 'cutensorCreatePermutation')
+        if __cutensorCreatePermutation == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreatePermutation = dlsym(handle, 'cutensorCreatePermutation')
+
+        global __cutensorPermute
+        __cutensorPermute = dlsym(RTLD_DEFAULT, 'cutensorPermute')
+        if __cutensorPermute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorPermute = dlsym(handle, 'cutensorPermute')
+
+        global __cutensorCreateContraction
+        __cutensorCreateContraction = dlsym(RTLD_DEFAULT, 'cutensorCreateContraction')
+        if __cutensorCreateContraction == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateContraction = dlsym(handle, 'cutensorCreateContraction')
+
+        global __cutensorDestroyOperationDescriptor
+        __cutensorDestroyOperationDescriptor = dlsym(RTLD_DEFAULT, 'cutensorDestroyOperationDescriptor')
+        if __cutensorDestroyOperationDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroyOperationDescriptor = dlsym(handle, 'cutensorDestroyOperationDescriptor')
+
+        global __cutensorOperationDescriptorSetAttribute
+        __cutensorOperationDescriptorSetAttribute = dlsym(RTLD_DEFAULT, 'cutensorOperationDescriptorSetAttribute')
+        if __cutensorOperationDescriptorSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorOperationDescriptorSetAttribute = dlsym(handle, 'cutensorOperationDescriptorSetAttribute')
+
+        global __cutensorOperationDescriptorGetAttribute
+        __cutensorOperationDescriptorGetAttribute = dlsym(RTLD_DEFAULT, 'cutensorOperationDescriptorGetAttribute')
+        if __cutensorOperationDescriptorGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorOperationDescriptorGetAttribute = dlsym(handle, 'cutensorOperationDescriptorGetAttribute')
+
+        global __cutensorCreatePlanPreference
+        __cutensorCreatePlanPreference = dlsym(RTLD_DEFAULT, 'cutensorCreatePlanPreference')
+        if __cutensorCreatePlanPreference == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreatePlanPreference = dlsym(handle, 'cutensorCreatePlanPreference')
+
+        global __cutensorDestroyPlanPreference
+        __cutensorDestroyPlanPreference = dlsym(RTLD_DEFAULT, 'cutensorDestroyPlanPreference')
+        if __cutensorDestroyPlanPreference == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroyPlanPreference = dlsym(handle, 'cutensorDestroyPlanPreference')
+
+        global __cutensorPlanPreferenceSetAttribute
+        __cutensorPlanPreferenceSetAttribute = dlsym(RTLD_DEFAULT, 'cutensorPlanPreferenceSetAttribute')
+        if __cutensorPlanPreferenceSetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorPlanPreferenceSetAttribute = dlsym(handle, 'cutensorPlanPreferenceSetAttribute')
+
+        global __cutensorPlanGetAttribute
+        __cutensorPlanGetAttribute = dlsym(RTLD_DEFAULT, 'cutensorPlanGetAttribute')
+        if __cutensorPlanGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorPlanGetAttribute = dlsym(handle, 'cutensorPlanGetAttribute')
+
+        global __cutensorEstimateWorkspaceSize
+        __cutensorEstimateWorkspaceSize = dlsym(RTLD_DEFAULT, 'cutensorEstimateWorkspaceSize')
+        if __cutensorEstimateWorkspaceSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorEstimateWorkspaceSize = dlsym(handle, 'cutensorEstimateWorkspaceSize')
+
+        global __cutensorCreatePlan
+        __cutensorCreatePlan = dlsym(RTLD_DEFAULT, 'cutensorCreatePlan')
+        if __cutensorCreatePlan == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreatePlan = dlsym(handle, 'cutensorCreatePlan')
+
+        global __cutensorDestroyPlan
+        __cutensorDestroyPlan = dlsym(RTLD_DEFAULT, 'cutensorDestroyPlan')
+        if __cutensorDestroyPlan == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroyPlan = dlsym(handle, 'cutensorDestroyPlan')
+
+        global __cutensorContract
+        __cutensorContract = dlsym(RTLD_DEFAULT, 'cutensorContract')
+        if __cutensorContract == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorContract = dlsym(handle, 'cutensorContract')
+
+        global __cutensorCreateReduction
+        __cutensorCreateReduction = dlsym(RTLD_DEFAULT, 'cutensorCreateReduction')
+        if __cutensorCreateReduction == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateReduction = dlsym(handle, 'cutensorCreateReduction')
+
+        global __cutensorReduce
+        __cutensorReduce = dlsym(RTLD_DEFAULT, 'cutensorReduce')
+        if __cutensorReduce == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorReduce = dlsym(handle, 'cutensorReduce')
+
+        global __cutensorCreateContractionTrinary
+        __cutensorCreateContractionTrinary = dlsym(RTLD_DEFAULT, 'cutensorCreateContractionTrinary')
+        if __cutensorCreateContractionTrinary == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateContractionTrinary = dlsym(handle, 'cutensorCreateContractionTrinary')
+
+        global __cutensorContractTrinary
+        __cutensorContractTrinary = dlsym(RTLD_DEFAULT, 'cutensorContractTrinary')
+        if __cutensorContractTrinary == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorContractTrinary = dlsym(handle, 'cutensorContractTrinary')
+
+        global __cutensorCreateBlockSparseTensorDescriptor
+        __cutensorCreateBlockSparseTensorDescriptor = dlsym(RTLD_DEFAULT, 'cutensorCreateBlockSparseTensorDescriptor')
+        if __cutensorCreateBlockSparseTensorDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateBlockSparseTensorDescriptor = dlsym(handle, 'cutensorCreateBlockSparseTensorDescriptor')
+
+        global __cutensorDestroyBlockSparseTensorDescriptor
+        __cutensorDestroyBlockSparseTensorDescriptor = dlsym(RTLD_DEFAULT, 'cutensorDestroyBlockSparseTensorDescriptor')
+        if __cutensorDestroyBlockSparseTensorDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorDestroyBlockSparseTensorDescriptor = dlsym(handle, 'cutensorDestroyBlockSparseTensorDescriptor')
+
+        global __cutensorCreateBlockSparseContraction
+        __cutensorCreateBlockSparseContraction = dlsym(RTLD_DEFAULT, 'cutensorCreateBlockSparseContraction')
+        if __cutensorCreateBlockSparseContraction == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorCreateBlockSparseContraction = dlsym(handle, 'cutensorCreateBlockSparseContraction')
+
+        global __cutensorBlockSparseContract
+        __cutensorBlockSparseContract = dlsym(RTLD_DEFAULT, 'cutensorBlockSparseContract')
+        if __cutensorBlockSparseContract == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorBlockSparseContract = dlsym(handle, 'cutensorBlockSparseContract')
+
+        global __cutensorGetErrorString
+        __cutensorGetErrorString = dlsym(RTLD_DEFAULT, 'cutensorGetErrorString')
+        if __cutensorGetErrorString == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorGetErrorString = dlsym(handle, 'cutensorGetErrorString')
+
+        global __cutensorGetVersion
+        __cutensorGetVersion = dlsym(RTLD_DEFAULT, 'cutensorGetVersion')
+        if __cutensorGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorGetVersion = dlsym(handle, 'cutensorGetVersion')
+
+        global __cutensorGetCudartVersion
+        __cutensorGetCudartVersion = dlsym(RTLD_DEFAULT, 'cutensorGetCudartVersion')
+        if __cutensorGetCudartVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorGetCudartVersion = dlsym(handle, 'cutensorGetCudartVersion')
+
+        global __cutensorLoggerSetCallback
+        __cutensorLoggerSetCallback = dlsym(RTLD_DEFAULT, 'cutensorLoggerSetCallback')
+        if __cutensorLoggerSetCallback == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerSetCallback = dlsym(handle, 'cutensorLoggerSetCallback')
+
+        global __cutensorLoggerSetFile
+        __cutensorLoggerSetFile = dlsym(RTLD_DEFAULT, 'cutensorLoggerSetFile')
+        if __cutensorLoggerSetFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerSetFile = dlsym(handle, 'cutensorLoggerSetFile')
+
+        global __cutensorLoggerOpenFile
+        __cutensorLoggerOpenFile = dlsym(RTLD_DEFAULT, 'cutensorLoggerOpenFile')
+        if __cutensorLoggerOpenFile == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerOpenFile = dlsym(handle, 'cutensorLoggerOpenFile')
+
+        global __cutensorLoggerSetLevel
+        __cutensorLoggerSetLevel = dlsym(RTLD_DEFAULT, 'cutensorLoggerSetLevel')
+        if __cutensorLoggerSetLevel == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerSetLevel = dlsym(handle, 'cutensorLoggerSetLevel')
+
+        global __cutensorLoggerSetMask
+        __cutensorLoggerSetMask = dlsym(RTLD_DEFAULT, 'cutensorLoggerSetMask')
+        if __cutensorLoggerSetMask == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerSetMask = dlsym(handle, 'cutensorLoggerSetMask')
+
+        global __cutensorLoggerForceDisable
+        __cutensorLoggerForceDisable = dlsym(RTLD_DEFAULT, 'cutensorLoggerForceDisable')
+        if __cutensorLoggerForceDisable == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cutensorLoggerForceDisable = dlsym(handle, 'cutensorLoggerForceDisable')
+        __py_cutensor_init = True
+        return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_cutensor()
+    cdef dict data = {}
+
+    global __cutensorCreate
+    data["__cutensorCreate"] = <intptr_t>__cutensorCreate
+
+    global __cutensorDestroy
+    data["__cutensorDestroy"] = <intptr_t>__cutensorDestroy
+
+    global __cutensorHandleResizePlanCache
+    data["__cutensorHandleResizePlanCache"] = <intptr_t>__cutensorHandleResizePlanCache
+
+    global __cutensorHandleWritePlanCacheToFile
+    data["__cutensorHandleWritePlanCacheToFile"] = <intptr_t>__cutensorHandleWritePlanCacheToFile
+
+    global __cutensorHandleReadPlanCacheFromFile
+    data["__cutensorHandleReadPlanCacheFromFile"] = <intptr_t>__cutensorHandleReadPlanCacheFromFile
+
+    global __cutensorWriteKernelCacheToFile
+    data["__cutensorWriteKernelCacheToFile"] = <intptr_t>__cutensorWriteKernelCacheToFile
+
+    global __cutensorReadKernelCacheFromFile
+    data["__cutensorReadKernelCacheFromFile"] = <intptr_t>__cutensorReadKernelCacheFromFile
+
+    global __cutensorCreateTensorDescriptor
+    data["__cutensorCreateTensorDescriptor"] = <intptr_t>__cutensorCreateTensorDescriptor
+
+    global __cutensorDestroyTensorDescriptor
+    data["__cutensorDestroyTensorDescriptor"] = <intptr_t>__cutensorDestroyTensorDescriptor
+
+    global __cutensorCreateElementwiseTrinary
+    data["__cutensorCreateElementwiseTrinary"] = <intptr_t>__cutensorCreateElementwiseTrinary
+
+    global __cutensorElementwiseTrinaryExecute
+    data["__cutensorElementwiseTrinaryExecute"] = <intptr_t>__cutensorElementwiseTrinaryExecute
+
+    global __cutensorCreateElementwiseBinary
+    data["__cutensorCreateElementwiseBinary"] = <intptr_t>__cutensorCreateElementwiseBinary
+
+    global __cutensorElementwiseBinaryExecute
+    data["__cutensorElementwiseBinaryExecute"] = <intptr_t>__cutensorElementwiseBinaryExecute
+
+    global __cutensorCreatePermutation
+    data["__cutensorCreatePermutation"] = <intptr_t>__cutensorCreatePermutation
+
+    global __cutensorPermute
+    data["__cutensorPermute"] = <intptr_t>__cutensorPermute
+
+    global __cutensorCreateContraction
+    data["__cutensorCreateContraction"] = <intptr_t>__cutensorCreateContraction
+
+    global __cutensorDestroyOperationDescriptor
+    data["__cutensorDestroyOperationDescriptor"] = <intptr_t>__cutensorDestroyOperationDescriptor
+
+    global __cutensorOperationDescriptorSetAttribute
+    data["__cutensorOperationDescriptorSetAttribute"] = <intptr_t>__cutensorOperationDescriptorSetAttribute
+
+    global __cutensorOperationDescriptorGetAttribute
+    data["__cutensorOperationDescriptorGetAttribute"] = <intptr_t>__cutensorOperationDescriptorGetAttribute
+
+    global __cutensorCreatePlanPreference
+    data["__cutensorCreatePlanPreference"] = <intptr_t>__cutensorCreatePlanPreference
+
+    global __cutensorDestroyPlanPreference
+    data["__cutensorDestroyPlanPreference"] = <intptr_t>__cutensorDestroyPlanPreference
+
+    global __cutensorPlanPreferenceSetAttribute
+    data["__cutensorPlanPreferenceSetAttribute"] = <intptr_t>__cutensorPlanPreferenceSetAttribute
+
+    global __cutensorPlanGetAttribute
+    data["__cutensorPlanGetAttribute"] = <intptr_t>__cutensorPlanGetAttribute
+
+    global __cutensorEstimateWorkspaceSize
+    data["__cutensorEstimateWorkspaceSize"] = <intptr_t>__cutensorEstimateWorkspaceSize
+
+    global __cutensorCreatePlan
+    data["__cutensorCreatePlan"] = <intptr_t>__cutensorCreatePlan
+
+    global __cutensorDestroyPlan
+    data["__cutensorDestroyPlan"] = <intptr_t>__cutensorDestroyPlan
+
+    global __cutensorContract
+    data["__cutensorContract"] = <intptr_t>__cutensorContract
+
+    global __cutensorCreateReduction
+    data["__cutensorCreateReduction"] = <intptr_t>__cutensorCreateReduction
+
+    global __cutensorReduce
+    data["__cutensorReduce"] = <intptr_t>__cutensorReduce
+
+    global __cutensorCreateContractionTrinary
+    data["__cutensorCreateContractionTrinary"] = <intptr_t>__cutensorCreateContractionTrinary
+
+    global __cutensorContractTrinary
+    data["__cutensorContractTrinary"] = <intptr_t>__cutensorContractTrinary
+
+    global __cutensorCreateBlockSparseTensorDescriptor
+    data["__cutensorCreateBlockSparseTensorDescriptor"] = <intptr_t>__cutensorCreateBlockSparseTensorDescriptor
+
+    global __cutensorDestroyBlockSparseTensorDescriptor
+    data["__cutensorDestroyBlockSparseTensorDescriptor"] = <intptr_t>__cutensorDestroyBlockSparseTensorDescriptor
+
+    global __cutensorCreateBlockSparseContraction
+    data["__cutensorCreateBlockSparseContraction"] = <intptr_t>__cutensorCreateBlockSparseContraction
+
+    global __cutensorBlockSparseContract
+    data["__cutensorBlockSparseContract"] = <intptr_t>__cutensorBlockSparseContract
+
+    global __cutensorGetErrorString
+    data["__cutensorGetErrorString"] = <intptr_t>__cutensorGetErrorString
+
+    global __cutensorGetVersion
+    data["__cutensorGetVersion"] = <intptr_t>__cutensorGetVersion
+
+    global __cutensorGetCudartVersion
+    data["__cutensorGetCudartVersion"] = <intptr_t>__cutensorGetCudartVersion
+
+    global __cutensorLoggerSetCallback
+    data["__cutensorLoggerSetCallback"] = <intptr_t>__cutensorLoggerSetCallback
+
+    global __cutensorLoggerSetFile
+    data["__cutensorLoggerSetFile"] = <intptr_t>__cutensorLoggerSetFile
+
+    global __cutensorLoggerOpenFile
+    data["__cutensorLoggerOpenFile"] = <intptr_t>__cutensorLoggerOpenFile
+
+    global __cutensorLoggerSetLevel
+    data["__cutensorLoggerSetLevel"] = <intptr_t>__cutensorLoggerSetLevel
+
+    global __cutensorLoggerSetMask
+    data["__cutensorLoggerSetMask"] = <intptr_t>__cutensorLoggerSetMask
+
+    global __cutensorLoggerForceDisable
+    data["__cutensorLoggerForceDisable"] = <intptr_t>__cutensorLoggerForceDisable
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cutensorStatus_t _cutensorCreate(cutensorHandle_t* handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreate
+    _check_or_init_cutensor()
+    if __cutensorCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreate is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t*) noexcept nogil>__cutensorCreate)(
+        handle)
+
+
+cdef cutensorStatus_t _cutensorDestroy(cutensorHandle_t handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroy
+    _check_or_init_cutensor()
+    if __cutensorDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroy is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t) noexcept nogil>__cutensorDestroy)(
+        handle)
+
+
+cdef cutensorStatus_t _cutensorHandleResizePlanCache(cutensorHandle_t handle, const uint32_t numEntries) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorHandleResizePlanCache
+    _check_or_init_cutensor()
+    if __cutensorHandleResizePlanCache == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorHandleResizePlanCache is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t, const uint32_t) noexcept nogil>__cutensorHandleResizePlanCache)(
+        handle, numEntries)
+
+
+cdef cutensorStatus_t _cutensorHandleWritePlanCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorHandleWritePlanCacheToFile
+    _check_or_init_cutensor()
+    if __cutensorHandleWritePlanCacheToFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorHandleWritePlanCacheToFile is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const char*) noexcept nogil>__cutensorHandleWritePlanCacheToFile)(
+        handle, filename)
+
+
+cdef cutensorStatus_t _cutensorHandleReadPlanCacheFromFile(cutensorHandle_t handle, const char filename[], uint32_t* numCachelinesRead) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorHandleReadPlanCacheFromFile
+    _check_or_init_cutensor()
+    if __cutensorHandleReadPlanCacheFromFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorHandleReadPlanCacheFromFile is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t, const char*, uint32_t*) noexcept nogil>__cutensorHandleReadPlanCacheFromFile)(
+        handle, filename, numCachelinesRead)
+
+
+cdef cutensorStatus_t _cutensorWriteKernelCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorWriteKernelCacheToFile
+    _check_or_init_cutensor()
+    if __cutensorWriteKernelCacheToFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorWriteKernelCacheToFile is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const char*) noexcept nogil>__cutensorWriteKernelCacheToFile)(
+        handle, filename)
+
+
+cdef cutensorStatus_t _cutensorReadKernelCacheFromFile(cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorReadKernelCacheFromFile
+    _check_or_init_cutensor()
+    if __cutensorReadKernelCacheFromFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorReadKernelCacheFromFile is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t, const char*) noexcept nogil>__cutensorReadKernelCacheFromFile)(
+        handle, filename)
+
+
+cdef cutensorStatus_t _cutensorCreateTensorDescriptor(const cutensorHandle_t handle, cutensorTensorDescriptor_t* desc, const uint32_t numModes, const int64_t extent[], const int64_t stride[], cudaDataType_t dataType, uint32_t alignmentRequirement) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateTensorDescriptor
+    _check_or_init_cutensor()
+    if __cutensorCreateTensorDescriptor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateTensorDescriptor is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorTensorDescriptor_t*, const uint32_t, const int64_t*, const int64_t*, cudaDataType_t, uint32_t) noexcept nogil>__cutensorCreateTensorDescriptor)(
+        handle, desc, numModes, extent, stride, dataType, alignmentRequirement)
+
+
+cdef cutensorStatus_t _cutensorDestroyTensorDescriptor(cutensorTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroyTensorDescriptor
+    _check_or_init_cutensor()
+    if __cutensorDestroyTensorDescriptor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroyTensorDescriptor is not found")
+    return (<cutensorStatus_t (*)(cutensorTensorDescriptor_t) noexcept nogil>__cutensorDestroyTensorDescriptor)(
+        desc)
+
+
+cdef cutensorStatus_t _cutensorCreateElementwiseTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAB, cutensorOperator_t opABC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateElementwiseTrinary
+    _check_or_init_cutensor()
+    if __cutensorCreateElementwiseTrinary == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateElementwiseTrinary is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, cutensorOperator_t, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateElementwiseTrinary)(
+        handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, opAB, opABC, descCompute)
+
+
+cdef cutensorStatus_t _cutensorElementwiseTrinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* B, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorElementwiseTrinaryExecute
+    _check_or_init_cutensor()
+    if __cutensorElementwiseTrinaryExecute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorElementwiseTrinaryExecute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, const void*, const void*, const void*, const void*, void*, cudaStream_t) noexcept nogil>__cutensorElementwiseTrinaryExecute)(
+        handle, plan, alpha, A, beta, B, gamma, C, D, stream)
+
+
+cdef cutensorStatus_t _cutensorCreateElementwiseBinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateElementwiseBinary
+    _check_or_init_cutensor()
+    if __cutensorCreateElementwiseBinary == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateElementwiseBinary is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateElementwiseBinary)(
+        handle, desc, descA, modeA, opA, descC, modeC, opC, descD, modeD, opAC, descCompute)
+
+
+cdef cutensorStatus_t _cutensorElementwiseBinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorElementwiseBinaryExecute
+    _check_or_init_cutensor()
+    if __cutensorElementwiseBinaryExecute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorElementwiseBinaryExecute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, const void*, const void*, void*, cudaStream_t) noexcept nogil>__cutensorElementwiseBinaryExecute)(
+        handle, plan, alpha, A, gamma, C, D, stream)
+
+
+cdef cutensorStatus_t _cutensorCreatePermutation(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreatePermutation
+    _check_or_init_cutensor()
+    if __cutensorCreatePermutation == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreatePermutation is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreatePermutation)(
+        handle, desc, descA, modeA, opA, descB, modeB, descCompute)
+
+
+cdef cutensorStatus_t _cutensorPermute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, void* B, const cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorPermute
+    _check_or_init_cutensor()
+    if __cutensorPermute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorPermute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, void*, const cudaStream_t) noexcept nogil>__cutensorPermute)(
+        handle, plan, alpha, A, B, stream)
+
+
+cdef cutensorStatus_t _cutensorCreateContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateContraction
+    _check_or_init_cutensor()
+    if __cutensorCreateContraction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateContraction is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateContraction)(
+        handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, descCompute)
+
+
+cdef cutensorStatus_t _cutensorDestroyOperationDescriptor(cutensorOperationDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroyOperationDescriptor
+    _check_or_init_cutensor()
+    if __cutensorDestroyOperationDescriptor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroyOperationDescriptor is not found")
+    return (<cutensorStatus_t (*)(cutensorOperationDescriptor_t) noexcept nogil>__cutensorDestroyOperationDescriptor)(
+        desc)
+
+
+cdef cutensorStatus_t _cutensorOperationDescriptorSetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorOperationDescriptorSetAttribute
+    _check_or_init_cutensor()
+    if __cutensorOperationDescriptorSetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorOperationDescriptorSetAttribute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t, cutensorOperationDescriptorAttribute_t, const void*, size_t) noexcept nogil>__cutensorOperationDescriptorSetAttribute)(
+        handle, desc, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t _cutensorOperationDescriptorGetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorOperationDescriptorGetAttribute
+    _check_or_init_cutensor()
+    if __cutensorOperationDescriptorGetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorOperationDescriptorGetAttribute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t, cutensorOperationDescriptorAttribute_t, void*, size_t) noexcept nogil>__cutensorOperationDescriptorGetAttribute)(
+        handle, desc, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t _cutensorCreatePlanPreference(const cutensorHandle_t handle, cutensorPlanPreference_t* pref, cutensorAlgo_t algo, cutensorJitMode_t jitMode) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreatePlanPreference
+    _check_or_init_cutensor()
+    if __cutensorCreatePlanPreference == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreatePlanPreference is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorPlanPreference_t*, cutensorAlgo_t, cutensorJitMode_t) noexcept nogil>__cutensorCreatePlanPreference)(
+        handle, pref, algo, jitMode)
+
+
+cdef cutensorStatus_t _cutensorDestroyPlanPreference(cutensorPlanPreference_t pref) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroyPlanPreference
+    _check_or_init_cutensor()
+    if __cutensorDestroyPlanPreference == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroyPlanPreference is not found")
+    return (<cutensorStatus_t (*)(cutensorPlanPreference_t) noexcept nogil>__cutensorDestroyPlanPreference)(
+        pref)
+
+
+cdef cutensorStatus_t _cutensorPlanPreferenceSetAttribute(const cutensorHandle_t handle, cutensorPlanPreference_t pref, cutensorPlanPreferenceAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorPlanPreferenceSetAttribute
+    _check_or_init_cutensor()
+    if __cutensorPlanPreferenceSetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorPlanPreferenceSetAttribute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorPlanPreference_t, cutensorPlanPreferenceAttribute_t, const void*, size_t) noexcept nogil>__cutensorPlanPreferenceSetAttribute)(
+        handle, pref, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t _cutensorPlanGetAttribute(const cutensorHandle_t handle, const cutensorPlan_t plan, cutensorPlanAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorPlanGetAttribute
+    _check_or_init_cutensor()
+    if __cutensorPlanGetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorPlanGetAttribute is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, cutensorPlanAttribute_t, void*, size_t) noexcept nogil>__cutensorPlanGetAttribute)(
+        handle, plan, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t _cutensorEstimateWorkspaceSize(const cutensorHandle_t handle, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t planPref, const cutensorWorksizePreference_t workspacePref, uint64_t* workspaceSizeEstimate) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorEstimateWorkspaceSize
+    _check_or_init_cutensor()
+    if __cutensorEstimateWorkspaceSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorEstimateWorkspaceSize is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorOperationDescriptor_t, const cutensorPlanPreference_t, const cutensorWorksizePreference_t, uint64_t*) noexcept nogil>__cutensorEstimateWorkspaceSize)(
+        handle, desc, planPref, workspacePref, workspaceSizeEstimate)
+
+
+cdef cutensorStatus_t _cutensorCreatePlan(const cutensorHandle_t handle, cutensorPlan_t* plan, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t pref, uint64_t workspaceSizeLimit) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreatePlan
+    _check_or_init_cutensor()
+    if __cutensorCreatePlan == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreatePlan is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorPlan_t*, const cutensorOperationDescriptor_t, const cutensorPlanPreference_t, uint64_t) noexcept nogil>__cutensorCreatePlan)(
+        handle, plan, desc, pref, workspaceSizeLimit)
+
+
+cdef cutensorStatus_t _cutensorDestroyPlan(cutensorPlan_t plan) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroyPlan
+    _check_or_init_cutensor()
+    if __cutensorDestroyPlan == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroyPlan is not found")
+    return (<cutensorStatus_t (*)(cutensorPlan_t) noexcept nogil>__cutensorDestroyPlan)(
+        plan)
+
+
+cdef cutensorStatus_t _cutensorContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorContract
+    _check_or_init_cutensor()
+    if __cutensorContract == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorContract is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, const void*, const void*, const void*, void*, void*, uint64_t, cudaStream_t) noexcept nogil>__cutensorContract)(
+        handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t _cutensorCreateReduction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opReduce, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateReduction
+    _check_or_init_cutensor()
+    if __cutensorCreateReduction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateReduction is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateReduction)(
+        handle, desc, descA, modeA, opA, descC, modeC, opC, descD, modeD, opReduce, descCompute)
+
+
+cdef cutensorStatus_t _cutensorReduce(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorReduce
+    _check_or_init_cutensor()
+    if __cutensorReduce == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorReduce is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, const void*, const void*, void*, void*, uint64_t, cudaStream_t) noexcept nogil>__cutensorReduce)(
+        handle, plan, alpha, A, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t _cutensorCreateContractionTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opD, const cutensorTensorDescriptor_t descE, const int32_t modeE[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateContractionTrinary
+    _check_or_init_cutensor()
+    if __cutensorCreateContractionTrinary == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateContractionTrinary is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorTensorDescriptor_t, const int32_t*, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateContractionTrinary)(
+        handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, opD, descE, modeE, descCompute)
+
+
+cdef cutensorStatus_t _cutensorContractTrinary(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* C, const void* beta, const void* D, void* E, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorContractTrinary
+    _check_or_init_cutensor()
+    if __cutensorContractTrinary == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorContractTrinary is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void*, const void*, const void*, const void*, const void*, void*, void*, uint64_t, cudaStream_t) noexcept nogil>__cutensorContractTrinary)(
+        handle, plan, alpha, A, B, C, beta, D, E, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t _cutensorCreateBlockSparseTensorDescriptor(cutensorHandle_t handle, cutensorBlockSparseTensorDescriptor_t* desc, const uint32_t numModes, const uint64_t numNonZeroBlocks, const uint32_t numSectionsPerMode[], const int64_t extent[], const int32_t nonZeroCoordinates[], const int64_t stride[], cudaDataType_t dataType) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateBlockSparseTensorDescriptor
+    _check_or_init_cutensor()
+    if __cutensorCreateBlockSparseTensorDescriptor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateBlockSparseTensorDescriptor is not found")
+    return (<cutensorStatus_t (*)(cutensorHandle_t, cutensorBlockSparseTensorDescriptor_t*, const uint32_t, const uint64_t, const uint32_t*, const int64_t*, const int32_t*, const int64_t*, cudaDataType_t) noexcept nogil>__cutensorCreateBlockSparseTensorDescriptor)(
+        handle, desc, numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, stride, dataType)
+
+
+cdef cutensorStatus_t _cutensorDestroyBlockSparseTensorDescriptor(cutensorBlockSparseTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorDestroyBlockSparseTensorDescriptor
+    _check_or_init_cutensor()
+    if __cutensorDestroyBlockSparseTensorDescriptor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorDestroyBlockSparseTensorDescriptor is not found")
+    return (<cutensorStatus_t (*)(cutensorBlockSparseTensorDescriptor_t) noexcept nogil>__cutensorDestroyBlockSparseTensorDescriptor)(
+        desc)
+
+
+cdef cutensorStatus_t _cutensorCreateBlockSparseContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorBlockSparseTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorBlockSparseTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorBlockSparseTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorBlockSparseTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorCreateBlockSparseContraction
+    _check_or_init_cutensor()
+    if __cutensorCreateBlockSparseContraction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorCreateBlockSparseContraction is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, cutensorOperationDescriptor_t*, const cutensorBlockSparseTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorBlockSparseTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorBlockSparseTensorDescriptor_t, const int32_t*, cutensorOperator_t, const cutensorBlockSparseTensorDescriptor_t, const int32_t*, const cutensorComputeDescriptor_t) noexcept nogil>__cutensorCreateBlockSparseContraction)(
+        handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, descCompute)
+
+
+cdef cutensorStatus_t _cutensorBlockSparseContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* const A[], const void* const B[], const void* beta, const void* const C[], void* const D[], void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorBlockSparseContract
+    _check_or_init_cutensor()
+    if __cutensorBlockSparseContract == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorBlockSparseContract is not found")
+    return (<cutensorStatus_t (*)(const cutensorHandle_t, const cutensorPlan_t, const void*, const void* const*, const void* const*, const void*, const void* const*, void* const*, void*, uint64_t, cudaStream_t) noexcept nogil>__cutensorBlockSparseContract)(
+        handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef const char* _cutensorGetErrorString(const cutensorStatus_t error) except?NULL nogil:
+    global __cutensorGetErrorString
+    _check_or_init_cutensor()
+    if __cutensorGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorGetErrorString is not found")
+    return (<const char* (*)(const cutensorStatus_t) noexcept nogil>__cutensorGetErrorString)(
+        error)
+
+
+cdef size_t _cutensorGetVersion() except?0 nogil:
+    global __cutensorGetVersion
+    _check_or_init_cutensor()
+    if __cutensorGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorGetVersion is not found")
+    return (<size_t (*)() noexcept nogil>__cutensorGetVersion)(
+        )
+
+
+cdef size_t _cutensorGetCudartVersion() except?0 nogil:
+    global __cutensorGetCudartVersion
+    _check_or_init_cutensor()
+    if __cutensorGetCudartVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorGetCudartVersion is not found")
+    return (<size_t (*)() noexcept nogil>__cutensorGetCudartVersion)(
+        )
+
+
+cdef cutensorStatus_t _cutensorLoggerSetCallback(cutensorLoggerCallback_t callback) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerSetCallback
+    _check_or_init_cutensor()
+    if __cutensorLoggerSetCallback == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerSetCallback is not found")
+    return (<cutensorStatus_t (*)(cutensorLoggerCallback_t) noexcept nogil>__cutensorLoggerSetCallback)(
+        callback)
+
+
+cdef cutensorStatus_t _cutensorLoggerSetFile(FILE* file) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerSetFile
+    _check_or_init_cutensor()
+    if __cutensorLoggerSetFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerSetFile is not found")
+    return (<cutensorStatus_t (*)(FILE*) noexcept nogil>__cutensorLoggerSetFile)(
+        file)
+
+
+cdef cutensorStatus_t _cutensorLoggerOpenFile(const char* logFile) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerOpenFile
+    _check_or_init_cutensor()
+    if __cutensorLoggerOpenFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerOpenFile is not found")
+    return (<cutensorStatus_t (*)(const char*) noexcept nogil>__cutensorLoggerOpenFile)(
+        logFile)
+
+
+cdef cutensorStatus_t _cutensorLoggerSetLevel(int32_t level) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerSetLevel
+    _check_or_init_cutensor()
+    if __cutensorLoggerSetLevel == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerSetLevel is not found")
+    return (<cutensorStatus_t (*)(int32_t) noexcept nogil>__cutensorLoggerSetLevel)(
+        level)
+
+
+cdef cutensorStatus_t _cutensorLoggerSetMask(int32_t mask) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerSetMask
+    _check_or_init_cutensor()
+    if __cutensorLoggerSetMask == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerSetMask is not found")
+    return (<cutensorStatus_t (*)(int32_t) noexcept nogil>__cutensorLoggerSetMask)(
+        mask)
+
+
+cdef cutensorStatus_t _cutensorLoggerForceDisable() except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cutensorLoggerForceDisable
+    _check_or_init_cutensor()
+    if __cutensorLoggerForceDisable == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensorLoggerForceDisable is not found")
+    return (<cutensorStatus_t (*)() noexcept nogil>__cutensorLoggerForceDisable)(
+        )
diff --git a/nvmath/bindings/_internal/mathdx.pxd b/nvmath/bindings/_internal/mathdx.pxd
index 595fc66..a9c6ada 100644
--- a/nvmath/bindings/_internal/mathdx.pxd
+++ b/nvmath/bindings/_internal/mathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 from ..cymathdx cimport *
 
@@ -27,11 +27,11 @@ cdef commondxStatusType _cublasdxSetOperatorInt64(cublasdxDescriptor handle, cub
 cdef commondxStatusType _cublasdxSetOperatorInt64s(cublasdxDescriptor handle, cublasdxOperatorType op, size_t count, const long long int* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxBindTensor(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxSetTensorOptionInt64(cublasdxTensor tensor, cublasdxTensorOption option, long long int value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
-cdef commondxStatusType _cublasdxFinalizeTensors(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxFinalizeTensorsNew(size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxGetTensorTraitInt64(cublasdxTensor tensor, cublasdxTensorTrait trait, long long int* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxGetTensorTraitStrSize(cublasdxTensor tensor, cublasdxTensorTrait trait, size_t* size) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxGetTensorTraitStr(cublasdxTensor tensor, cublasdxTensorTrait trait, size_t size, char* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
-cdef commondxStatusType _cublasdxBindDeviceFunction(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxCreateDeviceFunctionOld(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxFinalizeDeviceFunctions(commondxCode code, size_t count, const cublasdxDeviceFunction* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxGetDeviceFunctionTraitStrSize(cublasdxDeviceFunction device_function, cublasdxDeviceFunctionTrait trait, size_t* size) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType _cublasdxGetDeviceFunctionTraitStr(cublasdxDeviceFunction device_function, cublasdxDeviceFunctionTrait trait, size_t size, char* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
@@ -77,3 +77,10 @@ cdef commondxStatusType _cusolverdxFinalizeCode(commondxCode code, cusolverdxDes
 cdef commondxStatusType _cusolverdxDestroyDescriptor(cusolverdxDescriptor handle) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef const char* _cusolverdxOperatorTypeToStr(cusolverdxOperatorType op) except?NULL nogil
 cdef const char* _cusolverdxTraitTypeToStr(cusolverdxTraitType trait) except?NULL nogil
+cdef commondxStatusType _cublasdxCreateTensorNew(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxMakeTensorLike(cublasdxTensor input, commondxValueType value_type, cublasdxTensor* output) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxDestroyTensorNew(cublasdxTensor tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxCreateDeviceFunctionNew(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType _cublasdxDestroyDeviceFunctionNew(cublasdxDeviceFunction device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+
+cdef commondxStatusType _cublasdxFinalizeTensors203(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/mathdx_linux.pyx b/nvmath/bindings/_internal/mathdx_linux.pyx
index 4663cf2..0a739a5 100644
--- a/nvmath/bindings/_internal/mathdx_linux.pyx
+++ b/nvmath/bindings/_internal/mathdx_linux.pyx
@@ -1,7 +1,9 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
@@ -11,6 +13,8 @@ from cuda.pathfinder import load_nvidia_dynamic_lib
 # Extern
 ###############################################################################
 
+# You must 'from .utils import NotSupportedError' before using this template
+
 cdef extern from "<dlfcn.h>" nogil:
     void* dlopen(const char*, int)
     char* dlerror()
@@ -25,13 +29,32 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_mathdx_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __commondxCreateCode = NULL
 cdef void* __commondxSetCodeOptionInt64 = NULL
@@ -103,6 +126,11 @@ cdef void* __cusolverdxFinalizeCode = NULL
 cdef void* __cusolverdxDestroyDescriptor = NULL
 cdef void* __cusolverdxOperatorTypeToStr = NULL
 cdef void* __cusolverdxTraitTypeToStr = NULL
+cdef void* __cublasdxCreateTensor = NULL
+cdef void* __cublasdxMakeTensorLike = NULL
+cdef void* __cublasdxDestroyTensor = NULL
+cdef void* __cublasdxCreateDeviceFunction = NULL
+cdef void* __cublasdxDestroyDeviceFunction = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -116,520 +144,539 @@ cdef int _check_or_init_mathdx() except -1 nogil:
     if __py_mathdx_init:
         return 0
 
-    # Load driver to check version
     cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __commondxCreateCode
-    __commondxCreateCode = dlsym(RTLD_DEFAULT, 'commondxCreateCode')
-    if __commondxCreateCode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxCreateCode = dlsym(handle, 'commondxCreateCode')
-
-    global __commondxSetCodeOptionInt64
-    __commondxSetCodeOptionInt64 = dlsym(RTLD_DEFAULT, 'commondxSetCodeOptionInt64')
-    if __commondxSetCodeOptionInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxSetCodeOptionInt64 = dlsym(handle, 'commondxSetCodeOptionInt64')
-
-    global __commondxSetCodeOptionStr
-    __commondxSetCodeOptionStr = dlsym(RTLD_DEFAULT, 'commondxSetCodeOptionStr')
-    if __commondxSetCodeOptionStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxSetCodeOptionStr = dlsym(handle, 'commondxSetCodeOptionStr')
-
-    global __commondxGetCodeOptionInt64
-    __commondxGetCodeOptionInt64 = dlsym(RTLD_DEFAULT, 'commondxGetCodeOptionInt64')
-    if __commondxGetCodeOptionInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeOptionInt64 = dlsym(handle, 'commondxGetCodeOptionInt64')
-
-    global __commondxGetCodeOptionsInt64s
-    __commondxGetCodeOptionsInt64s = dlsym(RTLD_DEFAULT, 'commondxGetCodeOptionsInt64s')
-    if __commondxGetCodeOptionsInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeOptionsInt64s = dlsym(handle, 'commondxGetCodeOptionsInt64s')
-
-    global __commondxGetCodeLTOIRSize
-    __commondxGetCodeLTOIRSize = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRSize')
-    if __commondxGetCodeLTOIRSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeLTOIRSize = dlsym(handle, 'commondxGetCodeLTOIRSize')
-
-    global __commondxGetCodeLTOIR
-    __commondxGetCodeLTOIR = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIR')
-    if __commondxGetCodeLTOIR == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeLTOIR = dlsym(handle, 'commondxGetCodeLTOIR')
-
-    global __commondxGetCodeNumLTOIRs
-    __commondxGetCodeNumLTOIRs = dlsym(RTLD_DEFAULT, 'commondxGetCodeNumLTOIRs')
-    if __commondxGetCodeNumLTOIRs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeNumLTOIRs = dlsym(handle, 'commondxGetCodeNumLTOIRs')
-
-    global __commondxGetCodeLTOIRSizes
-    __commondxGetCodeLTOIRSizes = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRSizes')
-    if __commondxGetCodeLTOIRSizes == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeLTOIRSizes = dlsym(handle, 'commondxGetCodeLTOIRSizes')
-
-    global __commondxGetCodeLTOIRs
-    __commondxGetCodeLTOIRs = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRs')
-    if __commondxGetCodeLTOIRs == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxGetCodeLTOIRs = dlsym(handle, 'commondxGetCodeLTOIRs')
-
-    global __commondxDestroyCode
-    __commondxDestroyCode = dlsym(RTLD_DEFAULT, 'commondxDestroyCode')
-    if __commondxDestroyCode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxDestroyCode = dlsym(handle, 'commondxDestroyCode')
-
-    global __commondxStatusToStr
-    __commondxStatusToStr = dlsym(RTLD_DEFAULT, 'commondxStatusToStr')
-    if __commondxStatusToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __commondxStatusToStr = dlsym(handle, 'commondxStatusToStr')
-
-    global __mathdxGetVersion
-    __mathdxGetVersion = dlsym(RTLD_DEFAULT, 'mathdxGetVersion')
-    if __mathdxGetVersion == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __mathdxGetVersion = dlsym(handle, 'mathdxGetVersion')
-
-    global __mathdxGetVersionEx
-    __mathdxGetVersionEx = dlsym(RTLD_DEFAULT, 'mathdxGetVersionEx')
-    if __mathdxGetVersionEx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __mathdxGetVersionEx = dlsym(handle, 'mathdxGetVersionEx')
-
-    global __cublasdxCreateDescriptor
-    __cublasdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cublasdxCreateDescriptor')
-    if __cublasdxCreateDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxCreateDescriptor = dlsym(handle, 'cublasdxCreateDescriptor')
-
-    global __cublasdxSetOptionStr
-    __cublasdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cublasdxSetOptionStr')
-    if __cublasdxSetOptionStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxSetOptionStr = dlsym(handle, 'cublasdxSetOptionStr')
-
-    global __cublasdxSetOperatorInt64
-    __cublasdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cublasdxSetOperatorInt64')
-    if __cublasdxSetOperatorInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxSetOperatorInt64 = dlsym(handle, 'cublasdxSetOperatorInt64')
-
-    global __cublasdxSetOperatorInt64s
-    __cublasdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cublasdxSetOperatorInt64s')
-    if __cublasdxSetOperatorInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxSetOperatorInt64s = dlsym(handle, 'cublasdxSetOperatorInt64s')
-
-    global __cublasdxBindTensor
-    __cublasdxBindTensor = dlsym(RTLD_DEFAULT, 'cublasdxBindTensor')
-    if __cublasdxBindTensor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxBindTensor = dlsym(handle, 'cublasdxBindTensor')
-
-    global __cublasdxSetTensorOptionInt64
-    __cublasdxSetTensorOptionInt64 = dlsym(RTLD_DEFAULT, 'cublasdxSetTensorOptionInt64')
-    if __cublasdxSetTensorOptionInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxSetTensorOptionInt64 = dlsym(handle, 'cublasdxSetTensorOptionInt64')
-
-    global __cublasdxFinalizeTensors
-    __cublasdxFinalizeTensors = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeTensors')
-    if __cublasdxFinalizeTensors == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxFinalizeTensors = dlsym(handle, 'cublasdxFinalizeTensors')
-
-    global __cublasdxGetTensorTraitInt64
-    __cublasdxGetTensorTraitInt64 = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitInt64')
-    if __cublasdxGetTensorTraitInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTensorTraitInt64 = dlsym(handle, 'cublasdxGetTensorTraitInt64')
-
-    global __cublasdxGetTensorTraitStrSize
-    __cublasdxGetTensorTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitStrSize')
-    if __cublasdxGetTensorTraitStrSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTensorTraitStrSize = dlsym(handle, 'cublasdxGetTensorTraitStrSize')
-
-    global __cublasdxGetTensorTraitStr
-    __cublasdxGetTensorTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitStr')
-    if __cublasdxGetTensorTraitStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTensorTraitStr = dlsym(handle, 'cublasdxGetTensorTraitStr')
-
-    global __cublasdxBindDeviceFunction
-    __cublasdxBindDeviceFunction = dlsym(RTLD_DEFAULT, 'cublasdxBindDeviceFunction')
-    if __cublasdxBindDeviceFunction == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxBindDeviceFunction = dlsym(handle, 'cublasdxBindDeviceFunction')
-
-    global __cublasdxFinalizeDeviceFunctions
-    __cublasdxFinalizeDeviceFunctions = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeDeviceFunctions')
-    if __cublasdxFinalizeDeviceFunctions == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxFinalizeDeviceFunctions = dlsym(handle, 'cublasdxFinalizeDeviceFunctions')
-
-    global __cublasdxGetDeviceFunctionTraitStrSize
-    __cublasdxGetDeviceFunctionTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetDeviceFunctionTraitStrSize')
-    if __cublasdxGetDeviceFunctionTraitStrSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetDeviceFunctionTraitStrSize = dlsym(handle, 'cublasdxGetDeviceFunctionTraitStrSize')
-
-    global __cublasdxGetDeviceFunctionTraitStr
-    __cublasdxGetDeviceFunctionTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetDeviceFunctionTraitStr')
-    if __cublasdxGetDeviceFunctionTraitStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetDeviceFunctionTraitStr = dlsym(handle, 'cublasdxGetDeviceFunctionTraitStr')
-
-    global __cublasdxGetLTOIRSize
-    __cublasdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cublasdxGetLTOIRSize')
-    if __cublasdxGetLTOIRSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetLTOIRSize = dlsym(handle, 'cublasdxGetLTOIRSize')
-
-    global __cublasdxGetLTOIR
-    __cublasdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cublasdxGetLTOIR')
-    if __cublasdxGetLTOIR == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetLTOIR = dlsym(handle, 'cublasdxGetLTOIR')
-
-    global __cublasdxGetTraitStrSize
-    __cublasdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitStrSize')
-    if __cublasdxGetTraitStrSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTraitStrSize = dlsym(handle, 'cublasdxGetTraitStrSize')
-
-    global __cublasdxGetTraitStr
-    __cublasdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitStr')
-    if __cublasdxGetTraitStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTraitStr = dlsym(handle, 'cublasdxGetTraitStr')
-
-    global __cublasdxGetTraitInt64
-    __cublasdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitInt64')
-    if __cublasdxGetTraitInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTraitInt64 = dlsym(handle, 'cublasdxGetTraitInt64')
-
-    global __cublasdxGetTraitInt64s
-    __cublasdxGetTraitInt64s = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitInt64s')
-    if __cublasdxGetTraitInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxGetTraitInt64s = dlsym(handle, 'cublasdxGetTraitInt64s')
-
-    global __cublasdxOperatorTypeToStr
-    __cublasdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cublasdxOperatorTypeToStr')
-    if __cublasdxOperatorTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxOperatorTypeToStr = dlsym(handle, 'cublasdxOperatorTypeToStr')
-
-    global __cublasdxTraitTypeToStr
-    __cublasdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cublasdxTraitTypeToStr')
-    if __cublasdxTraitTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxTraitTypeToStr = dlsym(handle, 'cublasdxTraitTypeToStr')
-
-    global __cublasdxFinalizeCode
-    __cublasdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeCode')
-    if __cublasdxFinalizeCode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxFinalizeCode = dlsym(handle, 'cublasdxFinalizeCode')
-
-    global __cublasdxDestroyDescriptor
-    __cublasdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cublasdxDestroyDescriptor')
-    if __cublasdxDestroyDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cublasdxDestroyDescriptor = dlsym(handle, 'cublasdxDestroyDescriptor')
-
-    global __cufftdxCreateDescriptor
-    __cufftdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cufftdxCreateDescriptor')
-    if __cufftdxCreateDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxCreateDescriptor = dlsym(handle, 'cufftdxCreateDescriptor')
-
-    global __cufftdxSetOptionStr
-    __cufftdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cufftdxSetOptionStr')
-    if __cufftdxSetOptionStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxSetOptionStr = dlsym(handle, 'cufftdxSetOptionStr')
-
-    global __cufftdxGetKnobInt64Size
-    __cufftdxGetKnobInt64Size = dlsym(RTLD_DEFAULT, 'cufftdxGetKnobInt64Size')
-    if __cufftdxGetKnobInt64Size == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetKnobInt64Size = dlsym(handle, 'cufftdxGetKnobInt64Size')
-
-    global __cufftdxGetKnobInt64s
-    __cufftdxGetKnobInt64s = dlsym(RTLD_DEFAULT, 'cufftdxGetKnobInt64s')
-    if __cufftdxGetKnobInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetKnobInt64s = dlsym(handle, 'cufftdxGetKnobInt64s')
-
-    global __cufftdxSetOperatorInt64
-    __cufftdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cufftdxSetOperatorInt64')
-    if __cufftdxSetOperatorInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxSetOperatorInt64 = dlsym(handle, 'cufftdxSetOperatorInt64')
-
-    global __cufftdxSetOperatorInt64s
-    __cufftdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cufftdxSetOperatorInt64s')
-    if __cufftdxSetOperatorInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxSetOperatorInt64s = dlsym(handle, 'cufftdxSetOperatorInt64s')
 
-    global __cufftdxGetLTOIRSize
-    __cufftdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cufftdxGetLTOIRSize')
-    if __cufftdxGetLTOIRSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetLTOIRSize = dlsym(handle, 'cufftdxGetLTOIRSize')
-
-    global __cufftdxGetLTOIR
-    __cufftdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cufftdxGetLTOIR')
-    if __cufftdxGetLTOIR == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetLTOIR = dlsym(handle, 'cufftdxGetLTOIR')
-
-    global __cufftdxGetTraitStrSize
-    __cufftdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitStrSize')
-    if __cufftdxGetTraitStrSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetTraitStrSize = dlsym(handle, 'cufftdxGetTraitStrSize')
-
-    global __cufftdxGetTraitStr
-    __cufftdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitStr')
-    if __cufftdxGetTraitStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetTraitStr = dlsym(handle, 'cufftdxGetTraitStr')
-
-    global __cufftdxGetTraitInt64
-    __cufftdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitInt64')
-    if __cufftdxGetTraitInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetTraitInt64 = dlsym(handle, 'cufftdxGetTraitInt64')
-
-    global __cufftdxGetTraitInt64s
-    __cufftdxGetTraitInt64s = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitInt64s')
-    if __cufftdxGetTraitInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetTraitInt64s = dlsym(handle, 'cufftdxGetTraitInt64s')
-
-    global __cufftdxGetTraitCommondxDataType
-    __cufftdxGetTraitCommondxDataType = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitCommondxDataType')
-    if __cufftdxGetTraitCommondxDataType == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxGetTraitCommondxDataType = dlsym(handle, 'cufftdxGetTraitCommondxDataType')
-
-    global __cufftdxFinalizeCode
-    __cufftdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cufftdxFinalizeCode')
-    if __cufftdxFinalizeCode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxFinalizeCode = dlsym(handle, 'cufftdxFinalizeCode')
-
-    global __cufftdxDestroyDescriptor
-    __cufftdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cufftdxDestroyDescriptor')
-    if __cufftdxDestroyDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxDestroyDescriptor = dlsym(handle, 'cufftdxDestroyDescriptor')
-
-    global __cufftdxOperatorTypeToStr
-    __cufftdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cufftdxOperatorTypeToStr')
-    if __cufftdxOperatorTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxOperatorTypeToStr = dlsym(handle, 'cufftdxOperatorTypeToStr')
-
-    global __cufftdxTraitTypeToStr
-    __cufftdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cufftdxTraitTypeToStr')
-    if __cufftdxTraitTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cufftdxTraitTypeToStr = dlsym(handle, 'cufftdxTraitTypeToStr')
-
-    global __cusolverdxCreateDescriptor
-    __cusolverdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cusolverdxCreateDescriptor')
-    if __cusolverdxCreateDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxCreateDescriptor = dlsym(handle, 'cusolverdxCreateDescriptor')
-
-    global __cusolverdxSetOptionStr
-    __cusolverdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cusolverdxSetOptionStr')
-    if __cusolverdxSetOptionStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxSetOptionStr = dlsym(handle, 'cusolverdxSetOptionStr')
-
-    global __cusolverdxSetOperatorInt64
-    __cusolverdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cusolverdxSetOperatorInt64')
-    if __cusolverdxSetOperatorInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxSetOperatorInt64 = dlsym(handle, 'cusolverdxSetOperatorInt64')
-
-    global __cusolverdxSetOperatorInt64s
-    __cusolverdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cusolverdxSetOperatorInt64s')
-    if __cusolverdxSetOperatorInt64s == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxSetOperatorInt64s = dlsym(handle, 'cusolverdxSetOperatorInt64s')
-
-    global __cusolverdxGetLTOIRSize
-    __cusolverdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetLTOIRSize')
-    if __cusolverdxGetLTOIRSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetLTOIRSize = dlsym(handle, 'cusolverdxGetLTOIRSize')
-
-    global __cusolverdxGetLTOIR
-    __cusolverdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cusolverdxGetLTOIR')
-    if __cusolverdxGetLTOIR == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetLTOIR = dlsym(handle, 'cusolverdxGetLTOIR')
-
-    global __cusolverdxGetUniversalFATBINSize
-    __cusolverdxGetUniversalFATBINSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetUniversalFATBINSize')
-    if __cusolverdxGetUniversalFATBINSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetUniversalFATBINSize = dlsym(handle, 'cusolverdxGetUniversalFATBINSize')
-
-    global __cusolverdxGetUniversalFATBIN
-    __cusolverdxGetUniversalFATBIN = dlsym(RTLD_DEFAULT, 'cusolverdxGetUniversalFATBIN')
-    if __cusolverdxGetUniversalFATBIN == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetUniversalFATBIN = dlsym(handle, 'cusolverdxGetUniversalFATBIN')
-
-    global __cusolverdxGetTraitStrSize
-    __cusolverdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitStrSize')
-    if __cusolverdxGetTraitStrSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetTraitStrSize = dlsym(handle, 'cusolverdxGetTraitStrSize')
-
-    global __cusolverdxGetTraitStr
-    __cusolverdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitStr')
-    if __cusolverdxGetTraitStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetTraitStr = dlsym(handle, 'cusolverdxGetTraitStr')
-
-    global __cusolverdxGetTraitInt64
-    __cusolverdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitInt64')
-    if __cusolverdxGetTraitInt64 == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxGetTraitInt64 = dlsym(handle, 'cusolverdxGetTraitInt64')
-
-    global __cusolverdxFinalizeCode
-    __cusolverdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cusolverdxFinalizeCode')
-    if __cusolverdxFinalizeCode == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxFinalizeCode = dlsym(handle, 'cusolverdxFinalizeCode')
-
-    global __cusolverdxDestroyDescriptor
-    __cusolverdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cusolverdxDestroyDescriptor')
-    if __cusolverdxDestroyDescriptor == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxDestroyDescriptor = dlsym(handle, 'cusolverdxDestroyDescriptor')
-
-    global __cusolverdxOperatorTypeToStr
-    __cusolverdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cusolverdxOperatorTypeToStr')
-    if __cusolverdxOperatorTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxOperatorTypeToStr = dlsym(handle, 'cusolverdxOperatorTypeToStr')
-
-    global __cusolverdxTraitTypeToStr
-    __cusolverdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cusolverdxTraitTypeToStr')
-    if __cusolverdxTraitTypeToStr == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __cusolverdxTraitTypeToStr = dlsym(handle, 'cusolverdxTraitTypeToStr')
-
-    __py_mathdx_init = True
-    return 0
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
+
+        # Load function
+        global __commondxCreateCode
+        __commondxCreateCode = dlsym(RTLD_DEFAULT, 'commondxCreateCode')
+        if __commondxCreateCode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxCreateCode = dlsym(handle, 'commondxCreateCode')
+
+        global __commondxSetCodeOptionInt64
+        __commondxSetCodeOptionInt64 = dlsym(RTLD_DEFAULT, 'commondxSetCodeOptionInt64')
+        if __commondxSetCodeOptionInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxSetCodeOptionInt64 = dlsym(handle, 'commondxSetCodeOptionInt64')
+
+        global __commondxSetCodeOptionStr
+        __commondxSetCodeOptionStr = dlsym(RTLD_DEFAULT, 'commondxSetCodeOptionStr')
+        if __commondxSetCodeOptionStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxSetCodeOptionStr = dlsym(handle, 'commondxSetCodeOptionStr')
+
+        global __commondxGetCodeOptionInt64
+        __commondxGetCodeOptionInt64 = dlsym(RTLD_DEFAULT, 'commondxGetCodeOptionInt64')
+        if __commondxGetCodeOptionInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeOptionInt64 = dlsym(handle, 'commondxGetCodeOptionInt64')
+
+        global __commondxGetCodeOptionsInt64s
+        __commondxGetCodeOptionsInt64s = dlsym(RTLD_DEFAULT, 'commondxGetCodeOptionsInt64s')
+        if __commondxGetCodeOptionsInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeOptionsInt64s = dlsym(handle, 'commondxGetCodeOptionsInt64s')
+
+        global __commondxGetCodeLTOIRSize
+        __commondxGetCodeLTOIRSize = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRSize')
+        if __commondxGetCodeLTOIRSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeLTOIRSize = dlsym(handle, 'commondxGetCodeLTOIRSize')
+
+        global __commondxGetCodeLTOIR
+        __commondxGetCodeLTOIR = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIR')
+        if __commondxGetCodeLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeLTOIR = dlsym(handle, 'commondxGetCodeLTOIR')
+
+        global __commondxGetCodeNumLTOIRs
+        __commondxGetCodeNumLTOIRs = dlsym(RTLD_DEFAULT, 'commondxGetCodeNumLTOIRs')
+        if __commondxGetCodeNumLTOIRs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeNumLTOIRs = dlsym(handle, 'commondxGetCodeNumLTOIRs')
+
+        global __commondxGetCodeLTOIRSizes
+        __commondxGetCodeLTOIRSizes = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRSizes')
+        if __commondxGetCodeLTOIRSizes == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeLTOIRSizes = dlsym(handle, 'commondxGetCodeLTOIRSizes')
+
+        global __commondxGetCodeLTOIRs
+        __commondxGetCodeLTOIRs = dlsym(RTLD_DEFAULT, 'commondxGetCodeLTOIRs')
+        if __commondxGetCodeLTOIRs == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxGetCodeLTOIRs = dlsym(handle, 'commondxGetCodeLTOIRs')
+
+        global __commondxDestroyCode
+        __commondxDestroyCode = dlsym(RTLD_DEFAULT, 'commondxDestroyCode')
+        if __commondxDestroyCode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxDestroyCode = dlsym(handle, 'commondxDestroyCode')
+
+        global __commondxStatusToStr
+        __commondxStatusToStr = dlsym(RTLD_DEFAULT, 'commondxStatusToStr')
+        if __commondxStatusToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __commondxStatusToStr = dlsym(handle, 'commondxStatusToStr')
+
+        global __mathdxGetVersion
+        __mathdxGetVersion = dlsym(RTLD_DEFAULT, 'mathdxGetVersion')
+        if __mathdxGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __mathdxGetVersion = dlsym(handle, 'mathdxGetVersion')
+
+        global __mathdxGetVersionEx
+        __mathdxGetVersionEx = dlsym(RTLD_DEFAULT, 'mathdxGetVersionEx')
+        if __mathdxGetVersionEx == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __mathdxGetVersionEx = dlsym(handle, 'mathdxGetVersionEx')
+
+        global __cublasdxCreateDescriptor
+        __cublasdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cublasdxCreateDescriptor')
+        if __cublasdxCreateDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxCreateDescriptor = dlsym(handle, 'cublasdxCreateDescriptor')
+
+        global __cublasdxSetOptionStr
+        __cublasdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cublasdxSetOptionStr')
+        if __cublasdxSetOptionStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxSetOptionStr = dlsym(handle, 'cublasdxSetOptionStr')
+
+        global __cublasdxSetOperatorInt64
+        __cublasdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cublasdxSetOperatorInt64')
+        if __cublasdxSetOperatorInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxSetOperatorInt64 = dlsym(handle, 'cublasdxSetOperatorInt64')
+
+        global __cublasdxSetOperatorInt64s
+        __cublasdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cublasdxSetOperatorInt64s')
+        if __cublasdxSetOperatorInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxSetOperatorInt64s = dlsym(handle, 'cublasdxSetOperatorInt64s')
+
+        global __cublasdxBindTensor
+        __cublasdxBindTensor = dlsym(RTLD_DEFAULT, 'cublasdxBindTensor')
+        if __cublasdxBindTensor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxBindTensor = dlsym(handle, 'cublasdxBindTensor')
+
+        global __cublasdxSetTensorOptionInt64
+        __cublasdxSetTensorOptionInt64 = dlsym(RTLD_DEFAULT, 'cublasdxSetTensorOptionInt64')
+        if __cublasdxSetTensorOptionInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxSetTensorOptionInt64 = dlsym(handle, 'cublasdxSetTensorOptionInt64')
+
+        global __cublasdxFinalizeTensors
+        __cublasdxFinalizeTensors = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeTensors')
+        if __cublasdxFinalizeTensors == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxFinalizeTensors = dlsym(handle, 'cublasdxFinalizeTensors')
+
+        global __cublasdxGetTensorTraitInt64
+        __cublasdxGetTensorTraitInt64 = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitInt64')
+        if __cublasdxGetTensorTraitInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTensorTraitInt64 = dlsym(handle, 'cublasdxGetTensorTraitInt64')
+
+        global __cublasdxGetTensorTraitStrSize
+        __cublasdxGetTensorTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitStrSize')
+        if __cublasdxGetTensorTraitStrSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTensorTraitStrSize = dlsym(handle, 'cublasdxGetTensorTraitStrSize')
+
+        global __cublasdxGetTensorTraitStr
+        __cublasdxGetTensorTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetTensorTraitStr')
+        if __cublasdxGetTensorTraitStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTensorTraitStr = dlsym(handle, 'cublasdxGetTensorTraitStr')
+
+        global __cublasdxBindDeviceFunction
+        __cublasdxBindDeviceFunction = dlsym(RTLD_DEFAULT, 'cublasdxBindDeviceFunction')
+        if __cublasdxBindDeviceFunction == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxBindDeviceFunction = dlsym(handle, 'cublasdxBindDeviceFunction')
+
+        global __cublasdxFinalizeDeviceFunctions
+        __cublasdxFinalizeDeviceFunctions = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeDeviceFunctions')
+        if __cublasdxFinalizeDeviceFunctions == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxFinalizeDeviceFunctions = dlsym(handle, 'cublasdxFinalizeDeviceFunctions')
+
+        global __cublasdxGetDeviceFunctionTraitStrSize
+        __cublasdxGetDeviceFunctionTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetDeviceFunctionTraitStrSize')
+        if __cublasdxGetDeviceFunctionTraitStrSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetDeviceFunctionTraitStrSize = dlsym(handle, 'cublasdxGetDeviceFunctionTraitStrSize')
+
+        global __cublasdxGetDeviceFunctionTraitStr
+        __cublasdxGetDeviceFunctionTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetDeviceFunctionTraitStr')
+        if __cublasdxGetDeviceFunctionTraitStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetDeviceFunctionTraitStr = dlsym(handle, 'cublasdxGetDeviceFunctionTraitStr')
+
+        global __cublasdxGetLTOIRSize
+        __cublasdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cublasdxGetLTOIRSize')
+        if __cublasdxGetLTOIRSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetLTOIRSize = dlsym(handle, 'cublasdxGetLTOIRSize')
+
+        global __cublasdxGetLTOIR
+        __cublasdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cublasdxGetLTOIR')
+        if __cublasdxGetLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetLTOIR = dlsym(handle, 'cublasdxGetLTOIR')
+
+        global __cublasdxGetTraitStrSize
+        __cublasdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitStrSize')
+        if __cublasdxGetTraitStrSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTraitStrSize = dlsym(handle, 'cublasdxGetTraitStrSize')
+
+        global __cublasdxGetTraitStr
+        __cublasdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitStr')
+        if __cublasdxGetTraitStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTraitStr = dlsym(handle, 'cublasdxGetTraitStr')
+
+        global __cublasdxGetTraitInt64
+        __cublasdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitInt64')
+        if __cublasdxGetTraitInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTraitInt64 = dlsym(handle, 'cublasdxGetTraitInt64')
+
+        global __cublasdxGetTraitInt64s
+        __cublasdxGetTraitInt64s = dlsym(RTLD_DEFAULT, 'cublasdxGetTraitInt64s')
+        if __cublasdxGetTraitInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxGetTraitInt64s = dlsym(handle, 'cublasdxGetTraitInt64s')
+
+        global __cublasdxOperatorTypeToStr
+        __cublasdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cublasdxOperatorTypeToStr')
+        if __cublasdxOperatorTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxOperatorTypeToStr = dlsym(handle, 'cublasdxOperatorTypeToStr')
+
+        global __cublasdxTraitTypeToStr
+        __cublasdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cublasdxTraitTypeToStr')
+        if __cublasdxTraitTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxTraitTypeToStr = dlsym(handle, 'cublasdxTraitTypeToStr')
+
+        global __cublasdxFinalizeCode
+        __cublasdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cublasdxFinalizeCode')
+        if __cublasdxFinalizeCode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxFinalizeCode = dlsym(handle, 'cublasdxFinalizeCode')
+
+        global __cublasdxDestroyDescriptor
+        __cublasdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cublasdxDestroyDescriptor')
+        if __cublasdxDestroyDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxDestroyDescriptor = dlsym(handle, 'cublasdxDestroyDescriptor')
+
+        global __cufftdxCreateDescriptor
+        __cufftdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cufftdxCreateDescriptor')
+        if __cufftdxCreateDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxCreateDescriptor = dlsym(handle, 'cufftdxCreateDescriptor')
+
+        global __cufftdxSetOptionStr
+        __cufftdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cufftdxSetOptionStr')
+        if __cufftdxSetOptionStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxSetOptionStr = dlsym(handle, 'cufftdxSetOptionStr')
+
+        global __cufftdxGetKnobInt64Size
+        __cufftdxGetKnobInt64Size = dlsym(RTLD_DEFAULT, 'cufftdxGetKnobInt64Size')
+        if __cufftdxGetKnobInt64Size == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetKnobInt64Size = dlsym(handle, 'cufftdxGetKnobInt64Size')
+
+        global __cufftdxGetKnobInt64s
+        __cufftdxGetKnobInt64s = dlsym(RTLD_DEFAULT, 'cufftdxGetKnobInt64s')
+        if __cufftdxGetKnobInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetKnobInt64s = dlsym(handle, 'cufftdxGetKnobInt64s')
+
+        global __cufftdxSetOperatorInt64
+        __cufftdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cufftdxSetOperatorInt64')
+        if __cufftdxSetOperatorInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxSetOperatorInt64 = dlsym(handle, 'cufftdxSetOperatorInt64')
+
+        global __cufftdxSetOperatorInt64s
+        __cufftdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cufftdxSetOperatorInt64s')
+        if __cufftdxSetOperatorInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxSetOperatorInt64s = dlsym(handle, 'cufftdxSetOperatorInt64s')
+
+        global __cufftdxGetLTOIRSize
+        __cufftdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cufftdxGetLTOIRSize')
+        if __cufftdxGetLTOIRSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetLTOIRSize = dlsym(handle, 'cufftdxGetLTOIRSize')
+
+        global __cufftdxGetLTOIR
+        __cufftdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cufftdxGetLTOIR')
+        if __cufftdxGetLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetLTOIR = dlsym(handle, 'cufftdxGetLTOIR')
+
+        global __cufftdxGetTraitStrSize
+        __cufftdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitStrSize')
+        if __cufftdxGetTraitStrSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetTraitStrSize = dlsym(handle, 'cufftdxGetTraitStrSize')
+
+        global __cufftdxGetTraitStr
+        __cufftdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitStr')
+        if __cufftdxGetTraitStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetTraitStr = dlsym(handle, 'cufftdxGetTraitStr')
+
+        global __cufftdxGetTraitInt64
+        __cufftdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitInt64')
+        if __cufftdxGetTraitInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetTraitInt64 = dlsym(handle, 'cufftdxGetTraitInt64')
+
+        global __cufftdxGetTraitInt64s
+        __cufftdxGetTraitInt64s = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitInt64s')
+        if __cufftdxGetTraitInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetTraitInt64s = dlsym(handle, 'cufftdxGetTraitInt64s')
+
+        global __cufftdxGetTraitCommondxDataType
+        __cufftdxGetTraitCommondxDataType = dlsym(RTLD_DEFAULT, 'cufftdxGetTraitCommondxDataType')
+        if __cufftdxGetTraitCommondxDataType == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxGetTraitCommondxDataType = dlsym(handle, 'cufftdxGetTraitCommondxDataType')
+
+        global __cufftdxFinalizeCode
+        __cufftdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cufftdxFinalizeCode')
+        if __cufftdxFinalizeCode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxFinalizeCode = dlsym(handle, 'cufftdxFinalizeCode')
+
+        global __cufftdxDestroyDescriptor
+        __cufftdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cufftdxDestroyDescriptor')
+        if __cufftdxDestroyDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxDestroyDescriptor = dlsym(handle, 'cufftdxDestroyDescriptor')
+
+        global __cufftdxOperatorTypeToStr
+        __cufftdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cufftdxOperatorTypeToStr')
+        if __cufftdxOperatorTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxOperatorTypeToStr = dlsym(handle, 'cufftdxOperatorTypeToStr')
+
+        global __cufftdxTraitTypeToStr
+        __cufftdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cufftdxTraitTypeToStr')
+        if __cufftdxTraitTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cufftdxTraitTypeToStr = dlsym(handle, 'cufftdxTraitTypeToStr')
+
+        global __cusolverdxCreateDescriptor
+        __cusolverdxCreateDescriptor = dlsym(RTLD_DEFAULT, 'cusolverdxCreateDescriptor')
+        if __cusolverdxCreateDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxCreateDescriptor = dlsym(handle, 'cusolverdxCreateDescriptor')
+
+        global __cusolverdxSetOptionStr
+        __cusolverdxSetOptionStr = dlsym(RTLD_DEFAULT, 'cusolverdxSetOptionStr')
+        if __cusolverdxSetOptionStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxSetOptionStr = dlsym(handle, 'cusolverdxSetOptionStr')
+
+        global __cusolverdxSetOperatorInt64
+        __cusolverdxSetOperatorInt64 = dlsym(RTLD_DEFAULT, 'cusolverdxSetOperatorInt64')
+        if __cusolverdxSetOperatorInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxSetOperatorInt64 = dlsym(handle, 'cusolverdxSetOperatorInt64')
+
+        global __cusolverdxSetOperatorInt64s
+        __cusolverdxSetOperatorInt64s = dlsym(RTLD_DEFAULT, 'cusolverdxSetOperatorInt64s')
+        if __cusolverdxSetOperatorInt64s == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxSetOperatorInt64s = dlsym(handle, 'cusolverdxSetOperatorInt64s')
+
+        global __cusolverdxGetLTOIRSize
+        __cusolverdxGetLTOIRSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetLTOIRSize')
+        if __cusolverdxGetLTOIRSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetLTOIRSize = dlsym(handle, 'cusolverdxGetLTOIRSize')
+
+        global __cusolverdxGetLTOIR
+        __cusolverdxGetLTOIR = dlsym(RTLD_DEFAULT, 'cusolverdxGetLTOIR')
+        if __cusolverdxGetLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetLTOIR = dlsym(handle, 'cusolverdxGetLTOIR')
+
+        global __cusolverdxGetUniversalFATBINSize
+        __cusolverdxGetUniversalFATBINSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetUniversalFATBINSize')
+        if __cusolverdxGetUniversalFATBINSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetUniversalFATBINSize = dlsym(handle, 'cusolverdxGetUniversalFATBINSize')
+
+        global __cusolverdxGetUniversalFATBIN
+        __cusolverdxGetUniversalFATBIN = dlsym(RTLD_DEFAULT, 'cusolverdxGetUniversalFATBIN')
+        if __cusolverdxGetUniversalFATBIN == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetUniversalFATBIN = dlsym(handle, 'cusolverdxGetUniversalFATBIN')
+
+        global __cusolverdxGetTraitStrSize
+        __cusolverdxGetTraitStrSize = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitStrSize')
+        if __cusolverdxGetTraitStrSize == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetTraitStrSize = dlsym(handle, 'cusolverdxGetTraitStrSize')
+
+        global __cusolverdxGetTraitStr
+        __cusolverdxGetTraitStr = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitStr')
+        if __cusolverdxGetTraitStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetTraitStr = dlsym(handle, 'cusolverdxGetTraitStr')
+
+        global __cusolverdxGetTraitInt64
+        __cusolverdxGetTraitInt64 = dlsym(RTLD_DEFAULT, 'cusolverdxGetTraitInt64')
+        if __cusolverdxGetTraitInt64 == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxGetTraitInt64 = dlsym(handle, 'cusolverdxGetTraitInt64')
+
+        global __cusolverdxFinalizeCode
+        __cusolverdxFinalizeCode = dlsym(RTLD_DEFAULT, 'cusolverdxFinalizeCode')
+        if __cusolverdxFinalizeCode == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxFinalizeCode = dlsym(handle, 'cusolverdxFinalizeCode')
+
+        global __cusolverdxDestroyDescriptor
+        __cusolverdxDestroyDescriptor = dlsym(RTLD_DEFAULT, 'cusolverdxDestroyDescriptor')
+        if __cusolverdxDestroyDescriptor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxDestroyDescriptor = dlsym(handle, 'cusolverdxDestroyDescriptor')
+
+        global __cusolverdxOperatorTypeToStr
+        __cusolverdxOperatorTypeToStr = dlsym(RTLD_DEFAULT, 'cusolverdxOperatorTypeToStr')
+        if __cusolverdxOperatorTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxOperatorTypeToStr = dlsym(handle, 'cusolverdxOperatorTypeToStr')
+
+        global __cusolverdxTraitTypeToStr
+        __cusolverdxTraitTypeToStr = dlsym(RTLD_DEFAULT, 'cusolverdxTraitTypeToStr')
+        if __cusolverdxTraitTypeToStr == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cusolverdxTraitTypeToStr = dlsym(handle, 'cusolverdxTraitTypeToStr')
+
+        global __cublasdxCreateTensor
+        __cublasdxCreateTensor = dlsym(RTLD_DEFAULT, 'cublasdxCreateTensor')
+        if __cublasdxCreateTensor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxCreateTensor = dlsym(handle, 'cublasdxCreateTensor')
+
+        global __cublasdxMakeTensorLike
+        __cublasdxMakeTensorLike = dlsym(RTLD_DEFAULT, 'cublasdxMakeTensorLike')
+        if __cublasdxMakeTensorLike == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxMakeTensorLike = dlsym(handle, 'cublasdxMakeTensorLike')
+
+        global __cublasdxDestroyTensor
+        __cublasdxDestroyTensor = dlsym(RTLD_DEFAULT, 'cublasdxDestroyTensor')
+        if __cublasdxDestroyTensor == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxDestroyTensor = dlsym(handle, 'cublasdxDestroyTensor')
+
+        global __cublasdxCreateDeviceFunction
+        __cublasdxCreateDeviceFunction = dlsym(RTLD_DEFAULT, 'cublasdxCreateDeviceFunction')
+        if __cublasdxCreateDeviceFunction == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxCreateDeviceFunction = dlsym(handle, 'cublasdxCreateDeviceFunction')
+
+        global __cublasdxDestroyDeviceFunction
+        __cublasdxDestroyDeviceFunction = dlsym(RTLD_DEFAULT, 'cublasdxDestroyDeviceFunction')
+        if __cublasdxDestroyDeviceFunction == NULL:
+            if handle == NULL:
+                handle = load_library(driver_ver)
+            __cublasdxDestroyDeviceFunction = dlsym(handle, 'cublasdxDestroyDeviceFunction')
+
+        __py_mathdx_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -853,6 +900,21 @@ cpdef dict _inspect_function_pointers():
     global __cusolverdxTraitTypeToStr
     data["__cusolverdxTraitTypeToStr"] = <intptr_t>__cusolverdxTraitTypeToStr
 
+    global __cublasdxCreateTensor
+    data["__cublasdxCreateTensor"] = <intptr_t>__cublasdxCreateTensor
+
+    global __cublasdxMakeTensorLike
+    data["__cublasdxMakeTensorLike"] = <intptr_t>__cublasdxMakeTensorLike
+
+    global __cublasdxDestroyTensor
+    data["__cublasdxDestroyTensor"] = <intptr_t>__cublasdxDestroyTensor
+
+    global __cublasdxCreateDeviceFunction
+    data["__cublasdxCreateDeviceFunction"] = <intptr_t>__cublasdxCreateDeviceFunction
+
+    global __cublasdxDestroyDeviceFunction
+    data["__cublasdxDestroyDeviceFunction"] = <intptr_t>__cublasdxDestroyDeviceFunction
+
     func_ptrs = data
     return data
 
@@ -1068,14 +1130,14 @@ cdef commondxStatusType _cublasdxSetTensorOptionInt64(cublasdxTensor tensor, cub
         tensor, option, value)
 
 
-cdef commondxStatusType _cublasdxFinalizeTensors(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+cdef commondxStatusType _cublasdxFinalizeTensorsNew(size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
     global __cublasdxFinalizeTensors
     _check_or_init_mathdx()
     if __cublasdxFinalizeTensors == NULL:
         with gil:
             raise FunctionNotFoundError("function cublasdxFinalizeTensors is not found")
-    return (<commondxStatusType (*)(cublasdxDescriptor, size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
-        handle, count, array)
+    return (<commondxStatusType (*)(size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
+        count, array)
 
 
 cdef commondxStatusType _cublasdxGetTensorTraitInt64(cublasdxTensor tensor, cublasdxTensorTrait trait, long long int* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
@@ -1108,7 +1170,7 @@ cdef commondxStatusType _cublasdxGetTensorTraitStr(cublasdxTensor tensor, cublas
         tensor, trait, size, value)
 
 
-cdef commondxStatusType _cublasdxBindDeviceFunction(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+cdef commondxStatusType _cublasdxCreateDeviceFunctionOld(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
     global __cublasdxBindDeviceFunction
     _check_or_init_mathdx()
     if __cublasdxBindDeviceFunction == NULL:
@@ -1566,3 +1628,62 @@ cdef const char* _cusolverdxTraitTypeToStr(cusolverdxTraitType trait) except?NUL
             raise FunctionNotFoundError("function cusolverdxTraitTypeToStr is not found")
     return (<const char* (*)(cusolverdxTraitType) noexcept nogil>__cusolverdxTraitTypeToStr)(
         trait)
+
+
+cdef commondxStatusType _cublasdxCreateTensorNew(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxCreateTensor
+    _check_or_init_mathdx()
+    if __cublasdxCreateTensor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxCreateTensor is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, cublasdxTensorType, cublasdxTensor*) noexcept nogil>__cublasdxCreateTensor)(
+        handle, tensor_type, tensor)
+
+
+cdef commondxStatusType _cublasdxMakeTensorLike(cublasdxTensor input, commondxValueType value_type, cublasdxTensor* output) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxMakeTensorLike
+    _check_or_init_mathdx()
+    if __cublasdxMakeTensorLike == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxMakeTensorLike is not found")
+    return (<commondxStatusType (*)(cublasdxTensor, commondxValueType, cublasdxTensor*) noexcept nogil>__cublasdxMakeTensorLike)(
+        input, value_type, output)
+
+
+cdef commondxStatusType _cublasdxDestroyTensorNew(cublasdxTensor tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxDestroyTensor
+    _check_or_init_mathdx()
+    if __cublasdxDestroyTensor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxDestroyTensor is not found")
+    return (<commondxStatusType (*)(cublasdxTensor) noexcept nogil>__cublasdxDestroyTensor)(
+        tensor)
+
+
+cdef commondxStatusType _cublasdxCreateDeviceFunctionNew(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxCreateDeviceFunction
+    _check_or_init_mathdx()
+    if __cublasdxCreateDeviceFunction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxCreateDeviceFunction is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, cublasdxDeviceFunctionType, size_t, const cublasdxTensor*, cublasdxDeviceFunction*) noexcept nogil>__cublasdxCreateDeviceFunction)(
+        handle, device_function_type, count, array, device_function)
+
+
+cdef commondxStatusType _cublasdxDestroyDeviceFunctionNew(cublasdxDeviceFunction device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxDestroyDeviceFunction
+    _check_or_init_mathdx()
+    if __cublasdxDestroyDeviceFunction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxDestroyDeviceFunction is not found")
+    return (<commondxStatusType (*)(cublasdxDeviceFunction) noexcept nogil>__cublasdxDestroyDeviceFunction)(
+        device_function)
+
+cdef commondxStatusType _cublasdxFinalizeTensors203(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxFinalizeTensors
+    _check_or_init_mathdx()
+    if __cublasdxFinalizeTensors == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxFinalizeTensors is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
+        handle, count, array)
diff --git a/nvmath/bindings/_internal/mathdx_windows.pyx b/nvmath/bindings/_internal/mathdx_windows.pyx
index e2d0359..01e3d7b 100644
--- a/nvmath/bindings/_internal/mathdx_windows.pyx
+++ b/nvmath/bindings/_internal/mathdx_windows.pyx
@@ -1,23 +1,80 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
-
-import win32api
+import threading
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in nvcuda.dll')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_mathdx_init = False
-cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __commondxCreateCode = NULL
 cdef void* __commondxSetCodeOptionInt64 = NULL
@@ -89,6 +146,11 @@ cdef void* __cusolverdxFinalizeCode = NULL
 cdef void* __cusolverdxDestroyDescriptor = NULL
 cdef void* __cusolverdxOperatorTypeToStr = NULL
 cdef void* __cusolverdxTraitTypeToStr = NULL
+cdef void* __cublasdxCreateTensor = NULL
+cdef void* __cublasdxMakeTensorLike = NULL
+cdef void* __cublasdxDestroyTensor = NULL
+cdef void* __cublasdxCreateDeviceFunction = NULL
+cdef void* __cublasdxDestroyDeviceFunction = NULL
 
 
 cdef inline list get_site_packages():
@@ -105,448 +167,240 @@ cdef int _check_or_init_mathdx() except -1 nogil:
     if __py_mathdx_init:
         return 0
 
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
+    with gil, __symbol_lock:
+        driver_ver = get_cuda_version()
 
         # Load library
         handle = load_library(driver_ver)
 
         # Load function
         global __commondxCreateCode
-        try:
-            __commondxCreateCode = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxCreateCode')
-        except:
-            pass
+        __commondxCreateCode = GetProcAddress(handle, 'commondxCreateCode')
 
         global __commondxSetCodeOptionInt64
-        try:
-            __commondxSetCodeOptionInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxSetCodeOptionInt64')
-        except:
-            pass
+        __commondxSetCodeOptionInt64 = GetProcAddress(handle, 'commondxSetCodeOptionInt64')
 
         global __commondxSetCodeOptionStr
-        try:
-            __commondxSetCodeOptionStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxSetCodeOptionStr')
-        except:
-            pass
+        __commondxSetCodeOptionStr = GetProcAddress(handle, 'commondxSetCodeOptionStr')
 
         global __commondxGetCodeOptionInt64
-        try:
-            __commondxGetCodeOptionInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeOptionInt64')
-        except:
-            pass
+        __commondxGetCodeOptionInt64 = GetProcAddress(handle, 'commondxGetCodeOptionInt64')
 
         global __commondxGetCodeOptionsInt64s
-        try:
-            __commondxGetCodeOptionsInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeOptionsInt64s')
-        except:
-            pass
+        __commondxGetCodeOptionsInt64s = GetProcAddress(handle, 'commondxGetCodeOptionsInt64s')
 
         global __commondxGetCodeLTOIRSize
-        try:
-            __commondxGetCodeLTOIRSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeLTOIRSize')
-        except:
-            pass
+        __commondxGetCodeLTOIRSize = GetProcAddress(handle, 'commondxGetCodeLTOIRSize')
 
         global __commondxGetCodeLTOIR
-        try:
-            __commondxGetCodeLTOIR = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeLTOIR')
-        except:
-            pass
+        __commondxGetCodeLTOIR = GetProcAddress(handle, 'commondxGetCodeLTOIR')
 
         global __commondxGetCodeNumLTOIRs
-        try:
-            __commondxGetCodeNumLTOIRs = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeNumLTOIRs')
-        except:
-            pass
+        __commondxGetCodeNumLTOIRs = GetProcAddress(handle, 'commondxGetCodeNumLTOIRs')
 
         global __commondxGetCodeLTOIRSizes
-        try:
-            __commondxGetCodeLTOIRSizes = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeLTOIRSizes')
-        except:
-            pass
+        __commondxGetCodeLTOIRSizes = GetProcAddress(handle, 'commondxGetCodeLTOIRSizes')
 
         global __commondxGetCodeLTOIRs
-        try:
-            __commondxGetCodeLTOIRs = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxGetCodeLTOIRs')
-        except:
-            pass
+        __commondxGetCodeLTOIRs = GetProcAddress(handle, 'commondxGetCodeLTOIRs')
 
         global __commondxDestroyCode
-        try:
-            __commondxDestroyCode = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxDestroyCode')
-        except:
-            pass
+        __commondxDestroyCode = GetProcAddress(handle, 'commondxDestroyCode')
 
         global __commondxStatusToStr
-        try:
-            __commondxStatusToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'commondxStatusToStr')
-        except:
-            pass
+        __commondxStatusToStr = GetProcAddress(handle, 'commondxStatusToStr')
 
         global __mathdxGetVersion
-        try:
-            __mathdxGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'mathdxGetVersion')
-        except:
-            pass
+        __mathdxGetVersion = GetProcAddress(handle, 'mathdxGetVersion')
 
         global __mathdxGetVersionEx
-        try:
-            __mathdxGetVersionEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'mathdxGetVersionEx')
-        except:
-            pass
+        __mathdxGetVersionEx = GetProcAddress(handle, 'mathdxGetVersionEx')
 
         global __cublasdxCreateDescriptor
-        try:
-            __cublasdxCreateDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxCreateDescriptor')
-        except:
-            pass
+        __cublasdxCreateDescriptor = GetProcAddress(handle, 'cublasdxCreateDescriptor')
 
         global __cublasdxSetOptionStr
-        try:
-            __cublasdxSetOptionStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxSetOptionStr')
-        except:
-            pass
+        __cublasdxSetOptionStr = GetProcAddress(handle, 'cublasdxSetOptionStr')
 
         global __cublasdxSetOperatorInt64
-        try:
-            __cublasdxSetOperatorInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxSetOperatorInt64')
-        except:
-            pass
+        __cublasdxSetOperatorInt64 = GetProcAddress(handle, 'cublasdxSetOperatorInt64')
 
         global __cublasdxSetOperatorInt64s
-        try:
-            __cublasdxSetOperatorInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxSetOperatorInt64s')
-        except:
-            pass
+        __cublasdxSetOperatorInt64s = GetProcAddress(handle, 'cublasdxSetOperatorInt64s')
 
         global __cublasdxBindTensor
-        try:
-            __cublasdxBindTensor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxBindTensor')
-        except:
-            pass
+        __cublasdxBindTensor = GetProcAddress(handle, 'cublasdxBindTensor')
 
         global __cublasdxSetTensorOptionInt64
-        try:
-            __cublasdxSetTensorOptionInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxSetTensorOptionInt64')
-        except:
-            pass
+        __cublasdxSetTensorOptionInt64 = GetProcAddress(handle, 'cublasdxSetTensorOptionInt64')
 
         global __cublasdxFinalizeTensors
-        try:
-            __cublasdxFinalizeTensors = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxFinalizeTensors')
-        except:
-            pass
+        __cublasdxFinalizeTensors = GetProcAddress(handle, 'cublasdxFinalizeTensors')
 
         global __cublasdxGetTensorTraitInt64
-        try:
-            __cublasdxGetTensorTraitInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTensorTraitInt64')
-        except:
-            pass
+        __cublasdxGetTensorTraitInt64 = GetProcAddress(handle, 'cublasdxGetTensorTraitInt64')
 
         global __cublasdxGetTensorTraitStrSize
-        try:
-            __cublasdxGetTensorTraitStrSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTensorTraitStrSize')
-        except:
-            pass
+        __cublasdxGetTensorTraitStrSize = GetProcAddress(handle, 'cublasdxGetTensorTraitStrSize')
 
         global __cublasdxGetTensorTraitStr
-        try:
-            __cublasdxGetTensorTraitStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTensorTraitStr')
-        except:
-            pass
+        __cublasdxGetTensorTraitStr = GetProcAddress(handle, 'cublasdxGetTensorTraitStr')
 
         global __cublasdxBindDeviceFunction
-        try:
-            __cublasdxBindDeviceFunction = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxBindDeviceFunction')
-        except:
-            pass
+        __cublasdxBindDeviceFunction = GetProcAddress(handle, 'cublasdxBindDeviceFunction')
 
         global __cublasdxFinalizeDeviceFunctions
-        try:
-            __cublasdxFinalizeDeviceFunctions = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxFinalizeDeviceFunctions')
-        except:
-            pass
+        __cublasdxFinalizeDeviceFunctions = GetProcAddress(handle, 'cublasdxFinalizeDeviceFunctions')
 
         global __cublasdxGetDeviceFunctionTraitStrSize
-        try:
-            __cublasdxGetDeviceFunctionTraitStrSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetDeviceFunctionTraitStrSize')
-        except:
-            pass
+        __cublasdxGetDeviceFunctionTraitStrSize = GetProcAddress(handle, 'cublasdxGetDeviceFunctionTraitStrSize')
 
         global __cublasdxGetDeviceFunctionTraitStr
-        try:
-            __cublasdxGetDeviceFunctionTraitStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetDeviceFunctionTraitStr')
-        except:
-            pass
+        __cublasdxGetDeviceFunctionTraitStr = GetProcAddress(handle, 'cublasdxGetDeviceFunctionTraitStr')
 
         global __cublasdxGetLTOIRSize
-        try:
-            __cublasdxGetLTOIRSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetLTOIRSize')
-        except:
-            pass
+        __cublasdxGetLTOIRSize = GetProcAddress(handle, 'cublasdxGetLTOIRSize')
 
         global __cublasdxGetLTOIR
-        try:
-            __cublasdxGetLTOIR = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetLTOIR')
-        except:
-            pass
+        __cublasdxGetLTOIR = GetProcAddress(handle, 'cublasdxGetLTOIR')
 
         global __cublasdxGetTraitStrSize
-        try:
-            __cublasdxGetTraitStrSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTraitStrSize')
-        except:
-            pass
+        __cublasdxGetTraitStrSize = GetProcAddress(handle, 'cublasdxGetTraitStrSize')
 
         global __cublasdxGetTraitStr
-        try:
-            __cublasdxGetTraitStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTraitStr')
-        except:
-            pass
+        __cublasdxGetTraitStr = GetProcAddress(handle, 'cublasdxGetTraitStr')
 
         global __cublasdxGetTraitInt64
-        try:
-            __cublasdxGetTraitInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTraitInt64')
-        except:
-            pass
+        __cublasdxGetTraitInt64 = GetProcAddress(handle, 'cublasdxGetTraitInt64')
 
         global __cublasdxGetTraitInt64s
-        try:
-            __cublasdxGetTraitInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxGetTraitInt64s')
-        except:
-            pass
+        __cublasdxGetTraitInt64s = GetProcAddress(handle, 'cublasdxGetTraitInt64s')
 
         global __cublasdxOperatorTypeToStr
-        try:
-            __cublasdxOperatorTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxOperatorTypeToStr')
-        except:
-            pass
+        __cublasdxOperatorTypeToStr = GetProcAddress(handle, 'cublasdxOperatorTypeToStr')
 
         global __cublasdxTraitTypeToStr
-        try:
-            __cublasdxTraitTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxTraitTypeToStr')
-        except:
-            pass
+        __cublasdxTraitTypeToStr = GetProcAddress(handle, 'cublasdxTraitTypeToStr')
 
         global __cublasdxFinalizeCode
-        try:
-            __cublasdxFinalizeCode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxFinalizeCode')
-        except:
-            pass
+        __cublasdxFinalizeCode = GetProcAddress(handle, 'cublasdxFinalizeCode')
 
         global __cublasdxDestroyDescriptor
-        try:
-            __cublasdxDestroyDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasdxDestroyDescriptor')
-        except:
-            pass
+        __cublasdxDestroyDescriptor = GetProcAddress(handle, 'cublasdxDestroyDescriptor')
 
         global __cufftdxCreateDescriptor
-        try:
-            __cufftdxCreateDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxCreateDescriptor')
-        except:
-            pass
+        __cufftdxCreateDescriptor = GetProcAddress(handle, 'cufftdxCreateDescriptor')
 
         global __cufftdxSetOptionStr
-        try:
-            __cufftdxSetOptionStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxSetOptionStr')
-        except:
-            pass
+        __cufftdxSetOptionStr = GetProcAddress(handle, 'cufftdxSetOptionStr')
 
         global __cufftdxGetKnobInt64Size
-        try:
-            __cufftdxGetKnobInt64Size = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetKnobInt64Size')
-        except:
-            pass
+        __cufftdxGetKnobInt64Size = GetProcAddress(handle, 'cufftdxGetKnobInt64Size')
 
         global __cufftdxGetKnobInt64s
-        try:
-            __cufftdxGetKnobInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetKnobInt64s')
-        except:
-            pass
+        __cufftdxGetKnobInt64s = GetProcAddress(handle, 'cufftdxGetKnobInt64s')
 
         global __cufftdxSetOperatorInt64
-        try:
-            __cufftdxSetOperatorInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxSetOperatorInt64')
-        except:
-            pass
+        __cufftdxSetOperatorInt64 = GetProcAddress(handle, 'cufftdxSetOperatorInt64')
 
         global __cufftdxSetOperatorInt64s
-        try:
-            __cufftdxSetOperatorInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxSetOperatorInt64s')
-        except:
-            pass
+        __cufftdxSetOperatorInt64s = GetProcAddress(handle, 'cufftdxSetOperatorInt64s')
 
         global __cufftdxGetLTOIRSize
-        try:
-            __cufftdxGetLTOIRSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetLTOIRSize')
-        except:
-            pass
+        __cufftdxGetLTOIRSize = GetProcAddress(handle, 'cufftdxGetLTOIRSize')
 
         global __cufftdxGetLTOIR
-        try:
-            __cufftdxGetLTOIR = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetLTOIR')
-        except:
-            pass
+        __cufftdxGetLTOIR = GetProcAddress(handle, 'cufftdxGetLTOIR')
 
         global __cufftdxGetTraitStrSize
-        try:
-            __cufftdxGetTraitStrSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetTraitStrSize')
-        except:
-            pass
+        __cufftdxGetTraitStrSize = GetProcAddress(handle, 'cufftdxGetTraitStrSize')
 
         global __cufftdxGetTraitStr
-        try:
-            __cufftdxGetTraitStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetTraitStr')
-        except:
-            pass
+        __cufftdxGetTraitStr = GetProcAddress(handle, 'cufftdxGetTraitStr')
 
         global __cufftdxGetTraitInt64
-        try:
-            __cufftdxGetTraitInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetTraitInt64')
-        except:
-            pass
+        __cufftdxGetTraitInt64 = GetProcAddress(handle, 'cufftdxGetTraitInt64')
 
         global __cufftdxGetTraitInt64s
-        try:
-            __cufftdxGetTraitInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetTraitInt64s')
-        except:
-            pass
+        __cufftdxGetTraitInt64s = GetProcAddress(handle, 'cufftdxGetTraitInt64s')
 
         global __cufftdxGetTraitCommondxDataType
-        try:
-            __cufftdxGetTraitCommondxDataType = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxGetTraitCommondxDataType')
-        except:
-            pass
+        __cufftdxGetTraitCommondxDataType = GetProcAddress(handle, 'cufftdxGetTraitCommondxDataType')
 
         global __cufftdxFinalizeCode
-        try:
-            __cufftdxFinalizeCode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxFinalizeCode')
-        except:
-            pass
+        __cufftdxFinalizeCode = GetProcAddress(handle, 'cufftdxFinalizeCode')
 
         global __cufftdxDestroyDescriptor
-        try:
-            __cufftdxDestroyDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxDestroyDescriptor')
-        except:
-            pass
+        __cufftdxDestroyDescriptor = GetProcAddress(handle, 'cufftdxDestroyDescriptor')
 
         global __cufftdxOperatorTypeToStr
-        try:
-            __cufftdxOperatorTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxOperatorTypeToStr')
-        except:
-            pass
+        __cufftdxOperatorTypeToStr = GetProcAddress(handle, 'cufftdxOperatorTypeToStr')
 
         global __cufftdxTraitTypeToStr
-        try:
-            __cufftdxTraitTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftdxTraitTypeToStr')
-        except:
-            pass
+        __cufftdxTraitTypeToStr = GetProcAddress(handle, 'cufftdxTraitTypeToStr')
 
         global __cusolverdxCreateDescriptor
-        try:
-            __cusolverdxCreateDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxCreateDescriptor')
-        except:
-            pass
+        __cusolverdxCreateDescriptor = GetProcAddress(handle, 'cusolverdxCreateDescriptor')
 
         global __cusolverdxSetOptionStr
-        try:
-            __cusolverdxSetOptionStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxSetOptionStr')
-        except:
-            pass
+        __cusolverdxSetOptionStr = GetProcAddress(handle, 'cusolverdxSetOptionStr')
 
         global __cusolverdxSetOperatorInt64
-        try:
-            __cusolverdxSetOperatorInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxSetOperatorInt64')
-        except:
-            pass
+        __cusolverdxSetOperatorInt64 = GetProcAddress(handle, 'cusolverdxSetOperatorInt64')
 
         global __cusolverdxSetOperatorInt64s
-        try:
-            __cusolverdxSetOperatorInt64s = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxSetOperatorInt64s')
-        except:
-            pass
+        __cusolverdxSetOperatorInt64s = GetProcAddress(handle, 'cusolverdxSetOperatorInt64s')
 
         global __cusolverdxGetLTOIRSize
-        try:
-            __cusolverdxGetLTOIRSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetLTOIRSize')
-        except:
-            pass
+        __cusolverdxGetLTOIRSize = GetProcAddress(handle, 'cusolverdxGetLTOIRSize')
 
         global __cusolverdxGetLTOIR
-        try:
-            __cusolverdxGetLTOIR = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetLTOIR')
-        except:
-            pass
+        __cusolverdxGetLTOIR = GetProcAddress(handle, 'cusolverdxGetLTOIR')
 
         global __cusolverdxGetUniversalFATBINSize
-        try:
-            __cusolverdxGetUniversalFATBINSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetUniversalFATBINSize')
-        except:
-            pass
+        __cusolverdxGetUniversalFATBINSize = GetProcAddress(handle, 'cusolverdxGetUniversalFATBINSize')
 
         global __cusolverdxGetUniversalFATBIN
-        try:
-            __cusolverdxGetUniversalFATBIN = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetUniversalFATBIN')
-        except:
-            pass
+        __cusolverdxGetUniversalFATBIN = GetProcAddress(handle, 'cusolverdxGetUniversalFATBIN')
 
         global __cusolverdxGetTraitStrSize
-        try:
-            __cusolverdxGetTraitStrSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetTraitStrSize')
-        except:
-            pass
+        __cusolverdxGetTraitStrSize = GetProcAddress(handle, 'cusolverdxGetTraitStrSize')
 
         global __cusolverdxGetTraitStr
-        try:
-            __cusolverdxGetTraitStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetTraitStr')
-        except:
-            pass
+        __cusolverdxGetTraitStr = GetProcAddress(handle, 'cusolverdxGetTraitStr')
 
         global __cusolverdxGetTraitInt64
-        try:
-            __cusolverdxGetTraitInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxGetTraitInt64')
-        except:
-            pass
+        __cusolverdxGetTraitInt64 = GetProcAddress(handle, 'cusolverdxGetTraitInt64')
 
         global __cusolverdxFinalizeCode
-        try:
-            __cusolverdxFinalizeCode = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxFinalizeCode')
-        except:
-            pass
+        __cusolverdxFinalizeCode = GetProcAddress(handle, 'cusolverdxFinalizeCode')
 
         global __cusolverdxDestroyDescriptor
-        try:
-            __cusolverdxDestroyDescriptor = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxDestroyDescriptor')
-        except:
-            pass
+        __cusolverdxDestroyDescriptor = GetProcAddress(handle, 'cusolverdxDestroyDescriptor')
 
         global __cusolverdxOperatorTypeToStr
-        try:
-            __cusolverdxOperatorTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxOperatorTypeToStr')
-        except:
-            pass
+        __cusolverdxOperatorTypeToStr = GetProcAddress(handle, 'cusolverdxOperatorTypeToStr')
 
         global __cusolverdxTraitTypeToStr
-        try:
-            __cusolverdxTraitTypeToStr = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverdxTraitTypeToStr')
-        except:
-            pass
+        __cusolverdxTraitTypeToStr = GetProcAddress(handle, 'cusolverdxTraitTypeToStr')
+
+        global __cublasdxCreateTensor
+        __cublasdxCreateTensor = GetProcAddress(handle, 'cublasdxCreateTensor')
+
+        global __cublasdxMakeTensorLike
+        __cublasdxMakeTensorLike = GetProcAddress(handle, 'cublasdxMakeTensorLike')
 
-    __py_mathdx_init = True
-    return 0
+        global __cublasdxDestroyTensor
+        __cublasdxDestroyTensor = GetProcAddress(handle, 'cublasdxDestroyTensor')
+
+        global __cublasdxCreateDeviceFunction
+        __cublasdxCreateDeviceFunction = GetProcAddress(handle, 'cublasdxCreateDeviceFunction')
+
+        global __cublasdxDestroyDeviceFunction
+        __cublasdxDestroyDeviceFunction = GetProcAddress(handle, 'cublasdxDestroyDeviceFunction')
+
+        __py_mathdx_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
@@ -770,6 +624,21 @@ cpdef dict _inspect_function_pointers():
     global __cusolverdxTraitTypeToStr
     data["__cusolverdxTraitTypeToStr"] = <intptr_t>__cusolverdxTraitTypeToStr
 
+    global __cublasdxCreateTensor
+    data["__cublasdxCreateTensor"] = <intptr_t>__cublasdxCreateTensor
+
+    global __cublasdxMakeTensorLike
+    data["__cublasdxMakeTensorLike"] = <intptr_t>__cublasdxMakeTensorLike
+
+    global __cublasdxDestroyTensor
+    data["__cublasdxDestroyTensor"] = <intptr_t>__cublasdxDestroyTensor
+
+    global __cublasdxCreateDeviceFunction
+    data["__cublasdxCreateDeviceFunction"] = <intptr_t>__cublasdxCreateDeviceFunction
+
+    global __cublasdxDestroyDeviceFunction
+    data["__cublasdxDestroyDeviceFunction"] = <intptr_t>__cublasdxDestroyDeviceFunction
+
     func_ptrs = data
     return data
 
@@ -985,14 +854,14 @@ cdef commondxStatusType _cublasdxSetTensorOptionInt64(cublasdxTensor tensor, cub
         tensor, option, value)
 
 
-cdef commondxStatusType _cublasdxFinalizeTensors(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+cdef commondxStatusType _cublasdxFinalizeTensorsNew(size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
     global __cublasdxFinalizeTensors
     _check_or_init_mathdx()
     if __cublasdxFinalizeTensors == NULL:
         with gil:
             raise FunctionNotFoundError("function cublasdxFinalizeTensors is not found")
-    return (<commondxStatusType (*)(cublasdxDescriptor, size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
-        handle, count, array)
+    return (<commondxStatusType (*)(size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
+        count, array)
 
 
 cdef commondxStatusType _cublasdxGetTensorTraitInt64(cublasdxTensor tensor, cublasdxTensorTrait trait, long long int* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
@@ -1025,7 +894,7 @@ cdef commondxStatusType _cublasdxGetTensorTraitStr(cublasdxTensor tensor, cublas
         tensor, trait, size, value)
 
 
-cdef commondxStatusType _cublasdxBindDeviceFunction(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+cdef commondxStatusType _cublasdxCreateDeviceFunctionOld(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
     global __cublasdxBindDeviceFunction
     _check_or_init_mathdx()
     if __cublasdxBindDeviceFunction == NULL:
@@ -1483,3 +1352,62 @@ cdef const char* _cusolverdxTraitTypeToStr(cusolverdxTraitType trait) except?NUL
             raise FunctionNotFoundError("function cusolverdxTraitTypeToStr is not found")
     return (<const char* (*)(cusolverdxTraitType) noexcept nogil>__cusolverdxTraitTypeToStr)(
         trait)
+
+
+cdef commondxStatusType _cublasdxCreateTensorNew(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxCreateTensor
+    _check_or_init_mathdx()
+    if __cublasdxCreateTensor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxCreateTensor is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, cublasdxTensorType, cublasdxTensor*) noexcept nogil>__cublasdxCreateTensor)(
+        handle, tensor_type, tensor)
+
+
+cdef commondxStatusType _cublasdxMakeTensorLike(cublasdxTensor input, commondxValueType value_type, cublasdxTensor* output) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxMakeTensorLike
+    _check_or_init_mathdx()
+    if __cublasdxMakeTensorLike == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxMakeTensorLike is not found")
+    return (<commondxStatusType (*)(cublasdxTensor, commondxValueType, cublasdxTensor*) noexcept nogil>__cublasdxMakeTensorLike)(
+        input, value_type, output)
+
+
+cdef commondxStatusType _cublasdxDestroyTensorNew(cublasdxTensor tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxDestroyTensor
+    _check_or_init_mathdx()
+    if __cublasdxDestroyTensor == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxDestroyTensor is not found")
+    return (<commondxStatusType (*)(cublasdxTensor) noexcept nogil>__cublasdxDestroyTensor)(
+        tensor)
+
+
+cdef commondxStatusType _cublasdxCreateDeviceFunctionNew(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxCreateDeviceFunction
+    _check_or_init_mathdx()
+    if __cublasdxCreateDeviceFunction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxCreateDeviceFunction is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, cublasdxDeviceFunctionType, size_t, const cublasdxTensor*, cublasdxDeviceFunction*) noexcept nogil>__cublasdxCreateDeviceFunction)(
+        handle, device_function_type, count, array, device_function)
+
+
+cdef commondxStatusType _cublasdxDestroyDeviceFunctionNew(cublasdxDeviceFunction device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxDestroyDeviceFunction
+    _check_or_init_mathdx()
+    if __cublasdxDestroyDeviceFunction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxDestroyDeviceFunction is not found")
+    return (<commondxStatusType (*)(cublasdxDeviceFunction) noexcept nogil>__cublasdxDestroyDeviceFunction)(
+        device_function)
+
+cdef commondxStatusType _cublasdxFinalizeTensors203(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    global __cublasdxFinalizeTensors
+    _check_or_init_mathdx()
+    if __cublasdxFinalizeTensors == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasdxFinalizeTensors is not found")
+    return (<commondxStatusType (*)(cublasdxDescriptor, size_t, const cublasdxTensor*) noexcept nogil>__cublasdxFinalizeTensors)(
+        handle, count, array)
diff --git a/nvmath/bindings/_internal/nccl.pxd b/nvmath/bindings/_internal/nccl.pxd
new file mode 100644
index 0000000..dbed6f2
--- /dev/null
+++ b/nvmath/bindings/_internal/nccl.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+from ..cynccl cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef ncclResult_t _ncclGetVersion(int* version) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclGetUniqueId(ncclUniqueId* uniqueId) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclCommDestroy(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclCommAbort(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* _ncclGetErrorString(ncclResult_t result) except?NULL nogil
+cdef ncclResult_t _ncclCommCount(const ncclComm_t comm, int* count) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclCommCuDevice(const ncclComm_t comm, int* device) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t _ncclCommUserRank(const ncclComm_t comm, int* rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* _ncclGetLastError(ncclComm_t comm) except?NULL nogil
+cdef ncclResult_t _ncclCommFinalize(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/nccl_linux.pyx b/nvmath/bindings/_internal/nccl_linux.pyx
new file mode 100644
index 0000000..da7070b
--- /dev/null
+++ b/nvmath/bindings/_internal/nccl_linux.pyx
@@ -0,0 +1,333 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nccl_init = False
+
+cdef void* __ncclGetVersion = NULL
+cdef void* __ncclGetUniqueId = NULL
+cdef void* __ncclCommInitRank = NULL
+cdef void* __ncclCommDestroy = NULL
+cdef void* __ncclCommAbort = NULL
+cdef void* __ncclGetErrorString = NULL
+cdef void* __ncclCommCount = NULL
+cdef void* __ncclCommCuDevice = NULL
+cdef void* __ncclCommUserRank = NULL
+cdef void* __ncclGetLastError = NULL
+cdef void* __ncclCommFinalize = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nccl")._handle_uint
+    return <void*>handle
+
+
+cdef int _check_or_init_nccl() except -1 nogil:
+    global __py_nccl_init
+    if __py_nccl_init:
+        return 0
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Load function
+        global __ncclGetVersion
+        __ncclGetVersion = dlsym(RTLD_DEFAULT, 'ncclGetVersion')
+        if __ncclGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclGetVersion = dlsym(handle, 'ncclGetVersion')
+
+        global __ncclGetUniqueId
+        __ncclGetUniqueId = dlsym(RTLD_DEFAULT, 'ncclGetUniqueId')
+        if __ncclGetUniqueId == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclGetUniqueId = dlsym(handle, 'ncclGetUniqueId')
+
+        global __ncclCommInitRank
+        __ncclCommInitRank = dlsym(RTLD_DEFAULT, 'ncclCommInitRank')
+        if __ncclCommInitRank == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommInitRank = dlsym(handle, 'ncclCommInitRank')
+
+        global __ncclCommDestroy
+        __ncclCommDestroy = dlsym(RTLD_DEFAULT, 'ncclCommDestroy')
+        if __ncclCommDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommDestroy = dlsym(handle, 'ncclCommDestroy')
+
+        global __ncclCommAbort
+        __ncclCommAbort = dlsym(RTLD_DEFAULT, 'ncclCommAbort')
+        if __ncclCommAbort == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommAbort = dlsym(handle, 'ncclCommAbort')
+
+        global __ncclGetErrorString
+        __ncclGetErrorString = dlsym(RTLD_DEFAULT, 'ncclGetErrorString')
+        if __ncclGetErrorString == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclGetErrorString = dlsym(handle, 'ncclGetErrorString')
+
+        global __ncclCommCount
+        __ncclCommCount = dlsym(RTLD_DEFAULT, 'ncclCommCount')
+        if __ncclCommCount == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommCount = dlsym(handle, 'ncclCommCount')
+
+        global __ncclCommCuDevice
+        __ncclCommCuDevice = dlsym(RTLD_DEFAULT, 'ncclCommCuDevice')
+        if __ncclCommCuDevice == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommCuDevice = dlsym(handle, 'ncclCommCuDevice')
+
+        global __ncclCommUserRank
+        __ncclCommUserRank = dlsym(RTLD_DEFAULT, 'ncclCommUserRank')
+        if __ncclCommUserRank == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommUserRank = dlsym(handle, 'ncclCommUserRank')
+
+        global __ncclGetLastError
+        __ncclGetLastError = dlsym(RTLD_DEFAULT, 'ncclGetLastError')
+        if __ncclGetLastError == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclGetLastError = dlsym(handle, 'ncclGetLastError')
+
+        global __ncclCommFinalize
+        __ncclCommFinalize = dlsym(RTLD_DEFAULT, 'ncclCommFinalize')
+        if __ncclCommFinalize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __ncclCommFinalize = dlsym(handle, 'ncclCommFinalize')
+        __py_nccl_init = True
+        return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nccl()
+    cdef dict data = {}
+
+    global __ncclGetVersion
+    data["__ncclGetVersion"] = <intptr_t>__ncclGetVersion
+
+    global __ncclGetUniqueId
+    data["__ncclGetUniqueId"] = <intptr_t>__ncclGetUniqueId
+
+    global __ncclCommInitRank
+    data["__ncclCommInitRank"] = <intptr_t>__ncclCommInitRank
+
+    global __ncclCommDestroy
+    data["__ncclCommDestroy"] = <intptr_t>__ncclCommDestroy
+
+    global __ncclCommAbort
+    data["__ncclCommAbort"] = <intptr_t>__ncclCommAbort
+
+    global __ncclGetErrorString
+    data["__ncclGetErrorString"] = <intptr_t>__ncclGetErrorString
+
+    global __ncclCommCount
+    data["__ncclCommCount"] = <intptr_t>__ncclCommCount
+
+    global __ncclCommCuDevice
+    data["__ncclCommCuDevice"] = <intptr_t>__ncclCommCuDevice
+
+    global __ncclCommUserRank
+    data["__ncclCommUserRank"] = <intptr_t>__ncclCommUserRank
+
+    global __ncclGetLastError
+    data["__ncclGetLastError"] = <intptr_t>__ncclGetLastError
+
+    global __ncclCommFinalize
+    data["__ncclCommFinalize"] = <intptr_t>__ncclCommFinalize
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef ncclResult_t _ncclGetVersion(int* version) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclGetVersion
+    _check_or_init_nccl()
+    if __ncclGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclGetVersion is not found")
+    return (<ncclResult_t (*)(int*) noexcept nogil>__ncclGetVersion)(
+        version)
+
+
+cdef ncclResult_t _ncclGetUniqueId(ncclUniqueId* uniqueId) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclGetUniqueId
+    _check_or_init_nccl()
+    if __ncclGetUniqueId == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclGetUniqueId is not found")
+    return (<ncclResult_t (*)(ncclUniqueId*) noexcept nogil>__ncclGetUniqueId)(
+        uniqueId)
+
+
+cdef ncclResult_t _ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommInitRank
+    _check_or_init_nccl()
+    if __ncclCommInitRank == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommInitRank is not found")
+    return (<ncclResult_t (*)(ncclComm_t*, int, ncclUniqueId, int) noexcept nogil>__ncclCommInitRank)(
+        comm, nranks, commId, rank)
+
+
+cdef ncclResult_t _ncclCommDestroy(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommDestroy
+    _check_or_init_nccl()
+    if __ncclCommDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommDestroy is not found")
+    return (<ncclResult_t (*)(ncclComm_t) noexcept nogil>__ncclCommDestroy)(
+        comm)
+
+
+cdef ncclResult_t _ncclCommAbort(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommAbort
+    _check_or_init_nccl()
+    if __ncclCommAbort == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommAbort is not found")
+    return (<ncclResult_t (*)(ncclComm_t) noexcept nogil>__ncclCommAbort)(
+        comm)
+
+
+cdef const char* _ncclGetErrorString(ncclResult_t result) except?NULL nogil:
+    global __ncclGetErrorString
+    _check_or_init_nccl()
+    if __ncclGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclGetErrorString is not found")
+    return (<const char* (*)(ncclResult_t) noexcept nogil>__ncclGetErrorString)(
+        result)
+
+
+cdef ncclResult_t _ncclCommCount(const ncclComm_t comm, int* count) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommCount
+    _check_or_init_nccl()
+    if __ncclCommCount == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommCount is not found")
+    return (<ncclResult_t (*)(const ncclComm_t, int*) noexcept nogil>__ncclCommCount)(
+        comm, count)
+
+
+cdef ncclResult_t _ncclCommCuDevice(const ncclComm_t comm, int* device) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommCuDevice
+    _check_or_init_nccl()
+    if __ncclCommCuDevice == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommCuDevice is not found")
+    return (<ncclResult_t (*)(const ncclComm_t, int*) noexcept nogil>__ncclCommCuDevice)(
+        comm, device)
+
+
+cdef ncclResult_t _ncclCommUserRank(const ncclComm_t comm, int* rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommUserRank
+    _check_or_init_nccl()
+    if __ncclCommUserRank == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommUserRank is not found")
+    return (<ncclResult_t (*)(const ncclComm_t, int*) noexcept nogil>__ncclCommUserRank)(
+        comm, rank)
+
+
+cdef const char* _ncclGetLastError(ncclComm_t comm) except?NULL nogil:
+    global __ncclGetLastError
+    _check_or_init_nccl()
+    if __ncclGetLastError == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclGetLastError is not found")
+    return (<const char* (*)(ncclComm_t) noexcept nogil>__ncclGetLastError)(
+        comm)
+
+
+cdef ncclResult_t _ncclCommFinalize(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    global __ncclCommFinalize
+    _check_or_init_nccl()
+    if __ncclCommFinalize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function ncclCommFinalize is not found")
+    return (<ncclResult_t (*)(ncclComm_t) noexcept nogil>__ncclCommFinalize)(
+        comm)
diff --git a/nvmath/bindings/_internal/nvshmem_linux.pyx b/nvmath/bindings/_internal/nvshmem_linux.pyx
index 1c9cd89..b7dab1d 100644
--- a/nvmath/bindings/_internal/nvshmem_linux.pyx
+++ b/nvmath/bindings/_internal/nvshmem_linux.pyx
@@ -7,10 +7,13 @@
 cimport cython
 from libc.stdint cimport intptr_t, uintptr_t
 
+import threading
+
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
@@ -29,11 +32,30 @@ cdef extern from "<dlfcn.h>" nogil:
 
     const void* RTLD_DEFAULT 'RTLD_DEFAULT'
 
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_nvshmem_init = False
 
 cdef void* __nvshmemx_init_status = NULL
@@ -64,122 +86,123 @@ cdef int _check_or_init_nvshmem() except -1 nogil:
     if __py_nvshmem_init:
         return 0
 
-    # Load function
     cdef void* handle = NULL
-    global __nvshmemx_init_status
-    __nvshmemx_init_status = dlsym(RTLD_DEFAULT, 'nvshmemx_init_status')
-    if __nvshmemx_init_status == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_init_status = dlsym(handle, 'nvshmemx_init_status')
-
-    global __nvshmem_my_pe
-    __nvshmem_my_pe = dlsym(RTLD_DEFAULT, 'nvshmem_my_pe')
-    if __nvshmem_my_pe == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_my_pe = dlsym(handle, 'nvshmem_my_pe')
-
-    global __nvshmem_n_pes
-    __nvshmem_n_pes = dlsym(RTLD_DEFAULT, 'nvshmem_n_pes')
-    if __nvshmem_n_pes == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_n_pes = dlsym(handle, 'nvshmem_n_pes')
-
-    global __nvshmem_malloc
-    __nvshmem_malloc = dlsym(RTLD_DEFAULT, 'nvshmem_malloc')
-    if __nvshmem_malloc == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_malloc = dlsym(handle, 'nvshmem_malloc')
-
-    global __nvshmem_calloc
-    __nvshmem_calloc = dlsym(RTLD_DEFAULT, 'nvshmem_calloc')
-    if __nvshmem_calloc == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_calloc = dlsym(handle, 'nvshmem_calloc')
-
-    global __nvshmem_align
-    __nvshmem_align = dlsym(RTLD_DEFAULT, 'nvshmem_align')
-    if __nvshmem_align == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_align = dlsym(handle, 'nvshmem_align')
-
-    global __nvshmem_free
-    __nvshmem_free = dlsym(RTLD_DEFAULT, 'nvshmem_free')
-    if __nvshmem_free == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_free = dlsym(handle, 'nvshmem_free')
-
-    global __nvshmem_ptr
-    __nvshmem_ptr = dlsym(RTLD_DEFAULT, 'nvshmem_ptr')
-    if __nvshmem_ptr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_ptr = dlsym(handle, 'nvshmem_ptr')
-
-    global __nvshmem_int_p
-    __nvshmem_int_p = dlsym(RTLD_DEFAULT, 'nvshmem_int_p')
-    if __nvshmem_int_p == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_int_p = dlsym(handle, 'nvshmem_int_p')
-
-    global __nvshmem_team_my_pe
-    __nvshmem_team_my_pe = dlsym(RTLD_DEFAULT, 'nvshmem_team_my_pe')
-    if __nvshmem_team_my_pe == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmem_team_my_pe = dlsym(handle, 'nvshmem_team_my_pe')
-
-    global __nvshmemx_barrier_all_on_stream
-    __nvshmemx_barrier_all_on_stream = dlsym(RTLD_DEFAULT, 'nvshmemx_barrier_all_on_stream')
-    if __nvshmemx_barrier_all_on_stream == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_barrier_all_on_stream = dlsym(handle, 'nvshmemx_barrier_all_on_stream')
-
-    global __nvshmemx_sync_all_on_stream
-    __nvshmemx_sync_all_on_stream = dlsym(RTLD_DEFAULT, 'nvshmemx_sync_all_on_stream')
-    if __nvshmemx_sync_all_on_stream == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_sync_all_on_stream = dlsym(handle, 'nvshmemx_sync_all_on_stream')
 
-    global __nvshmemx_hostlib_init_attr
-    __nvshmemx_hostlib_init_attr = dlsym(RTLD_DEFAULT, 'nvshmemx_hostlib_init_attr')
-    if __nvshmemx_hostlib_init_attr == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_hostlib_init_attr = dlsym(handle, 'nvshmemx_hostlib_init_attr')
-
-    global __nvshmemx_hostlib_finalize
-    __nvshmemx_hostlib_finalize = dlsym(RTLD_DEFAULT, 'nvshmemx_hostlib_finalize')
-    if __nvshmemx_hostlib_finalize == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_hostlib_finalize = dlsym(handle, 'nvshmemx_hostlib_finalize')
-
-    global __nvshmemx_set_attr_uniqueid_args
-    __nvshmemx_set_attr_uniqueid_args = dlsym(RTLD_DEFAULT, 'nvshmemx_set_attr_uniqueid_args')
-    if __nvshmemx_set_attr_uniqueid_args == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_set_attr_uniqueid_args = dlsym(handle, 'nvshmemx_set_attr_uniqueid_args')
-
-    global __nvshmemx_get_uniqueid
-    __nvshmemx_get_uniqueid = dlsym(RTLD_DEFAULT, 'nvshmemx_get_uniqueid')
-    if __nvshmemx_get_uniqueid == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvshmemx_get_uniqueid = dlsym(handle, 'nvshmemx_get_uniqueid')
-
-    __py_nvshmem_init = True
-    return 0
+    with gil, __symbol_lock:
+        # Load function
+        global __nvshmemx_init_status
+        __nvshmemx_init_status = dlsym(RTLD_DEFAULT, 'nvshmemx_init_status')
+        if __nvshmemx_init_status == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_init_status = dlsym(handle, 'nvshmemx_init_status')
+
+        global __nvshmem_my_pe
+        __nvshmem_my_pe = dlsym(RTLD_DEFAULT, 'nvshmem_my_pe')
+        if __nvshmem_my_pe == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_my_pe = dlsym(handle, 'nvshmem_my_pe')
+
+        global __nvshmem_n_pes
+        __nvshmem_n_pes = dlsym(RTLD_DEFAULT, 'nvshmem_n_pes')
+        if __nvshmem_n_pes == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_n_pes = dlsym(handle, 'nvshmem_n_pes')
+
+        global __nvshmem_malloc
+        __nvshmem_malloc = dlsym(RTLD_DEFAULT, 'nvshmem_malloc')
+        if __nvshmem_malloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_malloc = dlsym(handle, 'nvshmem_malloc')
+
+        global __nvshmem_calloc
+        __nvshmem_calloc = dlsym(RTLD_DEFAULT, 'nvshmem_calloc')
+        if __nvshmem_calloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_calloc = dlsym(handle, 'nvshmem_calloc')
+
+        global __nvshmem_align
+        __nvshmem_align = dlsym(RTLD_DEFAULT, 'nvshmem_align')
+        if __nvshmem_align == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_align = dlsym(handle, 'nvshmem_align')
+
+        global __nvshmem_free
+        __nvshmem_free = dlsym(RTLD_DEFAULT, 'nvshmem_free')
+        if __nvshmem_free == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_free = dlsym(handle, 'nvshmem_free')
+
+        global __nvshmem_ptr
+        __nvshmem_ptr = dlsym(RTLD_DEFAULT, 'nvshmem_ptr')
+        if __nvshmem_ptr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_ptr = dlsym(handle, 'nvshmem_ptr')
+
+        global __nvshmem_int_p
+        __nvshmem_int_p = dlsym(RTLD_DEFAULT, 'nvshmem_int_p')
+        if __nvshmem_int_p == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_int_p = dlsym(handle, 'nvshmem_int_p')
+
+        global __nvshmem_team_my_pe
+        __nvshmem_team_my_pe = dlsym(RTLD_DEFAULT, 'nvshmem_team_my_pe')
+        if __nvshmem_team_my_pe == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmem_team_my_pe = dlsym(handle, 'nvshmem_team_my_pe')
+
+        global __nvshmemx_barrier_all_on_stream
+        __nvshmemx_barrier_all_on_stream = dlsym(RTLD_DEFAULT, 'nvshmemx_barrier_all_on_stream')
+        if __nvshmemx_barrier_all_on_stream == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_barrier_all_on_stream = dlsym(handle, 'nvshmemx_barrier_all_on_stream')
+
+        global __nvshmemx_sync_all_on_stream
+        __nvshmemx_sync_all_on_stream = dlsym(RTLD_DEFAULT, 'nvshmemx_sync_all_on_stream')
+        if __nvshmemx_sync_all_on_stream == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_sync_all_on_stream = dlsym(handle, 'nvshmemx_sync_all_on_stream')
+
+        global __nvshmemx_hostlib_init_attr
+        __nvshmemx_hostlib_init_attr = dlsym(RTLD_DEFAULT, 'nvshmemx_hostlib_init_attr')
+        if __nvshmemx_hostlib_init_attr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_hostlib_init_attr = dlsym(handle, 'nvshmemx_hostlib_init_attr')
+
+        global __nvshmemx_hostlib_finalize
+        __nvshmemx_hostlib_finalize = dlsym(RTLD_DEFAULT, 'nvshmemx_hostlib_finalize')
+        if __nvshmemx_hostlib_finalize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_hostlib_finalize = dlsym(handle, 'nvshmemx_hostlib_finalize')
+
+        global __nvshmemx_set_attr_uniqueid_args
+        __nvshmemx_set_attr_uniqueid_args = dlsym(RTLD_DEFAULT, 'nvshmemx_set_attr_uniqueid_args')
+        if __nvshmemx_set_attr_uniqueid_args == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_set_attr_uniqueid_args = dlsym(handle, 'nvshmemx_set_attr_uniqueid_args')
+
+        global __nvshmemx_get_uniqueid
+        __nvshmemx_get_uniqueid = dlsym(RTLD_DEFAULT, 'nvshmemx_get_uniqueid')
+        if __nvshmemx_get_uniqueid == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvshmemx_get_uniqueid = dlsym(handle, 'nvshmemx_get_uniqueid')
+        __py_nvshmem_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/_internal/utils.pxd b/nvmath/bindings/_internal/utils.pxd
index 91c4a3b..99fce2a 100644
--- a/nvmath/bindings/_internal/utils.pxd
+++ b/nvmath/bindings/_internal/utils.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport int32_t, int64_t, intptr_t
+from libc.stdint cimport int32_t, int64_t, intptr_t, uint32_t
 from libcpp.vector cimport vector
 from libcpp cimport bool as cppbool
 from libcpp cimport nullptr_t, nullptr
@@ -151,6 +151,7 @@ ctypedef fused ResT:
     int
     int32_t
     int64_t
+    uint32_t
     float
     double
     size_t
diff --git a/nvmath/bindings/cublas.pyi b/nvmath/bindings/cublas.pyi
index 7cdfb6c..7f04e18 100644
--- a/nvmath/bindings/cublas.pyi
+++ b/nvmath/bindings/cublas.pyi
@@ -2,517 +2,531 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import _cython_3_1_3
+import _cython_3_1_4
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-asum_ex: _cython_3_1_3.cython_function_or_method
-asum_ex_64: _cython_3_1_3.cython_function_or_method
-axpy_ex: _cython_3_1_3.cython_function_or_method
-axpy_ex_64: _cython_3_1_3.cython_function_or_method
-caxpy: _cython_3_1_3.cython_function_or_method
-caxpy_64: _cython_3_1_3.cython_function_or_method
-ccopy: _cython_3_1_3.cython_function_or_method
-ccopy_64: _cython_3_1_3.cython_function_or_method
-cdgmm: _cython_3_1_3.cython_function_or_method
-cdgmm_64: _cython_3_1_3.cython_function_or_method
-cdotc: _cython_3_1_3.cython_function_or_method
-cdotc_64: _cython_3_1_3.cython_function_or_method
-cdotu: _cython_3_1_3.cython_function_or_method
-cdotu_64: _cython_3_1_3.cython_function_or_method
-cgbmv: _cython_3_1_3.cython_function_or_method
-cgbmv_64: _cython_3_1_3.cython_function_or_method
-cgeam: _cython_3_1_3.cython_function_or_method
-cgeam_64: _cython_3_1_3.cython_function_or_method
-cgels_batched: _cython_3_1_3.cython_function_or_method
-cgemm: _cython_3_1_3.cython_function_or_method
-cgemm3m: _cython_3_1_3.cython_function_or_method
-cgemm3m_64: _cython_3_1_3.cython_function_or_method
-cgemm3m_batched: _cython_3_1_3.cython_function_or_method
-cgemm3m_batched_64: _cython_3_1_3.cython_function_or_method
-cgemm3m_ex: _cython_3_1_3.cython_function_or_method
-cgemm3m_ex_64: _cython_3_1_3.cython_function_or_method
-cgemm3m_strided_batched: _cython_3_1_3.cython_function_or_method
-cgemm3m_strided_batched_64: _cython_3_1_3.cython_function_or_method
-cgemm_64: _cython_3_1_3.cython_function_or_method
-cgemm_batched: _cython_3_1_3.cython_function_or_method
-cgemm_batched_64: _cython_3_1_3.cython_function_or_method
-cgemm_ex: _cython_3_1_3.cython_function_or_method
-cgemm_ex_64: _cython_3_1_3.cython_function_or_method
-cgemm_strided_batched: _cython_3_1_3.cython_function_or_method
-cgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
-cgemv: _cython_3_1_3.cython_function_or_method
-cgemv_64: _cython_3_1_3.cython_function_or_method
-cgemv_batched: _cython_3_1_3.cython_function_or_method
-cgemv_batched_64: _cython_3_1_3.cython_function_or_method
-cgemv_strided_batched: _cython_3_1_3.cython_function_or_method
-cgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
-cgeqrf_batched: _cython_3_1_3.cython_function_or_method
-cgerc: _cython_3_1_3.cython_function_or_method
-cgerc_64: _cython_3_1_3.cython_function_or_method
-cgeru: _cython_3_1_3.cython_function_or_method
-cgeru_64: _cython_3_1_3.cython_function_or_method
-cgetrf_batched: _cython_3_1_3.cython_function_or_method
-cgetri_batched: _cython_3_1_3.cython_function_or_method
-cgetrs_batched: _cython_3_1_3.cython_function_or_method
-chbmv: _cython_3_1_3.cython_function_or_method
-chbmv_64: _cython_3_1_3.cython_function_or_method
-check_status: _cython_3_1_3.cython_function_or_method
-chemm: _cython_3_1_3.cython_function_or_method
-chemm_64: _cython_3_1_3.cython_function_or_method
-chemv: _cython_3_1_3.cython_function_or_method
-chemv_64: _cython_3_1_3.cython_function_or_method
-cher: _cython_3_1_3.cython_function_or_method
-cher2: _cython_3_1_3.cython_function_or_method
-cher2_64: _cython_3_1_3.cython_function_or_method
-cher2k: _cython_3_1_3.cython_function_or_method
-cher2k_64: _cython_3_1_3.cython_function_or_method
-cher_64: _cython_3_1_3.cython_function_or_method
-cherk: _cython_3_1_3.cython_function_or_method
-cherk3m_ex: _cython_3_1_3.cython_function_or_method
-cherk3m_ex_64: _cython_3_1_3.cython_function_or_method
-cherk_64: _cython_3_1_3.cython_function_or_method
-cherk_ex: _cython_3_1_3.cython_function_or_method
-cherk_ex_64: _cython_3_1_3.cython_function_or_method
-cherkx: _cython_3_1_3.cython_function_or_method
-cherkx_64: _cython_3_1_3.cython_function_or_method
-chpmv: _cython_3_1_3.cython_function_or_method
-chpmv_64: _cython_3_1_3.cython_function_or_method
-chpr: _cython_3_1_3.cython_function_or_method
-chpr2: _cython_3_1_3.cython_function_or_method
-chpr2_64: _cython_3_1_3.cython_function_or_method
-chpr_64: _cython_3_1_3.cython_function_or_method
-cmatinv_batched: _cython_3_1_3.cython_function_or_method
-copy_ex: _cython_3_1_3.cython_function_or_method
-copy_ex_64: _cython_3_1_3.cython_function_or_method
-create: _cython_3_1_3.cython_function_or_method
-crot: _cython_3_1_3.cython_function_or_method
-crot_64: _cython_3_1_3.cython_function_or_method
-crotg: _cython_3_1_3.cython_function_or_method
-cscal: _cython_3_1_3.cython_function_or_method
-cscal_64: _cython_3_1_3.cython_function_or_method
-csrot: _cython_3_1_3.cython_function_or_method
-csrot_64: _cython_3_1_3.cython_function_or_method
-csscal: _cython_3_1_3.cython_function_or_method
-csscal_64: _cython_3_1_3.cython_function_or_method
-cswap: _cython_3_1_3.cython_function_or_method
-cswap_64: _cython_3_1_3.cython_function_or_method
-csymm: _cython_3_1_3.cython_function_or_method
-csymm_64: _cython_3_1_3.cython_function_or_method
-csymv: _cython_3_1_3.cython_function_or_method
-csymv_64: _cython_3_1_3.cython_function_or_method
-csyr: _cython_3_1_3.cython_function_or_method
-csyr2: _cython_3_1_3.cython_function_or_method
-csyr2_64: _cython_3_1_3.cython_function_or_method
-csyr2k: _cython_3_1_3.cython_function_or_method
-csyr2k_64: _cython_3_1_3.cython_function_or_method
-csyr_64: _cython_3_1_3.cython_function_or_method
-csyrk: _cython_3_1_3.cython_function_or_method
-csyrk3m_ex: _cython_3_1_3.cython_function_or_method
-csyrk3m_ex_64: _cython_3_1_3.cython_function_or_method
-csyrk_64: _cython_3_1_3.cython_function_or_method
-csyrk_ex: _cython_3_1_3.cython_function_or_method
-csyrk_ex_64: _cython_3_1_3.cython_function_or_method
-csyrkx: _cython_3_1_3.cython_function_or_method
-csyrkx_64: _cython_3_1_3.cython_function_or_method
-ctbmv: _cython_3_1_3.cython_function_or_method
-ctbmv_64: _cython_3_1_3.cython_function_or_method
-ctbsv: _cython_3_1_3.cython_function_or_method
-ctbsv_64: _cython_3_1_3.cython_function_or_method
-ctpmv: _cython_3_1_3.cython_function_or_method
-ctpmv_64: _cython_3_1_3.cython_function_or_method
-ctpsv: _cython_3_1_3.cython_function_or_method
-ctpsv_64: _cython_3_1_3.cython_function_or_method
-ctpttr: _cython_3_1_3.cython_function_or_method
-ctrmm: _cython_3_1_3.cython_function_or_method
-ctrmm_64: _cython_3_1_3.cython_function_or_method
-ctrmv: _cython_3_1_3.cython_function_or_method
-ctrmv_64: _cython_3_1_3.cython_function_or_method
-ctrsm: _cython_3_1_3.cython_function_or_method
-ctrsm_64: _cython_3_1_3.cython_function_or_method
-ctrsm_batched: _cython_3_1_3.cython_function_or_method
-ctrsm_batched_64: _cython_3_1_3.cython_function_or_method
-ctrsv: _cython_3_1_3.cython_function_or_method
-ctrsv_64: _cython_3_1_3.cython_function_or_method
-ctrttp: _cython_3_1_3.cython_function_or_method
-dasum: _cython_3_1_3.cython_function_or_method
-dasum_64: _cython_3_1_3.cython_function_or_method
-daxpy: _cython_3_1_3.cython_function_or_method
-daxpy_64: _cython_3_1_3.cython_function_or_method
-dcopy: _cython_3_1_3.cython_function_or_method
-dcopy_64: _cython_3_1_3.cython_function_or_method
-ddgmm: _cython_3_1_3.cython_function_or_method
-ddgmm_64: _cython_3_1_3.cython_function_or_method
-ddot: _cython_3_1_3.cython_function_or_method
-ddot_64: _cython_3_1_3.cython_function_or_method
-destroy: _cython_3_1_3.cython_function_or_method
-dgbmv: _cython_3_1_3.cython_function_or_method
-dgbmv_64: _cython_3_1_3.cython_function_or_method
-dgeam: _cython_3_1_3.cython_function_or_method
-dgeam_64: _cython_3_1_3.cython_function_or_method
-dgels_batched: _cython_3_1_3.cython_function_or_method
-dgemm: _cython_3_1_3.cython_function_or_method
-dgemm_64: _cython_3_1_3.cython_function_or_method
-dgemm_batched: _cython_3_1_3.cython_function_or_method
-dgemm_batched_64: _cython_3_1_3.cython_function_or_method
-dgemm_grouped_batched: _cython_3_1_3.cython_function_or_method
-dgemm_grouped_batched_64: _cython_3_1_3.cython_function_or_method
-dgemm_strided_batched: _cython_3_1_3.cython_function_or_method
-dgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
-dgemv: _cython_3_1_3.cython_function_or_method
-dgemv_64: _cython_3_1_3.cython_function_or_method
-dgemv_batched: _cython_3_1_3.cython_function_or_method
-dgemv_batched_64: _cython_3_1_3.cython_function_or_method
-dgemv_strided_batched: _cython_3_1_3.cython_function_or_method
-dgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
-dgeqrf_batched: _cython_3_1_3.cython_function_or_method
-dger: _cython_3_1_3.cython_function_or_method
-dger_64: _cython_3_1_3.cython_function_or_method
-dgetrf_batched: _cython_3_1_3.cython_function_or_method
-dgetri_batched: _cython_3_1_3.cython_function_or_method
-dgetrs_batched: _cython_3_1_3.cython_function_or_method
-dmatinv_batched: _cython_3_1_3.cython_function_or_method
-dnrm2: _cython_3_1_3.cython_function_or_method
-dnrm2_64: _cython_3_1_3.cython_function_or_method
-dot_ex: _cython_3_1_3.cython_function_or_method
-dot_ex_64: _cython_3_1_3.cython_function_or_method
-dotc_ex: _cython_3_1_3.cython_function_or_method
-dotc_ex_64: _cython_3_1_3.cython_function_or_method
-drot: _cython_3_1_3.cython_function_or_method
-drot_64: _cython_3_1_3.cython_function_or_method
-drotg: _cython_3_1_3.cython_function_or_method
-drotm: _cython_3_1_3.cython_function_or_method
-drotm_64: _cython_3_1_3.cython_function_or_method
-drotmg: _cython_3_1_3.cython_function_or_method
-dsbmv: _cython_3_1_3.cython_function_or_method
-dsbmv_64: _cython_3_1_3.cython_function_or_method
-dscal: _cython_3_1_3.cython_function_or_method
-dscal_64: _cython_3_1_3.cython_function_or_method
-dspmv: _cython_3_1_3.cython_function_or_method
-dspmv_64: _cython_3_1_3.cython_function_or_method
-dspr: _cython_3_1_3.cython_function_or_method
-dspr2: _cython_3_1_3.cython_function_or_method
-dspr2_64: _cython_3_1_3.cython_function_or_method
-dspr_64: _cython_3_1_3.cython_function_or_method
-dswap: _cython_3_1_3.cython_function_or_method
-dswap_64: _cython_3_1_3.cython_function_or_method
-dsymm: _cython_3_1_3.cython_function_or_method
-dsymm_64: _cython_3_1_3.cython_function_or_method
-dsymv: _cython_3_1_3.cython_function_or_method
-dsymv_64: _cython_3_1_3.cython_function_or_method
-dsyr: _cython_3_1_3.cython_function_or_method
-dsyr2: _cython_3_1_3.cython_function_or_method
-dsyr2_64: _cython_3_1_3.cython_function_or_method
-dsyr2k: _cython_3_1_3.cython_function_or_method
-dsyr2k_64: _cython_3_1_3.cython_function_or_method
-dsyr_64: _cython_3_1_3.cython_function_or_method
-dsyrk: _cython_3_1_3.cython_function_or_method
-dsyrk_64: _cython_3_1_3.cython_function_or_method
-dsyrkx: _cython_3_1_3.cython_function_or_method
-dsyrkx_64: _cython_3_1_3.cython_function_or_method
-dtbmv: _cython_3_1_3.cython_function_or_method
-dtbmv_64: _cython_3_1_3.cython_function_or_method
-dtbsv: _cython_3_1_3.cython_function_or_method
-dtbsv_64: _cython_3_1_3.cython_function_or_method
-dtpmv: _cython_3_1_3.cython_function_or_method
-dtpmv_64: _cython_3_1_3.cython_function_or_method
-dtpsv: _cython_3_1_3.cython_function_or_method
-dtpsv_64: _cython_3_1_3.cython_function_or_method
-dtpttr: _cython_3_1_3.cython_function_or_method
-dtrmm: _cython_3_1_3.cython_function_or_method
-dtrmm_64: _cython_3_1_3.cython_function_or_method
-dtrmv: _cython_3_1_3.cython_function_or_method
-dtrmv_64: _cython_3_1_3.cython_function_or_method
-dtrsm: _cython_3_1_3.cython_function_or_method
-dtrsm_64: _cython_3_1_3.cython_function_or_method
-dtrsm_batched: _cython_3_1_3.cython_function_or_method
-dtrsm_batched_64: _cython_3_1_3.cython_function_or_method
-dtrsv: _cython_3_1_3.cython_function_or_method
-dtrsv_64: _cython_3_1_3.cython_function_or_method
-dtrttp: _cython_3_1_3.cython_function_or_method
-dzasum: _cython_3_1_3.cython_function_or_method
-dzasum_64: _cython_3_1_3.cython_function_or_method
-dznrm2: _cython_3_1_3.cython_function_or_method
-dznrm2_64: _cython_3_1_3.cython_function_or_method
-gemm_batched_ex: _cython_3_1_3.cython_function_or_method
-gemm_batched_ex_64: _cython_3_1_3.cython_function_or_method
-gemm_ex: _cython_3_1_3.cython_function_or_method
-gemm_ex_64: _cython_3_1_3.cython_function_or_method
-gemm_grouped_batched_ex: _cython_3_1_3.cython_function_or_method
-gemm_grouped_batched_ex_64: _cython_3_1_3.cython_function_or_method
-gemm_strided_batched_ex: _cython_3_1_3.cython_function_or_method
-gemm_strided_batched_ex_64: _cython_3_1_3.cython_function_or_method
-get_atomics_mode: _cython_3_1_3.cython_function_or_method
-get_cudart_version: _cython_3_1_3.cython_function_or_method
-get_emulation_strategy: _cython_3_1_3.cython_function_or_method
-get_math_mode: _cython_3_1_3.cython_function_or_method
-get_matrix: _cython_3_1_3.cython_function_or_method
-get_matrix_64: _cython_3_1_3.cython_function_or_method
-get_matrix_async: _cython_3_1_3.cython_function_or_method
-get_matrix_async_64: _cython_3_1_3.cython_function_or_method
-get_pointer_mode: _cython_3_1_3.cython_function_or_method
-get_property: _cython_3_1_3.cython_function_or_method
-get_sm_count_target: _cython_3_1_3.cython_function_or_method
-get_status_name: _cython_3_1_3.cython_function_or_method
-get_status_string: _cython_3_1_3.cython_function_or_method
-get_stream: _cython_3_1_3.cython_function_or_method
-get_vector: _cython_3_1_3.cython_function_or_method
-get_vector_64: _cython_3_1_3.cython_function_or_method
-get_vector_async: _cython_3_1_3.cython_function_or_method
-get_vector_async_64: _cython_3_1_3.cython_function_or_method
-get_version: _cython_3_1_3.cython_function_or_method
-iamax_ex: _cython_3_1_3.cython_function_or_method
-iamax_ex_64: _cython_3_1_3.cython_function_or_method
-iamin_ex: _cython_3_1_3.cython_function_or_method
-iamin_ex_64: _cython_3_1_3.cython_function_or_method
-icamax: _cython_3_1_3.cython_function_or_method
-icamax_64: _cython_3_1_3.cython_function_or_method
-icamin: _cython_3_1_3.cython_function_or_method
-icamin_64: _cython_3_1_3.cython_function_or_method
-idamax: _cython_3_1_3.cython_function_or_method
-idamax_64: _cython_3_1_3.cython_function_or_method
-idamin: _cython_3_1_3.cython_function_or_method
-idamin_64: _cython_3_1_3.cython_function_or_method
-isamax: _cython_3_1_3.cython_function_or_method
-isamax_64: _cython_3_1_3.cython_function_or_method
-isamin: _cython_3_1_3.cython_function_or_method
-isamin_64: _cython_3_1_3.cython_function_or_method
-izamax: _cython_3_1_3.cython_function_or_method
-izamax_64: _cython_3_1_3.cython_function_or_method
-izamin: _cython_3_1_3.cython_function_or_method
-izamin_64: _cython_3_1_3.cython_function_or_method
-logger_configure: _cython_3_1_3.cython_function_or_method
-nrm2_ex: _cython_3_1_3.cython_function_or_method
-nrm2ex_64: _cython_3_1_3.cython_function_or_method
-rot_ex: _cython_3_1_3.cython_function_or_method
-rot_ex_64: _cython_3_1_3.cython_function_or_method
-rotg_ex: _cython_3_1_3.cython_function_or_method
-rotm_ex: _cython_3_1_3.cython_function_or_method
-rotm_ex_64: _cython_3_1_3.cython_function_or_method
-rotmg_ex: _cython_3_1_3.cython_function_or_method
-sasum: _cython_3_1_3.cython_function_or_method
-sasum_64: _cython_3_1_3.cython_function_or_method
-saxpy: _cython_3_1_3.cython_function_or_method
-saxpy_64: _cython_3_1_3.cython_function_or_method
-scal_ex: _cython_3_1_3.cython_function_or_method
-scal_ex_64: _cython_3_1_3.cython_function_or_method
-scasum: _cython_3_1_3.cython_function_or_method
-scasum_64: _cython_3_1_3.cython_function_or_method
-scnrm2: _cython_3_1_3.cython_function_or_method
-scnrm2_64: _cython_3_1_3.cython_function_or_method
-scopy: _cython_3_1_3.cython_function_or_method
-scopy_64: _cython_3_1_3.cython_function_or_method
-sdgmm: _cython_3_1_3.cython_function_or_method
-sdgmm_64: _cython_3_1_3.cython_function_or_method
-sdot: _cython_3_1_3.cython_function_or_method
-sdot_64: _cython_3_1_3.cython_function_or_method
-set_atomics_mode: _cython_3_1_3.cython_function_or_method
-set_emulation_strategy: _cython_3_1_3.cython_function_or_method
-set_math_mode: _cython_3_1_3.cython_function_or_method
-set_matrix: _cython_3_1_3.cython_function_or_method
-set_matrix_64: _cython_3_1_3.cython_function_or_method
-set_matrix_async: _cython_3_1_3.cython_function_or_method
-set_matrix_async_64: _cython_3_1_3.cython_function_or_method
-set_pointer_mode: _cython_3_1_3.cython_function_or_method
-set_sm_count_target: _cython_3_1_3.cython_function_or_method
-set_stream: _cython_3_1_3.cython_function_or_method
-set_vector: _cython_3_1_3.cython_function_or_method
-set_vector_64: _cython_3_1_3.cython_function_or_method
-set_vector_async: _cython_3_1_3.cython_function_or_method
-set_vector_async_64: _cython_3_1_3.cython_function_or_method
-set_workspace: _cython_3_1_3.cython_function_or_method
-sgbmv: _cython_3_1_3.cython_function_or_method
-sgbmv_64: _cython_3_1_3.cython_function_or_method
-sgeam: _cython_3_1_3.cython_function_or_method
-sgeam_64: _cython_3_1_3.cython_function_or_method
-sgels_batched: _cython_3_1_3.cython_function_or_method
-sgemm: _cython_3_1_3.cython_function_or_method
-sgemm_64: _cython_3_1_3.cython_function_or_method
-sgemm_batched: _cython_3_1_3.cython_function_or_method
-sgemm_batched_64: _cython_3_1_3.cython_function_or_method
-sgemm_ex: _cython_3_1_3.cython_function_or_method
-sgemm_ex_64: _cython_3_1_3.cython_function_or_method
-sgemm_grouped_batched: _cython_3_1_3.cython_function_or_method
-sgemm_grouped_batched_64: _cython_3_1_3.cython_function_or_method
-sgemm_strided_batched: _cython_3_1_3.cython_function_or_method
-sgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
-sgemv: _cython_3_1_3.cython_function_or_method
-sgemv_64: _cython_3_1_3.cython_function_or_method
-sgemv_batched: _cython_3_1_3.cython_function_or_method
-sgemv_batched_64: _cython_3_1_3.cython_function_or_method
-sgemv_strided_batched: _cython_3_1_3.cython_function_or_method
-sgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
-sgeqrf_batched: _cython_3_1_3.cython_function_or_method
-sger: _cython_3_1_3.cython_function_or_method
-sger_64: _cython_3_1_3.cython_function_or_method
-sgetrf_batched: _cython_3_1_3.cython_function_or_method
-sgetri_batched: _cython_3_1_3.cython_function_or_method
-sgetrs_batched: _cython_3_1_3.cython_function_or_method
-smatinv_batched: _cython_3_1_3.cython_function_or_method
-snrm2: _cython_3_1_3.cython_function_or_method
-snrm2_64: _cython_3_1_3.cython_function_or_method
-srot: _cython_3_1_3.cython_function_or_method
-srot_64: _cython_3_1_3.cython_function_or_method
-srotg: _cython_3_1_3.cython_function_or_method
-srotm: _cython_3_1_3.cython_function_or_method
-srotm_64: _cython_3_1_3.cython_function_or_method
-srotmg: _cython_3_1_3.cython_function_or_method
-ssbmv: _cython_3_1_3.cython_function_or_method
-ssbmv_64: _cython_3_1_3.cython_function_or_method
-sscal: _cython_3_1_3.cython_function_or_method
-sscal_64: _cython_3_1_3.cython_function_or_method
-sspmv: _cython_3_1_3.cython_function_or_method
-sspmv_64: _cython_3_1_3.cython_function_or_method
-sspr: _cython_3_1_3.cython_function_or_method
-sspr2: _cython_3_1_3.cython_function_or_method
-sspr2_64: _cython_3_1_3.cython_function_or_method
-sspr_64: _cython_3_1_3.cython_function_or_method
-sswap: _cython_3_1_3.cython_function_or_method
-sswap_64: _cython_3_1_3.cython_function_or_method
-ssymm: _cython_3_1_3.cython_function_or_method
-ssymm_64: _cython_3_1_3.cython_function_or_method
-ssymv: _cython_3_1_3.cython_function_or_method
-ssymv_64: _cython_3_1_3.cython_function_or_method
-ssyr: _cython_3_1_3.cython_function_or_method
-ssyr2: _cython_3_1_3.cython_function_or_method
-ssyr2_64: _cython_3_1_3.cython_function_or_method
-ssyr2k: _cython_3_1_3.cython_function_or_method
-ssyr2k_64: _cython_3_1_3.cython_function_or_method
-ssyr_64: _cython_3_1_3.cython_function_or_method
-ssyrk: _cython_3_1_3.cython_function_or_method
-ssyrk_64: _cython_3_1_3.cython_function_or_method
-ssyrkx: _cython_3_1_3.cython_function_or_method
-ssyrkx_64: _cython_3_1_3.cython_function_or_method
-stbmv: _cython_3_1_3.cython_function_or_method
-stbmv_64: _cython_3_1_3.cython_function_or_method
-stbsv: _cython_3_1_3.cython_function_or_method
-stbsv_64: _cython_3_1_3.cython_function_or_method
-stpmv: _cython_3_1_3.cython_function_or_method
-stpmv_64: _cython_3_1_3.cython_function_or_method
-stpsv: _cython_3_1_3.cython_function_or_method
-stpsv_64: _cython_3_1_3.cython_function_or_method
-stpttr: _cython_3_1_3.cython_function_or_method
-strmm: _cython_3_1_3.cython_function_or_method
-strmm_64: _cython_3_1_3.cython_function_or_method
-strmv: _cython_3_1_3.cython_function_or_method
-strmv_64: _cython_3_1_3.cython_function_or_method
-strsm: _cython_3_1_3.cython_function_or_method
-strsm_64: _cython_3_1_3.cython_function_or_method
-strsm_batched: _cython_3_1_3.cython_function_or_method
-strsm_batched_64: _cython_3_1_3.cython_function_or_method
-strsv: _cython_3_1_3.cython_function_or_method
-strsv_64: _cython_3_1_3.cython_function_or_method
-strttp: _cython_3_1_3.cython_function_or_method
-swap_ex: _cython_3_1_3.cython_function_or_method
-swap_ex_64: _cython_3_1_3.cython_function_or_method
-uint8gemm_bias: _cython_3_1_3.cython_function_or_method
-zaxpy: _cython_3_1_3.cython_function_or_method
-zaxpy_64: _cython_3_1_3.cython_function_or_method
-zcopy: _cython_3_1_3.cython_function_or_method
-zcopy_64: _cython_3_1_3.cython_function_or_method
-zdgmm: _cython_3_1_3.cython_function_or_method
-zdgmm_64: _cython_3_1_3.cython_function_or_method
-zdotc: _cython_3_1_3.cython_function_or_method
-zdotc_64: _cython_3_1_3.cython_function_or_method
-zdotu: _cython_3_1_3.cython_function_or_method
-zdotu_64: _cython_3_1_3.cython_function_or_method
-zdrot: _cython_3_1_3.cython_function_or_method
-zdrot_64: _cython_3_1_3.cython_function_or_method
-zdscal: _cython_3_1_3.cython_function_or_method
-zdscal_64: _cython_3_1_3.cython_function_or_method
-zgbmv: _cython_3_1_3.cython_function_or_method
-zgbmv_64: _cython_3_1_3.cython_function_or_method
-zgeam: _cython_3_1_3.cython_function_or_method
-zgeam_64: _cython_3_1_3.cython_function_or_method
-zgels_batched: _cython_3_1_3.cython_function_or_method
-zgemm: _cython_3_1_3.cython_function_or_method
-zgemm3m: _cython_3_1_3.cython_function_or_method
-zgemm3m_64: _cython_3_1_3.cython_function_or_method
-zgemm_64: _cython_3_1_3.cython_function_or_method
-zgemm_batched: _cython_3_1_3.cython_function_or_method
-zgemm_batched_64: _cython_3_1_3.cython_function_or_method
-zgemm_strided_batched: _cython_3_1_3.cython_function_or_method
-zgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
-zgemv: _cython_3_1_3.cython_function_or_method
-zgemv_64: _cython_3_1_3.cython_function_or_method
-zgemv_batched: _cython_3_1_3.cython_function_or_method
-zgemv_batched_64: _cython_3_1_3.cython_function_or_method
-zgemv_strided_batched: _cython_3_1_3.cython_function_or_method
-zgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
-zgeqrf_batched: _cython_3_1_3.cython_function_or_method
-zgerc: _cython_3_1_3.cython_function_or_method
-zgerc_64: _cython_3_1_3.cython_function_or_method
-zgeru: _cython_3_1_3.cython_function_or_method
-zgeru_64: _cython_3_1_3.cython_function_or_method
-zgetrf_batched: _cython_3_1_3.cython_function_or_method
-zgetri_batched: _cython_3_1_3.cython_function_or_method
-zgetrs_batched: _cython_3_1_3.cython_function_or_method
-zhbmv: _cython_3_1_3.cython_function_or_method
-zhbmv_64: _cython_3_1_3.cython_function_or_method
-zhemm: _cython_3_1_3.cython_function_or_method
-zhemm_64: _cython_3_1_3.cython_function_or_method
-zhemv: _cython_3_1_3.cython_function_or_method
-zhemv_64: _cython_3_1_3.cython_function_or_method
-zher: _cython_3_1_3.cython_function_or_method
-zher2: _cython_3_1_3.cython_function_or_method
-zher2_64: _cython_3_1_3.cython_function_or_method
-zher2k: _cython_3_1_3.cython_function_or_method
-zher2k_64: _cython_3_1_3.cython_function_or_method
-zher_64: _cython_3_1_3.cython_function_or_method
-zherk: _cython_3_1_3.cython_function_or_method
-zherk_64: _cython_3_1_3.cython_function_or_method
-zherkx: _cython_3_1_3.cython_function_or_method
-zherkx_64: _cython_3_1_3.cython_function_or_method
-zhpmv: _cython_3_1_3.cython_function_or_method
-zhpmv_64: _cython_3_1_3.cython_function_or_method
-zhpr: _cython_3_1_3.cython_function_or_method
-zhpr2: _cython_3_1_3.cython_function_or_method
-zhpr2_64: _cython_3_1_3.cython_function_or_method
-zhpr_64: _cython_3_1_3.cython_function_or_method
-zmatinv_batched: _cython_3_1_3.cython_function_or_method
-zrot: _cython_3_1_3.cython_function_or_method
-zrot_64: _cython_3_1_3.cython_function_or_method
-zrotg: _cython_3_1_3.cython_function_or_method
-zscal: _cython_3_1_3.cython_function_or_method
-zscal_64: _cython_3_1_3.cython_function_or_method
-zswap: _cython_3_1_3.cython_function_or_method
-zswap_64: _cython_3_1_3.cython_function_or_method
-zsymm: _cython_3_1_3.cython_function_or_method
-zsymm_64: _cython_3_1_3.cython_function_or_method
-zsymv: _cython_3_1_3.cython_function_or_method
-zsymv_64: _cython_3_1_3.cython_function_or_method
-zsyr: _cython_3_1_3.cython_function_or_method
-zsyr2: _cython_3_1_3.cython_function_or_method
-zsyr2_64: _cython_3_1_3.cython_function_or_method
-zsyr2k: _cython_3_1_3.cython_function_or_method
-zsyr2k_64: _cython_3_1_3.cython_function_or_method
-zsyr_64: _cython_3_1_3.cython_function_or_method
-zsyrk: _cython_3_1_3.cython_function_or_method
-zsyrk_64: _cython_3_1_3.cython_function_or_method
-zsyrkx: _cython_3_1_3.cython_function_or_method
-zsyrkx_64: _cython_3_1_3.cython_function_or_method
-ztbmv: _cython_3_1_3.cython_function_or_method
-ztbmv_64: _cython_3_1_3.cython_function_or_method
-ztbsv: _cython_3_1_3.cython_function_or_method
-ztbsv_64: _cython_3_1_3.cython_function_or_method
-ztpmv: _cython_3_1_3.cython_function_or_method
-ztpmv_64: _cython_3_1_3.cython_function_or_method
-ztpsv: _cython_3_1_3.cython_function_or_method
-ztpsv_64: _cython_3_1_3.cython_function_or_method
-ztpttr: _cython_3_1_3.cython_function_or_method
-ztrmm: _cython_3_1_3.cython_function_or_method
-ztrmm_64: _cython_3_1_3.cython_function_or_method
-ztrmv: _cython_3_1_3.cython_function_or_method
-ztrmv_64: _cython_3_1_3.cython_function_or_method
-ztrsm: _cython_3_1_3.cython_function_or_method
-ztrsm_64: _cython_3_1_3.cython_function_or_method
-ztrsm_batched: _cython_3_1_3.cython_function_or_method
-ztrsm_batched_64: _cython_3_1_3.cython_function_or_method
-ztrsv: _cython_3_1_3.cython_function_or_method
-ztrsv_64: _cython_3_1_3.cython_function_or_method
-ztrttp: _cython_3_1_3.cython_function_or_method
+asum_ex: _cython_3_1_4.cython_function_or_method
+asum_ex_64: _cython_3_1_4.cython_function_or_method
+axpy_ex: _cython_3_1_4.cython_function_or_method
+axpy_ex_64: _cython_3_1_4.cython_function_or_method
+caxpy: _cython_3_1_4.cython_function_or_method
+caxpy_64: _cython_3_1_4.cython_function_or_method
+ccopy: _cython_3_1_4.cython_function_or_method
+ccopy_64: _cython_3_1_4.cython_function_or_method
+cdgmm: _cython_3_1_4.cython_function_or_method
+cdgmm_64: _cython_3_1_4.cython_function_or_method
+cdgmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+cdotc: _cython_3_1_4.cython_function_or_method
+cdotc_64: _cython_3_1_4.cython_function_or_method
+cdotu: _cython_3_1_4.cython_function_or_method
+cdotu_64: _cython_3_1_4.cython_function_or_method
+cgbmv: _cython_3_1_4.cython_function_or_method
+cgbmv_64: _cython_3_1_4.cython_function_or_method
+cgeam: _cython_3_1_4.cython_function_or_method
+cgeam_64: _cython_3_1_4.cython_function_or_method
+cgels_batched: _cython_3_1_4.cython_function_or_method
+cgemm: _cython_3_1_4.cython_function_or_method
+cgemm3m: _cython_3_1_4.cython_function_or_method
+cgemm3m_64: _cython_3_1_4.cython_function_or_method
+cgemm3m_batched: _cython_3_1_4.cython_function_or_method
+cgemm3m_batched_64: _cython_3_1_4.cython_function_or_method
+cgemm3m_ex: _cython_3_1_4.cython_function_or_method
+cgemm3m_ex_64: _cython_3_1_4.cython_function_or_method
+cgemm3m_strided_batched: _cython_3_1_4.cython_function_or_method
+cgemm3m_strided_batched_64: _cython_3_1_4.cython_function_or_method
+cgemm_64: _cython_3_1_4.cython_function_or_method
+cgemm_batched: _cython_3_1_4.cython_function_or_method
+cgemm_batched_64: _cython_3_1_4.cython_function_or_method
+cgemm_ex: _cython_3_1_4.cython_function_or_method
+cgemm_ex_64: _cython_3_1_4.cython_function_or_method
+cgemm_strided_batched: _cython_3_1_4.cython_function_or_method
+cgemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+cgemv: _cython_3_1_4.cython_function_or_method
+cgemv_64: _cython_3_1_4.cython_function_or_method
+cgemv_batched: _cython_3_1_4.cython_function_or_method
+cgemv_batched_64: _cython_3_1_4.cython_function_or_method
+cgemv_strided_batched: _cython_3_1_4.cython_function_or_method
+cgemv_strided_batched_64: _cython_3_1_4.cython_function_or_method
+cgeqrf_batched: _cython_3_1_4.cython_function_or_method
+cgerc: _cython_3_1_4.cython_function_or_method
+cgerc_64: _cython_3_1_4.cython_function_or_method
+cgeru: _cython_3_1_4.cython_function_or_method
+cgeru_64: _cython_3_1_4.cython_function_or_method
+cgetrf_batched: _cython_3_1_4.cython_function_or_method
+cgetri_batched: _cython_3_1_4.cython_function_or_method
+cgetrs_batched: _cython_3_1_4.cython_function_or_method
+chbmv: _cython_3_1_4.cython_function_or_method
+chbmv_64: _cython_3_1_4.cython_function_or_method
+check_status: _cython_3_1_4.cython_function_or_method
+chemm: _cython_3_1_4.cython_function_or_method
+chemm_64: _cython_3_1_4.cython_function_or_method
+chemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+chemv: _cython_3_1_4.cython_function_or_method
+chemv_64: _cython_3_1_4.cython_function_or_method
+cher: _cython_3_1_4.cython_function_or_method
+cher2: _cython_3_1_4.cython_function_or_method
+cher2_64: _cython_3_1_4.cython_function_or_method
+cher2k: _cython_3_1_4.cython_function_or_method
+cher2k_64: _cython_3_1_4.cython_function_or_method
+cher_64: _cython_3_1_4.cython_function_or_method
+cherk: _cython_3_1_4.cython_function_or_method
+cherk3m_ex: _cython_3_1_4.cython_function_or_method
+cherk3m_ex_64: _cython_3_1_4.cython_function_or_method
+cherk_64: _cython_3_1_4.cython_function_or_method
+cherk_ex: _cython_3_1_4.cython_function_or_method
+cherk_ex_64: _cython_3_1_4.cython_function_or_method
+cherkx: _cython_3_1_4.cython_function_or_method
+cherkx_64: _cython_3_1_4.cython_function_or_method
+chpmv: _cython_3_1_4.cython_function_or_method
+chpmv_64: _cython_3_1_4.cython_function_or_method
+chpr: _cython_3_1_4.cython_function_or_method
+chpr2: _cython_3_1_4.cython_function_or_method
+chpr2_64: _cython_3_1_4.cython_function_or_method
+chpr_64: _cython_3_1_4.cython_function_or_method
+cmatinv_batched: _cython_3_1_4.cython_function_or_method
+copy_ex: _cython_3_1_4.cython_function_or_method
+copy_ex_64: _cython_3_1_4.cython_function_or_method
+create: _cython_3_1_4.cython_function_or_method
+crot: _cython_3_1_4.cython_function_or_method
+crot_64: _cython_3_1_4.cython_function_or_method
+crotg: _cython_3_1_4.cython_function_or_method
+cscal: _cython_3_1_4.cython_function_or_method
+cscal_64: _cython_3_1_4.cython_function_or_method
+csrot: _cython_3_1_4.cython_function_or_method
+csrot_64: _cython_3_1_4.cython_function_or_method
+csscal: _cython_3_1_4.cython_function_or_method
+csscal_64: _cython_3_1_4.cython_function_or_method
+cswap: _cython_3_1_4.cython_function_or_method
+cswap_64: _cython_3_1_4.cython_function_or_method
+csymm: _cython_3_1_4.cython_function_or_method
+csymm_64: _cython_3_1_4.cython_function_or_method
+csymm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+csymv: _cython_3_1_4.cython_function_or_method
+csymv_64: _cython_3_1_4.cython_function_or_method
+csyr: _cython_3_1_4.cython_function_or_method
+csyr2: _cython_3_1_4.cython_function_or_method
+csyr2_64: _cython_3_1_4.cython_function_or_method
+csyr2k: _cython_3_1_4.cython_function_or_method
+csyr2k_64: _cython_3_1_4.cython_function_or_method
+csyr_64: _cython_3_1_4.cython_function_or_method
+csyrk: _cython_3_1_4.cython_function_or_method
+csyrk3m_ex: _cython_3_1_4.cython_function_or_method
+csyrk3m_ex_64: _cython_3_1_4.cython_function_or_method
+csyrk_64: _cython_3_1_4.cython_function_or_method
+csyrk_ex: _cython_3_1_4.cython_function_or_method
+csyrk_ex_64: _cython_3_1_4.cython_function_or_method
+csyrkx: _cython_3_1_4.cython_function_or_method
+csyrkx_64: _cython_3_1_4.cython_function_or_method
+ctbmv: _cython_3_1_4.cython_function_or_method
+ctbmv_64: _cython_3_1_4.cython_function_or_method
+ctbsv: _cython_3_1_4.cython_function_or_method
+ctbsv_64: _cython_3_1_4.cython_function_or_method
+ctpmv: _cython_3_1_4.cython_function_or_method
+ctpmv_64: _cython_3_1_4.cython_function_or_method
+ctpsv: _cython_3_1_4.cython_function_or_method
+ctpsv_64: _cython_3_1_4.cython_function_or_method
+ctpttr: _cython_3_1_4.cython_function_or_method
+ctrmm: _cython_3_1_4.cython_function_or_method
+ctrmm_64: _cython_3_1_4.cython_function_or_method
+ctrmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+ctrmv: _cython_3_1_4.cython_function_or_method
+ctrmv_64: _cython_3_1_4.cython_function_or_method
+ctrsm: _cython_3_1_4.cython_function_or_method
+ctrsm_64: _cython_3_1_4.cython_function_or_method
+ctrsm_batched: _cython_3_1_4.cython_function_or_method
+ctrsm_batched_64: _cython_3_1_4.cython_function_or_method
+ctrsv: _cython_3_1_4.cython_function_or_method
+ctrsv_64: _cython_3_1_4.cython_function_or_method
+ctrttp: _cython_3_1_4.cython_function_or_method
+dasum: _cython_3_1_4.cython_function_or_method
+dasum_64: _cython_3_1_4.cython_function_or_method
+daxpy: _cython_3_1_4.cython_function_or_method
+daxpy_64: _cython_3_1_4.cython_function_or_method
+dcopy: _cython_3_1_4.cython_function_or_method
+dcopy_64: _cython_3_1_4.cython_function_or_method
+ddgmm: _cython_3_1_4.cython_function_or_method
+ddgmm_64: _cython_3_1_4.cython_function_or_method
+ddgmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+ddot: _cython_3_1_4.cython_function_or_method
+ddot_64: _cython_3_1_4.cython_function_or_method
+destroy: _cython_3_1_4.cython_function_or_method
+dgbmv: _cython_3_1_4.cython_function_or_method
+dgbmv_64: _cython_3_1_4.cython_function_or_method
+dgeam: _cython_3_1_4.cython_function_or_method
+dgeam_64: _cython_3_1_4.cython_function_or_method
+dgels_batched: _cython_3_1_4.cython_function_or_method
+dgemm: _cython_3_1_4.cython_function_or_method
+dgemm_64: _cython_3_1_4.cython_function_or_method
+dgemm_batched: _cython_3_1_4.cython_function_or_method
+dgemm_batched_64: _cython_3_1_4.cython_function_or_method
+dgemm_grouped_batched: _cython_3_1_4.cython_function_or_method
+dgemm_grouped_batched_64: _cython_3_1_4.cython_function_or_method
+dgemm_strided_batched: _cython_3_1_4.cython_function_or_method
+dgemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+dgemv: _cython_3_1_4.cython_function_or_method
+dgemv_64: _cython_3_1_4.cython_function_or_method
+dgemv_batched: _cython_3_1_4.cython_function_or_method
+dgemv_batched_64: _cython_3_1_4.cython_function_or_method
+dgemv_strided_batched: _cython_3_1_4.cython_function_or_method
+dgemv_strided_batched_64: _cython_3_1_4.cython_function_or_method
+dgeqrf_batched: _cython_3_1_4.cython_function_or_method
+dger: _cython_3_1_4.cython_function_or_method
+dger_64: _cython_3_1_4.cython_function_or_method
+dgetrf_batched: _cython_3_1_4.cython_function_or_method
+dgetri_batched: _cython_3_1_4.cython_function_or_method
+dgetrs_batched: _cython_3_1_4.cython_function_or_method
+dmatinv_batched: _cython_3_1_4.cython_function_or_method
+dnrm2: _cython_3_1_4.cython_function_or_method
+dnrm2_64: _cython_3_1_4.cython_function_or_method
+dot_ex: _cython_3_1_4.cython_function_or_method
+dot_ex_64: _cython_3_1_4.cython_function_or_method
+dotc_ex: _cython_3_1_4.cython_function_or_method
+dotc_ex_64: _cython_3_1_4.cython_function_or_method
+drot: _cython_3_1_4.cython_function_or_method
+drot_64: _cython_3_1_4.cython_function_or_method
+drotg: _cython_3_1_4.cython_function_or_method
+drotm: _cython_3_1_4.cython_function_or_method
+drotm_64: _cython_3_1_4.cython_function_or_method
+drotmg: _cython_3_1_4.cython_function_or_method
+dsbmv: _cython_3_1_4.cython_function_or_method
+dsbmv_64: _cython_3_1_4.cython_function_or_method
+dscal: _cython_3_1_4.cython_function_or_method
+dscal_64: _cython_3_1_4.cython_function_or_method
+dspmv: _cython_3_1_4.cython_function_or_method
+dspmv_64: _cython_3_1_4.cython_function_or_method
+dspr: _cython_3_1_4.cython_function_or_method
+dspr2: _cython_3_1_4.cython_function_or_method
+dspr2_64: _cython_3_1_4.cython_function_or_method
+dspr_64: _cython_3_1_4.cython_function_or_method
+dswap: _cython_3_1_4.cython_function_or_method
+dswap_64: _cython_3_1_4.cython_function_or_method
+dsymm: _cython_3_1_4.cython_function_or_method
+dsymm_64: _cython_3_1_4.cython_function_or_method
+dsymm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+dsymv: _cython_3_1_4.cython_function_or_method
+dsymv_64: _cython_3_1_4.cython_function_or_method
+dsyr: _cython_3_1_4.cython_function_or_method
+dsyr2: _cython_3_1_4.cython_function_or_method
+dsyr2_64: _cython_3_1_4.cython_function_or_method
+dsyr2k: _cython_3_1_4.cython_function_or_method
+dsyr2k_64: _cython_3_1_4.cython_function_or_method
+dsyr_64: _cython_3_1_4.cython_function_or_method
+dsyrk: _cython_3_1_4.cython_function_or_method
+dsyrk_64: _cython_3_1_4.cython_function_or_method
+dsyrkx: _cython_3_1_4.cython_function_or_method
+dsyrkx_64: _cython_3_1_4.cython_function_or_method
+dtbmv: _cython_3_1_4.cython_function_or_method
+dtbmv_64: _cython_3_1_4.cython_function_or_method
+dtbsv: _cython_3_1_4.cython_function_or_method
+dtbsv_64: _cython_3_1_4.cython_function_or_method
+dtpmv: _cython_3_1_4.cython_function_or_method
+dtpmv_64: _cython_3_1_4.cython_function_or_method
+dtpsv: _cython_3_1_4.cython_function_or_method
+dtpsv_64: _cython_3_1_4.cython_function_or_method
+dtpttr: _cython_3_1_4.cython_function_or_method
+dtrmm: _cython_3_1_4.cython_function_or_method
+dtrmm_64: _cython_3_1_4.cython_function_or_method
+dtrmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+dtrmv: _cython_3_1_4.cython_function_or_method
+dtrmv_64: _cython_3_1_4.cython_function_or_method
+dtrsm: _cython_3_1_4.cython_function_or_method
+dtrsm_64: _cython_3_1_4.cython_function_or_method
+dtrsm_batched: _cython_3_1_4.cython_function_or_method
+dtrsm_batched_64: _cython_3_1_4.cython_function_or_method
+dtrsv: _cython_3_1_4.cython_function_or_method
+dtrsv_64: _cython_3_1_4.cython_function_or_method
+dtrttp: _cython_3_1_4.cython_function_or_method
+dzasum: _cython_3_1_4.cython_function_or_method
+dzasum_64: _cython_3_1_4.cython_function_or_method
+dznrm2: _cython_3_1_4.cython_function_or_method
+dznrm2_64: _cython_3_1_4.cython_function_or_method
+gemm_batched_ex: _cython_3_1_4.cython_function_or_method
+gemm_batched_ex_64: _cython_3_1_4.cython_function_or_method
+gemm_ex: _cython_3_1_4.cython_function_or_method
+gemm_ex_64: _cython_3_1_4.cython_function_or_method
+gemm_grouped_batched_ex: _cython_3_1_4.cython_function_or_method
+gemm_grouped_batched_ex_64: _cython_3_1_4.cython_function_or_method
+gemm_strided_batched_ex: _cython_3_1_4.cython_function_or_method
+gemm_strided_batched_ex_64: _cython_3_1_4.cython_function_or_method
+get_atomics_mode: _cython_3_1_4.cython_function_or_method
+get_cudart_version: _cython_3_1_4.cython_function_or_method
+get_emulation_strategy: _cython_3_1_4.cython_function_or_method
+get_math_mode: _cython_3_1_4.cython_function_or_method
+get_matrix: _cython_3_1_4.cython_function_or_method
+get_matrix_64: _cython_3_1_4.cython_function_or_method
+get_matrix_async: _cython_3_1_4.cython_function_or_method
+get_matrix_async_64: _cython_3_1_4.cython_function_or_method
+get_pointer_mode: _cython_3_1_4.cython_function_or_method
+get_property: _cython_3_1_4.cython_function_or_method
+get_sm_count_target: _cython_3_1_4.cython_function_or_method
+get_status_name: _cython_3_1_4.cython_function_or_method
+get_status_string: _cython_3_1_4.cython_function_or_method
+get_stream: _cython_3_1_4.cython_function_or_method
+get_vector: _cython_3_1_4.cython_function_or_method
+get_vector_64: _cython_3_1_4.cython_function_or_method
+get_vector_async: _cython_3_1_4.cython_function_or_method
+get_vector_async_64: _cython_3_1_4.cython_function_or_method
+get_version: _cython_3_1_4.cython_function_or_method
+iamax_ex: _cython_3_1_4.cython_function_or_method
+iamax_ex_64: _cython_3_1_4.cython_function_or_method
+iamin_ex: _cython_3_1_4.cython_function_or_method
+iamin_ex_64: _cython_3_1_4.cython_function_or_method
+icamax: _cython_3_1_4.cython_function_or_method
+icamax_64: _cython_3_1_4.cython_function_or_method
+icamin: _cython_3_1_4.cython_function_or_method
+icamin_64: _cython_3_1_4.cython_function_or_method
+idamax: _cython_3_1_4.cython_function_or_method
+idamax_64: _cython_3_1_4.cython_function_or_method
+idamin: _cython_3_1_4.cython_function_or_method
+idamin_64: _cython_3_1_4.cython_function_or_method
+isamax: _cython_3_1_4.cython_function_or_method
+isamax_64: _cython_3_1_4.cython_function_or_method
+isamin: _cython_3_1_4.cython_function_or_method
+isamin_64: _cython_3_1_4.cython_function_or_method
+izamax: _cython_3_1_4.cython_function_or_method
+izamax_64: _cython_3_1_4.cython_function_or_method
+izamin: _cython_3_1_4.cython_function_or_method
+izamin_64: _cython_3_1_4.cython_function_or_method
+logger_configure: _cython_3_1_4.cython_function_or_method
+nrm2_ex: _cython_3_1_4.cython_function_or_method
+nrm2ex_64: _cython_3_1_4.cython_function_or_method
+rot_ex: _cython_3_1_4.cython_function_or_method
+rot_ex_64: _cython_3_1_4.cython_function_or_method
+rotg_ex: _cython_3_1_4.cython_function_or_method
+rotm_ex: _cython_3_1_4.cython_function_or_method
+rotm_ex_64: _cython_3_1_4.cython_function_or_method
+rotmg_ex: _cython_3_1_4.cython_function_or_method
+sasum: _cython_3_1_4.cython_function_or_method
+sasum_64: _cython_3_1_4.cython_function_or_method
+saxpy: _cython_3_1_4.cython_function_or_method
+saxpy_64: _cython_3_1_4.cython_function_or_method
+scal_ex: _cython_3_1_4.cython_function_or_method
+scal_ex_64: _cython_3_1_4.cython_function_or_method
+scasum: _cython_3_1_4.cython_function_or_method
+scasum_64: _cython_3_1_4.cython_function_or_method
+scnrm2: _cython_3_1_4.cython_function_or_method
+scnrm2_64: _cython_3_1_4.cython_function_or_method
+scopy: _cython_3_1_4.cython_function_or_method
+scopy_64: _cython_3_1_4.cython_function_or_method
+sdgmm: _cython_3_1_4.cython_function_or_method
+sdgmm_64: _cython_3_1_4.cython_function_or_method
+sdgmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+sdot: _cython_3_1_4.cython_function_or_method
+sdot_64: _cython_3_1_4.cython_function_or_method
+set_atomics_mode: _cython_3_1_4.cython_function_or_method
+set_emulation_strategy: _cython_3_1_4.cython_function_or_method
+set_math_mode: _cython_3_1_4.cython_function_or_method
+set_matrix: _cython_3_1_4.cython_function_or_method
+set_matrix_64: _cython_3_1_4.cython_function_or_method
+set_matrix_async: _cython_3_1_4.cython_function_or_method
+set_matrix_async_64: _cython_3_1_4.cython_function_or_method
+set_pointer_mode: _cython_3_1_4.cython_function_or_method
+set_sm_count_target: _cython_3_1_4.cython_function_or_method
+set_stream: _cython_3_1_4.cython_function_or_method
+set_vector: _cython_3_1_4.cython_function_or_method
+set_vector_64: _cython_3_1_4.cython_function_or_method
+set_vector_async: _cython_3_1_4.cython_function_or_method
+set_vector_async_64: _cython_3_1_4.cython_function_or_method
+set_workspace: _cython_3_1_4.cython_function_or_method
+sgbmv: _cython_3_1_4.cython_function_or_method
+sgbmv_64: _cython_3_1_4.cython_function_or_method
+sgeam: _cython_3_1_4.cython_function_or_method
+sgeam_64: _cython_3_1_4.cython_function_or_method
+sgels_batched: _cython_3_1_4.cython_function_or_method
+sgemm: _cython_3_1_4.cython_function_or_method
+sgemm_64: _cython_3_1_4.cython_function_or_method
+sgemm_batched: _cython_3_1_4.cython_function_or_method
+sgemm_batched_64: _cython_3_1_4.cython_function_or_method
+sgemm_ex: _cython_3_1_4.cython_function_or_method
+sgemm_ex_64: _cython_3_1_4.cython_function_or_method
+sgemm_grouped_batched: _cython_3_1_4.cython_function_or_method
+sgemm_grouped_batched_64: _cython_3_1_4.cython_function_or_method
+sgemm_strided_batched: _cython_3_1_4.cython_function_or_method
+sgemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+sgemv: _cython_3_1_4.cython_function_or_method
+sgemv_64: _cython_3_1_4.cython_function_or_method
+sgemv_batched: _cython_3_1_4.cython_function_or_method
+sgemv_batched_64: _cython_3_1_4.cython_function_or_method
+sgemv_strided_batched: _cython_3_1_4.cython_function_or_method
+sgemv_strided_batched_64: _cython_3_1_4.cython_function_or_method
+sgeqrf_batched: _cython_3_1_4.cython_function_or_method
+sger: _cython_3_1_4.cython_function_or_method
+sger_64: _cython_3_1_4.cython_function_or_method
+sgetrf_batched: _cython_3_1_4.cython_function_or_method
+sgetri_batched: _cython_3_1_4.cython_function_or_method
+sgetrs_batched: _cython_3_1_4.cython_function_or_method
+smatinv_batched: _cython_3_1_4.cython_function_or_method
+snrm2: _cython_3_1_4.cython_function_or_method
+snrm2_64: _cython_3_1_4.cython_function_or_method
+srot: _cython_3_1_4.cython_function_or_method
+srot_64: _cython_3_1_4.cython_function_or_method
+srotg: _cython_3_1_4.cython_function_or_method
+srotm: _cython_3_1_4.cython_function_or_method
+srotm_64: _cython_3_1_4.cython_function_or_method
+srotmg: _cython_3_1_4.cython_function_or_method
+ssbmv: _cython_3_1_4.cython_function_or_method
+ssbmv_64: _cython_3_1_4.cython_function_or_method
+sscal: _cython_3_1_4.cython_function_or_method
+sscal_64: _cython_3_1_4.cython_function_or_method
+sspmv: _cython_3_1_4.cython_function_or_method
+sspmv_64: _cython_3_1_4.cython_function_or_method
+sspr: _cython_3_1_4.cython_function_or_method
+sspr2: _cython_3_1_4.cython_function_or_method
+sspr2_64: _cython_3_1_4.cython_function_or_method
+sspr_64: _cython_3_1_4.cython_function_or_method
+sswap: _cython_3_1_4.cython_function_or_method
+sswap_64: _cython_3_1_4.cython_function_or_method
+ssymm: _cython_3_1_4.cython_function_or_method
+ssymm_64: _cython_3_1_4.cython_function_or_method
+ssymm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+ssymv: _cython_3_1_4.cython_function_or_method
+ssymv_64: _cython_3_1_4.cython_function_or_method
+ssyr: _cython_3_1_4.cython_function_or_method
+ssyr2: _cython_3_1_4.cython_function_or_method
+ssyr2_64: _cython_3_1_4.cython_function_or_method
+ssyr2k: _cython_3_1_4.cython_function_or_method
+ssyr2k_64: _cython_3_1_4.cython_function_or_method
+ssyr_64: _cython_3_1_4.cython_function_or_method
+ssyrk: _cython_3_1_4.cython_function_or_method
+ssyrk_64: _cython_3_1_4.cython_function_or_method
+ssyrkx: _cython_3_1_4.cython_function_or_method
+ssyrkx_64: _cython_3_1_4.cython_function_or_method
+stbmv: _cython_3_1_4.cython_function_or_method
+stbmv_64: _cython_3_1_4.cython_function_or_method
+stbsv: _cython_3_1_4.cython_function_or_method
+stbsv_64: _cython_3_1_4.cython_function_or_method
+stpmv: _cython_3_1_4.cython_function_or_method
+stpmv_64: _cython_3_1_4.cython_function_or_method
+stpsv: _cython_3_1_4.cython_function_or_method
+stpsv_64: _cython_3_1_4.cython_function_or_method
+stpttr: _cython_3_1_4.cython_function_or_method
+strmm: _cython_3_1_4.cython_function_or_method
+strmm_64: _cython_3_1_4.cython_function_or_method
+strmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+strmv: _cython_3_1_4.cython_function_or_method
+strmv_64: _cython_3_1_4.cython_function_or_method
+strsm: _cython_3_1_4.cython_function_or_method
+strsm_64: _cython_3_1_4.cython_function_or_method
+strsm_batched: _cython_3_1_4.cython_function_or_method
+strsm_batched_64: _cython_3_1_4.cython_function_or_method
+strsv: _cython_3_1_4.cython_function_or_method
+strsv_64: _cython_3_1_4.cython_function_or_method
+strttp: _cython_3_1_4.cython_function_or_method
+swap_ex: _cython_3_1_4.cython_function_or_method
+swap_ex_64: _cython_3_1_4.cython_function_or_method
+uint8gemm_bias: _cython_3_1_4.cython_function_or_method
+zaxpy: _cython_3_1_4.cython_function_or_method
+zaxpy_64: _cython_3_1_4.cython_function_or_method
+zcopy: _cython_3_1_4.cython_function_or_method
+zcopy_64: _cython_3_1_4.cython_function_or_method
+zdgmm: _cython_3_1_4.cython_function_or_method
+zdgmm_64: _cython_3_1_4.cython_function_or_method
+zdgmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+zdotc: _cython_3_1_4.cython_function_or_method
+zdotc_64: _cython_3_1_4.cython_function_or_method
+zdotu: _cython_3_1_4.cython_function_or_method
+zdotu_64: _cython_3_1_4.cython_function_or_method
+zdrot: _cython_3_1_4.cython_function_or_method
+zdrot_64: _cython_3_1_4.cython_function_or_method
+zdscal: _cython_3_1_4.cython_function_or_method
+zdscal_64: _cython_3_1_4.cython_function_or_method
+zgbmv: _cython_3_1_4.cython_function_or_method
+zgbmv_64: _cython_3_1_4.cython_function_or_method
+zgeam: _cython_3_1_4.cython_function_or_method
+zgeam_64: _cython_3_1_4.cython_function_or_method
+zgels_batched: _cython_3_1_4.cython_function_or_method
+zgemm: _cython_3_1_4.cython_function_or_method
+zgemm3m: _cython_3_1_4.cython_function_or_method
+zgemm3m_64: _cython_3_1_4.cython_function_or_method
+zgemm_64: _cython_3_1_4.cython_function_or_method
+zgemm_batched: _cython_3_1_4.cython_function_or_method
+zgemm_batched_64: _cython_3_1_4.cython_function_or_method
+zgemm_strided_batched: _cython_3_1_4.cython_function_or_method
+zgemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+zgemv: _cython_3_1_4.cython_function_or_method
+zgemv_64: _cython_3_1_4.cython_function_or_method
+zgemv_batched: _cython_3_1_4.cython_function_or_method
+zgemv_batched_64: _cython_3_1_4.cython_function_or_method
+zgemv_strided_batched: _cython_3_1_4.cython_function_or_method
+zgemv_strided_batched_64: _cython_3_1_4.cython_function_or_method
+zgeqrf_batched: _cython_3_1_4.cython_function_or_method
+zgerc: _cython_3_1_4.cython_function_or_method
+zgerc_64: _cython_3_1_4.cython_function_or_method
+zgeru: _cython_3_1_4.cython_function_or_method
+zgeru_64: _cython_3_1_4.cython_function_or_method
+zgetrf_batched: _cython_3_1_4.cython_function_or_method
+zgetri_batched: _cython_3_1_4.cython_function_or_method
+zgetrs_batched: _cython_3_1_4.cython_function_or_method
+zhbmv: _cython_3_1_4.cython_function_or_method
+zhbmv_64: _cython_3_1_4.cython_function_or_method
+zhemm: _cython_3_1_4.cython_function_or_method
+zhemm_64: _cython_3_1_4.cython_function_or_method
+zhemm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+zhemv: _cython_3_1_4.cython_function_or_method
+zhemv_64: _cython_3_1_4.cython_function_or_method
+zher: _cython_3_1_4.cython_function_or_method
+zher2: _cython_3_1_4.cython_function_or_method
+zher2_64: _cython_3_1_4.cython_function_or_method
+zher2k: _cython_3_1_4.cython_function_or_method
+zher2k_64: _cython_3_1_4.cython_function_or_method
+zher_64: _cython_3_1_4.cython_function_or_method
+zherk: _cython_3_1_4.cython_function_or_method
+zherk_64: _cython_3_1_4.cython_function_or_method
+zherkx: _cython_3_1_4.cython_function_or_method
+zherkx_64: _cython_3_1_4.cython_function_or_method
+zhpmv: _cython_3_1_4.cython_function_or_method
+zhpmv_64: _cython_3_1_4.cython_function_or_method
+zhpr: _cython_3_1_4.cython_function_or_method
+zhpr2: _cython_3_1_4.cython_function_or_method
+zhpr2_64: _cython_3_1_4.cython_function_or_method
+zhpr_64: _cython_3_1_4.cython_function_or_method
+zmatinv_batched: _cython_3_1_4.cython_function_or_method
+zrot: _cython_3_1_4.cython_function_or_method
+zrot_64: _cython_3_1_4.cython_function_or_method
+zrotg: _cython_3_1_4.cython_function_or_method
+zscal: _cython_3_1_4.cython_function_or_method
+zscal_64: _cython_3_1_4.cython_function_or_method
+zswap: _cython_3_1_4.cython_function_or_method
+zswap_64: _cython_3_1_4.cython_function_or_method
+zsymm: _cython_3_1_4.cython_function_or_method
+zsymm_64: _cython_3_1_4.cython_function_or_method
+zsymm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+zsymv: _cython_3_1_4.cython_function_or_method
+zsymv_64: _cython_3_1_4.cython_function_or_method
+zsyr: _cython_3_1_4.cython_function_or_method
+zsyr2: _cython_3_1_4.cython_function_or_method
+zsyr2_64: _cython_3_1_4.cython_function_or_method
+zsyr2k: _cython_3_1_4.cython_function_or_method
+zsyr2k_64: _cython_3_1_4.cython_function_or_method
+zsyr_64: _cython_3_1_4.cython_function_or_method
+zsyrk: _cython_3_1_4.cython_function_or_method
+zsyrk_64: _cython_3_1_4.cython_function_or_method
+zsyrkx: _cython_3_1_4.cython_function_or_method
+zsyrkx_64: _cython_3_1_4.cython_function_or_method
+ztbmv: _cython_3_1_4.cython_function_or_method
+ztbmv_64: _cython_3_1_4.cython_function_or_method
+ztbsv: _cython_3_1_4.cython_function_or_method
+ztbsv_64: _cython_3_1_4.cython_function_or_method
+ztpmv: _cython_3_1_4.cython_function_or_method
+ztpmv_64: _cython_3_1_4.cython_function_or_method
+ztpsv: _cython_3_1_4.cython_function_or_method
+ztpsv_64: _cython_3_1_4.cython_function_or_method
+ztpttr: _cython_3_1_4.cython_function_or_method
+ztrmm: _cython_3_1_4.cython_function_or_method
+ztrmm_64: _cython_3_1_4.cython_function_or_method
+ztrmm_strided_batched_64: _cython_3_1_4.cython_function_or_method
+ztrmv: _cython_3_1_4.cython_function_or_method
+ztrmv_64: _cython_3_1_4.cython_function_or_method
+ztrsm: _cython_3_1_4.cython_function_or_method
+ztrsm_64: _cython_3_1_4.cython_function_or_method
+ztrsm_batched: _cython_3_1_4.cython_function_or_method
+ztrsm_batched_64: _cython_3_1_4.cython_function_or_method
+ztrsv: _cython_3_1_4.cython_function_or_method
+ztrsv_64: _cython_3_1_4.cython_function_or_method
+ztrttp: _cython_3_1_4.cython_function_or_method
 
 class AtomicsMode(enum.IntEnum):
     """See `cublasAtomicsMode_t`."""
diff --git a/nvmath/bindings/cublas.pyx b/nvmath/bindings/cublas.pyx
index 85b7604..094e4ff 100644
--- a/nvmath/bindings/cublas.pyx
+++ b/nvmath/bindings/cublas.pyx
@@ -171,6 +171,362 @@ cpdef inline check_status(int status):
         raise cuBLASError(status)
 
 
+###############################################################################
+# Convenience wrappers/adapters
+###############################################################################
+
+
+cpdef void zhemm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuDoubleComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuDoubleComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuDoubleComplex*>c + batch_idx * stride_c)
+
+        zhemm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+cpdef void chemm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuComplex*>c + batch_idx * stride_c)
+
+        chemm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void ssymm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<float*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<float*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<float*>c + batch_idx * stride_c)
+
+        ssymm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void dsymm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<double*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<double*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<double*>c + batch_idx * stride_c)
+
+        dsymm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void csymm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuComplex*>c + batch_idx * stride_c)
+
+        csymm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void zsymm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuDoubleComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuDoubleComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuDoubleComplex*>c + batch_idx * stride_c)
+
+        zsymm(handle, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void strmm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<float*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<float*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<float*>c + batch_idx * stride_c)
+
+        strmm(handle, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb, c_batch, ldc)
+
+
+cpdef void dtrmm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<double*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<double*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<double*>c + batch_idx * stride_c)
+
+        dtrmm(handle, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb, c_batch, ldc)
+
+cpdef void ctrmm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuComplex*>c + batch_idx * stride_c)
+
+        ctrmm(handle, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb, c_batch, ldc)
+
+cpdef void ztrmm_strided_batched_64(
+    intptr_t handle,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    int64_t m,
+    int64_t n,
+    intptr_t alpha,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t b, int64_t ldb, int64_t stride_b,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuDoubleComplex*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<cuDoubleComplex*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<cuDoubleComplex*>c + batch_idx * stride_c)
+
+        ztrmm(handle, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb, c_batch, ldc)
+
+
+cpdef void sdgmm_strided_batched_64(
+    intptr_t handle,
+    int mode,
+    int64_t m,
+    int64_t n,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t x, int64_t incx, int64_t stride_x,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t x_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<float*>a + batch_idx * stride_a)
+        x_batch = <intptr_t>(<float*>x + batch_idx * stride_x)
+        c_batch = <intptr_t>(<float*>c + batch_idx * stride_c)
+
+        sdgmm(handle, mode, m, n, a_batch, lda, x_batch, incx, c_batch, ldc)
+
+
+cpdef void ddgmm_strided_batched_64(
+    intptr_t handle,
+    int mode,
+    int64_t m,
+    int64_t n,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t x, int64_t incx, int64_t stride_x,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t x_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<double*>a + batch_idx * stride_a)
+        x_batch = <intptr_t>(<double*>x + batch_idx * stride_x)
+        c_batch = <intptr_t>(<double*>c + batch_idx * stride_c)
+
+        ddgmm(handle, mode, m, n, a_batch, lda, x_batch, incx, c_batch, ldc)
+
+cpdef void cdgmm_strided_batched_64(
+    intptr_t handle,
+    int mode,
+    int64_t m,
+    int64_t n,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t x, int64_t incx, int64_t stride_x,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t x_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuComplex*>a + batch_idx * stride_a)
+        x_batch = <intptr_t>(<cuComplex*>x + batch_idx * stride_x)
+        c_batch = <intptr_t>(<cuComplex*>c + batch_idx * stride_c)
+
+        cdgmm(handle, mode, m, n, a_batch, lda, x_batch, incx, c_batch, ldc)
+
+cpdef void zdgmm_strided_batched_64(
+    intptr_t handle,
+    int mode,
+    int64_t m,
+    int64_t n,
+    intptr_t a, int64_t lda, int64_t stride_a,
+    intptr_t x, int64_t incx, int64_t stride_x,
+    intptr_t c, int64_t ldc, int64_t stride_c,
+    int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t x_batch
+    cdef intptr_t c_batch
+    cdef int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<cuDoubleComplex*>a + batch_idx * stride_a)
+        x_batch = <intptr_t>(<cuDoubleComplex*>x + batch_idx * stride_x)
+        c_batch = <intptr_t>(<cuDoubleComplex*>c + batch_idx * stride_c)
+
+        zdgmm(handle, mode, m, n, a_batch, lda, x_batch, incx, c_batch, ldc)
+
+
 ###############################################################################
 # Wrapper functions
 ###############################################################################
diff --git a/nvmath/bindings/cublasMp.pxd b/nvmath/bindings/cublasMp.pxd
new file mode 100644
index 0000000..ed317e4
--- /dev/null
+++ b/nvmath/bindings/cublasMp.pxd
@@ -0,0 +1,61 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cycublasMp cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef ncclComm_t ncclComm
+ctypedef cublasMpHandle_t Handle
+ctypedef cublasMpGrid_t Grid
+ctypedef cublasMpMatrixDescriptor_t MatrixDescriptor
+ctypedef cublasMpMatmulDescriptor_t MatmulDescriptor
+ctypedef cublasMpLoggerCallback_t LoggerCallback
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef cublasOperation_t _Operation
+ctypedef cublasComputeType_t _ComputeType
+ctypedef cublasMpStatus_t _Status
+ctypedef cublasMpGridLayout_t _GridLayout
+ctypedef cublasMpMatmulDescriptorAttribute_t _MatmulDescriptorAttribute
+ctypedef cublasMpMatmulAlgoType_t _MatmulAlgoType
+ctypedef cublasMpMatmulEpilogue_t _MatmulEpilogue
+ctypedef cublasMpMatmulMatrixScale_t _MatmulMatrixScale
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t create(intptr_t stream) except? 0
+cpdef destroy(intptr_t handle)
+cpdef stream_set(intptr_t handle, intptr_t stream)
+cpdef int get_version() except? 0
+cpdef intptr_t grid_create(int64_t nprow, int64_t npcol, int layout, intptr_t comm) except? 0
+cpdef grid_destroy(intptr_t grid)
+cpdef intptr_t matrix_descriptor_create(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, int type, intptr_t grid) except? 0
+cpdef matrix_descriptor_destroy(intptr_t desc)
+cpdef intptr_t matmul_descriptor_create(int compute_type) except? 0
+cpdef matmul_descriptor_destroy(intptr_t matmul_desc)
+cpdef get_matmul_descriptor_attribute_dtype(int attr)
+cpdef matmul_descriptor_attribute_set(intptr_t matmul_desc, int attr, intptr_t buf, size_t size_in_bytes)
+cpdef matmul_descriptor_attribute_get(intptr_t matmul_desc, int attr, intptr_t buf, size_t size_in_bytes, intptr_t size_written)
+cpdef tuple matmul_buffer_size(intptr_t handle, intptr_t matmul_desc, int64_t m, int64_t n, int64_t k, intptr_t alpha, intptr_t a, int64_t ia, int64_t ja, intptr_t desc_a, intptr_t b, int64_t ib, int64_t jb, intptr_t desc_b, intptr_t beta, intptr_t c, int64_t ic, int64_t jc, intptr_t desc_c, intptr_t d, int64_t id, int64_t jd, intptr_t desc_d)
+cpdef matmul(intptr_t handle, intptr_t matmul_desc, int64_t m, int64_t n, int64_t k, intptr_t alpha, intptr_t a, int64_t ia, int64_t ja, intptr_t desc_a, intptr_t b, int64_t ib, int64_t jb, intptr_t desc_b, intptr_t beta, intptr_t c, int64_t ic, int64_t jc, intptr_t desc_c, intptr_t d, int64_t id, int64_t jd, intptr_t desc_d, intptr_t d_work, size_t workspace_size_in_bytes_on_device, intptr_t h_work, size_t workspace_size_in_bytes_on_host)
+cpdef int64_t numroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except? -1
diff --git a/nvmath/bindings/cublasMp.pyx b/nvmath/bindings/cublasMp.pyx
new file mode 100644
index 0000000..009dd71
--- /dev/null
+++ b/nvmath/bindings/cublasMp.pyx
@@ -0,0 +1,310 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+
+cimport cython  # NOQA
+from libc.stdint cimport int64_t
+from libcpp.vector cimport vector
+
+from enum import IntEnum as _IntEnum
+
+import numpy as _numpy
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Operation(_IntEnum):
+    """See `cublasOperation_t`."""
+    N = CUBLAS_OP_N
+    T = CUBLAS_OP_T
+    C = CUBLAS_OP_C
+    HERMITAN = CUBLAS_OP_HERMITAN
+    CONJG = CUBLAS_OP_CONJG
+
+class ComputeType(_IntEnum):
+    """See `cublasComputeType_t`."""
+    COMPUTE_16F = CUBLAS_COMPUTE_16F
+    COMPUTE_16F_PEDANTIC = CUBLAS_COMPUTE_16F_PEDANTIC
+    COMPUTE_32F = CUBLAS_COMPUTE_32F
+    COMPUTE_32F_PEDANTIC = CUBLAS_COMPUTE_32F_PEDANTIC
+    COMPUTE_32F_FAST_16F = CUBLAS_COMPUTE_32F_FAST_16F
+    COMPUTE_32F_FAST_16BF = CUBLAS_COMPUTE_32F_FAST_16BF
+    COMPUTE_32F_FAST_TF32 = CUBLAS_COMPUTE_32F_FAST_TF32
+    COMPUTE_64F = CUBLAS_COMPUTE_64F
+    COMPUTE_64F_PEDANTIC = CUBLAS_COMPUTE_64F_PEDANTIC
+    COMPUTE_32I = CUBLAS_COMPUTE_32I
+    COMPUTE_32I_PEDANTIC = CUBLAS_COMPUTE_32I_PEDANTIC
+
+class Status(_IntEnum):
+    """See `cublasMpStatus_t`."""
+    SUCCESS = CUBLASMP_STATUS_SUCCESS
+    NOT_INITIALIZED = CUBLASMP_STATUS_NOT_INITIALIZED
+    ALLOCATION_FAILED = CUBLASMP_STATUS_ALLOCATION_FAILED
+    INVALID_VALUE = CUBLASMP_STATUS_INVALID_VALUE
+    ARCHITECTURE_MISMATCH = CUBLASMP_STATUS_ARCHITECTURE_MISMATCH
+    EXECUTION_FAILED = CUBLASMP_STATUS_EXECUTION_FAILED
+    INTERNAL_ERROR = CUBLASMP_STATUS_INTERNAL_ERROR
+    NOT_SUPPORTED = CUBLASMP_STATUS_NOT_SUPPORTED
+
+class GridLayout(_IntEnum):
+    """See `cublasMpGridLayout_t`."""
+    COL_MAJOR = CUBLASMP_GRID_LAYOUT_COL_MAJOR
+    ROW_MAJOR = CUBLASMP_GRID_LAYOUT_ROW_MAJOR
+
+class MatmulDescriptorAttribute(_IntEnum):
+    """See `cublasMpMatmulDescriptorAttribute_t`."""
+    TRANSA = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSA
+    TRANSB = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSB
+    COMPUTE_TYPE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMPUTE_TYPE
+    ALGO_TYPE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_ALGO_TYPE
+    COMMUNICATION_SM_COUNT = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMMUNICATION_SM_COUNT
+    EPILOGUE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE
+    BIAS_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_POINTER
+    BIAS_BATCH_STRIDE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_BATCH_STRIDE
+    BIAS_DATA_TYPE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_DATA_TYPE
+    EPILOGUE_AUX_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_POINTER
+    EPILOGUE_AUX_LD = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_LD
+    EPILOGUE_AUX_BATCH_STRIDE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_BATCH_STRIDE
+    EPILOGUE_AUX_DATA_TYPE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_DATA_TYPE
+    EPILOGUE_AUX_SCALE_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_POINTER
+    EPILOGUE_AUX_AMAX_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_AMAX_POINTER
+    EPILOGUE_AUX_SCALE_MODE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_MODE
+    A_SCALE_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_POINTER
+    A_SCALE_MODE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_MODE
+    B_SCALE_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_POINTER
+    B_SCALE_MODE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_MODE
+    C_SCALE_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_POINTER
+    C_SCALE_MODE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_MODE
+    D_SCALE_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_POINTER
+    D_SCALE_MODE = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_MODE
+    AMAX_D_POINTER = CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_AMAX_D_POINTER
+
+class MatmulAlgoType(_IntEnum):
+    """See `cublasMpMatmulAlgoType_t`."""
+    DEFAULT = CUBLASMP_MATMUL_ALGO_TYPE_DEFAULT
+    SPLIT_P2P = CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_P2P
+    SPLIT_MULTICAST = CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_MULTICAST
+    ATOMIC_P2P = CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_P2P
+    ATOMIC_MULTICAST = CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_MULTICAST
+
+class MatmulEpilogue(_IntEnum):
+    """See `cublasMpMatmulEpilogue_t`."""
+    DEFAULT = CUBLASMP_MATMUL_EPILOGUE_DEFAULT
+    ALLREDUCE = CUBLASMP_MATMUL_EPILOGUE_ALLREDUCE
+    RELU = CUBLASMP_MATMUL_EPILOGUE_RELU
+    RELU_AUX = CUBLASMP_MATMUL_EPILOGUE_RELU_AUX
+    BIAS = CUBLASMP_MATMUL_EPILOGUE_BIAS
+    RELU_BIAS = CUBLASMP_MATMUL_EPILOGUE_RELU_BIAS
+    RELU_AUX_BIAS = CUBLASMP_MATMUL_EPILOGUE_RELU_AUX_BIAS
+    DRELU = CUBLASMP_MATMUL_EPILOGUE_DRELU
+    DRELU_BGRAD = CUBLASMP_MATMUL_EPILOGUE_DRELU_BGRAD
+    GELU = CUBLASMP_MATMUL_EPILOGUE_GELU
+    GELU_AUX = CUBLASMP_MATMUL_EPILOGUE_GELU_AUX
+    GELU_BIAS = CUBLASMP_MATMUL_EPILOGUE_GELU_BIAS
+    GELU_AUX_BIAS = CUBLASMP_MATMUL_EPILOGUE_GELU_AUX_BIAS
+    DGELU = CUBLASMP_MATMUL_EPILOGUE_DGELU
+    DGELU_BGRAD = CUBLASMP_MATMUL_EPILOGUE_DGELU_BGRAD
+    BGRADA = CUBLASMP_MATMUL_EPILOGUE_BGRADA
+    BGRADB = CUBLASMP_MATMUL_EPILOGUE_BGRADB
+
+class MatmulMatrixScale(_IntEnum):
+    """See `cublasMpMatmulMatrixScale_t`."""
+    SCALAR_FP32 = CUBLASMP_MATMUL_MATRIX_SCALE_SCALAR_FP32
+    VEC16_UE4M3 = CUBLASMP_MATMUL_MATRIX_SCALE_VEC16_UE4M3
+    VEC32_UE8M0 = CUBLASMP_MATMUL_MATRIX_SCALE_VEC32_UE8M0
+    OUTER_VEC_FP32 = CUBLASMP_MATMUL_MATRIX_SCALE_OUTER_VEC_FP32
+    VEC128_FP32 = CUBLASMP_MATMUL_MATRIX_SCALE_VEC128_FP32
+    BLK128x128_FP32 = CUBLASMP_MATMUL_MATRIX_SCALE_BLK128x128_FP32
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class cuBLASMpError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Status(status)
+        cdef str err = f"{s.name} ({s.value})"
+        err = f"{err}. You can set CUBLASMP_LOG_LEVEL=5 and CUBLASLT_LOG_LEVEL=5 environment variables to enable logging to learn more."
+        super(cuBLASMpError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cpdef inline check_status(int status):
+    if status != 0:
+        raise cuBLASMpError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef intptr_t create(intptr_t stream) except? 0:
+    """See `cublasMpCreate`."""
+    cdef Handle handle
+    with nogil:
+        __status__ = cublasMpCreate(&handle, <Stream>stream)
+    check_status(__status__)
+    return <intptr_t>handle
+
+
+cpdef destroy(intptr_t handle):
+    """See `cublasMpDestroy`."""
+    with nogil:
+        __status__ = cublasMpDestroy(<Handle>handle)
+    check_status(__status__)
+
+
+cpdef stream_set(intptr_t handle, intptr_t stream):
+    """See `cublasMpStreamSet`."""
+    with nogil:
+        __status__ = cublasMpStreamSet(<Handle>handle, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef int get_version() except? 0:
+    """See `cublasMpGetVersion`."""
+    cdef int version
+    with nogil:
+        __status__ = cublasMpGetVersion(&version)
+    check_status(__status__)
+    return version
+
+
+cpdef intptr_t grid_create(int64_t nprow, int64_t npcol, int layout, intptr_t comm) except? 0:
+    """See `cublasMpGridCreate`."""
+    cdef Grid grid
+    with nogil:
+        __status__ = cublasMpGridCreate(nprow, npcol, <_GridLayout>layout, <ncclComm>comm, &grid)
+    check_status(__status__)
+    return <intptr_t>grid
+
+
+cpdef grid_destroy(intptr_t grid):
+    """See `cublasMpGridDestroy`."""
+    with nogil:
+        __status__ = cublasMpGridDestroy(<Grid>grid)
+    check_status(__status__)
+
+
+cpdef intptr_t matrix_descriptor_create(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, int type, intptr_t grid) except? 0:
+    """See `cublasMpMatrixDescriptorCreate`."""
+    cdef MatrixDescriptor desc
+    with nogil:
+        __status__ = cublasMpMatrixDescriptorCreate(m, n, mb, nb, rsrc, csrc, lld, <DataType>type, <Grid>grid, &desc)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef matrix_descriptor_destroy(intptr_t desc):
+    """See `cublasMpMatrixDescriptorDestroy`."""
+    with nogil:
+        __status__ = cublasMpMatrixDescriptorDestroy(<MatrixDescriptor>desc)
+    check_status(__status__)
+
+
+cpdef intptr_t matmul_descriptor_create(int compute_type) except? 0:
+    """See `cublasMpMatmulDescriptorCreate`."""
+    cdef MatmulDescriptor matmul_desc
+    with nogil:
+        __status__ = cublasMpMatmulDescriptorCreate(&matmul_desc, <_ComputeType>compute_type)
+    check_status(__status__)
+    return <intptr_t>matmul_desc
+
+
+cpdef matmul_descriptor_destroy(intptr_t matmul_desc):
+    """See `cublasMpMatmulDescriptorDestroy`."""
+    with nogil:
+        __status__ = cublasMpMatmulDescriptorDestroy(<MatmulDescriptor>matmul_desc)
+    check_status(__status__)
+
+
+######################### Python specific utility #########################
+
+cdef dict matmul_descriptor_attribute_sizes = {
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSA: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSB: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMPUTE_TYPE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_ALGO_TYPE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMMUNICATION_SM_COUNT: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_BATCH_STRIDE: _numpy.int64,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_DATA_TYPE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_LD: _numpy.int64,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_BATCH_STRIDE: _numpy.int64,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_DATA_TYPE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_AMAX_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_MODE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_MODE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_MODE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_MODE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_POINTER: _numpy.intp,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_MODE: _numpy.int32,
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_AMAX_D_POINTER: _numpy.intp,
+}
+
+cpdef get_matmul_descriptor_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding MatmulDescriptorAttribute attribute.
+
+    Args:
+        attr (MatmulDescriptorAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`matmul_descriptor_attribute_get`, :func:`matmul_descriptor_attribute_set`.
+    """
+    return matmul_descriptor_attribute_sizes[attr]
+
+###########################################################################
+
+
+cpdef matmul_descriptor_attribute_set(intptr_t matmul_desc, int attr, intptr_t buf, size_t size_in_bytes):
+    """See `cublasMpMatmulDescriptorAttributeSet`."""
+    with nogil:
+        __status__ = cublasMpMatmulDescriptorAttributeSet(<MatmulDescriptor>matmul_desc, <_MatmulDescriptorAttribute>attr, <const void*>buf, size_in_bytes)
+    check_status(__status__)
+
+
+cpdef matmul_descriptor_attribute_get(intptr_t matmul_desc, int attr, intptr_t buf, size_t size_in_bytes, intptr_t size_written):
+    """See `cublasMpMatmulDescriptorAttributeGet`."""
+    with nogil:
+        __status__ = cublasMpMatmulDescriptorAttributeGet(<MatmulDescriptor>matmul_desc, <_MatmulDescriptorAttribute>attr, <void*>buf, size_in_bytes, <size_t*>size_written)
+    check_status(__status__)
+
+
+cpdef tuple matmul_buffer_size(intptr_t handle, intptr_t matmul_desc, int64_t m, int64_t n, int64_t k, intptr_t alpha, intptr_t a, int64_t ia, int64_t ja, intptr_t desc_a, intptr_t b, int64_t ib, int64_t jb, intptr_t desc_b, intptr_t beta, intptr_t c, int64_t ic, int64_t jc, intptr_t desc_c, intptr_t d, int64_t id, int64_t jd, intptr_t desc_d):
+    """See `cublasMpMatmul_bufferSize`."""
+    cdef size_t workspace_size_in_bytes_on_device
+    cdef size_t workspace_size_in_bytes_on_host
+    with nogil:
+        __status__ = cublasMpMatmul_bufferSize(<Handle>handle, <MatmulDescriptor>matmul_desc, m, n, k, <const void*>alpha, <const void*>a, ia, ja, <MatrixDescriptor>desc_a, <const void*>b, ib, jb, <MatrixDescriptor>desc_b, <const void*>beta, <const void*>c, ic, jc, <MatrixDescriptor>desc_c, <void*>d, id, jd, <MatrixDescriptor>desc_d, &workspace_size_in_bytes_on_device, &workspace_size_in_bytes_on_host)
+    check_status(__status__)
+    return (workspace_size_in_bytes_on_device, workspace_size_in_bytes_on_host)
+
+
+cpdef matmul(intptr_t handle, intptr_t matmul_desc, int64_t m, int64_t n, int64_t k, intptr_t alpha, intptr_t a, int64_t ia, int64_t ja, intptr_t desc_a, intptr_t b, int64_t ib, int64_t jb, intptr_t desc_b, intptr_t beta, intptr_t c, int64_t ic, int64_t jc, intptr_t desc_c, intptr_t d, int64_t id, int64_t jd, intptr_t desc_d, intptr_t d_work, size_t workspace_size_in_bytes_on_device, intptr_t h_work, size_t workspace_size_in_bytes_on_host):
+    """See `cublasMpMatmul`."""
+    with nogil:
+        __status__ = cublasMpMatmul(<Handle>handle, <MatmulDescriptor>matmul_desc, m, n, k, <const void*>alpha, <const void*>a, ia, ja, <MatrixDescriptor>desc_a, <const void*>b, ib, jb, <MatrixDescriptor>desc_b, <const void*>beta, <const void*>c, ic, jc, <MatrixDescriptor>desc_c, <void*>d, id, jd, <MatrixDescriptor>desc_d, <void*>d_work, workspace_size_in_bytes_on_device, <void*>h_work, workspace_size_in_bytes_on_host)
+    check_status(__status__)
+
+
+cpdef int64_t numroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except? -1:
+    """See `cublasMpNumroc`."""
+    return cublasMpNumroc(n, nb, iproc, isrcproc, nprocs)
diff --git a/nvmath/bindings/cudss.pxd b/nvmath/bindings/cudss.pxd
index 5b379b7..7d7e494 100644
--- a/nvmath/bindings/cudss.pxd
+++ b/nvmath/bindings/cudss.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -63,6 +63,7 @@ cpdef config_destroy(intptr_t solver_config)
 cpdef intptr_t data_create(intptr_t handle) except? 0
 cpdef data_destroy(intptr_t handle, intptr_t solver_data)
 cpdef intptr_t create() except? 0
+cpdef intptr_t create_mg(int device_count, device_indices) except? 0
 cpdef destroy(intptr_t handle)
 cpdef int get_property(int property_type) except? -1
 cpdef intptr_t matrix_create_dn(int64_t nrows, int64_t ncols, int64_t ld, intptr_t values, int value_type, int layout) except? 0
@@ -70,8 +71,8 @@ cpdef intptr_t matrix_create_csr(int64_t nrows, int64_t ncols, int64_t nnz, intp
 cpdef intptr_t matrix_create_batch_dn(int64_t batch_count, intptr_t nrows, intptr_t ncols, intptr_t ld, intptr_t values, int index_type, int value_type, int layout) except? 0
 cpdef intptr_t matrix_create_batch_csr(int64_t batch_count, intptr_t nrows, intptr_t ncols, intptr_t nnz, intptr_t row_start, intptr_t row_end, intptr_t col_indices, intptr_t values, int index_type, int value_type, int mtype, int mview, int index_base) except? 0
 cpdef matrix_destroy(intptr_t matrix)
-cpdef tuple matrix_get_dn(intptr_t matrix)
-cpdef tuple matrix_get_csr(intptr_t matrix)
+cpdef matrix_get_dn(intptr_t matrix, intptr_t nrows, intptr_t ncols, intptr_t ld, intptr_t values, intptr_t type, intptr_t layout)
+cpdef matrix_get_csr(intptr_t matrix, intptr_t nrows, intptr_t ncols, intptr_t nnz, intptr_t row_start, intptr_t row_end, intptr_t col_indices, intptr_t values, intptr_t index_type, intptr_t value_type, intptr_t mtype, intptr_t mview, intptr_t index_base)
 cpdef matrix_set_values(intptr_t matrix, intptr_t values)
 cpdef matrix_set_csr_pointers(intptr_t matrix, intptr_t row_offsets, intptr_t row_end, intptr_t col_indices, intptr_t values)
 cpdef matrix_get_batch_dn(intptr_t matrix, intptr_t batch_count, intptr_t nrows, intptr_t ncols, intptr_t ld, intptr_t values, intptr_t index_type, intptr_t value_type, intptr_t layout)
@@ -79,5 +80,7 @@ cpdef matrix_get_batch_csr(intptr_t matrix, intptr_t batch_count, intptr_t nrows
 cpdef matrix_set_batch_values(intptr_t matrix, intptr_t values)
 cpdef matrix_set_batch_csr_pointers(intptr_t matrix, intptr_t row_offsets, intptr_t row_end, intptr_t col_indices, intptr_t values)
 cpdef int matrix_get_format(intptr_t matrix) except? -1
+cpdef matrix_set_distribution_row1d(intptr_t matrix, int64_t first_row, int64_t last_row)
+cpdef matrix_get_distribution_row1d(intptr_t matrix, intptr_t first_row, intptr_t last_row)
 cpdef get_device_mem_handler(intptr_t handle, intptr_t handler)
 cpdef set_device_mem_handler(intptr_t handle, intptr_t handler)
diff --git a/nvmath/bindings/cudss.pyi b/nvmath/bindings/cudss.pyi
index f9bbeb4..14a04a0 100644
--- a/nvmath/bindings/cudss.pyi
+++ b/nvmath/bindings/cudss.pyi
@@ -2,64 +2,71 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import _cython_3_0_12
+import _cython_3_1_4
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-check_status: _cython_3_0_12.cython_function_or_method
-config_create: _cython_3_0_12.cython_function_or_method
-config_destroy: _cython_3_0_12.cython_function_or_method
-config_get: _cython_3_0_12.cython_function_or_method
-config_set: _cython_3_0_12.cython_function_or_method
-create: _cython_3_0_12.cython_function_or_method
-data_create: _cython_3_0_12.cython_function_or_method
-data_destroy: _cython_3_0_12.cython_function_or_method
-data_get: _cython_3_0_12.cython_function_or_method
-data_set: _cython_3_0_12.cython_function_or_method
-destroy: _cython_3_0_12.cython_function_or_method
-execute: _cython_3_0_12.cython_function_or_method
-get_config_param_dtype: _cython_3_0_12.cython_function_or_method
-get_data_param_dtype: _cython_3_0_12.cython_function_or_method
-get_device_mem_handler: _cython_3_0_12.cython_function_or_method
-get_property: _cython_3_0_12.cython_function_or_method
-matrix_create_batch_csr: _cython_3_0_12.cython_function_or_method
-matrix_create_batch_dn: _cython_3_0_12.cython_function_or_method
-matrix_create_csr: _cython_3_0_12.cython_function_or_method
-matrix_create_dn: _cython_3_0_12.cython_function_or_method
-matrix_destroy: _cython_3_0_12.cython_function_or_method
-matrix_get_batch_csr: _cython_3_0_12.cython_function_or_method
-matrix_get_batch_dn: _cython_3_0_12.cython_function_or_method
-matrix_get_csr: _cython_3_0_12.cython_function_or_method
-matrix_get_dn: _cython_3_0_12.cython_function_or_method
-matrix_get_format: _cython_3_0_12.cython_function_or_method
-matrix_set_batch_csr_pointers: _cython_3_0_12.cython_function_or_method
-matrix_set_batch_values: _cython_3_0_12.cython_function_or_method
-matrix_set_csr_pointers: _cython_3_0_12.cython_function_or_method
-matrix_set_values: _cython_3_0_12.cython_function_or_method
-set_comm_layer: _cython_3_0_12.cython_function_or_method
-set_device_mem_handler: _cython_3_0_12.cython_function_or_method
-set_stream: _cython_3_0_12.cython_function_or_method
-set_threading_layer: _cython_3_0_12.cython_function_or_method
+check_status: _cython_3_1_4.cython_function_or_method
+config_create: _cython_3_1_4.cython_function_or_method
+config_destroy: _cython_3_1_4.cython_function_or_method
+config_get: _cython_3_1_4.cython_function_or_method
+config_set: _cython_3_1_4.cython_function_or_method
+create: _cython_3_1_4.cython_function_or_method
+create_mg: _cython_3_1_4.cython_function_or_method
+data_create: _cython_3_1_4.cython_function_or_method
+data_destroy: _cython_3_1_4.cython_function_or_method
+data_get: _cython_3_1_4.cython_function_or_method
+data_set: _cython_3_1_4.cython_function_or_method
+destroy: _cython_3_1_4.cython_function_or_method
+execute: _cython_3_1_4.cython_function_or_method
+get_config_param_dtype: _cython_3_1_4.cython_function_or_method
+get_data_param_dtype: _cython_3_1_4.cython_function_or_method
+get_device_mem_handler: _cython_3_1_4.cython_function_or_method
+get_property: _cython_3_1_4.cython_function_or_method
+matrix_create_batch_csr: _cython_3_1_4.cython_function_or_method
+matrix_create_batch_dn: _cython_3_1_4.cython_function_or_method
+matrix_create_csr: _cython_3_1_4.cython_function_or_method
+matrix_create_dn: _cython_3_1_4.cython_function_or_method
+matrix_destroy: _cython_3_1_4.cython_function_or_method
+matrix_get_batch_csr: _cython_3_1_4.cython_function_or_method
+matrix_get_batch_dn: _cython_3_1_4.cython_function_or_method
+matrix_get_csr: _cython_3_1_4.cython_function_or_method
+matrix_get_distribution_row1d: _cython_3_1_4.cython_function_or_method
+matrix_get_dn: _cython_3_1_4.cython_function_or_method
+matrix_get_format: _cython_3_1_4.cython_function_or_method
+matrix_set_batch_csr_pointers: _cython_3_1_4.cython_function_or_method
+matrix_set_batch_values: _cython_3_1_4.cython_function_or_method
+matrix_set_csr_pointers: _cython_3_1_4.cython_function_or_method
+matrix_set_distribution_row1d: _cython_3_1_4.cython_function_or_method
+matrix_set_values: _cython_3_1_4.cython_function_or_method
+set_comm_layer: _cython_3_1_4.cython_function_or_method
+set_device_mem_handler: _cython_3_1_4.cython_function_or_method
+set_stream: _cython_3_1_4.cython_function_or_method
+set_threading_layer: _cython_3_1_4.cython_function_or_method
 
 class AlgType(enum.IntEnum):
+    """See `cudssAlgType_t`."""
     __new__: ClassVar[Callable] = ...
     ALG_1: ClassVar[AlgType] = ...
     ALG_2: ClassVar[AlgType] = ...
     ALG_3: ClassVar[AlgType] = ...
+    ALG_4: ClassVar[AlgType] = ...
+    ALG_5: ClassVar[AlgType] = ...
     ALG_DEFAULT: ClassVar[AlgType] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class ConfigParam(enum.IntEnum):
+    """See `cudssConfigParam_t`."""
     __new__: ClassVar[Callable] = ...
+    DETERMINISTIC_MODE: ClassVar[ConfigParam] = ...
+    DEVICE_COUNT: ClassVar[ConfigParam] = ...
+    DEVICE_INDICES: ClassVar[ConfigParam] = ...
     FACTORIZATION_ALG: ClassVar[ConfigParam] = ...
     HOST_NTHREADS: ClassVar[ConfigParam] = ...
     HYBRID_DEVICE_MEMORY_LIMIT: ClassVar[ConfigParam] = ...
@@ -67,50 +74,62 @@ class ConfigParam(enum.IntEnum):
     HYBRID_MODE: ClassVar[ConfigParam] = ...
     IR_N_STEPS: ClassVar[ConfigParam] = ...
     IR_TOL: ClassVar[ConfigParam] = ...
-    MATCHING_TYPE: ClassVar[ConfigParam] = ...
+    MATCHING_ALG: ClassVar[ConfigParam] = ...
     MAX_LU_NNZ: ClassVar[ConfigParam] = ...
+    ND_NLEVELS: ClassVar[ConfigParam] = ...
     PIVOT_EPSILON: ClassVar[ConfigParam] = ...
     PIVOT_EPSILON_ALG: ClassVar[ConfigParam] = ...
     PIVOT_THRESHOLD: ClassVar[ConfigParam] = ...
     PIVOT_TYPE: ClassVar[ConfigParam] = ...
     REORDERING_ALG: ClassVar[ConfigParam] = ...
+    SCHUR_MODE: ClassVar[ConfigParam] = ...
     SOLVE_ALG: ClassVar[ConfigParam] = ...
     SOLVE_MODE: ClassVar[ConfigParam] = ...
+    UBATCH_INDEX: ClassVar[ConfigParam] = ...
+    UBATCH_SIZE: ClassVar[ConfigParam] = ...
     USE_CUDA_REGISTER_MEMORY: ClassVar[ConfigParam] = ...
+    USE_MATCHING: ClassVar[ConfigParam] = ...
+    USE_SUPERPANELS: ClassVar[ConfigParam] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class DataParam(enum.IntEnum):
+    """See `cudssDataParam_t`."""
     __new__: ClassVar[Callable] = ...
     COMM: ClassVar[DataParam] = ...
     DIAG: ClassVar[DataParam] = ...
+    ELIMINATION_TREE: ClassVar[DataParam] = ...
     HYBRID_DEVICE_MEMORY_MIN: ClassVar[DataParam] = ...
     INERTIA: ClassVar[DataParam] = ...
     INFO: ClassVar[DataParam] = ...
     LU_NNZ: ClassVar[DataParam] = ...
     MEMORY_ESTIMATES: ClassVar[DataParam] = ...
     NPIVOTS: ClassVar[DataParam] = ...
+    NSUPERPANELS: ClassVar[DataParam] = ...
     PERM_COL: ClassVar[DataParam] = ...
+    PERM_MATCHING: ClassVar[DataParam] = ...
     PERM_REORDER_COL: ClassVar[DataParam] = ...
     PERM_REORDER_ROW: ClassVar[DataParam] = ...
     PERM_ROW: ClassVar[DataParam] = ...
+    SCALE_COL: ClassVar[DataParam] = ...
+    SCALE_ROW: ClassVar[DataParam] = ...
+    SCHUR_MATRIX: ClassVar[DataParam] = ...
+    SCHUR_SHAPE: ClassVar[DataParam] = ...
+    USER_ELIMINATION_TREE: ClassVar[DataParam] = ...
+    USER_HOST_INTERRUPT: ClassVar[DataParam] = ...
     USER_PERM: ClassVar[DataParam] = ...
+    USER_SCHUR_INDICES: ClassVar[DataParam] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class IndexBase(enum.IntEnum):
+    """See `cudssIndexBase_t`."""
     __new__: ClassVar[Callable] = ...
     ONE: ClassVar[IndexBase] = ...
     ZERO: ClassVar[IndexBase] = ...
@@ -118,12 +137,10 @@ class IndexBase(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Layout(enum.IntEnum):
+    """See `cudssLayout_t`."""
     __new__: ClassVar[Callable] = ...
     COL_MAJOR: ClassVar[Layout] = ...
     ROW_MAJOR: ClassVar[Layout] = ...
@@ -131,26 +148,23 @@ class Layout(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class MatrixFormat(enum.IntEnum):
+    """See `cudssMatrixFormat_t`."""
     __new__: ClassVar[Callable] = ...
     BATCH: ClassVar[MatrixFormat] = ...
     CSR: ClassVar[MatrixFormat] = ...
     DENSE: ClassVar[MatrixFormat] = ...
+    DISTRIBUTED: ClassVar[MatrixFormat] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class MatrixType(enum.IntEnum):
+    """See `cudssMatrixType_t`."""
     __new__: ClassVar[Callable] = ...
     GENERAL: ClassVar[MatrixType] = ...
     HERMITIAN: ClassVar[MatrixType] = ...
@@ -161,12 +175,10 @@ class MatrixType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class MatrixViewType(enum.IntEnum):
+    """See `cudssMatrixViewType_t`."""
     __new__: ClassVar[Callable] = ...
     FULL: ClassVar[MatrixViewType] = ...
     LOWER: ClassVar[MatrixViewType] = ...
@@ -175,12 +187,10 @@ class MatrixViewType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class OpType(enum.IntEnum):
+    """See `cudssOpType_t`."""
     __new__: ClassVar[Callable] = ...
     MAX: ClassVar[OpType] = ...
     MIN: ClassVar[OpType] = ...
@@ -189,30 +199,31 @@ class OpType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Phase(enum.IntEnum):
+    """See `cudssPhase_t`."""
     __new__: ClassVar[Callable] = ...
     ANALYSIS: ClassVar[Phase] = ...
     FACTORIZATION: ClassVar[Phase] = ...
     REFACTORIZATION: ClassVar[Phase] = ...
+    REORDERING: ClassVar[Phase] = ...
     SOLVE: ClassVar[Phase] = ...
     SOLVE_BWD: ClassVar[Phase] = ...
+    SOLVE_BWD_PERM: ClassVar[Phase] = ...
     SOLVE_DIAG: ClassVar[Phase] = ...
     SOLVE_FWD: ClassVar[Phase] = ...
+    SOLVE_FWD_PERM: ClassVar[Phase] = ...
+    SOLVE_REFINEMENT: ClassVar[Phase] = ...
+    SYMBOLIC_FACTORIZATION: ClassVar[Phase] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class PivotType(enum.IntEnum):
+    """See `cudssPivotType_t`."""
     __new__: ClassVar[Callable] = ...
     PIVOT_COL: ClassVar[PivotType] = ...
     PIVOT_NONE: ClassVar[PivotType] = ...
@@ -221,12 +232,10 @@ class PivotType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Status(enum.IntEnum):
+    """See `cudssStatus_t`."""
     __new__: ClassVar[Callable] = ...
     ALLOC_FAILED: ClassVar[Status] = ...
     EXECUTION_FAILED: ClassVar[Status] = ...
@@ -239,11 +248,11 @@ class Status(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class cuDSSError(Exception):
-    def __init__(self, status) -> Any: ...
-    def __reduce__(self) -> Any: ...
+    """cuDSSError(status)"""
+    def __init__(self, status) -> Any:
+        """Initialize self.  See help(type(self)) for accurate signature."""
+    def __reduce__(self) -> Any:
+        """cuDSSError.__reduce__(self)"""
diff --git a/nvmath/bindings/cudss.pyx b/nvmath/bindings/cudss.pyx
index 42b714f..fecb1aa 100644
--- a/nvmath/bindings/cudss.pyx
+++ b/nvmath/bindings/cudss.pyx
@@ -2,10 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
+from libcpp.vector cimport vector
+
+from ._internal.utils cimport (get_resource_ptr, get_resource_ptrs, nullable_unique_ptr,
+                               get_buffer_pointer,)
+
 from enum import IntEnum as _IntEnum
 
 import numpy as _numpy
@@ -33,7 +38,8 @@ class ConfigParam(_IntEnum):
     REORDERING_ALG = CUDSS_CONFIG_REORDERING_ALG
     FACTORIZATION_ALG = CUDSS_CONFIG_FACTORIZATION_ALG
     SOLVE_ALG = CUDSS_CONFIG_SOLVE_ALG
-    MATCHING_TYPE = CUDSS_CONFIG_MATCHING_TYPE
+    USE_MATCHING = CUDSS_CONFIG_USE_MATCHING
+    MATCHING_ALG = CUDSS_CONFIG_MATCHING_ALG
     SOLVE_MODE = CUDSS_CONFIG_SOLVE_MODE
     IR_N_STEPS = CUDSS_CONFIG_IR_N_STEPS
     IR_TOL = CUDSS_CONFIG_IR_TOL
@@ -47,6 +53,14 @@ class ConfigParam(_IntEnum):
     HOST_NTHREADS = CUDSS_CONFIG_HOST_NTHREADS
     HYBRID_EXECUTE_MODE = CUDSS_CONFIG_HYBRID_EXECUTE_MODE
     PIVOT_EPSILON_ALG = CUDSS_CONFIG_PIVOT_EPSILON_ALG
+    ND_NLEVELS = CUDSS_CONFIG_ND_NLEVELS
+    UBATCH_SIZE = CUDSS_CONFIG_UBATCH_SIZE
+    UBATCH_INDEX = CUDSS_CONFIG_UBATCH_INDEX
+    USE_SUPERPANELS = CUDSS_CONFIG_USE_SUPERPANELS
+    DEVICE_COUNT = CUDSS_CONFIG_DEVICE_COUNT
+    DEVICE_INDICES = CUDSS_CONFIG_DEVICE_INDICES
+    SCHUR_MODE = CUDSS_CONFIG_SCHUR_MODE
+    DETERMINISTIC_MODE = CUDSS_CONFIG_DETERMINISTIC_MODE
 
 class DataParam(_IntEnum):
     """See `cudssDataParam_t`."""
@@ -63,16 +77,31 @@ class DataParam(_IntEnum):
     HYBRID_DEVICE_MEMORY_MIN = CUDSS_DATA_HYBRID_DEVICE_MEMORY_MIN
     COMM = CUDSS_DATA_COMM
     MEMORY_ESTIMATES = CUDSS_DATA_MEMORY_ESTIMATES
+    PERM_MATCHING = CUDSS_DATA_PERM_MATCHING
+    SCALE_ROW = CUDSS_DATA_SCALE_ROW
+    SCALE_COL = CUDSS_DATA_SCALE_COL
+    NSUPERPANELS = CUDSS_DATA_NSUPERPANELS
+    USER_SCHUR_INDICES = CUDSS_DATA_USER_SCHUR_INDICES
+    SCHUR_SHAPE = CUDSS_DATA_SCHUR_SHAPE
+    SCHUR_MATRIX = CUDSS_DATA_SCHUR_MATRIX
+    USER_ELIMINATION_TREE = CUDSS_DATA_USER_ELIMINATION_TREE
+    ELIMINATION_TREE = CUDSS_DATA_ELIMINATION_TREE
+    USER_HOST_INTERRUPT = CUDSS_DATA_USER_HOST_INTERRUPT
 
 class Phase(_IntEnum):
     """See `cudssPhase_t`."""
+    REORDERING = CUDSS_PHASE_REORDERING
+    SYMBOLIC_FACTORIZATION = CUDSS_PHASE_SYMBOLIC_FACTORIZATION
     ANALYSIS = CUDSS_PHASE_ANALYSIS
     FACTORIZATION = CUDSS_PHASE_FACTORIZATION
     REFACTORIZATION = CUDSS_PHASE_REFACTORIZATION
-    SOLVE = CUDSS_PHASE_SOLVE
+    SOLVE_FWD_PERM = CUDSS_PHASE_SOLVE_FWD_PERM
     SOLVE_FWD = CUDSS_PHASE_SOLVE_FWD
     SOLVE_DIAG = CUDSS_PHASE_SOLVE_DIAG
     SOLVE_BWD = CUDSS_PHASE_SOLVE_BWD
+    SOLVE_BWD_PERM = CUDSS_PHASE_SOLVE_BWD_PERM
+    SOLVE_REFINEMENT = CUDSS_PHASE_SOLVE_REFINEMENT
+    SOLVE = CUDSS_PHASE_SOLVE
 
 class Status(_IntEnum):
     """See `cudssStatus_t`."""
@@ -114,6 +143,8 @@ class AlgType(_IntEnum):
     ALG_1 = CUDSS_ALG_1
     ALG_2 = CUDSS_ALG_2
     ALG_3 = CUDSS_ALG_3
+    ALG_4 = CUDSS_ALG_4
+    ALG_5 = CUDSS_ALG_5
 
 class PivotType(_IntEnum):
     """See `cudssPivotType_t`."""
@@ -126,6 +157,7 @@ class MatrixFormat(_IntEnum):
     DENSE = CUDSS_MFORMAT_DENSE
     CSR = CUDSS_MFORMAT_CSR
     BATCH = CUDSS_MFORMAT_BATCH
+    DISTRIBUTED = CUDSS_MFORMAT_DISTRIBUTED
 
 ###############################################################################
 # Error handling
@@ -159,7 +191,8 @@ cdef dict config_param_sizes = {
     CUDSS_CONFIG_REORDERING_ALG: _numpy.int32,
     CUDSS_CONFIG_FACTORIZATION_ALG: _numpy.int32,
     CUDSS_CONFIG_SOLVE_ALG: _numpy.int32,
-    CUDSS_CONFIG_MATCHING_TYPE: _numpy.int32,
+    CUDSS_CONFIG_USE_MATCHING: _numpy.int32,
+    CUDSS_CONFIG_MATCHING_ALG: _numpy.int32,
     CUDSS_CONFIG_SOLVE_MODE: _numpy.int32,
     CUDSS_CONFIG_IR_N_STEPS: _numpy.int32,
     CUDSS_CONFIG_IR_TOL: _numpy.float64,
@@ -173,6 +206,14 @@ cdef dict config_param_sizes = {
     CUDSS_CONFIG_HOST_NTHREADS: _numpy.int32,
     CUDSS_CONFIG_HYBRID_EXECUTE_MODE: _numpy.int32,
     CUDSS_CONFIG_PIVOT_EPSILON_ALG: _numpy.int32,
+    CUDSS_CONFIG_ND_NLEVELS: _numpy.int32,
+    CUDSS_CONFIG_UBATCH_SIZE: _numpy.int32,
+    CUDSS_CONFIG_UBATCH_INDEX: _numpy.int32,
+    CUDSS_CONFIG_USE_SUPERPANELS: _numpy.int32,
+    CUDSS_CONFIG_DEVICE_COUNT: _numpy.int32,
+    CUDSS_CONFIG_DEVICE_INDICES: _numpy.intp,
+    CUDSS_CONFIG_SCHUR_MODE: _numpy.int32,
+    CUDSS_CONFIG_DETERMINISTIC_MODE: _numpy.int32,
 }
 
 cpdef get_config_param_dtype(int attr):
@@ -195,15 +236,15 @@ cpdef get_config_param_dtype(int attr):
 cpdef config_set(intptr_t config, int param, intptr_t value, size_t size_in_bytes):
     """See `cudssConfigSet`."""
     with nogil:
-        status = cudssConfigSet(<Config>config, <_ConfigParam>param, <void*>value, size_in_bytes)
-    check_status(status)
+        __status__ = cudssConfigSet(<Config>config, <_ConfigParam>param, <void*>value, size_in_bytes)
+    check_status(__status__)
 
 
 cpdef config_get(intptr_t config, int param, intptr_t value, size_t size_in_bytes, intptr_t size_written):
     """See `cudssConfigGet`."""
     with nogil:
-        status = cudssConfigGet(<Config>config, <_ConfigParam>param, <void*>value, size_in_bytes, <size_t*>size_written)
-    check_status(status)
+        __status__ = cudssConfigGet(<Config>config, <_ConfigParam>param, <void*>value, size_in_bytes, <size_t*>size_written)
+    check_status(__status__)
 
 
 ######################### Python specific utility #########################
@@ -211,16 +252,12 @@ cpdef config_get(intptr_t config, int param, intptr_t value, size_t size_in_byte
 cdef dict data_param_sizes = {
     CUDSS_DATA_INFO: _numpy.int32,
     CUDSS_DATA_LU_NNZ: _numpy.int64,
-    CUDSS_DATA_NPIVOTS: _numpy.int32,
-    CUDSS_DATA_INERTIA: _numpy.int32,
-    CUDSS_DATA_PERM_REORDER_ROW: _numpy.int32,
-    CUDSS_DATA_PERM_REORDER_COL: _numpy.int32,
-    CUDSS_DATA_PERM_ROW: _numpy.int32,
-    CUDSS_DATA_PERM_COL: _numpy.int32,
-    CUDSS_DATA_USER_PERM: _numpy.int32,
     CUDSS_DATA_HYBRID_DEVICE_MEMORY_MIN: _numpy.int64,
     CUDSS_DATA_COMM: _numpy.intp,
     CUDSS_DATA_MEMORY_ESTIMATES: _numpy.int64,
+    CUDSS_DATA_SCHUR_SHAPE: _numpy.intp,
+    CUDSS_DATA_SCHUR_MATRIX: _numpy.intp,
+    CUDSS_DATA_USER_HOST_INTERRUPT: _numpy.int32,
 }
 
 cpdef get_data_param_dtype(int attr):
@@ -243,36 +280,36 @@ cpdef get_data_param_dtype(int attr):
 cpdef data_set(intptr_t handle, intptr_t data, int param, intptr_t value, size_t size_in_bytes):
     """See `cudssDataSet`."""
     with nogil:
-        status = cudssDataSet(<Handle>handle, <Data>data, <_DataParam>param, <void*>value, size_in_bytes)
-    check_status(status)
+        __status__ = cudssDataSet(<Handle>handle, <Data>data, <_DataParam>param, <void*>value, size_in_bytes)
+    check_status(__status__)
 
 
 cpdef data_get(intptr_t handle, intptr_t data, int param, intptr_t value, size_t size_in_bytes, intptr_t size_written):
     """See `cudssDataGet`."""
     with nogil:
-        status = cudssDataGet(<Handle>handle, <Data>data, <_DataParam>param, <void*>value, size_in_bytes, <size_t*>size_written)
-    check_status(status)
+        __status__ = cudssDataGet(<Handle>handle, <Data>data, <_DataParam>param, <void*>value, size_in_bytes, <size_t*>size_written)
+    check_status(__status__)
 
 
 cpdef execute(intptr_t handle, int phase, intptr_t solver_config, intptr_t solver_data, intptr_t input_matrix, intptr_t solution, intptr_t rhs):
     """See `cudssExecute`."""
     with nogil:
-        status = cudssExecute(<Handle>handle, <_Phase>phase, <Config>solver_config, <Data>solver_data, <Matrix>input_matrix, <Matrix>solution, <Matrix>rhs)
-    check_status(status)
+        __status__ = cudssExecute(<Handle>handle, phase, <Config>solver_config, <Data>solver_data, <Matrix>input_matrix, <Matrix>solution, <Matrix>rhs)
+    check_status(__status__)
 
 
 cpdef set_stream(intptr_t handle, intptr_t stream):
     """See `cudssSetStream`."""
     with nogil:
-        status = cudssSetStream(<Handle>handle, <Stream>stream)
-    check_status(status)
+        __status__ = cudssSetStream(<Handle>handle, <Stream>stream)
+    check_status(__status__)
 
 
 cpdef set_comm_layer(intptr_t handle, intptr_t comm_lib_file_name):
     """See `cudssSetCommLayer`."""
     with nogil:
-        status = cudssSetCommLayer(<Handle>handle, <const char*>comm_lib_file_name)
-    check_status(status)
+        __status__ = cudssSetCommLayer(<Handle>handle, <const char*>comm_lib_file_name)
+    check_status(__status__)
 
 
 cpdef set_threading_layer(intptr_t handle, thr_lib_file_name):
@@ -282,64 +319,75 @@ cpdef set_threading_layer(intptr_t handle, thr_lib_file_name):
     cdef bytes _temp_thr_lib_file_name_ = (<str>thr_lib_file_name).encode()
     cdef char* _thr_lib_file_name_ = _temp_thr_lib_file_name_
     with nogil:
-        status = cudssSetThreadingLayer(<Handle>handle, <const char*>_thr_lib_file_name_)
-    check_status(status)
+        __status__ = cudssSetThreadingLayer(<Handle>handle, <const char*>_thr_lib_file_name_)
+    check_status(__status__)
 
 
 cpdef intptr_t config_create() except? 0:
     """See `cudssConfigCreate`."""
     cdef Config solver_config
     with nogil:
-        status = cudssConfigCreate(&solver_config)
-    check_status(status)
+        __status__ = cudssConfigCreate(&solver_config)
+    check_status(__status__)
     return <intptr_t>solver_config
 
 
 cpdef config_destroy(intptr_t solver_config):
     """See `cudssConfigDestroy`."""
     with nogil:
-        status = cudssConfigDestroy(<Config>solver_config)
-    check_status(status)
+        __status__ = cudssConfigDestroy(<Config>solver_config)
+    check_status(__status__)
 
 
 cpdef intptr_t data_create(intptr_t handle) except? 0:
     """See `cudssDataCreate`."""
     cdef Data solver_data
     with nogil:
-        status = cudssDataCreate(<Handle>handle, &solver_data)
-    check_status(status)
+        __status__ = cudssDataCreate(<Handle>handle, &solver_data)
+    check_status(__status__)
     return <intptr_t>solver_data
 
 
 cpdef data_destroy(intptr_t handle, intptr_t solver_data):
     """See `cudssDataDestroy`."""
     with nogil:
-        status = cudssDataDestroy(<Handle>handle, <Data>solver_data)
-    check_status(status)
+        __status__ = cudssDataDestroy(<Handle>handle, <Data>solver_data)
+    check_status(__status__)
 
 
 cpdef intptr_t create() except? 0:
     """See `cudssCreate`."""
     cdef Handle handle
     with nogil:
-        status = cudssCreate(&handle)
-    check_status(status)
+        __status__ = cudssCreate(&handle)
+    check_status(__status__)
     return <intptr_t>handle
 
 
+cpdef intptr_t create_mg(int device_count, device_indices) except? 0:
+    """See `cudssCreateMg`."""
+    cdef nullable_unique_ptr[ vector[int] ] _device_indices_
+    get_resource_ptr[int](_device_indices_, device_indices, <int*>NULL)
+    cdef Handle handle_pt
+    with nogil:
+        __status__ = cudssCreateMg(&handle_pt, device_count, <int*>(_device_indices_.data()))
+    check_status(__status__)
+    return <intptr_t>handle_pt
+
+
 cpdef destroy(intptr_t handle):
     """See `cudssDestroy`."""
     with nogil:
-        status = cudssDestroy(<Handle>handle)
-    check_status(status)
+        __status__ = cudssDestroy(<Handle>handle)
+    check_status(__status__)
 
 
 cpdef int get_property(int property_type) except? -1:
     """See `cudssGetProperty`."""
     cdef int value
     with nogil:
-        status = cudssGetProperty(<LibraryPropertyType>property_type, &value)
-    check_status(status)
+        __status__ = cudssGetProperty(<LibraryPropertyType>property_type, &value)
+    check_status(__status__)
     return value
 
 
@@ -347,8 +395,8 @@ cpdef intptr_t matrix_create_dn(int64_t nrows, int64_t ncols, int64_t ld, intptr
     """See `cudssMatrixCreateDn`."""
     cdef Matrix matrix
     with nogil:
-        status = cudssMatrixCreateDn(&matrix, nrows, ncols, ld, <void*>values, <DataType>value_type, <_Layout>layout)
-    check_status(status)
+        __status__ = cudssMatrixCreateDn(&matrix, nrows, ncols, ld, <void*>values, <DataType>value_type, <_Layout>layout)
+    check_status(__status__)
     return <intptr_t>matrix
 
 
@@ -356,8 +404,8 @@ cpdef intptr_t matrix_create_csr(int64_t nrows, int64_t ncols, int64_t nnz, intp
     """See `cudssMatrixCreateCsr`."""
     cdef Matrix matrix
     with nogil:
-        status = cudssMatrixCreateCsr(&matrix, nrows, ncols, nnz, <void*>row_start, <void*>row_end, <void*>col_indices, <void*>values, <DataType>index_type, <DataType>value_type, <_MatrixType>mtype, <_MatrixViewType>mview, <_IndexBase>index_base)
-    check_status(status)
+        __status__ = cudssMatrixCreateCsr(&matrix, nrows, ncols, nnz, <void*>row_start, <void*>row_end, <void*>col_indices, <void*>values, <DataType>index_type, <DataType>value_type, <_MatrixType>mtype, <_MatrixViewType>mview, <_IndexBase>index_base)
+    check_status(__status__)
     return <intptr_t>matrix
 
 
@@ -365,8 +413,8 @@ cpdef intptr_t matrix_create_batch_dn(int64_t batch_count, intptr_t nrows, intpt
     """See `cudssMatrixCreateBatchDn`."""
     cdef Matrix matrix
     with nogil:
-        status = cudssMatrixCreateBatchDn(&matrix, batch_count, <void*>nrows, <void*>ncols, <void*>ld, <void**>values, <DataType>index_type, <DataType>value_type, <_Layout>layout)
-    check_status(status)
+        __status__ = cudssMatrixCreateBatchDn(&matrix, batch_count, <void*>nrows, <void*>ncols, <void*>ld, <void**>values, <DataType>index_type, <DataType>value_type, <_Layout>layout)
+    check_status(__status__)
     return <intptr_t>matrix
 
 
@@ -374,112 +422,106 @@ cpdef intptr_t matrix_create_batch_csr(int64_t batch_count, intptr_t nrows, intp
     """See `cudssMatrixCreateBatchCsr`."""
     cdef Matrix matrix
     with nogil:
-        status = cudssMatrixCreateBatchCsr(&matrix, batch_count, <void*>nrows, <void*>ncols, <void*>nnz, <void**>row_start, <void**>row_end, <void**>col_indices, <void**>values, <DataType>index_type, <DataType>value_type, <_MatrixType>mtype, <_MatrixViewType>mview, <_IndexBase>index_base)
-    check_status(status)
+        __status__ = cudssMatrixCreateBatchCsr(&matrix, batch_count, <void*>nrows, <void*>ncols, <void*>nnz, <void**>row_start, <void**>row_end, <void**>col_indices, <void**>values, <DataType>index_type, <DataType>value_type, <_MatrixType>mtype, <_MatrixViewType>mview, <_IndexBase>index_base)
+    check_status(__status__)
     return <intptr_t>matrix
 
 
 cpdef matrix_destroy(intptr_t matrix):
     """See `cudssMatrixDestroy`."""
     with nogil:
-        status = cudssMatrixDestroy(<Matrix>matrix)
-    check_status(status)
+        __status__ = cudssMatrixDestroy(<Matrix>matrix)
+    check_status(__status__)
 
 
-cpdef tuple matrix_get_dn(intptr_t matrix):
+cpdef matrix_get_dn(intptr_t matrix, intptr_t nrows, intptr_t ncols, intptr_t ld, intptr_t values, intptr_t type, intptr_t layout):
     """See `cudssMatrixGetDn`."""
-    cdef int64_t nrows
-    cdef int64_t ncols
-    cdef int64_t ld
-    cdef void* values
-    cdef DataType type
-    cdef _Layout layout
     with nogil:
-        status = cudssMatrixGetDn(<Matrix>matrix, &nrows, &ncols, &ld, &values, &type, &layout)
-    check_status(status)
-    return (nrows, ncols, ld, <intptr_t>values, <int>type, <int>layout)
+        __status__ = cudssMatrixGetDn(<Matrix>matrix, <int64_t*>nrows, <int64_t*>ncols, <int64_t*>ld, <void**>values, <DataType*>type, <_Layout*>layout)
+    check_status(__status__)
 
 
-cpdef tuple matrix_get_csr(intptr_t matrix):
+cpdef matrix_get_csr(intptr_t matrix, intptr_t nrows, intptr_t ncols, intptr_t nnz, intptr_t row_start, intptr_t row_end, intptr_t col_indices, intptr_t values, intptr_t index_type, intptr_t value_type, intptr_t mtype, intptr_t mview, intptr_t index_base):
     """See `cudssMatrixGetCsr`."""
-    cdef int64_t nrows
-    cdef int64_t ncols
-    cdef int64_t nnz
-    cdef void* row_start
-    cdef void* row_end
-    cdef void* col_indices
-    cdef void* values
-    cdef DataType index_type
-    cdef DataType value_type
-    cdef _MatrixType mtype
-    cdef _MatrixViewType mview
-    cdef _IndexBase index_base
-    with nogil:
-        status = cudssMatrixGetCsr(<Matrix>matrix, &nrows, &ncols, &nnz, &row_start, &row_end, &col_indices, &values, &index_type, &value_type, &mtype, &mview, &index_base)
-    check_status(status)
-    return (nrows, ncols, nnz, <intptr_t>row_start, <intptr_t>row_end, <intptr_t>col_indices, <intptr_t>values, <int>index_type, <int>value_type, <int>mtype, <int>mview, <int>index_base)
+    with nogil:
+        __status__ = cudssMatrixGetCsr(<Matrix>matrix, <int64_t*>nrows, <int64_t*>ncols, <int64_t*>nnz, <void**>row_start, <void**>row_end, <void**>col_indices, <void**>values, <DataType*>index_type, <DataType*>value_type, <_MatrixType*>mtype, <_MatrixViewType*>mview, <_IndexBase*>index_base)
+    check_status(__status__)
 
 
 cpdef matrix_set_values(intptr_t matrix, intptr_t values):
     """See `cudssMatrixSetValues`."""
     with nogil:
-        status = cudssMatrixSetValues(<Matrix>matrix, <void*>values)
-    check_status(status)
+        __status__ = cudssMatrixSetValues(<Matrix>matrix, <void*>values)
+    check_status(__status__)
 
 
 cpdef matrix_set_csr_pointers(intptr_t matrix, intptr_t row_offsets, intptr_t row_end, intptr_t col_indices, intptr_t values):
     """See `cudssMatrixSetCsrPointers`."""
     with nogil:
-        status = cudssMatrixSetCsrPointers(<Matrix>matrix, <void*>row_offsets, <void*>row_end, <void*>col_indices, <void*>values)
-    check_status(status)
+        __status__ = cudssMatrixSetCsrPointers(<Matrix>matrix, <void*>row_offsets, <void*>row_end, <void*>col_indices, <void*>values)
+    check_status(__status__)
 
 
 cpdef matrix_get_batch_dn(intptr_t matrix, intptr_t batch_count, intptr_t nrows, intptr_t ncols, intptr_t ld, intptr_t values, intptr_t index_type, intptr_t value_type, intptr_t layout):
     """See `cudssMatrixGetBatchDn`."""
     with nogil:
-        status = cudssMatrixGetBatchDn(<Matrix>matrix, <int64_t*>batch_count, <void**>nrows, <void**>ncols, <void**>ld, <void***>values, <DataType*>index_type, <DataType*>value_type, <_Layout*>layout)
-    check_status(status)
+        __status__ = cudssMatrixGetBatchDn(<Matrix>matrix, <int64_t*>batch_count, <void**>nrows, <void**>ncols, <void**>ld, <void***>values, <DataType*>index_type, <DataType*>value_type, <_Layout*>layout)
+    check_status(__status__)
 
 
 cpdef matrix_get_batch_csr(intptr_t matrix, intptr_t batch_count, intptr_t nrows, intptr_t ncols, intptr_t nnz, intptr_t row_start, intptr_t row_end, intptr_t col_indices, intptr_t values, intptr_t index_type, intptr_t value_type, intptr_t mtype, intptr_t mview, intptr_t index_base):
     """See `cudssMatrixGetBatchCsr`."""
     with nogil:
-        status = cudssMatrixGetBatchCsr(<Matrix>matrix, <int64_t*>batch_count, <void**>nrows, <void**>ncols, <void**>nnz, <void***>row_start, <void***>row_end, <void***>col_indices, <void***>values, <DataType*>index_type, <DataType*>value_type, <_MatrixType*>mtype, <_MatrixViewType*>mview, <_IndexBase*>index_base)
-    check_status(status)
+        __status__ = cudssMatrixGetBatchCsr(<Matrix>matrix, <int64_t*>batch_count, <void**>nrows, <void**>ncols, <void**>nnz, <void***>row_start, <void***>row_end, <void***>col_indices, <void***>values, <DataType*>index_type, <DataType*>value_type, <_MatrixType*>mtype, <_MatrixViewType*>mview, <_IndexBase*>index_base)
+    check_status(__status__)
 
 
 cpdef matrix_set_batch_values(intptr_t matrix, intptr_t values):
     """See `cudssMatrixSetBatchValues`."""
     with nogil:
-        status = cudssMatrixSetBatchValues(<Matrix>matrix, <void**>values)
-    check_status(status)
+        __status__ = cudssMatrixSetBatchValues(<Matrix>matrix, <void**>values)
+    check_status(__status__)
 
 
 cpdef matrix_set_batch_csr_pointers(intptr_t matrix, intptr_t row_offsets, intptr_t row_end, intptr_t col_indices, intptr_t values):
     """See `cudssMatrixSetBatchCsrPointers`."""
     with nogil:
-        status = cudssMatrixSetBatchCsrPointers(<Matrix>matrix, <void**>row_offsets, <void**>row_end, <void**>col_indices, <void**>values)
-    check_status(status)
+        __status__ = cudssMatrixSetBatchCsrPointers(<Matrix>matrix, <void**>row_offsets, <void**>row_end, <void**>col_indices, <void**>values)
+    check_status(__status__)
 
 
 cpdef int matrix_get_format(intptr_t matrix) except? -1:
     """See `cudssMatrixGetFormat`."""
     cdef int format
     with nogil:
-        status = cudssMatrixGetFormat(<Matrix>matrix, &format)
-    check_status(status)
+        __status__ = cudssMatrixGetFormat(<Matrix>matrix, &format)
+    check_status(__status__)
     return format
 
 
+cpdef matrix_set_distribution_row1d(intptr_t matrix, int64_t first_row, int64_t last_row):
+    """See `cudssMatrixSetDistributionRow1d`."""
+    with nogil:
+        __status__ = cudssMatrixSetDistributionRow1d(<Matrix>matrix, first_row, last_row)
+    check_status(__status__)
+
+
+cpdef matrix_get_distribution_row1d(intptr_t matrix, intptr_t first_row, intptr_t last_row):
+    """See `cudssMatrixGetDistributionRow1d`."""
+    with nogil:
+        __status__ = cudssMatrixGetDistributionRow1d(<Matrix>matrix, <int64_t*>first_row, <int64_t*>last_row)
+    check_status(__status__)
+
+
 cpdef get_device_mem_handler(intptr_t handle, intptr_t handler):
     """See `cudssGetDeviceMemHandler`."""
     with nogil:
-        status = cudssGetDeviceMemHandler(<Handle>handle, <cudssDeviceMemHandler_t*>handler)
-    check_status(status)
+        __status__ = cudssGetDeviceMemHandler(<Handle>handle, <cudssDeviceMemHandler_t*>handler)
+    check_status(__status__)
 
 
 cpdef set_device_mem_handler(intptr_t handle, intptr_t handler):
     """See `cudssSetDeviceMemHandler`."""
     with nogil:
-        status = cudssSetDeviceMemHandler(<Handle>handle, <const cudssDeviceMemHandler_t*>handler)
-    check_status(status)
+        __status__ = cudssSetDeviceMemHandler(<Handle>handle, <const cudssDeviceMemHandler_t*>handler)
+    check_status(__status__)
diff --git a/nvmath/bindings/cufft.pxd b/nvmath/bindings/cufft.pxd
index 2f0c004..71b5735 100644
--- a/nvmath/bindings/cufft.pxd
+++ b/nvmath/bindings/cufft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -107,8 +107,12 @@ cpdef size_t xt_get_size_many(int plan, int rank, n, inembed, long long int istr
 cpdef xt_exec(int plan, intptr_t input, intptr_t output, int direction)
 cpdef xt_exec_descriptor(int plan, intptr_t input, intptr_t output, int direction)
 cpdef xt_set_work_area_policy(int plan, int policy, intptr_t work_size)
-cpdef xt_set_jit_callback(int plan, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info)
+cpdef _xt_set_jit_callback(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info)
 cpdef xt_set_subformat_default(int plan, int subformat_forward, int subformat_inverse)
 cpdef set_plan_property_int64(int plan, int property, long long int input_value_int)
 cpdef long long int get_plan_property_int64(int plan, int property) except? -1
 cpdef reset_plan_property(int plan, int property)
+cpdef _xt_set_jit_callback_12_7(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info)
+
+# wrapper calling the correct function based on the CTK version
+cpdef xt_set_jit_callback(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info)
diff --git a/nvmath/bindings/cufft.pyi b/nvmath/bindings/cufft.pyi
index c0985cc..07c318a 100644
--- a/nvmath/bindings/cufft.pyi
+++ b/nvmath/bindings/cufft.pyi
@@ -2,83 +2,82 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import _cython_3_1_2
+import _cython_3_1_4
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-check_status: _cython_3_1_2.cython_function_or_method
-create: _cython_3_1_2.cython_function_or_method
-destroy: _cython_3_1_2.cython_function_or_method
-estimate1d: _cython_3_1_2.cython_function_or_method
-estimate2d: _cython_3_1_2.cython_function_or_method
-estimate3d: _cython_3_1_2.cython_function_or_method
-estimate_many: _cython_3_1_2.cython_function_or_method
-exec_c2c: _cython_3_1_2.cython_function_or_method
-exec_c2r: _cython_3_1_2.cython_function_or_method
-exec_d2z: _cython_3_1_2.cython_function_or_method
-exec_r2c: _cython_3_1_2.cython_function_or_method
-exec_z2d: _cython_3_1_2.cython_function_or_method
-exec_z2z: _cython_3_1_2.cython_function_or_method
-get_plan_property_int64: _cython_3_1_2.cython_function_or_method
-get_property: _cython_3_1_2.cython_function_or_method
-get_size: _cython_3_1_2.cython_function_or_method
-get_size1d: _cython_3_1_2.cython_function_or_method
-get_size2d: _cython_3_1_2.cython_function_or_method
-get_size3d: _cython_3_1_2.cython_function_or_method
-get_size_many: _cython_3_1_2.cython_function_or_method
-get_size_many64: _cython_3_1_2.cython_function_or_method
-get_version: _cython_3_1_2.cython_function_or_method
-make_plan1d: _cython_3_1_2.cython_function_or_method
-make_plan2d: _cython_3_1_2.cython_function_or_method
-make_plan3d: _cython_3_1_2.cython_function_or_method
-make_plan_many: _cython_3_1_2.cython_function_or_method
-make_plan_many64: _cython_3_1_2.cython_function_or_method
-plan1d: _cython_3_1_2.cython_function_or_method
-plan2d: _cython_3_1_2.cython_function_or_method
-plan3d: _cython_3_1_2.cython_function_or_method
-plan_many: _cython_3_1_2.cython_function_or_method
-reset_plan_property: _cython_3_1_2.cython_function_or_method
-set_auto_allocation: _cython_3_1_2.cython_function_or_method
-set_plan_property_int64: _cython_3_1_2.cython_function_or_method
-set_stream: _cython_3_1_2.cython_function_or_method
-set_work_area: _cython_3_1_2.cython_function_or_method
-xt_clear_callback: _cython_3_1_2.cython_function_or_method
-xt_exec: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_c2c: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_c2r: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_d2z: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_r2c: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_z2d: _cython_3_1_2.cython_function_or_method
-xt_exec_descriptor_z2z: _cython_3_1_2.cython_function_or_method
-xt_free: _cython_3_1_2.cython_function_or_method
-xt_get_size_many: _cython_3_1_2.cython_function_or_method
-xt_make_plan_many: _cython_3_1_2.cython_function_or_method
-xt_malloc: _cython_3_1_2.cython_function_or_method
-xt_memcpy: _cython_3_1_2.cython_function_or_method
-xt_query_plan: _cython_3_1_2.cython_function_or_method
-xt_set_callback_shared_size: _cython_3_1_2.cython_function_or_method
-xt_set_gpus: _cython_3_1_2.cython_function_or_method
-xt_set_jit_callback: _cython_3_1_2.cython_function_or_method
-xt_set_subformat_default: _cython_3_1_2.cython_function_or_method
-xt_set_work_area: _cython_3_1_2.cython_function_or_method
-xt_set_work_area_policy: _cython_3_1_2.cython_function_or_method
+check_status: _cython_3_1_4.cython_function_or_method
+create: _cython_3_1_4.cython_function_or_method
+destroy: _cython_3_1_4.cython_function_or_method
+estimate1d: _cython_3_1_4.cython_function_or_method
+estimate2d: _cython_3_1_4.cython_function_or_method
+estimate3d: _cython_3_1_4.cython_function_or_method
+estimate_many: _cython_3_1_4.cython_function_or_method
+exec_c2c: _cython_3_1_4.cython_function_or_method
+exec_c2r: _cython_3_1_4.cython_function_or_method
+exec_d2z: _cython_3_1_4.cython_function_or_method
+exec_r2c: _cython_3_1_4.cython_function_or_method
+exec_z2d: _cython_3_1_4.cython_function_or_method
+exec_z2z: _cython_3_1_4.cython_function_or_method
+get_plan_property_int64: _cython_3_1_4.cython_function_or_method
+get_property: _cython_3_1_4.cython_function_or_method
+get_size: _cython_3_1_4.cython_function_or_method
+get_size1d: _cython_3_1_4.cython_function_or_method
+get_size2d: _cython_3_1_4.cython_function_or_method
+get_size3d: _cython_3_1_4.cython_function_or_method
+get_size_many: _cython_3_1_4.cython_function_or_method
+get_size_many64: _cython_3_1_4.cython_function_or_method
+get_version: _cython_3_1_4.cython_function_or_method
+make_plan1d: _cython_3_1_4.cython_function_or_method
+make_plan2d: _cython_3_1_4.cython_function_or_method
+make_plan3d: _cython_3_1_4.cython_function_or_method
+make_plan_many: _cython_3_1_4.cython_function_or_method
+make_plan_many64: _cython_3_1_4.cython_function_or_method
+plan1d: _cython_3_1_4.cython_function_or_method
+plan2d: _cython_3_1_4.cython_function_or_method
+plan3d: _cython_3_1_4.cython_function_or_method
+plan_many: _cython_3_1_4.cython_function_or_method
+reset_plan_property: _cython_3_1_4.cython_function_or_method
+set_auto_allocation: _cython_3_1_4.cython_function_or_method
+set_plan_property_int64: _cython_3_1_4.cython_function_or_method
+set_stream: _cython_3_1_4.cython_function_or_method
+set_work_area: _cython_3_1_4.cython_function_or_method
+xt_clear_callback: _cython_3_1_4.cython_function_or_method
+xt_exec: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_c2c: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_c2r: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_d2z: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_r2c: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_z2d: _cython_3_1_4.cython_function_or_method
+xt_exec_descriptor_z2z: _cython_3_1_4.cython_function_or_method
+xt_free: _cython_3_1_4.cython_function_or_method
+xt_get_size_many: _cython_3_1_4.cython_function_or_method
+xt_make_plan_many: _cython_3_1_4.cython_function_or_method
+xt_malloc: _cython_3_1_4.cython_function_or_method
+xt_memcpy: _cython_3_1_4.cython_function_or_method
+xt_query_plan: _cython_3_1_4.cython_function_or_method
+xt_set_callback_shared_size: _cython_3_1_4.cython_function_or_method
+xt_set_gpus: _cython_3_1_4.cython_function_or_method
+xt_set_jit_callback: _cython_3_1_4.cython_function_or_method
+xt_set_subformat_default: _cython_3_1_4.cython_function_or_method
+xt_set_work_area: _cython_3_1_4.cython_function_or_method
+xt_set_work_area_policy: _cython_3_1_4.cython_function_or_method
 
 class Compatibility(enum.IntEnum):
+    """See `cufftCompatibility`."""
     __new__: ClassVar[Callable] = ...
     FFTW_PADDING: ClassVar[Compatibility] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class LibFormat(enum.IntEnum):
+    """See `libFormat_t`."""
     __new__: ClassVar[Callable] = ...
     CUFFT: ClassVar[LibFormat] = ...
     UNDEFINED: ClassVar[LibFormat] = ...
@@ -86,12 +85,10 @@ class LibFormat(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Property(enum.IntEnum):
+    """See `cufftProperty`."""
     __new__: ClassVar[Callable] = ...
     MAX_NUM_HOST_THREADS: ClassVar[Property] = ...
     PATIENT_JIT: ClassVar[Property] = ...
@@ -99,12 +96,10 @@ class Property(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Result(enum.IntEnum):
+    """See `cufftResult`."""
     __new__: ClassVar[Callable] = ...
     ALLOC_FAILED: ClassVar[Result] = ...
     EXEC_FAILED: ClassVar[Result] = ...
@@ -116,9 +111,13 @@ class Result(enum.IntEnum):
     INVALID_TYPE: ClassVar[Result] = ...
     INVALID_VALUE: ClassVar[Result] = ...
     LICENSE_ERROR: ClassVar[Result] = ...
+    MISSING_DEPENDENCY: ClassVar[Result] = ...
     NOT_IMPLEMENTED: ClassVar[Result] = ...
     NOT_SUPPORTED: ClassVar[Result] = ...
     NO_WORKSPACE: ClassVar[Result] = ...
+    NVJITLINK_FAILURE: ClassVar[Result] = ...
+    NVRTC_FAILURE: ClassVar[Result] = ...
+    NVSHMEM_FAILURE: ClassVar[Result] = ...
     PARSE_ERROR: ClassVar[Result] = ...
     SETUP_FAILED: ClassVar[Result] = ...
     SUCCESS: ClassVar[Result] = ...
@@ -127,12 +126,10 @@ class Result(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Type(enum.IntEnum):
+    """See `cufftType`."""
     __new__: ClassVar[Callable] = ...
     C2C: ClassVar[Type] = ...
     C2R: ClassVar[Type] = ...
@@ -144,12 +141,10 @@ class Type(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class XtCallbackType(enum.IntEnum):
+    """See `cufftXtCallbackType`."""
     __new__: ClassVar[Callable] = ...
     LD_COMPLEX: ClassVar[XtCallbackType] = ...
     LD_COMPLEX_DOUBLE: ClassVar[XtCallbackType] = ...
@@ -164,12 +159,10 @@ class XtCallbackType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class XtCopyType(enum.IntEnum):
+    """See `cufftXtCopyType`."""
     __new__: ClassVar[Callable] = ...
     DEVICE_TO_DEVICE: ClassVar[XtCopyType] = ...
     DEVICE_TO_HOST: ClassVar[XtCopyType] = ...
@@ -179,12 +172,10 @@ class XtCopyType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class XtQueryType(enum.IntEnum):
+    """See `cufftXtQueryType`."""
     __new__: ClassVar[Callable] = ...
     QUERY_1D_FACTORS: ClassVar[XtQueryType] = ...
     QUERY_UNDEFINED: ClassVar[XtQueryType] = ...
@@ -192,12 +183,10 @@ class XtQueryType(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class XtSubFormat(enum.IntEnum):
+    """See `cufftXtSubFormat`."""
     __new__: ClassVar[Callable] = ...
     FORMAT_1D_INPUT_SHUFFLED: ClassVar[XtSubFormat] = ...
     FORMAT_DISTRIBUTED_INPUT: ClassVar[XtSubFormat] = ...
@@ -211,12 +200,10 @@ class XtSubFormat(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class XtWorkAreaPolicy(enum.IntEnum):
+    """See `cufftXtWorkAreaPolicy`."""
     __new__: ClassVar[Callable] = ...
     MINIMAL: ClassVar[XtWorkAreaPolicy] = ...
     PERFORMANCE: ClassVar[XtWorkAreaPolicy] = ...
@@ -225,11 +212,10 @@ class XtWorkAreaPolicy(enum.IntEnum):
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class cuFFTError(Exception):
-    def __init__(self, status) -> Any: ...
-    def __reduce__(self) -> Any: ...
+    def __init__(self, status) -> Any:
+        """cuFFTError.__init__(self, status)"""
+    def __reduce__(self) -> Any:
+        """cuFFTError.__reduce__(self)"""
diff --git a/nvmath/bindings/cufft.pyx b/nvmath/bindings/cufft.pyx
index 16d9f38..dadd9da 100644
--- a/nvmath/bindings/cufft.pyx
+++ b/nvmath/bindings/cufft.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc.stdint cimport int64_t
@@ -35,13 +35,17 @@ class Result(_IntEnum):
     SETUP_FAILED = CUFFT_SETUP_FAILED
     INVALID_SIZE = CUFFT_INVALID_SIZE
     UNALIGNED_DATA = CUFFT_UNALIGNED_DATA
-    INCOMPLETE_PARAMETER_LIST = CUFFT_INCOMPLETE_PARAMETER_LIST
     INVALID_DEVICE = CUFFT_INVALID_DEVICE
-    PARSE_ERROR = CUFFT_PARSE_ERROR
     NO_WORKSPACE = CUFFT_NO_WORKSPACE
     NOT_IMPLEMENTED = CUFFT_NOT_IMPLEMENTED
-    LICENSE_ERROR = CUFFT_LICENSE_ERROR
     NOT_SUPPORTED = CUFFT_NOT_SUPPORTED
+    MISSING_DEPENDENCY = CUFFT_MISSING_DEPENDENCY
+    NVRTC_FAILURE = CUFFT_NVRTC_FAILURE
+    NVJITLINK_FAILURE = CUFFT_NVJITLINK_FAILURE
+    NVSHMEM_FAILURE = CUFFT_NVSHMEM_FAILURE
+    INCOMPLETE_PARAMETER_LIST = _CUFFTRESULT_INTERNAL_LOADING_ERROR
+    PARSE_ERROR = CUFFT_INCOMPLETE_PARAMETER_LIST
+    LICENSE_ERROR = CUFFT_PARSE_ERROR
 
 class Type(_IntEnum):
     """See `cufftType`."""
@@ -145,6 +149,8 @@ cpdef inline check_status(int status):
         raise cuFFTError(status)
 
 
+cdef int _cufft_version = 0
+
 ###############################################################################
 # Wrapper functions
 ###############################################################################
@@ -592,13 +598,13 @@ cpdef xt_set_work_area_policy(int plan, int policy, intptr_t work_size):
     check_status(status)
 
 
-cpdef xt_set_jit_callback(int plan, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info):
+cpdef _xt_set_jit_callback(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info):
     """See `cufftXtSetJITCallback`."""
     cdef void* _lto_callback_fatbin_ = get_buffer_pointer(lto_callback_fatbin, lto_callback_fatbin_size, readonly=True)
     cdef nullable_unique_ptr[ vector[void*] ] _caller_info_
     get_resource_ptrs[void](_caller_info_, caller_info, <void*>NULL)
     with nogil:
-        status = cufftXtSetJITCallback(<cufftHandle>plan, <const void*>_lto_callback_fatbin_, lto_callback_fatbin_size, <_XtCallbackType>type, <void**>(_caller_info_.data()))
+        status = cufftXtSetJITCallback(<cufftHandle>plan, <const char*>lto_callback_symbol_name, <const void*>_lto_callback_fatbin_, lto_callback_fatbin_size, <_XtCallbackType>type, <void**>(_caller_info_.data()))
     check_status(status)
 
 
@@ -630,3 +636,26 @@ cpdef reset_plan_property(int plan, int property):
     with nogil:
         status = cufftResetPlanProperty(<cufftHandle>plan, <_Property>property)
     check_status(status)
+
+
+cpdef _xt_set_jit_callback_12_7(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info):
+    """See `__cufftXtSetJITCallback_12_7`."""
+    cdef void* _lto_callback_fatbin_ = get_buffer_pointer(lto_callback_fatbin, lto_callback_fatbin_size, readonly=True)
+    cdef nullable_unique_ptr[ vector[void*] ] _caller_info_
+    get_resource_ptrs[void](_caller_info_, caller_info, <void*>NULL)
+    with nogil:
+        status = __cufftXtSetJITCallback_12_7(<cufftHandle>plan, <const char*>lto_callback_symbol_name, <const void*>_lto_callback_fatbin_, lto_callback_fatbin_size, <_XtCallbackType>type, <void**>(_caller_info_.data()))
+    check_status(status)
+
+cpdef xt_set_jit_callback(int plan, intptr_t lto_callback_symbol_name, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info):
+    """
+    Signature of `cufftXtSetJITCallback` changed with CTK 13.
+    This wrapper makes sure to use the __cufftXtSetJITCallback_12_7 for older CTK.
+    """
+    global _cufft_version
+    if _cufft_version == 0:
+        _cufft_version = get_version()
+    if _cufft_version < 12000:  # CTK 13 is shipped with cuFFT 12.0
+        _xt_set_jit_callback_12_7(plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
+    else:
+        _xt_set_jit_callback(plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
diff --git a/nvmath/bindings/cutensor.pxd b/nvmath/bindings/cutensor.pxd
new file mode 100644
index 0000000..4b7783e
--- /dev/null
+++ b/nvmath/bindings/cutensor.pxd
@@ -0,0 +1,97 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+
+cimport cython
+
+from libc.stdint cimport intptr_t
+
+from .cycutensor cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef cutensorComputeDescriptor_t ComputeDescriptor
+ctypedef cutensorOperationDescriptor_t OperationDescriptor
+ctypedef cutensorPlan_t Plan
+ctypedef cutensorPlanPreference_t PlanPreference
+ctypedef cutensorHandle_t Handle
+ctypedef cutensorTensorDescriptor_t TensorDescriptor
+ctypedef cutensorBlockSparseTensorDescriptor_t BlockSparseTensorDescriptor
+ctypedef cutensorLoggerCallback_t LoggerCallback
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef cutensorOperator_t _Operator
+ctypedef cutensorStatus_t _Status
+ctypedef cutensorAlgo_t _Algo
+ctypedef cutensorWorksizePreference_t _WorksizePreference
+ctypedef cutensorOperationDescriptorAttribute_t _OperationDescriptorAttribute
+ctypedef cutensorPlanPreferenceAttribute_t _PlanPreferenceAttribute
+ctypedef cutensorAutotuneMode_t _AutotuneMode
+ctypedef cutensorJitMode_t _JitMode
+ctypedef cutensorCacheMode_t _CacheMode
+ctypedef cutensorPlanAttribute_t _PlanAttribute
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t create() except? 0
+cpdef destroy(intptr_t handle)
+cpdef handle_resize_plan_cache(intptr_t handle, uint32_t num_entries)
+cpdef handle_write_plan_cache_to_file(intptr_t handle, filename)
+cpdef uint32_t handle_read_plan_cache_from_file(intptr_t handle, filename) except? -1
+cpdef write_kernel_cache_to_file(intptr_t handle, filename)
+cpdef read_kernel_cache_from_file(intptr_t handle, filename)
+cpdef intptr_t create_tensor_descriptor(intptr_t handle, uint32_t num_modes, extent, stride, int data_type, uint32_t alignment_requirement) except? 0
+cpdef destroy_tensor_descriptor(intptr_t desc)
+cpdef intptr_t create_elementwise_trinary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_ab, int op_abc, intptr_t desc_compute) except? 0
+cpdef elementwise_trinary_execute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t beta, intptr_t b, intptr_t gamma, intptr_t c, intptr_t d, intptr_t stream)
+cpdef intptr_t create_elementwise_binary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_ac, intptr_t desc_compute) except? 0
+cpdef elementwise_binary_execute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t gamma, intptr_t c, intptr_t d, intptr_t stream)
+cpdef intptr_t create_permutation(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, intptr_t desc_compute) except? 0
+cpdef permute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t stream)
+cpdef intptr_t create_contraction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, intptr_t desc_compute) except? 0
+cpdef destroy_operation_descriptor(intptr_t desc)
+cpdef get_operation_descriptor_attribute_dtype(int attr)
+cpdef operation_descriptor_set_attribute(intptr_t handle, intptr_t desc, int attr, intptr_t buf, size_t size_in_bytes)
+cpdef operation_descriptor_get_attribute(intptr_t handle, intptr_t desc, int attr, intptr_t buf, size_t size_in_bytes)
+cpdef intptr_t create_plan_preference(intptr_t handle, int algo, int jit_mode) except? 0
+cpdef destroy_plan_preference(intptr_t pref)
+cpdef get_plan_preference_attribute_dtype(int attr)
+cpdef plan_preference_set_attribute(intptr_t handle, intptr_t pref, int attr, intptr_t buf, size_t size_in_bytes)
+cpdef get_plan_attribute_dtype(int attr)
+cpdef plan_get_attribute(intptr_t handle, intptr_t plan, int attr, intptr_t buf, size_t size_in_bytes)
+cpdef uint64_t estimate_workspace_size(intptr_t handle, intptr_t desc, intptr_t plan_pref, int workspace_pref) except? -1
+cpdef intptr_t create_plan(intptr_t handle, intptr_t desc, intptr_t pref, uint64_t workspace_size_limit) except? 0
+cpdef destroy_plan(intptr_t plan)
+cpdef contract(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t beta, intptr_t c, intptr_t d, intptr_t workspace, uint64_t workspace_size, intptr_t stream)
+cpdef intptr_t create_reduction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_reduce, intptr_t desc_compute) except? 0
+cpdef reduce(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t beta, intptr_t c, intptr_t d, intptr_t workspace, uint64_t workspace_size, intptr_t stream)
+cpdef intptr_t create_contraction_trinary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_d, intptr_t desc_e, mode_e, intptr_t desc_compute) except? 0
+cpdef contract_trinary(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t c, intptr_t beta, intptr_t d, intptr_t e, intptr_t workspace, uint64_t workspace_size, intptr_t stream)
+cpdef intptr_t create_block_sparse_tensor_descriptor(intptr_t handle, uint32_t num_modes, uint64_t num_non_zero_blocks, num_sections_per_mode, extent, non_zero_coordinates, stride, int data_type) except? 0
+cpdef destroy_block_sparse_tensor_descriptor(intptr_t desc)
+cpdef intptr_t create_block_sparse_contraction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, intptr_t desc_compute) except? 0
+cpdef block_sparse_contract(intptr_t handle, intptr_t plan, intptr_t alpha, a, b, intptr_t beta, c, d, intptr_t workspace, uint64_t workspace_size, intptr_t stream)
+cpdef str get_error_string(int error)
+cpdef size_t get_version() except? 0
+cpdef size_t get_cudart_version() except? 0
+cpdef logger_set_file(intptr_t file)
+cpdef logger_open_file(log_file)
+cpdef logger_set_level(int32_t level)
+cpdef logger_set_mask(int32_t mask)
+cpdef logger_force_disable()
diff --git a/nvmath/bindings/cutensor.pyi b/nvmath/bindings/cutensor.pyi
new file mode 100644
index 0000000..d1030ba
--- /dev/null
+++ b/nvmath/bindings/cutensor.pyi
@@ -0,0 +1,239 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_4
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+block_sparse_contract: _cython_3_1_4.cython_function_or_method
+check_status: _cython_3_1_4.cython_function_or_method
+contract: _cython_3_1_4.cython_function_or_method
+contract_trinary: _cython_3_1_4.cython_function_or_method
+create: _cython_3_1_4.cython_function_or_method
+create_block_sparse_contraction: _cython_3_1_4.cython_function_or_method
+create_block_sparse_tensor_descriptor: _cython_3_1_4.cython_function_or_method
+create_contraction: _cython_3_1_4.cython_function_or_method
+create_contraction_trinary: _cython_3_1_4.cython_function_or_method
+create_elementwise_binary: _cython_3_1_4.cython_function_or_method
+create_elementwise_trinary: _cython_3_1_4.cython_function_or_method
+create_permutation: _cython_3_1_4.cython_function_or_method
+create_plan: _cython_3_1_4.cython_function_or_method
+create_plan_preference: _cython_3_1_4.cython_function_or_method
+create_reduction: _cython_3_1_4.cython_function_or_method
+create_tensor_descriptor: _cython_3_1_4.cython_function_or_method
+destroy: _cython_3_1_4.cython_function_or_method
+destroy_block_sparse_tensor_descriptor: _cython_3_1_4.cython_function_or_method
+destroy_operation_descriptor: _cython_3_1_4.cython_function_or_method
+destroy_plan: _cython_3_1_4.cython_function_or_method
+destroy_plan_preference: _cython_3_1_4.cython_function_or_method
+destroy_tensor_descriptor: _cython_3_1_4.cython_function_or_method
+elementwise_binary_execute: _cython_3_1_4.cython_function_or_method
+elementwise_trinary_execute: _cython_3_1_4.cython_function_or_method
+estimate_workspace_size: _cython_3_1_4.cython_function_or_method
+get_cudart_version: _cython_3_1_4.cython_function_or_method
+get_error_string: _cython_3_1_4.cython_function_or_method
+get_operation_descriptor_attribute_dtype: _cython_3_1_4.cython_function_or_method
+get_plan_attribute_dtype: _cython_3_1_4.cython_function_or_method
+get_plan_preference_attribute_dtype: _cython_3_1_4.cython_function_or_method
+get_version: _cython_3_1_4.cython_function_or_method
+handle_read_plan_cache_from_file: _cython_3_1_4.cython_function_or_method
+handle_resize_plan_cache: _cython_3_1_4.cython_function_or_method
+handle_write_plan_cache_to_file: _cython_3_1_4.cython_function_or_method
+logger_force_disable: _cython_3_1_4.cython_function_or_method
+logger_open_file: _cython_3_1_4.cython_function_or_method
+logger_set_file: _cython_3_1_4.cython_function_or_method
+logger_set_level: _cython_3_1_4.cython_function_or_method
+logger_set_mask: _cython_3_1_4.cython_function_or_method
+operation_descriptor_get_attribute: _cython_3_1_4.cython_function_or_method
+operation_descriptor_set_attribute: _cython_3_1_4.cython_function_or_method
+permute: _cython_3_1_4.cython_function_or_method
+plan_get_attribute: _cython_3_1_4.cython_function_or_method
+plan_preference_set_attribute: _cython_3_1_4.cython_function_or_method
+read_kernel_cache_from_file: _cython_3_1_4.cython_function_or_method
+reduce: _cython_3_1_4.cython_function_or_method
+write_kernel_cache_to_file: _cython_3_1_4.cython_function_or_method
+
+class Algo(enum.IntEnum):
+    """See `cutensorAlgo_t`."""
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[Algo] = ...
+    DEFAULT_PATIENT: ClassVar[Algo] = ...
+    GETT: ClassVar[Algo] = ...
+    TGETT: ClassVar[Algo] = ...
+    TTGT: ClassVar[Algo] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class AutotuneMode(enum.IntEnum):
+    """See `cutensorAutotuneMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    INCREMENTAL: ClassVar[AutotuneMode] = ...
+    NONE: ClassVar[AutotuneMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class CacheMode(enum.IntEnum):
+    """See `cutensorCacheMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    NONE: ClassVar[CacheMode] = ...
+    PEDANTIC: ClassVar[CacheMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class ComputeDesc:
+    """See `cutensorComputeDescriptor_t`."""
+    COMPUTE_16BF: ClassVar[method] = ...
+    COMPUTE_16F: ClassVar[method] = ...
+    COMPUTE_32F: ClassVar[method] = ...
+    COMPUTE_3XTF32: ClassVar[method] = ...
+    COMPUTE_64F: ClassVar[method] = ...
+    COMPUTE_TF32: ClassVar[method] = ...
+
+class JitMode(enum.IntEnum):
+    """See `cutensorJitMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[JitMode] = ...
+    NONE: ClassVar[JitMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class OperationDescriptorAttribute(enum.IntEnum):
+    """See `cutensorOperationDescriptorAttribute_t`."""
+    __new__: ClassVar[Callable] = ...
+    FLOPS: ClassVar[OperationDescriptorAttribute] = ...
+    MOVED_BYTES: ClassVar[OperationDescriptorAttribute] = ...
+    PADDING_LEFT: ClassVar[OperationDescriptorAttribute] = ...
+    PADDING_RIGHT: ClassVar[OperationDescriptorAttribute] = ...
+    PADDING_VALUE: ClassVar[OperationDescriptorAttribute] = ...
+    SCALAR_TYPE: ClassVar[OperationDescriptorAttribute] = ...
+    TAG: ClassVar[OperationDescriptorAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Operator(enum.IntEnum):
+    """See `cutensorOperator_t`."""
+    __new__: ClassVar[Callable] = ...
+    OP_ABS: ClassVar[Operator] = ...
+    OP_ACOS: ClassVar[Operator] = ...
+    OP_ACOSH: ClassVar[Operator] = ...
+    OP_ADD: ClassVar[Operator] = ...
+    OP_ASIN: ClassVar[Operator] = ...
+    OP_ASINH: ClassVar[Operator] = ...
+    OP_ATAN: ClassVar[Operator] = ...
+    OP_ATANH: ClassVar[Operator] = ...
+    OP_CEIL: ClassVar[Operator] = ...
+    OP_CONJ: ClassVar[Operator] = ...
+    OP_COS: ClassVar[Operator] = ...
+    OP_COSH: ClassVar[Operator] = ...
+    OP_EXP: ClassVar[Operator] = ...
+    OP_FLOOR: ClassVar[Operator] = ...
+    OP_IDENTITY: ClassVar[Operator] = ...
+    OP_LOG: ClassVar[Operator] = ...
+    OP_MAX: ClassVar[Operator] = ...
+    OP_MIN: ClassVar[Operator] = ...
+    OP_MISH: ClassVar[Operator] = ...
+    OP_MUL: ClassVar[Operator] = ...
+    OP_NEG: ClassVar[Operator] = ...
+    OP_RCP: ClassVar[Operator] = ...
+    OP_RELU: ClassVar[Operator] = ...
+    OP_SIGMOID: ClassVar[Operator] = ...
+    OP_SIN: ClassVar[Operator] = ...
+    OP_SINH: ClassVar[Operator] = ...
+    OP_SOFT_PLUS: ClassVar[Operator] = ...
+    OP_SOFT_SIGN: ClassVar[Operator] = ...
+    OP_SQRT: ClassVar[Operator] = ...
+    OP_SWISH: ClassVar[Operator] = ...
+    OP_TAN: ClassVar[Operator] = ...
+    OP_TANH: ClassVar[Operator] = ...
+    OP_UNKNOWN: ClassVar[Operator] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class PlanAttribute(enum.IntEnum):
+    """See `cutensorPlanAttribute_t`."""
+    __new__: ClassVar[Callable] = ...
+    REQUIRED_WORKSPACE: ClassVar[PlanAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class PlanPreferenceAttribute(enum.IntEnum):
+    """See `cutensorPlanPreferenceAttribute_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALGO: ClassVar[PlanPreferenceAttribute] = ...
+    AUTOTUNE_MODE: ClassVar[PlanPreferenceAttribute] = ...
+    CACHE_MODE: ClassVar[PlanPreferenceAttribute] = ...
+    INCREMENTAL_COUNT: ClassVar[PlanPreferenceAttribute] = ...
+    JIT: ClassVar[PlanPreferenceAttribute] = ...
+    KERNEL_RANK: ClassVar[PlanPreferenceAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Status(enum.IntEnum):
+    """See `cutensorStatus_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Status] = ...
+    ARCH_MISMATCH: ClassVar[Status] = ...
+    CUBLAS_ERROR: ClassVar[Status] = ...
+    CUDA_ERROR: ClassVar[Status] = ...
+    EXECUTION_FAILED: ClassVar[Status] = ...
+    INSUFFICIENT_DRIVER: ClassVar[Status] = ...
+    INSUFFICIENT_WORKSPACE: ClassVar[Status] = ...
+    INTERNAL_ERROR: ClassVar[Status] = ...
+    INVALID_VALUE: ClassVar[Status] = ...
+    IO_ERROR: ClassVar[Status] = ...
+    LICENSE_ERROR: ClassVar[Status] = ...
+    MAPPING_ERROR: ClassVar[Status] = ...
+    NOT_INITIALIZED: ClassVar[Status] = ...
+    NOT_SUPPORTED: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class WorksizePreference(enum.IntEnum):
+    """See `cutensorWorksizePreference_t`."""
+    __new__: ClassVar[Callable] = ...
+    WORKSPACE_DEFAULT: ClassVar[WorksizePreference] = ...
+    WORKSPACE_MAX: ClassVar[WorksizePreference] = ...
+    WORKSPACE_MIN: ClassVar[WorksizePreference] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class cuTENSORError(Exception):
+    """cuTENSORError(status)"""
+    def __init__(self, status) -> Any:
+        """Initialize self.  See help(type(self)) for accurate signature."""
+    def __reduce__(self) -> Any:
+        """cuTENSORError.__reduce__(self)"""
diff --git a/nvmath/bindings/cutensor.pyx b/nvmath/bindings/cutensor.pyx
new file mode 100644
index 0000000..a4c9894
--- /dev/null
+++ b/nvmath/bindings/cutensor.pyx
@@ -0,0 +1,1336 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+
+cimport cython  # NOQA
+cimport cpython
+from libcpp.vector cimport vector
+
+from ._internal.utils cimport get_resource_ptr, get_resource_ptrs, nullable_unique_ptr
+
+from enum import IntEnum as _IntEnum
+
+import ctypes
+import threading
+import numpy as _numpy
+
+
+cdef object __symbol_lock = threading.Lock()
+_COMPUTE_DESC_INIT = False
+_COMPUTE_DESC_16F = None
+_COMPUTE_DESC_16BF = None
+_COMPUTE_DESC_TF32 = None
+_COMPUTE_DESC_3XTF32 = None
+_COMPUTE_DESC_32F = None
+_COMPUTE_DESC_64F = None
+
+def _load_cutensor_compute_descriptors():
+    global _COMPUTE_DESC_INIT
+    if _COMPUTE_DESC_INIT:
+        return
+
+    with __symbol_lock:
+        try:
+            lib = ctypes.CDLL("libcutensor.so.2")
+            global _COMPUTE_DESC_16F, _COMPUTE_DESC_16BF, _COMPUTE_DESC_TF32, _COMPUTE_DESC_3XTF32, _COMPUTE_DESC_32F, _COMPUTE_DESC_64F
+            _COMPUTE_DESC_16F = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_16F").value
+            _COMPUTE_DESC_16BF = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_16BF").value
+            _COMPUTE_DESC_TF32 = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_TF32").value
+            _COMPUTE_DESC_3XTF32 = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_3XTF32").value
+            _COMPUTE_DESC_32F = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_32F").value
+            _COMPUTE_DESC_64F = ctypes.c_void_p.in_dll(lib, "CUTENSOR_COMPUTE_DESC_64F").value
+            _COMPUTE_DESC_INIT = True
+        except:
+            raise ImportError("Failed to load cutensor library")
+
+
+class ComputeDesc:
+    """See `cutensorComputeDescriptor_t`."""
+
+    @classmethod
+    def COMPUTE_16F(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_16F
+
+    @classmethod
+    def COMPUTE_16BF(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_16BF
+
+    @classmethod
+    def COMPUTE_TF32(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_TF32
+
+    @classmethod
+    def COMPUTE_3XTF32(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_3XTF32
+
+    @classmethod
+    def COMPUTE_32F(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_32F
+
+    @classmethod
+    def COMPUTE_64F(cls):
+        _load_cutensor_compute_descriptors()
+        return _COMPUTE_DESC_64F
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Operator(_IntEnum):
+    """See `cutensorOperator_t`."""
+    OP_IDENTITY = CUTENSOR_OP_IDENTITY
+    OP_SQRT = CUTENSOR_OP_SQRT
+    OP_RELU = CUTENSOR_OP_RELU
+    OP_CONJ = CUTENSOR_OP_CONJ
+    OP_RCP = CUTENSOR_OP_RCP
+    OP_SIGMOID = CUTENSOR_OP_SIGMOID
+    OP_TANH = CUTENSOR_OP_TANH
+    OP_EXP = CUTENSOR_OP_EXP
+    OP_LOG = CUTENSOR_OP_LOG
+    OP_ABS = CUTENSOR_OP_ABS
+    OP_NEG = CUTENSOR_OP_NEG
+    OP_SIN = CUTENSOR_OP_SIN
+    OP_COS = CUTENSOR_OP_COS
+    OP_TAN = CUTENSOR_OP_TAN
+    OP_SINH = CUTENSOR_OP_SINH
+    OP_COSH = CUTENSOR_OP_COSH
+    OP_ASIN = CUTENSOR_OP_ASIN
+    OP_ACOS = CUTENSOR_OP_ACOS
+    OP_ATAN = CUTENSOR_OP_ATAN
+    OP_ASINH = CUTENSOR_OP_ASINH
+    OP_ACOSH = CUTENSOR_OP_ACOSH
+    OP_ATANH = CUTENSOR_OP_ATANH
+    OP_CEIL = CUTENSOR_OP_CEIL
+    OP_FLOOR = CUTENSOR_OP_FLOOR
+    OP_MISH = CUTENSOR_OP_MISH
+    OP_SWISH = CUTENSOR_OP_SWISH
+    OP_SOFT_PLUS = CUTENSOR_OP_SOFT_PLUS
+    OP_SOFT_SIGN = CUTENSOR_OP_SOFT_SIGN
+    OP_ADD = CUTENSOR_OP_ADD
+    OP_MUL = CUTENSOR_OP_MUL
+    OP_MAX = CUTENSOR_OP_MAX
+    OP_MIN = CUTENSOR_OP_MIN
+    OP_UNKNOWN = CUTENSOR_OP_UNKNOWN
+
+class Status(_IntEnum):
+    """See `cutensorStatus_t`."""
+    SUCCESS = CUTENSOR_STATUS_SUCCESS
+    NOT_INITIALIZED = CUTENSOR_STATUS_NOT_INITIALIZED
+    ALLOC_FAILED = CUTENSOR_STATUS_ALLOC_FAILED
+    INVALID_VALUE = CUTENSOR_STATUS_INVALID_VALUE
+    ARCH_MISMATCH = CUTENSOR_STATUS_ARCH_MISMATCH
+    MAPPING_ERROR = CUTENSOR_STATUS_MAPPING_ERROR
+    EXECUTION_FAILED = CUTENSOR_STATUS_EXECUTION_FAILED
+    INTERNAL_ERROR = CUTENSOR_STATUS_INTERNAL_ERROR
+    NOT_SUPPORTED = CUTENSOR_STATUS_NOT_SUPPORTED
+    LICENSE_ERROR = CUTENSOR_STATUS_LICENSE_ERROR
+    CUBLAS_ERROR = CUTENSOR_STATUS_CUBLAS_ERROR
+    CUDA_ERROR = CUTENSOR_STATUS_CUDA_ERROR
+    INSUFFICIENT_WORKSPACE = CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE
+    INSUFFICIENT_DRIVER = CUTENSOR_STATUS_INSUFFICIENT_DRIVER
+    IO_ERROR = CUTENSOR_STATUS_IO_ERROR
+
+class Algo(_IntEnum):
+    """See `cutensorAlgo_t`."""
+    DEFAULT_PATIENT = CUTENSOR_ALGO_DEFAULT_PATIENT
+    GETT = CUTENSOR_ALGO_GETT
+    TGETT = CUTENSOR_ALGO_TGETT
+    TTGT = CUTENSOR_ALGO_TTGT
+    DEFAULT = CUTENSOR_ALGO_DEFAULT
+
+class WorksizePreference(_IntEnum):
+    """See `cutensorWorksizePreference_t`."""
+    WORKSPACE_MIN = CUTENSOR_WORKSPACE_MIN
+    WORKSPACE_DEFAULT = CUTENSOR_WORKSPACE_DEFAULT
+    WORKSPACE_MAX = CUTENSOR_WORKSPACE_MAX
+
+class OperationDescriptorAttribute(_IntEnum):
+    """See `cutensorOperationDescriptorAttribute_t`."""
+    TAG = CUTENSOR_OPERATION_DESCRIPTOR_TAG
+    SCALAR_TYPE = CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE
+    FLOPS = CUTENSOR_OPERATION_DESCRIPTOR_FLOPS
+    MOVED_BYTES = CUTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES
+    PADDING_LEFT = CUTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT
+    PADDING_RIGHT = CUTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT
+    PADDING_VALUE = CUTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE
+
+class PlanPreferenceAttribute(_IntEnum):
+    """See `cutensorPlanPreferenceAttribute_t`."""
+    AUTOTUNE_MODE = CUTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE
+    CACHE_MODE = CUTENSOR_PLAN_PREFERENCE_CACHE_MODE
+    INCREMENTAL_COUNT = CUTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT
+    ALGO = CUTENSOR_PLAN_PREFERENCE_ALGO
+    KERNEL_RANK = CUTENSOR_PLAN_PREFERENCE_KERNEL_RANK
+    JIT = CUTENSOR_PLAN_PREFERENCE_JIT
+
+class AutotuneMode(_IntEnum):
+    """See `cutensorAutotuneMode_t`."""
+    NONE = CUTENSOR_AUTOTUNE_MODE_NONE
+    INCREMENTAL = CUTENSOR_AUTOTUNE_MODE_INCREMENTAL
+
+class JitMode(_IntEnum):
+    """See `cutensorJitMode_t`."""
+    NONE = CUTENSOR_JIT_MODE_NONE
+    DEFAULT = CUTENSOR_JIT_MODE_DEFAULT
+
+class CacheMode(_IntEnum):
+    """See `cutensorCacheMode_t`."""
+    NONE = CUTENSOR_CACHE_MODE_NONE
+    PEDANTIC = CUTENSOR_CACHE_MODE_PEDANTIC
+
+class PlanAttribute(_IntEnum):
+    """See `cutensorPlanAttribute_t`."""
+    REQUIRED_WORKSPACE = CUTENSOR_PLAN_REQUIRED_WORKSPACE
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cdef class cuTENSORError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Status(status)
+        cdef str err = f"{s.name} ({s.value})"
+        super(cuTENSORError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cpdef inline check_status(int status):
+    if status != 0:
+        raise cuTENSORError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef intptr_t create() except? 0:
+    """Initializes the cuTENSOR library and allocates the memory for the library context.
+
+    Returns:
+        intptr_t: Pointer to cutensorHandle_t.
+
+    .. seealso:: `cutensorCreate`
+    """
+    cdef Handle handle
+    with nogil:
+        __status__ = cutensorCreate(&handle)
+    check_status(__status__)
+    return <intptr_t>handle
+
+
+cpdef destroy(intptr_t handle):
+    """Frees all resources related to the provided library handle.
+
+    Args:
+        handle (intptr_t): Pointer to cutensorHandle_t.
+
+    .. seealso:: `cutensorDestroy`
+    """
+    with nogil:
+        __status__ = cutensorDestroy(<Handle>handle)
+    check_status(__status__)
+
+
+cpdef handle_resize_plan_cache(intptr_t handle, uint32_t num_entries):
+    """Resizes the plan cache.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context. The cache will be attached to the handle.
+        num_entries (uint32_t): Number of entries the cache will support.
+
+    .. seealso:: `cutensorHandleResizePlanCache`
+    """
+    with nogil:
+        __status__ = cutensorHandleResizePlanCache(<Handle>handle, <const uint32_t>num_entries)
+    check_status(__status__)
+
+
+cpdef handle_write_plan_cache_to_file(intptr_t handle, filename):
+    """Writes the Plan-Cache (that belongs to the provided handle) to file.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        filename (str): Specifies the filename (including the absolute path) to the file that should hold all the cache information. Warning: an existing file will be overwritten.
+
+    .. seealso:: `cutensorHandleWritePlanCacheToFile`
+    """
+    if not isinstance(filename, str):
+        raise TypeError("filename must be a Python str")
+    cdef bytes _temp_filename_ = (<str>filename).encode()
+    cdef char* _filename_ = _temp_filename_
+    with nogil:
+        __status__ = cutensorHandleWritePlanCacheToFile(<const Handle>handle, <const char*>_filename_)
+    check_status(__status__)
+
+
+cpdef uint32_t handle_read_plan_cache_from_file(intptr_t handle, filename) except? -1:
+    """Reads a Plan-Cache from file and overwrites the cachelines of the provided handle.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        filename (str): Specifies the filename (including the absolute path) to the file that holds all the cache information that have previously been written by ``cutensorHandleWritePlanCacheToFile``.
+
+    Returns:
+        uint32_t: On exit, this variable will hold the number of successfully-read cachelines, if CUTENSOR_STATUS_SUCCESS is returned. Otherwise, this variable will hold the number of cachelines that are required to read all cachelines associated to the cache pointed to by ``filename``; in that case CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE is returned.
+
+    .. seealso:: `cutensorHandleReadPlanCacheFromFile`
+    """
+    if not isinstance(filename, str):
+        raise TypeError("filename must be a Python str")
+    cdef bytes _temp_filename_ = (<str>filename).encode()
+    cdef char* _filename_ = _temp_filename_
+    cdef uint32_t num_cachelines_read
+    with nogil:
+        __status__ = cutensorHandleReadPlanCacheFromFile(<Handle>handle, <const char*>_filename_, &num_cachelines_read)
+    check_status(__status__)
+    return num_cachelines_read
+
+
+cpdef write_kernel_cache_to_file(intptr_t handle, filename):
+    """Writes the --per library-- kernel cache to file.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        filename (str): Specifies the filename (including the absolute path) to the file that should hold all the cache information. Warning: an existing file will be overwritten.
+
+    .. seealso:: `cutensorWriteKernelCacheToFile`
+    """
+    if not isinstance(filename, str):
+        raise TypeError("filename must be a Python str")
+    cdef bytes _temp_filename_ = (<str>filename).encode()
+    cdef char* _filename_ = _temp_filename_
+    with nogil:
+        __status__ = cutensorWriteKernelCacheToFile(<const Handle>handle, <const char*>_filename_)
+    check_status(__status__)
+
+
+cpdef read_kernel_cache_from_file(intptr_t handle, filename):
+    """Reads a kernel cache from file and adds all non-existing JIT compiled kernels to the kernel cache.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        filename (str): Specifies the filename (including the absolute path) to the file that holds all the cache information that have previously been written by cutensorWriteKernelCacheToFile.
+
+    .. seealso:: `cutensorReadKernelCacheFromFile`
+    """
+    if not isinstance(filename, str):
+        raise TypeError("filename must be a Python str")
+    cdef bytes _temp_filename_ = (<str>filename).encode()
+    cdef char* _filename_ = _temp_filename_
+    with nogil:
+        __status__ = cutensorReadKernelCacheFromFile(<Handle>handle, <const char*>_filename_)
+    check_status(__status__)
+
+
+cpdef intptr_t create_tensor_descriptor(intptr_t handle, uint32_t num_modes, extent, stride, int data_type, uint32_t alignment_requirement) except? 0:
+    """Creates a tensor descriptor.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        num_modes (uint32_t): Number of modes.
+        extent (object): Extent of each mode (must be larger than zero). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        stride (object): stride[i] denotes the displacement (a.k.a. stride)--in elements of the base type--between two consecutive elements in the ith-mode. If stride is NULL, a packed generalized column-major memory layout is assumed (i.e., the strides increase monotonically from left to right). Each stride must be larger than zero; to be precise, a stride of zero can be achieved by omitting this mode entirely; for instance instead of writing C[a,b] = A[b,a] with strideA(a) = 0, you can write C[a,b] = A[b] directly; cuTENSOR will then automatically infer that the a-mode in A should be broadcasted. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        data_type (int): Data type of the stored entries.
+        alignment_requirement (uint32_t): Alignment (in bytes) to the base pointer that will be used in conjunction with this tensor descriptor (e.g., ``cudaMalloc`` has a default alignment of 256 bytes).
+
+    Returns:
+        intptr_t: Pointer to the address where the allocated tensor descriptor object will be stored.
+
+    .. seealso:: `cutensorCreateTensorDescriptor`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _extent_
+    get_resource_ptr[int64_t](_extent_, extent, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _stride_
+    get_resource_ptr[int64_t](_stride_, stride, <int64_t*>NULL)
+    cdef TensorDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateTensorDescriptor(<const Handle>handle, &desc, <const uint32_t>num_modes, <const int64_t*>(_extent_.data()), <const int64_t*>(_stride_.data()), <DataType>data_type, alignment_requirement)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef destroy_tensor_descriptor(intptr_t desc):
+    """Frees all resources related to the provided tensor descriptor.
+
+    Args:
+        desc (intptr_t): The cutensorTensorDescriptor_t object that will be deallocated.
+
+    .. seealso:: `cutensorDestroyTensorDescriptor`
+    """
+    with nogil:
+        __status__ = cutensorDestroyTensorDescriptor(<TensorDescriptor>desc)
+    check_status(__status__)
+
+
+cpdef intptr_t create_elementwise_trinary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_ab, int op_abc, intptr_t desc_compute) except? 0:
+    """This function creates an operation descriptor that encodes an elementwise trinary operation.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): A descriptor that holds the information about the data type, modes, and strides of A.
+        mode_a (object): Array (in host memory) of size desc_a->numModes that holds the names of the modes of A (e.g., if  then mode_a = {'a','b','c'}). The mode_a[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_b (intptr_t): A descriptor that holds information about the data type, modes, and strides of B.
+        mode_b (object): Array (in host memory) of size desc_b->numModes that holds the names of the modes of B. mode_b[i] corresponds to extent[i] and stride[i] of the ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_b (Operator): Unary operator that will be applied to each element of B before it is further processed. The original data of this tensor remains unchanged.
+        desc_c (intptr_t): A descriptor that holds information about the data type, modes, and strides of C.
+        mode_c (object): Array (in host memory) of size desc_c->numModes that holds the names of the modes of C. The mode_c[i] corresponds to extent[i] and stride[i] of the ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged.
+        desc_d (intptr_t): A descriptor that holds information about the data type, modes, and strides of D. Notice that we currently request desc_d and desc_c to be identical.
+        mode_d (object): Array (in host memory) of size desc_d->numModes that holds the names of the modes of D. The mode_d[i] corresponds to extent[i] and stride[i] of the ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_ab (Operator): Element-wise binary operator (see  above).
+        op_abc (Operator): Element-wise binary operator (see  above).
+        desc_compute (intptr_t): Determines the precision in which this operations is performed.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the requested elementwise operation.
+
+    .. seealso:: `cutensorCreateElementwiseTrinary`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_b_
+    get_resource_ptr[int32_t](_mode_b_, mode_b, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateElementwiseTrinary(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_b, <const int32_t*>(_mode_b_.data()), <_Operator>op_b, <const TensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const TensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <_Operator>op_ab, <_Operator>op_abc, <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef elementwise_trinary_execute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t beta, intptr_t b, intptr_t gamma, intptr_t c, intptr_t d, intptr_t stream):
+    """Performs an element-wise tensor operation for three input tensors (see ``cutensorcreateElementwiseTrinary``).
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding all information about the desired elementwise operation (created by ``cutensorcreateElementwiseTrinary`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling factor for a (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE) to query the expected data type). Pointer to the host memory. If alpha is zero, a is not read and the corresponding unary operator is not applied.
+        a (intptr_t): Multi-mode tensor (described by ``desca`` as part of ``cutensorcreateElementwiseTrinary``). Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        beta (intptr_t): Scaling factor for b (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE) to query the expected data type). Pointer to the host memory. If beta is zero, b is not read and the corresponding unary operator is not applied.
+        b (intptr_t): Multi-mode tensor (described by ``descb`` as part of ``cutensorcreateElementwiseTrinary``). Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        gamma (intptr_t): Scaling factor for c (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE) to query the expected data type). Pointer to the host memory. If gamma is zero, c is not read and the corresponding unary operator is not applied.
+        c (intptr_t): Multi-mode tensor (described by ``descc`` as part of ``cutensorcreateElementwiseTrinary``). Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        d (intptr_t): Multi-mode tensor (described by ``descd`` as part of ``cutensorcreateElementwiseTrinary``). Pointer to the GPU-accessible memory (``c`` and ``d`` may be identical, if and only if ``descc == descd``).
+        stream (intptr_t): The cUda stream used to perform the operation.
+
+    .. seealso:: `cutensorElementwiseTrinaryExecute`
+    """
+    with nogil:
+        __status__ = cutensorElementwiseTrinaryExecute(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <const void*>beta, <const void*>b, <const void*>gamma, <const void*>c, <void*>d, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_elementwise_binary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_ac, intptr_t desc_compute) except? 0:
+    """This function creates an operation descriptor for an elementwise binary operation.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds the information about the data type, modes, and strides of A.
+        mode_a (object): Array (in host memory) of size desc_a->numModes that holds the names of the modes of A (e.g., if A_{a,b,c} => mode_a = {'a','b','c'}). The mode_a[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_c (intptr_t): The descriptor that holds information about the data type, modes, and strides of C.
+        mode_c (object): Array (in host memory) of size desc_c->numModes that holds the names of the modes of C. The mode_c[i] corresponds to extent[i] and stride[i] of the ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged.
+        desc_d (intptr_t): The descriptor that holds information about the data type, modes, and strides of D. Notice that we currently request desc_d and desc_c to be identical.
+        mode_d (object): Array (in host memory) of size desc_d->numModes that holds the names of the modes of D. The mode_d[i] corresponds to extent[i] and stride[i] of the ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_ac (Operator): Element-wise binary operator (see  above).
+        desc_compute (intptr_t): Determines the precision in which this operations is performed.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the requested elementwise operation.
+
+    .. seealso:: `cutensorCreateElementwiseBinary`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateElementwiseBinary(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const TensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <_Operator>op_ac, <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef elementwise_binary_execute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t gamma, intptr_t c, intptr_t d, intptr_t stream):
+    """Performs an element-wise tensor operation for two input tensors (see ``cutensorcreateElementwiseBinary``).
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding all information about the desired elementwise operation (created by ``cutensorcreateElementwiseBinary`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling factor for a (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE) to query the expected data type). Pointer to the host memory. If alpha is zero, a is not read and the corresponding unary operator is not applied.
+        a (intptr_t): Multi-mode tensor (described by ``desca`` as part of ``cutensorcreateElementwiseBinary``). Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        gamma (intptr_t): Scaling factor for c (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE) to query the expected data type). Pointer to the host memory. If gamma is zero, c is not read and the corresponding unary operator is not applied.
+        c (intptr_t): Multi-mode tensor (described by ``descc`` as part of ``cutensorcreateElementwiseBinary``). Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        d (intptr_t): Multi-mode tensor (described by ``descd`` as part of ``cutensorcreateElementwiseBinary``). Pointer to the GPU-accessible memory (``c`` and ``d`` may be identical, if and only if ``descc == descd``).
+        stream (intptr_t): The cUda stream used to perform the operation.
+
+    .. seealso:: `cutensorElementwiseBinaryExecute`
+    """
+    with nogil:
+        __status__ = cutensorElementwiseBinaryExecute(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <const void*>gamma, <const void*>c, <void*>d, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_permutation(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, intptr_t desc_compute) except? 0:
+    """This function creates an operation descriptor for a tensor permutation.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds information about the data type, modes, and strides of A.
+        mode_a (object): Array of size desc_a->numModes that holds the names of the modes of A (e.g., if A_{a,b,c} => mode_a = {'a','b','c'}). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_b (intptr_t): The descriptor that holds information about the data type, modes, and strides of B.
+        mode_b (object): Array of size desc_b->numModes that holds the names of the modes of B. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        desc_compute (intptr_t): Determines the precision in which this operations is performed.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the requested permutation.
+
+    .. seealso:: `cutensorCreatePermutation`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_b_
+    get_resource_ptr[int32_t](_mode_b_, mode_b, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreatePermutation(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_b, <const int32_t*>(_mode_b_.data()), <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef permute(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t stream):
+    """Performs the tensor permutation that is encoded by ``plan`` (see ``cutensorCreatePermutation``).
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding all information about the desired tensor reduction (created by ``cutensorCreatePermutation`` followed by ``cutensorCreatePlan``).
+        alpha (intptr_t): Scaling factor for a (see cutensorOperationDescriptorGetattribute(desc, CUTENSOR_OPERaTION_SCaLaR_TYPE)). Pointer to the host memory. If alpha is zero, a is not read and the corresponding unary operator is not applied.
+        a (intptr_t): Multi-mode tensor of type typea with nmodea modes. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to D.
+        b (intptr_t): Multi-mode tensor of type typeb with nmodeb modes. Pointer to the GPU-accessible memory.
+        stream (intptr_t): The CUDa stream.
+
+    .. seealso:: `cutensorPermute`
+    """
+    with nogil:
+        __status__ = cutensorPermute(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <void*>b, <const Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_contraction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, intptr_t desc_compute) except? 0:
+    """This function allocates a cutensorOperationDescriptor_t object that encodes a tensor contraction of the form .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds the information about the data type, modes and strides of A.
+        mode_a (object): Array with 'nmode_a' entries that represent the modes of A. The mode_a[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_b (intptr_t): The descriptor that holds information about the data type, modes, and strides of B.
+        mode_b (object): Array with 'nmode_b' entries that represent the modes of B. The mode_b[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_b (Operator): Unary operator that will be applied to each element of B before it is further processed. The original data of this tensor remains unchanged.
+        desc_c (intptr_t): Array with 'nmode_c' entries that represent the modes of C. The mode_c[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor.
+        mode_c (object): The escriptor that holds information about the data type, modes, and strides of C. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged.
+        desc_d (intptr_t): Array with 'nmode_d' entries that represent the modes of D (must be identical to mode_c for now). The mode_d[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor.
+        mode_d (object): The descriptor that holds information about the data type, modes, and strides of D (must be identical to ``desc_c`` for now). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        desc_compute (intptr_t): Determines the precision in which this operations is performed.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the tensor contraction operation.
+
+    .. seealso:: `cutensorCreateContraction`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_b_
+    get_resource_ptr[int32_t](_mode_b_, mode_b, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateContraction(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_b, <const int32_t*>(_mode_b_.data()), <_Operator>op_b, <const TensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const TensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef destroy_operation_descriptor(intptr_t desc):
+    """Frees all resources related to the provided descriptor.
+
+    Args:
+        desc (intptr_t): The cutensorOperationDescriptor_t object that will be deallocated.
+
+    .. seealso:: `cutensorDestroyOperationDescriptor`
+    """
+    with nogil:
+        __status__ = cutensorDestroyOperationDescriptor(<OperationDescriptor>desc)
+    check_status(__status__)
+
+
+######################### Python specific utility #########################
+
+cdef dict operation_descriptor_attribute_sizes = {
+    CUTENSOR_OPERATION_DESCRIPTOR_TAG: _numpy.int32,
+    CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE: _numpy.int32,
+    CUTENSOR_OPERATION_DESCRIPTOR_FLOPS: _numpy.float32,
+    CUTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES: _numpy.float32,
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT: _numpy.uint32,
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT: _numpy.uint32,
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE: _numpy.uint64,
+}
+
+cpdef get_operation_descriptor_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding OperationDescriptorAttribute attribute.
+
+    Args:
+        attr (OperationDescriptorAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`operation_descriptor_get_attribute`, :func:`operation_descriptor_set_attribute`.
+    """
+    return operation_descriptor_attribute_sizes[attr]
+
+###########################################################################
+
+
+cpdef operation_descriptor_set_attribute(intptr_t handle, intptr_t desc, int attr, intptr_t buf, size_t size_in_bytes):
+    """Set attribute of a cutensorOperationDescriptor_t object.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc (intptr_t): Operation descriptor that will be modified.
+        attr (OperationDescriptorAttribute): Specifies the attribute that will be set.
+        buf (intptr_t): This buffer (of size ``size_in_bytes``) determines the value to which ``attr`` will be set.
+        size_in_bytes (size_t): Size of buf (in bytes).
+
+    .. note:: To compute the attribute size, use the itemsize of the corresponding data
+        type, which can be queried using :func:`get_operation_descriptor_attribute_dtype`.
+
+    .. seealso:: `cutensorOperationDescriptorSetAttribute`
+    """
+    with nogil:
+        __status__ = cutensorOperationDescriptorSetAttribute(<const Handle>handle, <OperationDescriptor>desc, <_OperationDescriptorAttribute>attr, <const void*>buf, size_in_bytes)
+    check_status(__status__)
+
+
+cpdef operation_descriptor_get_attribute(intptr_t handle, intptr_t desc, int attr, intptr_t buf, size_t size_in_bytes):
+    """This function retrieves an attribute of the provided cutensorOperationDescriptor_t object (see cutensorOperationDescriptorAttribute_t).
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc (intptr_t): The cutensorOperationDescriptor_t object whos attribute is queried.
+        attr (OperationDescriptorAttribute): Specifies the attribute that will be retrieved.
+        buf (intptr_t): This buffer (of size size_in_bytes) will hold the requested attribute of the provided cutensorOperationDescriptor_t object.
+        size_in_bytes (size_t): Size of buf (in bytes); see cutensorOperationDescriptorAttribute_t for the exact size.
+
+    .. note:: To compute the attribute size, use the itemsize of the corresponding data
+        type, which can be queried using :func:`get_operation_descriptor_attribute_dtype`.
+
+    .. seealso:: `cutensorOperationDescriptorGetAttribute`
+    """
+    with nogil:
+        __status__ = cutensorOperationDescriptorGetAttribute(<const Handle>handle, <OperationDescriptor>desc, <_OperationDescriptorAttribute>attr, <void*>buf, size_in_bytes)
+    check_status(__status__)
+
+
+cpdef intptr_t create_plan_preference(intptr_t handle, int algo, int jit_mode) except? 0:
+    """Allocates the cutensorPlanPreference_t, enabling users to limit the applicable kernels for a given plan/operation.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        algo (Algo): Allows users to select a specific algorithm. CUTENSOR_ALGO_DEFAULT lets the heuristic choose the algorithm. Any value >= 0 selects a specific GEMM-like algorithm and deactivates the heuristic. If a specified algorithm is not supported CUTENSOR_STATUS_NOT_SUPPORTED is returned. See cutensorAlgo_t for additional choices.
+        jit_mode (JitMode): Determines if cuTENSOR is allowed to use JIT-compiled kernels (leading to a longer plan-creation phase); see cutensorJitMode_t.
+
+    Returns:
+        intptr_t: Pointer to the structure holding the cutensorPlanPreference_t allocated by this function. See cutensorPlanPreference_t.
+
+    .. seealso:: `cutensorCreatePlanPreference`
+    """
+    cdef PlanPreference pref
+    with nogil:
+        __status__ = cutensorCreatePlanPreference(<const Handle>handle, &pref, <_Algo>algo, <_JitMode>jit_mode)
+    check_status(__status__)
+    return <intptr_t>pref
+
+
+cpdef destroy_plan_preference(intptr_t pref):
+    """Frees all resources related to the provided preference.
+
+    Args:
+        pref (intptr_t): The cutensorPlanPreference_t object that will be deallocated.
+
+    .. seealso:: `cutensorDestroyPlanPreference`
+    """
+    with nogil:
+        __status__ = cutensorDestroyPlanPreference(<PlanPreference>pref)
+    check_status(__status__)
+
+
+######################### Python specific utility #########################
+
+cdef dict plan_preference_attribute_sizes = {
+    CUTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE: _numpy.int32,
+    CUTENSOR_PLAN_PREFERENCE_CACHE_MODE: _numpy.int32,
+    CUTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT: _numpy.int32,
+    CUTENSOR_PLAN_PREFERENCE_ALGO: _numpy.int32,
+    CUTENSOR_PLAN_PREFERENCE_KERNEL_RANK: _numpy.int32,
+    CUTENSOR_PLAN_PREFERENCE_JIT: _numpy.int32,
+}
+
+cpdef get_plan_preference_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding PlanPreferenceAttribute attribute.
+
+    Args:
+        attr (PlanPreferenceAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`plan_preference_set_attribute`.
+    """
+    return plan_preference_attribute_sizes[attr]
+
+###########################################################################
+
+
+cpdef plan_preference_set_attribute(intptr_t handle, intptr_t pref, int attr, intptr_t buf, size_t size_in_bytes):
+    """Set attribute of a cutensorPlanPreference_t object.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        pref (intptr_t): This opaque struct restricts the search space of viable candidates.
+        attr (PlanPreferenceAttribute): Specifies the attribute that will be set.
+        buf (intptr_t): This buffer (of size size_in_bytes) determines the value to which ``attr`` will be set.
+        size_in_bytes (size_t): Size of buf (in bytes); see cutensorPlanPreferenceAttribute_t for the exact size.
+
+    .. note:: To compute the attribute size, use the itemsize of the corresponding data
+        type, which can be queried using :func:`get_plan_preference_attribute_dtype`.
+
+    .. seealso:: `cutensorPlanPreferenceSetAttribute`
+    """
+    with nogil:
+        __status__ = cutensorPlanPreferenceSetAttribute(<const Handle>handle, <PlanPreference>pref, <_PlanPreferenceAttribute>attr, <const void*>buf, size_in_bytes)
+    check_status(__status__)
+
+
+######################### Python specific utility #########################
+
+cdef dict plan_attribute_sizes = {
+    CUTENSOR_PLAN_REQUIRED_WORKSPACE: _numpy.uint64,
+}
+
+cpdef get_plan_attribute_dtype(int attr):
+    """Get the Python data type of the corresponding PlanAttribute attribute.
+
+    Args:
+        attr (PlanAttribute): The attribute to query.
+
+    Returns:
+        The data type of the queried attribute.
+
+    .. note:: This API has no C counterpart and is a convenient helper for
+        allocating memory for :func:`plan_get_attribute`.
+    """
+    return plan_attribute_sizes[attr]
+
+###########################################################################
+
+
+cpdef plan_get_attribute(intptr_t handle, intptr_t plan, int attr, intptr_t buf, size_t size_in_bytes):
+    """Retrieves information about an already-created plan (see cutensorPlanAttribute_t).
+
+    Args:
+        handle (intptr_t): Denotes an already-created plan (e.g., via ``cutensorCreatePlan`` or cutensorCreatePlanAutotuned).
+        plan (intptr_t): Requested attribute.
+        attr (PlanAttribute): On successful exit: Holds the information of the requested attribute.
+        buf (intptr_t): size of ``buf`` in bytes.
+        size_in_bytes (size_t): The operation completed successfully.
+
+    .. note:: To compute the attribute size, use the itemsize of the corresponding data
+        type, which can be queried using :func:`get_plan_attribute_dtype`.
+
+    .. seealso:: `cutensorPlanGetAttribute`
+    """
+    with nogil:
+        __status__ = cutensorPlanGetAttribute(<const Handle>handle, <const Plan>plan, <_PlanAttribute>attr, <void*>buf, size_in_bytes)
+    check_status(__status__)
+
+
+cpdef uint64_t estimate_workspace_size(intptr_t handle, intptr_t desc, intptr_t plan_pref, int workspace_pref) except? -1:
+    """Determines the required workspaceSize for the given operation encoded by ``desc``.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc (intptr_t): This opaque struct encodes the operation.
+        plan_pref (intptr_t): This opaque struct restricts the space of viable candidates.
+        workspace_pref (int): This parameter influences the size of the workspace; see cutensorWorksizePreference_t for details.
+
+    Returns:
+        uint64_t: The workspace size (in bytes) that is required for the given operation.
+
+    .. seealso:: `cutensorEstimateWorkspaceSize`
+    """
+    cdef uint64_t workspace_size_estimate
+    with nogil:
+        __status__ = cutensorEstimateWorkspaceSize(<const Handle>handle, <const OperationDescriptor>desc, <const PlanPreference>plan_pref, <const _WorksizePreference>workspace_pref, &workspace_size_estimate)
+    check_status(__status__)
+    return workspace_size_estimate
+
+
+cpdef intptr_t create_plan(intptr_t handle, intptr_t desc, intptr_t pref, uint64_t workspace_size_limit) except? 0:
+    """This function allocates a cutensorPlan_t object, selects an appropriate kernel for a given operation (encoded by ``desc``) and prepares a plan that encodes the execution.
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc (intptr_t): This opaque struct encodes the given operation (see ``cutensorCreateContraction``, ``cutensorCreateReduction``, ``cutensorCreatePermutation``, ``cutensorCreateElementwiseBinary``, ``cutensorCreateElementwiseTrinary``, or ``cutensorCreateContractionTrinary``).
+        pref (intptr_t): This opaque struct is used to restrict the space of applicable candidates/kernels (see ``cutensorCreatePlanPreference`` or cutensorPlanPreferenceAttribute_t). May be ``nullptr``, in that case default choices are assumed. Block-sparse contractions currently only support these default settings and ignore other supplied preferences.
+        workspace_size_limit (uint64_t): Denotes the maximal workspace that the corresponding operation is allowed to use (see ``cutensorEstimateWorkspaceSize``).
+
+    Returns:
+        intptr_t: Pointer to the data structure created by this function that holds all information (e.g., selected kernel) necessary to perform the desired operation.
+
+    .. seealso:: `cutensorCreatePlan`
+    """
+    cdef Plan plan
+    with nogil:
+        __status__ = cutensorCreatePlan(<const Handle>handle, &plan, <const OperationDescriptor>desc, <const PlanPreference>pref, workspace_size_limit)
+    check_status(__status__)
+    return <intptr_t>plan
+
+
+cpdef destroy_plan(intptr_t plan):
+    """Frees all resources related to the provided plan.
+
+    Args:
+        plan (intptr_t): The cutensorPlan_t object that will be deallocated.
+
+    .. seealso:: `cutensorDestroyPlan`
+    """
+    with nogil:
+        __status__ = cutensorDestroyPlan(<Plan>plan)
+    check_status(__status__)
+
+
+cpdef contract(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t beta, intptr_t c, intptr_t d, intptr_t workspace, uint64_t workspace_size, intptr_t stream):
+    """This routine computes the tensor contraction .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding the contraction execution plan (created by ``cutensorcreatecontraction`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling for a*b. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE)). Pointer to the host memory.
+        a (intptr_t): Pointer to the data corresponding to a. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        b (intptr_t): Pointer to the data corresponding to b. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        beta (intptr_t): Scaling for c. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE)). Pointer to the host memory.
+        c (intptr_t): Pointer to the data corresponding to c. Pointer to the GPU-accessible memory.
+        d (intptr_t): Pointer to the data corresponding to d. Pointer to the GPU-accessible memory.
+        workspace (intptr_t): Optional parameter that may be NULL. This pointer provides additional workspace, in device memory, to the library for additional optimizations; the workspace must be aligned to 256 bytes (i.e., the default alignment of cudaMalloc).
+        workspace_size (uint64_t): Size of the workspace array in bytes; please refer to ``cutensorEstimateWorkspaceSize`` to query the required workspace. While ``cutensorcontract`` does not strictly require a workspace for the contraction, it is still recommended to provided some small workspace (e.g., 128 Mb).
+        stream (intptr_t): The cUda stream in which all the computation is performed.
+
+    .. seealso:: `cutensorContract`
+    """
+    with nogil:
+        __status__ = cutensorContract(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <const void*>b, <const void*>beta, <const void*>c, <void*>d, <void*>workspace, workspace_size, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_reduction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_reduce, intptr_t desc_compute) except? 0:
+    """Creates a cutensorOperatorDescriptor_t object that encodes a tensor reduction of the form .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds the information about the data type, modes and strides of A.
+        mode_a (object): Array with 'nmode_a' entries that represent the modes of A. mode_a[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to ``cutensorCreateTensorDescriptor``. Modes that only appear in mode_a but not in mode_c are reduced (contracted). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_c (intptr_t): The descriptor that holds the information about the data type, modes and strides of C.
+        mode_c (object): Array with 'nmode_c' entries that represent the modes of C. mode_c[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to ``cutensorCreateTensorDescriptor``. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged.
+        desc_d (intptr_t): Must be identical to desc_c for now.
+        mode_d (object): Must be identical to mode_c for now. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_reduce (Operator): binary operator used to reduce elements of A.
+        desc_compute (intptr_t): All arithmetic is performed using this data type (i.e., it affects the accuracy and performance).
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the requested tensor reduction operation.
+
+    .. seealso:: `cutensorCreateReduction`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateReduction(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const TensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <_Operator>op_reduce, <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef reduce(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t beta, intptr_t c, intptr_t d, intptr_t workspace, uint64_t workspace_size, intptr_t stream):
+    """Performs the tensor reduction that is encoded by ``plan`` (see ``cutensorcreateReduction``).
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding the reduction execution plan (created by ``cutensorcreateReduction`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling for a. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE)). Pointer to the host memory.
+        a (intptr_t): Pointer to the data corresponding to a in device memory. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to d.
+        beta (intptr_t): Scaling for c. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTENSOR_OPERaTION_ScaLaR_TYPE)). Pointer to the host memory.
+        c (intptr_t): Pointer to the data corresponding to c in device memory. Pointer to the GPU-accessible memory.
+        d (intptr_t): Pointer to the data corresponding to c in device memory. Pointer to the GPU-accessible memory.
+        workspace (intptr_t): Scratchpad (device) memory of size --at least-- ``workspace_size`` bytes; the workspace must be aligned to 256 bytes (i.e., the default alignment of cudaMalloc).
+        workspace_size (uint64_t): Please use :func:`estimate_workspace_size` to query the required workspace.
+        stream (intptr_t): The cUda stream in which all the computation is performed.
+
+    .. seealso:: `cutensorReduce`
+    """
+    with nogil:
+        __status__ = cutensorReduce(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <const void*>beta, <const void*>c, <void*>d, <void*>workspace, workspace_size, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_contraction_trinary(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, int op_d, intptr_t desc_e, mode_e, intptr_t desc_compute) except? 0:
+    """This function allocates a cutensorOperationDescriptor_t object that encodes a tensor contraction of the form .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds the information about the data type, modes and strides of A.
+        mode_a (object): Array with 'nmode_a' entries that represent the modes of A. The mode_a[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged.
+        desc_b (intptr_t): The descriptor that holds information about the data type, modes, and strides of B.
+        mode_b (object): Array with 'nmode_b' entries that represent the modes of B. The mode_b[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_b (Operator): Unary operator that will be applied to each element of B before it is further processed. The original data of this tensor remains unchanged.
+        desc_c (intptr_t): The escriptor that holds information about the data type, modes, and strides of C.
+        mode_c (object): Array with 'nmode_c' entries that represent the modes of C. The mode_c[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged.
+        desc_d (intptr_t): The escriptor that holds information about the data type, modes, and strides of D.
+        mode_d (object): Array with 'nmode_d' entries that represent the modes of D. The mode_d[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_d (Operator): Unary operator that will be applied to each element of D before it is further processed. The original data of this tensor remains unchanged.
+        desc_e (intptr_t): Array with 'nmode_e' entries that represent the modes of E (must be identical to mode_d for now). The mode_e[i] corresponds to extent[i] and stride[i] w.r.t. the arguments provided to cutensorInitTensorDescriptor.
+        mode_e (object): The descriptor that holds information about the data type, modes, and strides of E (must be identical to ``desc_d`` for now). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        desc_compute (intptr_t): Determines the precision in which this operations is performed.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the tensor contraction operation.
+
+    .. seealso:: `cutensorCreateContractionTrinary`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_b_
+    get_resource_ptr[int32_t](_mode_b_, mode_b, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_e_
+    get_resource_ptr[int32_t](_mode_e_, mode_e, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateContractionTrinary(<const Handle>handle, &desc, <const TensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const TensorDescriptor>desc_b, <const int32_t*>(_mode_b_.data()), <_Operator>op_b, <const TensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const TensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <_Operator>op_d, <const TensorDescriptor>desc_e, <const int32_t*>(_mode_e_.data()), <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef contract_trinary(intptr_t handle, intptr_t plan, intptr_t alpha, intptr_t a, intptr_t b, intptr_t c, intptr_t beta, intptr_t d, intptr_t e, intptr_t workspace, uint64_t workspace_size, intptr_t stream):
+    """This routine computes the tensor contraction .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTeNSOR's library context.
+        plan (intptr_t): Opaque handle holding the contraction execution plan (created by ``cutensorcreatecontractionTrinary`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling for a*b*c. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTeNSOR_OPeRaTION_ScaLaR_TYPe)). Pointer to the host memory.
+        a (intptr_t): Pointer to the data corresponding to a. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to e.
+        b (intptr_t): Pointer to the data corresponding to b. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to e.
+        c (intptr_t): Pointer to the data corresponding to c. Pointer to the GPU-accessible memory. The data accessed via this pointer must not overlap with the elements written to e.
+        beta (intptr_t): Scaling for d. Its data type is determined by 'desccompute' (see cutensorOperationdescriptorGetattribute(desc, cUTeNSOR_OPeRaTION_ScaLaR_TYPe)). Pointer to the host memory.
+        d (intptr_t): Pointer to the data corresponding to d. Pointer to the GPU-accessible memory.
+        e (intptr_t): Pointer to the data corresponding to e. Pointer to the GPU-accessible memory.
+        workspace (intptr_t): Optional parameter that may be NULL. This pointer provides additional workspace, in device memory, to the library for additional optimizations; the workspace must be aligned to 256 bytes (i.e., the default alignment of cudaMalloc).
+        workspace_size (uint64_t): Size of the workspace array in bytes; please refer to ``cutensorestimateWorkspaceSize`` to query the required workspace. While ``cutensorcontract`` does not strictly require a workspace for the contraction, it is still recommended to provided some small workspace (e.g., 128 Mb).
+        stream (intptr_t): The cUda stream in which all the computation is performed.
+
+    .. seealso:: `cutensorContractTrinary`
+    """
+    with nogil:
+        __status__ = cutensorContractTrinary(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void*>a, <const void*>b, <const void*>c, <const void*>beta, <const void*>d, <void*>e, <void*>workspace, workspace_size, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef intptr_t create_block_sparse_tensor_descriptor(intptr_t handle, uint32_t num_modes, uint64_t num_non_zero_blocks, num_sections_per_mode, extent, non_zero_coordinates, stride, int data_type) except? 0:
+    """Create a block-sparse tensor descriptor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        num_modes (uint32_t): The number of modes. Currently, a maximum of 8 modes is supported.
+        num_non_zero_blocks (uint64_t): The number of non-zero blocks in the block-sparse tensor.
+        num_sections_per_mode (object): The number of sections of each mode (host array of size ``num_modes``). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``uint32_t``.
+
+        extent (object): The extents of the sections of each mode (host array of size ``\sum_i^num_modes(num_sections_per_mode[i])``). First come the extents of the sections of the first mode, then the extents of the sections of the second mode, and so forth. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        non_zero_coordinates (object): Block-coordinates of each non-zero block (host array of size ``num_modes`` x ``num_non_zero_blocks`` Blocks can be specified in any order, however, that order must be consistent with stride and alignmentRequirement arrays. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        stride (object): The strides of each dense block (either nullptr or a host array of size ``num_modes`` x ``num_non_zero_blocks``). First the strides of the first block, then the strides of the second block, with the blocks in the same order as in non_zero_coordinates. Passing nullptr means contiguous column-major order for each block. Moreover, the strides need to be compatible in the following sense: Suppose you sort the strides of the first block, such that they are ascending; this sorting results in a permutation. If you apply this permutation to the strides of any other block, the result needs to be sorted as well. As an example:. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        data_type (int): Data type of the stored entries. We assume the same datatype for each block. Currently, the only supported values are CUDA_C_64F, CUDA_C_32F, CUDA_R_64F, and CUDA_R_32F.
+
+    Returns:
+        intptr_t: The resulting block-sparse tensor descriptor.
+
+    .. seealso:: `cutensorCreateBlockSparseTensorDescriptor`
+    """
+    cdef nullable_unique_ptr[ vector[uint32_t] ] _num_sections_per_mode_
+    get_resource_ptr[uint32_t](_num_sections_per_mode_, num_sections_per_mode, <uint32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _extent_
+    get_resource_ptr[int64_t](_extent_, extent, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _non_zero_coordinates_
+    get_resource_ptr[int32_t](_non_zero_coordinates_, non_zero_coordinates, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _stride_
+    get_resource_ptr[int64_t](_stride_, stride, <int64_t*>NULL)
+    cdef BlockSparseTensorDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateBlockSparseTensorDescriptor(<Handle>handle, &desc, <const uint32_t>num_modes, <const uint64_t>num_non_zero_blocks, <const uint32_t*>(_num_sections_per_mode_.data()), <const int64_t*>(_extent_.data()), <const int32_t*>(_non_zero_coordinates_.data()), <const int64_t*>(_stride_.data()), <DataType>data_type)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef destroy_block_sparse_tensor_descriptor(intptr_t desc):
+    """Frees all resources related to the provided block-sparse tensor descriptor.
+
+    Args:
+        desc (intptr_t): The cutensorBlockSparseTensorDescrptor_t object that will be deallocated.
+
+    .. seealso:: `cutensorDestroyBlockSparseTensorDescriptor`
+    """
+    with nogil:
+        __status__ = cutensorDestroyBlockSparseTensorDescriptor(<BlockSparseTensorDescriptor>desc)
+    check_status(__status__)
+
+
+cpdef intptr_t create_block_sparse_contraction(intptr_t handle, intptr_t desc_a, mode_a, int op_a, intptr_t desc_b, mode_b, int op_b, intptr_t desc_c, mode_c, int op_c, intptr_t desc_d, mode_d, intptr_t desc_compute) except? 0:
+    """This function allocates a cutensorOperationDescriptor_t object that encodes a block-sparse tensor contraction of the form .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        desc_a (intptr_t): The descriptor that holds the information about the data type, modes, sections, section extents, strides, and non-zero blocks of A.
+        mode_a (object): Array with 'nmode_a' entries that represent the modes of A. Sections, i.e., block-sizes, must match among the involved block-sparse tensors. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_a (Operator): Unary operator that will be applied to each element of A before it is further processed. The original data of this tensor remains unchanged. Currently, only CUTENSOR_OP_IDENTITY is supported.
+        desc_b (intptr_t): The descriptor that holds information about the the data type, modes, sections, section extents, strides, and non-zero blocks of B.
+        mode_b (object): Array with 'nmode_b' entries that represent the modes of B. Sections, i.e., block-sizes, must match among the involved block-sparse tensors. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_b (Operator): Unary operator that will be applied to each element of B before it is further processed. The original data of this tensor remains unchanged. Currently, only CUTENSOR_OP_IDENTITY is supported.
+        desc_c (intptr_t): Array with 'nmode_c' entries that represent the modes of C. Sections, i.e., block-sizes, must match among the involved block-sparse tensors.
+        mode_c (object): The descriptor that holds information about the data type, modes, sections, section extents, strides, and non-zero blocks of C. Note that the block-sparsity pattern of C (the nonZeroCoordinates[] array used to create the decriptor) of C must be identical to that of D; and it is this block-sparsity pattern that determines which parts of the results are computed; no fill-in is allocated or computed. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        op_c (Operator): Unary operator that will be applied to each element of C before it is further processed. The original data of this tensor remains unchanged. Currently, only CUTENSOR_OP_IDENTITY is supported.
+        desc_d (intptr_t): For now, this must be the same opaque pointer as desc_c, and the layouts of C and D must be identical.
+        mode_d (object): Array with 'nmode_d' entries that represent the modes of D (must be identical to mode_c for now). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        desc_compute (intptr_t): Datatype of for the intermediate computation of typeCompute T = A * B.
+
+    Returns:
+        intptr_t: This opaque struct gets allocated and filled with the information that encodes the tensor contraction operation.
+
+    .. seealso:: `cutensorCreateBlockSparseContraction`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_a_
+    get_resource_ptr[int32_t](_mode_a_, mode_a, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_b_
+    get_resource_ptr[int32_t](_mode_b_, mode_b, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_c_
+    get_resource_ptr[int32_t](_mode_c_, mode_c, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_d_
+    get_resource_ptr[int32_t](_mode_d_, mode_d, <int32_t*>NULL)
+    cdef OperationDescriptor desc
+    with nogil:
+        __status__ = cutensorCreateBlockSparseContraction(<const Handle>handle, &desc, <const BlockSparseTensorDescriptor>desc_a, <const int32_t*>(_mode_a_.data()), <_Operator>op_a, <const BlockSparseTensorDescriptor>desc_b, <const int32_t*>(_mode_b_.data()), <_Operator>op_b, <const BlockSparseTensorDescriptor>desc_c, <const int32_t*>(_mode_c_.data()), <_Operator>op_c, <const BlockSparseTensorDescriptor>desc_d, <const int32_t*>(_mode_d_.data()), <const ComputeDescriptor>desc_compute)
+    check_status(__status__)
+    return <intptr_t>desc
+
+
+cpdef block_sparse_contract(intptr_t handle, intptr_t plan, intptr_t alpha, a, b, intptr_t beta, c, d, intptr_t workspace, uint64_t workspace_size, intptr_t stream):
+    """This routine computes the block-sparse tensor contraction .
+
+    Args:
+        handle (intptr_t): Opaque handle holding cuTENSOR's library context.
+        plan (intptr_t): Opaque handle holding the contraction execution plan (created by ``cutensorcreateblockSparsecontraction`` followed by ``cutensorcreatePlan``).
+        alpha (intptr_t): Scaling for a*b. Its data type is determined by 'desccompute' (see ``cutensorcreateblockSparsecontraction``). Pointer to host memory.
+        a (object): Host-array of size numNonZeroblocks(a), containing pointers to GPU-accessible memory, corresponding the blocks of a. The data accessed via these pointers must not overlap with the elements written to d. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        b (object): Host-array of size numNonZeroblocks(b), containing pointers to GPU-accessible memory, corresponding the blocks of b. The data accessed via these pointers must not overlap with the elements written to d. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        beta (intptr_t): Scaling for c. Its data type is determined by 'desccompute' (see ``cutensorcreateblockSparsecontraction``). Pointer to host memory.
+        c (object): Host-array of size numNonZeroblocks(c), containing pointers to GPU-accessible memory, corresponding the blocks of c. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        d (object): Host-array of size numNonZeroblocks(d), containing pointers to GPU-accessible memory, corresponding the blocks of d. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        workspace (intptr_t): This pointer provides the required workspace in device memory. The workspace must be aligned to 256 bytes (i.e., the default alignment of cudaMalloc).
+        workspace_size (uint64_t): Size of the workspace array in bytes; please refer to ``cutensorEstimateWorkspaceSize`` to query the required workspace. For block-sparse contractions, this estimate is exact.
+        stream (intptr_t): The cUda stream to which all of the computation is synchronised.
+
+    .. seealso:: `cutensorBlockSparseContract`
+    """
+    cdef nullable_unique_ptr[ vector[void*] ] _a_
+    get_resource_ptrs[void](_a_, a, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _b_
+    get_resource_ptrs[void](_b_, b, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _c_
+    get_resource_ptrs[void](_c_, c, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _d_
+    get_resource_ptrs[void](_d_, d, <void*>NULL)
+    with nogil:
+        __status__ = cutensorBlockSparseContract(<const Handle>handle, <const Plan>plan, <const void*>alpha, <const void* const*>(_a_.data()), <const void* const*>(_b_.data()), <const void*>beta, <const void* const*>(_c_.data()), <void* const*>(_d_.data()), <void*>workspace, workspace_size, <Stream>stream)
+    check_status(__status__)
+
+
+cpdef str get_error_string(int error):
+    """Returns the description string for an error code.
+
+    Args:
+        error (int): Error code to convert to string.
+
+    .. seealso:: `cutensorGetErrorString`
+    """
+    cdef bytes _output_
+    _output_ = cutensorGetErrorString(<const _Status>error)
+    return _output_.decode()
+
+
+cpdef size_t get_version() except? 0:
+    """Returns Version number of the CUTENSOR library.
+
+    .. seealso:: `cutensorGetVersion`
+    """
+    return cutensorGetVersion()
+
+
+cpdef size_t get_cudart_version() except? 0:
+    """Returns version number of the CUDA runtime that cuTENSOR was compiled against.
+
+    .. seealso:: `cutensorGetCudartVersion`
+    """
+    return cutensorGetCudartVersion()
+
+
+cpdef logger_set_file(intptr_t file):
+    """This function sets the logging output file.
+
+    Args:
+        file (intptr_t): An open file with write permission.
+
+    .. seealso:: `cutensorLoggerSetFile`
+    """
+    with nogil:
+        __status__ = cutensorLoggerSetFile(<FILE*>file)
+    check_status(__status__)
+
+
+cpdef logger_open_file(log_file):
+    """This function opens a logging output file in the given path.
+
+    Args:
+        log_file (str): Path to the logging output file.
+
+    .. seealso:: `cutensorLoggerOpenFile`
+    """
+    if not isinstance(log_file, str):
+        raise TypeError("log_file must be a Python str")
+    cdef bytes _temp_log_file_ = (<str>log_file).encode()
+    cdef char* _log_file_ = _temp_log_file_
+    with nogil:
+        __status__ = cutensorLoggerOpenFile(<const char*>_log_file_)
+    check_status(__status__)
+
+
+cpdef logger_set_level(int32_t level):
+    """This function sets the value of the logging level.
+
+    Args:
+        level (int32_t): Log level, should be one of the following:.
+
+    .. seealso:: `cutensorLoggerSetLevel`
+    """
+    with nogil:
+        __status__ = cutensorLoggerSetLevel(level)
+    check_status(__status__)
+
+
+cpdef logger_set_mask(int32_t mask):
+    """This function sets the value of the log mask.
+
+    Args:
+        mask (int32_t): Log mask, the bitwise OR of the following:.
+
+    .. seealso:: `cutensorLoggerSetMask`
+    """
+    with nogil:
+        __status__ = cutensorLoggerSetMask(mask)
+    check_status(__status__)
+
+
+cpdef logger_force_disable():
+    """This function disables logging for the entire run.
+
+    .. seealso:: `cutensorLoggerForceDisable`
+    """
+    with nogil:
+        __status__ = cutensorLoggerForceDisable()
+    check_status(__status__)
+
+
+###############################################################################
diff --git a/nvmath/bindings/cycublasMp.pxd b/nvmath/bindings/cycublasMp.pxd
new file mode 100644
index 0000000..c9a12d6
--- /dev/null
+++ b/nvmath/bindings/cycublasMp.pxd
@@ -0,0 +1,162 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+# This layer exposes the C header to Cython as-is.
+
+from libc.stdint cimport int64_t, uint32_t, uint64_t, intptr_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum cublasOperation_t "cublasOperation_t":
+    CUBLAS_OP_N "CUBLAS_OP_N" = 0
+    CUBLAS_OP_T "CUBLAS_OP_T" = 1
+    CUBLAS_OP_C "CUBLAS_OP_C" = 2
+    CUBLAS_OP_HERMITAN "CUBLAS_OP_HERMITAN" = 2
+    CUBLAS_OP_CONJG "CUBLAS_OP_CONJG" = 3
+
+ctypedef enum cublasComputeType_t "cublasComputeType_t":
+    CUBLAS_COMPUTE_16F "CUBLAS_COMPUTE_16F" = 64
+    CUBLAS_COMPUTE_16F_PEDANTIC "CUBLAS_COMPUTE_16F_PEDANTIC" = 65
+    CUBLAS_COMPUTE_32F "CUBLAS_COMPUTE_32F" = 68
+    CUBLAS_COMPUTE_32F_PEDANTIC "CUBLAS_COMPUTE_32F_PEDANTIC" = 69
+    CUBLAS_COMPUTE_32F_FAST_16F "CUBLAS_COMPUTE_32F_FAST_16F" = 74
+    CUBLAS_COMPUTE_32F_FAST_16BF "CUBLAS_COMPUTE_32F_FAST_16BF" = 75
+    CUBLAS_COMPUTE_32F_FAST_TF32 "CUBLAS_COMPUTE_32F_FAST_TF32" = 77
+    CUBLAS_COMPUTE_64F "CUBLAS_COMPUTE_64F" = 70
+    CUBLAS_COMPUTE_64F_PEDANTIC "CUBLAS_COMPUTE_64F_PEDANTIC" = 71
+    CUBLAS_COMPUTE_32I "CUBLAS_COMPUTE_32I" = 72
+    CUBLAS_COMPUTE_32I_PEDANTIC "CUBLAS_COMPUTE_32I_PEDANTIC" = 73
+
+ctypedef enum cublasMpStatus_t "cublasMpStatus_t":
+    CUBLASMP_STATUS_SUCCESS "CUBLASMP_STATUS_SUCCESS" = 0
+    CUBLASMP_STATUS_NOT_INITIALIZED "CUBLASMP_STATUS_NOT_INITIALIZED" = 1
+    CUBLASMP_STATUS_ALLOCATION_FAILED "CUBLASMP_STATUS_ALLOCATION_FAILED" = 2
+    CUBLASMP_STATUS_INVALID_VALUE "CUBLASMP_STATUS_INVALID_VALUE" = 3
+    CUBLASMP_STATUS_ARCHITECTURE_MISMATCH "CUBLASMP_STATUS_ARCHITECTURE_MISMATCH" = 4
+    CUBLASMP_STATUS_EXECUTION_FAILED "CUBLASMP_STATUS_EXECUTION_FAILED" = 5
+    CUBLASMP_STATUS_INTERNAL_ERROR "CUBLASMP_STATUS_INTERNAL_ERROR" = 6
+    CUBLASMP_STATUS_NOT_SUPPORTED "CUBLASMP_STATUS_NOT_SUPPORTED" = 7
+    _CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR "_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR" = -42
+
+ctypedef enum cublasMpGridLayout_t "cublasMpGridLayout_t":
+    CUBLASMP_GRID_LAYOUT_COL_MAJOR "CUBLASMP_GRID_LAYOUT_COL_MAJOR" = 0
+    CUBLASMP_GRID_LAYOUT_ROW_MAJOR "CUBLASMP_GRID_LAYOUT_ROW_MAJOR" = 1
+
+ctypedef enum cublasMpMatmulDescriptorAttribute_t "cublasMpMatmulDescriptorAttribute_t":
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSA "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSA" = 0
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSB "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSB" = 1
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMPUTE_TYPE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMPUTE_TYPE" = 2
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_ALGO_TYPE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_ALGO_TYPE" = 3
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMMUNICATION_SM_COUNT "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMMUNICATION_SM_COUNT" = 4
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE" = 5
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_POINTER" = 6
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_BATCH_STRIDE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_BATCH_STRIDE" = 7
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_DATA_TYPE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_BIAS_DATA_TYPE" = 8
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_POINTER" = 9
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_LD "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_LD" = 10
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_BATCH_STRIDE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_BATCH_STRIDE" = 11
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_DATA_TYPE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_DATA_TYPE" = 12
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_POINTER" = 13
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_AMAX_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_AMAX_POINTER" = 14
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_MODE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_SCALE_MODE" = 15
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_POINTER" = 16
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_MODE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_A_SCALE_MODE" = 17
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_POINTER" = 18
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_MODE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_B_SCALE_MODE" = 19
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_POINTER" = 20
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_MODE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_C_SCALE_MODE" = 21
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_POINTER" = 22
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_MODE "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_D_SCALE_MODE" = 23
+    CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_AMAX_D_POINTER "CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_AMAX_D_POINTER" = 24
+
+ctypedef enum cublasMpMatmulAlgoType_t "cublasMpMatmulAlgoType_t":
+    CUBLASMP_MATMUL_ALGO_TYPE_DEFAULT "CUBLASMP_MATMUL_ALGO_TYPE_DEFAULT" = 0
+    CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_P2P "CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_P2P" = 1
+    CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_MULTICAST "CUBLASMP_MATMUL_ALGO_TYPE_SPLIT_MULTICAST" = 2
+    CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_P2P "CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_P2P" = 3
+    CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_MULTICAST "CUBLASMP_MATMUL_ALGO_TYPE_ATOMIC_MULTICAST" = 4
+
+ctypedef enum cublasMpMatmulEpilogue_t "cublasMpMatmulEpilogue_t":
+    CUBLASMP_MATMUL_EPILOGUE_DEFAULT "CUBLASMP_MATMUL_EPILOGUE_DEFAULT" = 0
+    CUBLASMP_MATMUL_EPILOGUE_ALLREDUCE "CUBLASMP_MATMUL_EPILOGUE_ALLREDUCE" = 1
+    CUBLASMP_MATMUL_EPILOGUE_RELU "CUBLASMP_MATMUL_EPILOGUE_RELU" = 2
+    CUBLASMP_MATMUL_EPILOGUE_RELU_AUX "CUBLASMP_MATMUL_EPILOGUE_RELU_AUX" = (CUBLASMP_MATMUL_EPILOGUE_RELU | 128)
+    CUBLASMP_MATMUL_EPILOGUE_BIAS "CUBLASMP_MATMUL_EPILOGUE_BIAS" = 4
+    CUBLASMP_MATMUL_EPILOGUE_RELU_BIAS "CUBLASMP_MATMUL_EPILOGUE_RELU_BIAS" = (CUBLASMP_MATMUL_EPILOGUE_RELU | CUBLASMP_MATMUL_EPILOGUE_BIAS)
+    CUBLASMP_MATMUL_EPILOGUE_RELU_AUX_BIAS "CUBLASMP_MATMUL_EPILOGUE_RELU_AUX_BIAS" = (CUBLASMP_MATMUL_EPILOGUE_RELU_AUX | CUBLASMP_MATMUL_EPILOGUE_BIAS)
+    CUBLASMP_MATMUL_EPILOGUE_DRELU "CUBLASMP_MATMUL_EPILOGUE_DRELU" = (8 | 128)
+    CUBLASMP_MATMUL_EPILOGUE_DRELU_BGRAD "CUBLASMP_MATMUL_EPILOGUE_DRELU_BGRAD" = (CUBLASMP_MATMUL_EPILOGUE_DRELU | 16)
+    CUBLASMP_MATMUL_EPILOGUE_GELU "CUBLASMP_MATMUL_EPILOGUE_GELU" = 32
+    CUBLASMP_MATMUL_EPILOGUE_GELU_AUX "CUBLASMP_MATMUL_EPILOGUE_GELU_AUX" = (CUBLASMP_MATMUL_EPILOGUE_GELU | 128)
+    CUBLASMP_MATMUL_EPILOGUE_GELU_BIAS "CUBLASMP_MATMUL_EPILOGUE_GELU_BIAS" = (CUBLASMP_MATMUL_EPILOGUE_GELU | CUBLASMP_MATMUL_EPILOGUE_BIAS)
+    CUBLASMP_MATMUL_EPILOGUE_GELU_AUX_BIAS "CUBLASMP_MATMUL_EPILOGUE_GELU_AUX_BIAS" = (CUBLASMP_MATMUL_EPILOGUE_GELU_AUX | CUBLASMP_MATMUL_EPILOGUE_BIAS)
+    CUBLASMP_MATMUL_EPILOGUE_DGELU "CUBLASMP_MATMUL_EPILOGUE_DGELU" = (64 | 128)
+    CUBLASMP_MATMUL_EPILOGUE_DGELU_BGRAD "CUBLASMP_MATMUL_EPILOGUE_DGELU_BGRAD" = (CUBLASMP_MATMUL_EPILOGUE_DGELU | 16)
+    CUBLASMP_MATMUL_EPILOGUE_BGRADA "CUBLASMP_MATMUL_EPILOGUE_BGRADA" = 256
+    CUBLASMP_MATMUL_EPILOGUE_BGRADB "CUBLASMP_MATMUL_EPILOGUE_BGRADB" = 512
+
+ctypedef enum cublasMpMatmulMatrixScale_t "cublasMpMatmulMatrixScale_t":
+    CUBLASMP_MATMUL_MATRIX_SCALE_SCALAR_FP32 "CUBLASMP_MATMUL_MATRIX_SCALE_SCALAR_FP32" = 0
+    CUBLASMP_MATMUL_MATRIX_SCALE_VEC16_UE4M3 "CUBLASMP_MATMUL_MATRIX_SCALE_VEC16_UE4M3" = 1
+    CUBLASMP_MATMUL_MATRIX_SCALE_VEC32_UE8M0 "CUBLASMP_MATMUL_MATRIX_SCALE_VEC32_UE8M0" = 2
+    CUBLASMP_MATMUL_MATRIX_SCALE_OUTER_VEC_FP32 "CUBLASMP_MATMUL_MATRIX_SCALE_OUTER_VEC_FP32" = 3
+    CUBLASMP_MATMUL_MATRIX_SCALE_VEC128_FP32 "CUBLASMP_MATMUL_MATRIX_SCALE_VEC128_FP32" = 4
+    CUBLASMP_MATMUL_MATRIX_SCALE_BLK128x128_FP32 "CUBLASMP_MATMUL_MATRIX_SCALE_BLK128x128_FP32" = 5
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+    ctypedef int cudaDataType_t 'cudaDataType_t'
+    ctypedef int cudaDataType 'cudaDataType'
+    ctypedef int libraryPropertyType_t 'libraryPropertyType_t'
+    ctypedef int libraryPropertyType 'libraryPropertyType'
+
+    ctypedef struct cuComplex:
+        pass
+    ctypedef struct cuDoubleComplex:
+        pass
+
+
+ctypedef void* ncclComm_t 'ncclComm_t'
+ctypedef void* cublasMpHandle_t 'cublasMpHandle_t'
+ctypedef void* cublasMpGrid_t 'cublasMpGrid_t'
+ctypedef void* cublasMpMatrixDescriptor_t 'cublasMpMatrixDescriptor_t'
+ctypedef void* cublasMpMatmulDescriptor_t 'cublasMpMatmulDescriptor_t'
+ctypedef void (*cublasMpLoggerCallback_t 'cublasMpLoggerCallback_t')(
+    int logLevel,
+    const char* functionName,
+    const char* message
+)
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef cublasMpStatus_t cublasMpCreate(cublasMpHandle_t* handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpDestroy(cublasMpHandle_t handle) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpStreamSet(cublasMpHandle_t handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpGetVersion(int* version) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpGridCreate(int64_t nprow, int64_t npcol, cublasMpGridLayout_t layout, ncclComm_t comm, cublasMpGrid_t* grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpGridDestroy(cublasMpGrid_t grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatrixDescriptorCreate(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, cudaDataType_t type, cublasMpGrid_t grid, cublasMpMatrixDescriptor_t* desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatrixDescriptorDestroy(cublasMpMatrixDescriptor_t desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmulDescriptorCreate(cublasMpMatmulDescriptor_t* matmulDesc, cublasComputeType_t computeType) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmulDescriptorDestroy(cublasMpMatmulDescriptor_t matmulDesc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmulDescriptorAttributeSet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmulDescriptorAttributeGet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, void* buf, size_t sizeInBytes, size_t* sizeWritten) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmul_bufferSize(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, size_t* workspaceSizeInBytesOnDevice, size_t* workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasMpStatus_t cublasMpMatmul(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, void* d_work, size_t workspaceSizeInBytesOnDevice, void* h_work, size_t workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef int64_t cublasMpNumroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except?-42 nogil
diff --git a/nvmath/bindings/cycublasMp.pyx b/nvmath/bindings/cycublasMp.pyx
new file mode 100644
index 0000000..234907e
--- /dev/null
+++ b/nvmath/bindings/cycublasMp.pyx
@@ -0,0 +1,71 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 0.5.0 to 0.6.0. Do not modify it directly.
+
+from ._internal cimport cublasMp as _cublasMp
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cublasMpStatus_t cublasMpCreate(cublasMpHandle_t* handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpCreate(handle, stream)
+
+
+cdef cublasMpStatus_t cublasMpDestroy(cublasMpHandle_t handle) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpDestroy(handle)
+
+
+cdef cublasMpStatus_t cublasMpStreamSet(cublasMpHandle_t handle, cudaStream_t stream) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpStreamSet(handle, stream)
+
+
+cdef cublasMpStatus_t cublasMpGetVersion(int* version) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpGetVersion(version)
+
+
+cdef cublasMpStatus_t cublasMpGridCreate(int64_t nprow, int64_t npcol, cublasMpGridLayout_t layout, ncclComm_t comm, cublasMpGrid_t* grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpGridCreate(nprow, npcol, layout, comm, grid)
+
+
+cdef cublasMpStatus_t cublasMpGridDestroy(cublasMpGrid_t grid) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpGridDestroy(grid)
+
+
+cdef cublasMpStatus_t cublasMpMatrixDescriptorCreate(int64_t m, int64_t n, int64_t mb, int64_t nb, int64_t rsrc, int64_t csrc, int64_t lld, cudaDataType_t type, cublasMpGrid_t grid, cublasMpMatrixDescriptor_t* desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatrixDescriptorCreate(m, n, mb, nb, rsrc, csrc, lld, type, grid, desc)
+
+
+cdef cublasMpStatus_t cublasMpMatrixDescriptorDestroy(cublasMpMatrixDescriptor_t desc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatrixDescriptorDestroy(desc)
+
+
+cdef cublasMpStatus_t cublasMpMatmulDescriptorCreate(cublasMpMatmulDescriptor_t* matmulDesc, cublasComputeType_t computeType) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmulDescriptorCreate(matmulDesc, computeType)
+
+
+cdef cublasMpStatus_t cublasMpMatmulDescriptorDestroy(cublasMpMatmulDescriptor_t matmulDesc) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmulDescriptorDestroy(matmulDesc)
+
+
+cdef cublasMpStatus_t cublasMpMatmulDescriptorAttributeSet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmulDescriptorAttributeSet(matmulDesc, attr, buf, sizeInBytes)
+
+
+cdef cublasMpStatus_t cublasMpMatmulDescriptorAttributeGet(cublasMpMatmulDescriptor_t matmulDesc, cublasMpMatmulDescriptorAttribute_t attr, void* buf, size_t sizeInBytes, size_t* sizeWritten) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmulDescriptorAttributeGet(matmulDesc, attr, buf, sizeInBytes, sizeWritten)
+
+
+cdef cublasMpStatus_t cublasMpMatmul_bufferSize(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, size_t* workspaceSizeInBytesOnDevice, size_t* workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmul_bufferSize(handle, matmulDesc, m, n, k, alpha, a, ia, ja, descA, b, ib, jb, descB, beta, c, ic, jc, descC, d, id, jd, descD, workspaceSizeInBytesOnDevice, workspaceSizeInBytesOnHost)
+
+
+cdef cublasMpStatus_t cublasMpMatmul(cublasMpHandle_t handle, cublasMpMatmulDescriptor_t matmulDesc, int64_t m, int64_t n, int64_t k, const void* alpha, const void* a, int64_t ia, int64_t ja, cublasMpMatrixDescriptor_t descA, const void* b, int64_t ib, int64_t jb, cublasMpMatrixDescriptor_t descB, const void* beta, const void* c, int64_t ic, int64_t jc, cublasMpMatrixDescriptor_t descC, void* d, int64_t id, int64_t jd, cublasMpMatrixDescriptor_t descD, void* d_work, size_t workspaceSizeInBytesOnDevice, void* h_work, size_t workspaceSizeInBytesOnHost) except?_CUBLASMPSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublasMp._cublasMpMatmul(handle, matmulDesc, m, n, k, alpha, a, ia, ja, descA, b, ib, jb, descB, beta, c, ic, jc, descC, d, id, jd, descD, d_work, workspaceSizeInBytesOnDevice, h_work, workspaceSizeInBytesOnHost)
+
+
+cdef int64_t cublasMpNumroc(int64_t n, int64_t nb, uint32_t iproc, uint32_t isrcproc, uint32_t nprocs) except?-42 nogil:
+    return _cublasMp._cublasMpNumroc(n, nb, iproc, isrcproc, nprocs)
diff --git a/nvmath/bindings/cycudss.pxd b/nvmath/bindings/cycudss.pxd
index 234d53c..e09b414 100644
--- a/nvmath/bindings/cycudss.pxd
+++ b/nvmath/bindings/cycudss.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int64_t
@@ -22,7 +22,8 @@ ctypedef enum cudssConfigParam_t "cudssConfigParam_t":
     CUDSS_CONFIG_REORDERING_ALG "CUDSS_CONFIG_REORDERING_ALG"
     CUDSS_CONFIG_FACTORIZATION_ALG "CUDSS_CONFIG_FACTORIZATION_ALG"
     CUDSS_CONFIG_SOLVE_ALG "CUDSS_CONFIG_SOLVE_ALG"
-    CUDSS_CONFIG_MATCHING_TYPE "CUDSS_CONFIG_MATCHING_TYPE"
+    CUDSS_CONFIG_USE_MATCHING "CUDSS_CONFIG_USE_MATCHING"
+    CUDSS_CONFIG_MATCHING_ALG "CUDSS_CONFIG_MATCHING_ALG"
     CUDSS_CONFIG_SOLVE_MODE "CUDSS_CONFIG_SOLVE_MODE"
     CUDSS_CONFIG_IR_N_STEPS "CUDSS_CONFIG_IR_N_STEPS"
     CUDSS_CONFIG_IR_TOL "CUDSS_CONFIG_IR_TOL"
@@ -36,6 +37,14 @@ ctypedef enum cudssConfigParam_t "cudssConfigParam_t":
     CUDSS_CONFIG_HOST_NTHREADS "CUDSS_CONFIG_HOST_NTHREADS"
     CUDSS_CONFIG_HYBRID_EXECUTE_MODE "CUDSS_CONFIG_HYBRID_EXECUTE_MODE"
     CUDSS_CONFIG_PIVOT_EPSILON_ALG "CUDSS_CONFIG_PIVOT_EPSILON_ALG"
+    CUDSS_CONFIG_ND_NLEVELS "CUDSS_CONFIG_ND_NLEVELS"
+    CUDSS_CONFIG_UBATCH_SIZE "CUDSS_CONFIG_UBATCH_SIZE"
+    CUDSS_CONFIG_UBATCH_INDEX "CUDSS_CONFIG_UBATCH_INDEX"
+    CUDSS_CONFIG_USE_SUPERPANELS "CUDSS_CONFIG_USE_SUPERPANELS"
+    CUDSS_CONFIG_DEVICE_COUNT "CUDSS_CONFIG_DEVICE_COUNT"
+    CUDSS_CONFIG_DEVICE_INDICES "CUDSS_CONFIG_DEVICE_INDICES"
+    CUDSS_CONFIG_SCHUR_MODE "CUDSS_CONFIG_SCHUR_MODE"
+    CUDSS_CONFIG_DETERMINISTIC_MODE "CUDSS_CONFIG_DETERMINISTIC_MODE"
 
 ctypedef enum cudssDataParam_t "cudssDataParam_t":
     CUDSS_DATA_INFO "CUDSS_DATA_INFO"
@@ -51,15 +60,30 @@ ctypedef enum cudssDataParam_t "cudssDataParam_t":
     CUDSS_DATA_HYBRID_DEVICE_MEMORY_MIN "CUDSS_DATA_HYBRID_DEVICE_MEMORY_MIN"
     CUDSS_DATA_COMM "CUDSS_DATA_COMM"
     CUDSS_DATA_MEMORY_ESTIMATES "CUDSS_DATA_MEMORY_ESTIMATES"
+    CUDSS_DATA_PERM_MATCHING "CUDSS_DATA_PERM_MATCHING"
+    CUDSS_DATA_SCALE_ROW "CUDSS_DATA_SCALE_ROW"
+    CUDSS_DATA_SCALE_COL "CUDSS_DATA_SCALE_COL"
+    CUDSS_DATA_NSUPERPANELS "CUDSS_DATA_NSUPERPANELS"
+    CUDSS_DATA_USER_SCHUR_INDICES "CUDSS_DATA_USER_SCHUR_INDICES"
+    CUDSS_DATA_SCHUR_SHAPE "CUDSS_DATA_SCHUR_SHAPE"
+    CUDSS_DATA_SCHUR_MATRIX "CUDSS_DATA_SCHUR_MATRIX"
+    CUDSS_DATA_USER_ELIMINATION_TREE "CUDSS_DATA_USER_ELIMINATION_TREE"
+    CUDSS_DATA_ELIMINATION_TREE "CUDSS_DATA_ELIMINATION_TREE"
+    CUDSS_DATA_USER_HOST_INTERRUPT "CUDSS_DATA_USER_HOST_INTERRUPT"
 
 ctypedef enum cudssPhase_t "cudssPhase_t":
-    CUDSS_PHASE_ANALYSIS "CUDSS_PHASE_ANALYSIS" = 1
-    CUDSS_PHASE_FACTORIZATION "CUDSS_PHASE_FACTORIZATION" = 2
-    CUDSS_PHASE_REFACTORIZATION "CUDSS_PHASE_REFACTORIZATION" = 4
-    CUDSS_PHASE_SOLVE "CUDSS_PHASE_SOLVE" = 8
-    CUDSS_PHASE_SOLVE_FWD "CUDSS_PHASE_SOLVE_FWD" = 16
-    CUDSS_PHASE_SOLVE_DIAG "CUDSS_PHASE_SOLVE_DIAG" = 32
-    CUDSS_PHASE_SOLVE_BWD "CUDSS_PHASE_SOLVE_BWD" = 64
+    CUDSS_PHASE_REORDERING "CUDSS_PHASE_REORDERING" = (1 << 0)
+    CUDSS_PHASE_SYMBOLIC_FACTORIZATION "CUDSS_PHASE_SYMBOLIC_FACTORIZATION" = (1 << 1)
+    CUDSS_PHASE_ANALYSIS "CUDSS_PHASE_ANALYSIS" = (CUDSS_PHASE_REORDERING | CUDSS_PHASE_SYMBOLIC_FACTORIZATION)
+    CUDSS_PHASE_FACTORIZATION "CUDSS_PHASE_FACTORIZATION" = (1 << 2)
+    CUDSS_PHASE_REFACTORIZATION "CUDSS_PHASE_REFACTORIZATION" = (1 << 3)
+    CUDSS_PHASE_SOLVE_FWD_PERM "CUDSS_PHASE_SOLVE_FWD_PERM" = (1 << 4)
+    CUDSS_PHASE_SOLVE_FWD "CUDSS_PHASE_SOLVE_FWD" = (1 << 5)
+    CUDSS_PHASE_SOLVE_DIAG "CUDSS_PHASE_SOLVE_DIAG" = (1 << 6)
+    CUDSS_PHASE_SOLVE_BWD "CUDSS_PHASE_SOLVE_BWD" = (1 << 7)
+    CUDSS_PHASE_SOLVE_BWD_PERM "CUDSS_PHASE_SOLVE_BWD_PERM" = (1 << 8)
+    CUDSS_PHASE_SOLVE_REFINEMENT "CUDSS_PHASE_SOLVE_REFINEMENT" = (1 << 9)
+    CUDSS_PHASE_SOLVE "CUDSS_PHASE_SOLVE" = (((((CUDSS_PHASE_SOLVE_FWD_PERM | CUDSS_PHASE_SOLVE_FWD) | CUDSS_PHASE_SOLVE_DIAG) | CUDSS_PHASE_SOLVE_BWD) | CUDSS_PHASE_SOLVE_BWD_PERM) | CUDSS_PHASE_SOLVE_REFINEMENT)
 
 ctypedef enum cudssStatus_t "cudssStatus_t":
     CUDSS_STATUS_SUCCESS "CUDSS_STATUS_SUCCESS" = 0
@@ -96,6 +120,8 @@ ctypedef enum cudssAlgType_t "cudssAlgType_t":
     CUDSS_ALG_1 "CUDSS_ALG_1"
     CUDSS_ALG_2 "CUDSS_ALG_2"
     CUDSS_ALG_3 "CUDSS_ALG_3"
+    CUDSS_ALG_4 "CUDSS_ALG_4"
+    CUDSS_ALG_5 "CUDSS_ALG_5"
 
 ctypedef enum cudssPivotType_t "cudssPivotType_t":
     CUDSS_PIVOT_COL "CUDSS_PIVOT_COL"
@@ -106,6 +132,7 @@ ctypedef enum cudssMatrixFormat_t "cudssMatrixFormat_t":
     CUDSS_MFORMAT_DENSE "CUDSS_MFORMAT_DENSE" = 1
     CUDSS_MFORMAT_CSR "CUDSS_MFORMAT_CSR" = 2
     CUDSS_MFORMAT_BATCH "CUDSS_MFORMAT_BATCH" = 4
+    CUDSS_MFORMAT_DISTRIBUTED "CUDSS_MFORMAT_DISTRIBUTED" = 8
 
 
 # types
@@ -162,7 +189,7 @@ cdef cudssStatus_t cudssConfigSet(cudssConfig_t config, cudssConfigParam_t param
 cdef cudssStatus_t cudssConfigGet(cudssConfig_t config, cudssConfigParam_t param, void* value, size_t sizeInBytes, size_t* sizeWritten) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssDataSet(cudssHandle_t handle, cudssData_t data, cudssDataParam_t param, void* value, size_t sizeInBytes) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssDataGet(cudssHandle_t handle, cudssData_t data, cudssDataParam_t param, void* value, size_t sizeInBytes, size_t* sizeWritten) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
-cdef cudssStatus_t cudssExecute(cudssHandle_t handle, cudssPhase_t phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t cudssExecute(cudssHandle_t handle, int phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssSetStream(cudssHandle_t handle, cudaStream_t stream) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssSetCommLayer(cudssHandle_t handle, const char* commLibFileName) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssSetThreadingLayer(cudssHandle_t handle, const char* thrLibFileName) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
@@ -171,6 +198,7 @@ cdef cudssStatus_t cudssConfigDestroy(cudssConfig_t solverConfig) except?_CUDSSS
 cdef cudssStatus_t cudssDataCreate(cudssHandle_t handle, cudssData_t* solverData) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssDataDestroy(cudssHandle_t handle, cudssData_t solverData) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssCreate(cudssHandle_t* handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t cudssCreateMg(cudssHandle_t* handle_pt, int device_count, int* device_indices) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssDestroy(cudssHandle_t handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssGetProperty(libraryPropertyType propertyType, int* value) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssMatrixCreateDn(cudssMatrix_t* matrix, int64_t nrows, int64_t ncols, int64_t ld, void* values, cudaDataType_t valueType, cudssLayout_t layout) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
@@ -187,5 +215,7 @@ cdef cudssStatus_t cudssMatrixGetBatchCsr(cudssMatrix_t matrix, int64_t* batchCo
 cdef cudssStatus_t cudssMatrixSetBatchValues(cudssMatrix_t matrix, void** values) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssMatrixSetBatchCsrPointers(cudssMatrix_t matrix, void** rowOffsets, void** rowEnd, void** colIndices, void** values) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssMatrixGetFormat(cudssMatrix_t matrix, int* format) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t cudssMatrixSetDistributionRow1d(cudssMatrix_t matrix, int64_t first_row, int64_t last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cudssStatus_t cudssMatrixGetDistributionRow1d(cudssMatrix_t matrix, int64_t* first_row, int64_t* last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssGetDeviceMemHandler(cudssHandle_t handle, cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cudssStatus_t cudssSetDeviceMemHandler(cudssHandle_t handle, const cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cycudss.pyx b/nvmath/bindings/cycudss.pyx
index e1c0a5a..89f0266 100644
--- a/nvmath/bindings/cycudss.pyx
+++ b/nvmath/bindings/cycudss.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.5.0. Do not modify it directly.
+# This code was automatically generated with version 0.7.0. Do not modify it directly.
 
 from ._internal cimport cudss as _cudss
 
@@ -27,7 +27,7 @@ cdef cudssStatus_t cudssDataGet(cudssHandle_t handle, cudssData_t data, cudssDat
     return _cudss._cudssDataGet(handle, data, param, value, sizeInBytes, sizeWritten)
 
 
-cdef cudssStatus_t cudssExecute(cudssHandle_t handle, cudssPhase_t phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+cdef cudssStatus_t cudssExecute(cudssHandle_t handle, int phase, cudssConfig_t solverConfig, cudssData_t solverData, cudssMatrix_t inputMatrix, cudssMatrix_t solution, cudssMatrix_t rhs) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     return _cudss._cudssExecute(handle, phase, solverConfig, solverData, inputMatrix, solution, rhs)
 
 
@@ -63,6 +63,10 @@ cdef cudssStatus_t cudssCreate(cudssHandle_t* handle) except?_CUDSSSTATUS_T_INTE
     return _cudss._cudssCreate(handle)
 
 
+cdef cudssStatus_t cudssCreateMg(cudssHandle_t* handle_pt, int device_count, int* device_indices) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cudss._cudssCreateMg(handle_pt, device_count, device_indices)
+
+
 cdef cudssStatus_t cudssDestroy(cudssHandle_t handle) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     return _cudss._cudssDestroy(handle)
 
@@ -127,6 +131,14 @@ cdef cudssStatus_t cudssMatrixGetFormat(cudssMatrix_t matrix, int* format) excep
     return _cudss._cudssMatrixGetFormat(matrix, format)
 
 
+cdef cudssStatus_t cudssMatrixSetDistributionRow1d(cudssMatrix_t matrix, int64_t first_row, int64_t last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cudss._cudssMatrixSetDistributionRow1d(matrix, first_row, last_row)
+
+
+cdef cudssStatus_t cudssMatrixGetDistributionRow1d(cudssMatrix_t matrix, int64_t* first_row, int64_t* last_row) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cudss._cudssMatrixGetDistributionRow1d(matrix, first_row, last_row)
+
+
 cdef cudssStatus_t cudssGetDeviceMemHandler(cudssHandle_t handle, cudssDeviceMemHandler_t* handler) except?_CUDSSSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     return _cudss._cudssGetDeviceMemHandler(handle, handler)
 
diff --git a/nvmath/bindings/cycufft.pxd b/nvmath/bindings/cycufft.pxd
index b8dd2fe..3799763 100644
--- a/nvmath/bindings/cycufft.pxd
+++ b/nvmath/bindings/cycufft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 ###############################################################################
@@ -25,14 +25,18 @@ ctypedef enum cufftResult "cufftResult":
     CUFFT_SETUP_FAILED "CUFFT_SETUP_FAILED" = 0x7
     CUFFT_INVALID_SIZE "CUFFT_INVALID_SIZE" = 0x8
     CUFFT_UNALIGNED_DATA "CUFFT_UNALIGNED_DATA" = 0x9
-    CUFFT_INCOMPLETE_PARAMETER_LIST "CUFFT_INCOMPLETE_PARAMETER_LIST" = 0xA
     CUFFT_INVALID_DEVICE "CUFFT_INVALID_DEVICE" = 0xB
-    CUFFT_PARSE_ERROR "CUFFT_PARSE_ERROR" = 0xC
     CUFFT_NO_WORKSPACE "CUFFT_NO_WORKSPACE" = 0xD
     CUFFT_NOT_IMPLEMENTED "CUFFT_NOT_IMPLEMENTED" = 0xE
-    CUFFT_LICENSE_ERROR "CUFFT_LICENSE_ERROR" = 0x0F
     CUFFT_NOT_SUPPORTED "CUFFT_NOT_SUPPORTED" = 0x10
+    CUFFT_MISSING_DEPENDENCY "CUFFT_MISSING_DEPENDENCY" = 0x11
+    CUFFT_NVRTC_FAILURE "CUFFT_NVRTC_FAILURE" = 0x12
+    CUFFT_NVJITLINK_FAILURE "CUFFT_NVJITLINK_FAILURE" = 0x13
+    CUFFT_NVSHMEM_FAILURE "CUFFT_NVSHMEM_FAILURE" = 0x14
     _CUFFTRESULT_INTERNAL_LOADING_ERROR "_CUFFTRESULT_INTERNAL_LOADING_ERROR" = -42
+    CUFFT_INCOMPLETE_PARAMETER_LIST "CUFFT_INCOMPLETE_PARAMETER_LIST" = 0xA
+    CUFFT_PARSE_ERROR "CUFFT_PARSE_ERROR" = 0xC
+    CUFFT_LICENSE_ERROR "CUFFT_LICENSE_ERROR" = 0x0F
 
 ctypedef enum cufftType "cufftType":
     CUFFT_R2C "CUFFT_R2C" = 0x2a
@@ -313,8 +317,9 @@ cdef cufftResult cufftXtGetSizeMany(cufftHandle plan, int rank, long long int* n
 cdef cufftResult cufftXtExec(cufftHandle plan, void* input, void* output, int direction) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftXtExecDescriptor(cufftHandle plan, cudaLibXtDesc* input, cudaLibXtDesc* output, int direction) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t* workSize) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
-cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
+cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
 cdef cufftResult cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
+cdef cufftResult __cufftXtSetJITCallback_12_7(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cycufft.pyx b/nvmath/bindings/cycufft.pyx
index e9047b3..43c0372 100644
--- a/nvmath/bindings/cycufft.pyx
+++ b/nvmath/bindings/cycufft.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.1. Do not modify it directly.
 
 from ._internal cimport cufft as _cufft
 
@@ -215,8 +215,8 @@ cdef cufftResult cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolic
     return _cufft._cufftXtSetWorkAreaPolicy(plan, policy, workSize)
 
 
-cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
-    return _cufft._cufftXtSetJITCallback(plan, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
+cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _cufft._cufftXtSetJITCallback(plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
 
 
 cdef cufftResult cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
@@ -233,3 +233,7 @@ cdef cufftResult cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty prope
 
 cdef cufftResult cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
     return _cufft._cufftResetPlanProperty(plan, property)
+
+
+cdef cufftResult __cufftXtSetJITCallback_12_7(cufftHandle plan, const char* lto_callback_symbol_name, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except?_CUFFTRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _cufft.___cufftXtSetJITCallback_12_7(plan, lto_callback_symbol_name, lto_callback_fatbin, lto_callback_fatbin_size, type, caller_info)
diff --git a/nvmath/bindings/cycutensor.pxd b/nvmath/bindings/cycutensor.pxd
new file mode 100644
index 0000000..75e5894
--- /dev/null
+++ b/nvmath/bindings/cycutensor.pxd
@@ -0,0 +1,191 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+# This layer exposes the C header to Cython as-is.
+
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
+from libc.stdio cimport FILE
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum cutensorOperator_t "cutensorOperator_t":
+    CUTENSOR_OP_IDENTITY "CUTENSOR_OP_IDENTITY" = 1
+    CUTENSOR_OP_SQRT "CUTENSOR_OP_SQRT" = 2
+    CUTENSOR_OP_RELU "CUTENSOR_OP_RELU" = 8
+    CUTENSOR_OP_CONJ "CUTENSOR_OP_CONJ" = 9
+    CUTENSOR_OP_RCP "CUTENSOR_OP_RCP" = 10
+    CUTENSOR_OP_SIGMOID "CUTENSOR_OP_SIGMOID" = 11
+    CUTENSOR_OP_TANH "CUTENSOR_OP_TANH" = 12
+    CUTENSOR_OP_EXP "CUTENSOR_OP_EXP" = 22
+    CUTENSOR_OP_LOG "CUTENSOR_OP_LOG" = 23
+    CUTENSOR_OP_ABS "CUTENSOR_OP_ABS" = 24
+    CUTENSOR_OP_NEG "CUTENSOR_OP_NEG" = 25
+    CUTENSOR_OP_SIN "CUTENSOR_OP_SIN" = 26
+    CUTENSOR_OP_COS "CUTENSOR_OP_COS" = 27
+    CUTENSOR_OP_TAN "CUTENSOR_OP_TAN" = 28
+    CUTENSOR_OP_SINH "CUTENSOR_OP_SINH" = 29
+    CUTENSOR_OP_COSH "CUTENSOR_OP_COSH" = 30
+    CUTENSOR_OP_ASIN "CUTENSOR_OP_ASIN" = 31
+    CUTENSOR_OP_ACOS "CUTENSOR_OP_ACOS" = 32
+    CUTENSOR_OP_ATAN "CUTENSOR_OP_ATAN" = 33
+    CUTENSOR_OP_ASINH "CUTENSOR_OP_ASINH" = 34
+    CUTENSOR_OP_ACOSH "CUTENSOR_OP_ACOSH" = 35
+    CUTENSOR_OP_ATANH "CUTENSOR_OP_ATANH" = 36
+    CUTENSOR_OP_CEIL "CUTENSOR_OP_CEIL" = 37
+    CUTENSOR_OP_FLOOR "CUTENSOR_OP_FLOOR" = 38
+    CUTENSOR_OP_MISH "CUTENSOR_OP_MISH" = 39
+    CUTENSOR_OP_SWISH "CUTENSOR_OP_SWISH" = 40
+    CUTENSOR_OP_SOFT_PLUS "CUTENSOR_OP_SOFT_PLUS" = 41
+    CUTENSOR_OP_SOFT_SIGN "CUTENSOR_OP_SOFT_SIGN" = 42
+    CUTENSOR_OP_ADD "CUTENSOR_OP_ADD" = 3
+    CUTENSOR_OP_MUL "CUTENSOR_OP_MUL" = 5
+    CUTENSOR_OP_MAX "CUTENSOR_OP_MAX" = 6
+    CUTENSOR_OP_MIN "CUTENSOR_OP_MIN" = 7
+    CUTENSOR_OP_UNKNOWN "CUTENSOR_OP_UNKNOWN" = 126
+
+ctypedef enum cutensorStatus_t "cutensorStatus_t":
+    CUTENSOR_STATUS_SUCCESS "CUTENSOR_STATUS_SUCCESS" = 0
+    CUTENSOR_STATUS_NOT_INITIALIZED "CUTENSOR_STATUS_NOT_INITIALIZED" = 1
+    CUTENSOR_STATUS_ALLOC_FAILED "CUTENSOR_STATUS_ALLOC_FAILED" = 3
+    CUTENSOR_STATUS_INVALID_VALUE "CUTENSOR_STATUS_INVALID_VALUE" = 7
+    CUTENSOR_STATUS_ARCH_MISMATCH "CUTENSOR_STATUS_ARCH_MISMATCH" = 8
+    CUTENSOR_STATUS_MAPPING_ERROR "CUTENSOR_STATUS_MAPPING_ERROR" = 11
+    CUTENSOR_STATUS_EXECUTION_FAILED "CUTENSOR_STATUS_EXECUTION_FAILED" = 13
+    CUTENSOR_STATUS_INTERNAL_ERROR "CUTENSOR_STATUS_INTERNAL_ERROR" = 14
+    CUTENSOR_STATUS_NOT_SUPPORTED "CUTENSOR_STATUS_NOT_SUPPORTED" = 15
+    CUTENSOR_STATUS_LICENSE_ERROR "CUTENSOR_STATUS_LICENSE_ERROR" = 16
+    CUTENSOR_STATUS_CUBLAS_ERROR "CUTENSOR_STATUS_CUBLAS_ERROR" = 17
+    CUTENSOR_STATUS_CUDA_ERROR "CUTENSOR_STATUS_CUDA_ERROR" = 18
+    CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE "CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE" = 19
+    CUTENSOR_STATUS_INSUFFICIENT_DRIVER "CUTENSOR_STATUS_INSUFFICIENT_DRIVER" = 20
+    CUTENSOR_STATUS_IO_ERROR "CUTENSOR_STATUS_IO_ERROR" = 21
+    _CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR "_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR" = -42
+
+ctypedef enum cutensorAlgo_t "cutensorAlgo_t":
+    CUTENSOR_ALGO_DEFAULT_PATIENT "CUTENSOR_ALGO_DEFAULT_PATIENT" = -(6)
+    CUTENSOR_ALGO_GETT "CUTENSOR_ALGO_GETT" = -(4)
+    CUTENSOR_ALGO_TGETT "CUTENSOR_ALGO_TGETT" = -(3)
+    CUTENSOR_ALGO_TTGT "CUTENSOR_ALGO_TTGT" = -(2)
+    CUTENSOR_ALGO_DEFAULT "CUTENSOR_ALGO_DEFAULT" = -(1)
+
+ctypedef enum cutensorWorksizePreference_t "cutensorWorksizePreference_t":
+    CUTENSOR_WORKSPACE_MIN "CUTENSOR_WORKSPACE_MIN" = 1
+    CUTENSOR_WORKSPACE_DEFAULT "CUTENSOR_WORKSPACE_DEFAULT" = 2
+    CUTENSOR_WORKSPACE_MAX "CUTENSOR_WORKSPACE_MAX" = 3
+
+ctypedef enum cutensorOperationDescriptorAttribute_t "cutensorOperationDescriptorAttribute_t":
+    CUTENSOR_OPERATION_DESCRIPTOR_TAG "CUTENSOR_OPERATION_DESCRIPTOR_TAG" = 0
+    CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE "CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE" = 1
+    CUTENSOR_OPERATION_DESCRIPTOR_FLOPS "CUTENSOR_OPERATION_DESCRIPTOR_FLOPS" = 2
+    CUTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES "CUTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES" = 3
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT "CUTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT" = 4
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT "CUTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT" = 5
+    CUTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE "CUTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE" = 6
+
+ctypedef enum cutensorPlanPreferenceAttribute_t "cutensorPlanPreferenceAttribute_t":
+    CUTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE "CUTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE" = 0
+    CUTENSOR_PLAN_PREFERENCE_CACHE_MODE "CUTENSOR_PLAN_PREFERENCE_CACHE_MODE" = 1
+    CUTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT "CUTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT" = 2
+    CUTENSOR_PLAN_PREFERENCE_ALGO "CUTENSOR_PLAN_PREFERENCE_ALGO" = 3
+    CUTENSOR_PLAN_PREFERENCE_KERNEL_RANK "CUTENSOR_PLAN_PREFERENCE_KERNEL_RANK" = 4
+    CUTENSOR_PLAN_PREFERENCE_JIT "CUTENSOR_PLAN_PREFERENCE_JIT" = 5
+
+ctypedef enum cutensorAutotuneMode_t "cutensorAutotuneMode_t":
+    CUTENSOR_AUTOTUNE_MODE_NONE "CUTENSOR_AUTOTUNE_MODE_NONE" = 0
+    CUTENSOR_AUTOTUNE_MODE_INCREMENTAL "CUTENSOR_AUTOTUNE_MODE_INCREMENTAL" = 1
+
+ctypedef enum cutensorJitMode_t "cutensorJitMode_t":
+    CUTENSOR_JIT_MODE_NONE "CUTENSOR_JIT_MODE_NONE" = 0
+    CUTENSOR_JIT_MODE_DEFAULT "CUTENSOR_JIT_MODE_DEFAULT" = 1
+
+ctypedef enum cutensorCacheMode_t "cutensorCacheMode_t":
+    CUTENSOR_CACHE_MODE_NONE "CUTENSOR_CACHE_MODE_NONE" = 0
+    CUTENSOR_CACHE_MODE_PEDANTIC "CUTENSOR_CACHE_MODE_PEDANTIC" = 1
+
+ctypedef enum cutensorPlanAttribute_t "cutensorPlanAttribute_t":
+    CUTENSOR_PLAN_REQUIRED_WORKSPACE "CUTENSOR_PLAN_REQUIRED_WORKSPACE" = 0
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+    ctypedef int cudaDataType_t 'cudaDataType_t'
+    ctypedef int cudaDataType 'cudaDataType'
+    ctypedef int libraryPropertyType_t 'libraryPropertyType_t'
+    ctypedef int libraryPropertyType 'libraryPropertyType'
+
+
+ctypedef cudaDataType_t cutensorDataType_t 'cutensorDataType_t'
+ctypedef void* cutensorComputeDescriptor_t 'cutensorComputeDescriptor_t'
+ctypedef void* cutensorOperationDescriptor_t 'cutensorOperationDescriptor_t'
+ctypedef void* cutensorPlan_t 'cutensorPlan_t'
+ctypedef void* cutensorPlanPreference_t 'cutensorPlanPreference_t'
+ctypedef void* cutensorHandle_t 'cutensorHandle_t'
+ctypedef void* cutensorTensorDescriptor_t 'cutensorTensorDescriptor_t'
+ctypedef void* cutensorBlockSparseTensorDescriptor_t 'cutensorBlockSparseTensorDescriptor_t'
+ctypedef void (*cutensorLoggerCallback_t 'cutensorLoggerCallback_t')(
+    int32_t logLevel,
+    const char* functionName,
+    const char* message
+)
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef cutensorStatus_t cutensorCreate(cutensorHandle_t* handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroy(cutensorHandle_t handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorHandleResizePlanCache(cutensorHandle_t handle, const uint32_t numEntries) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorHandleWritePlanCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorHandleReadPlanCacheFromFile(cutensorHandle_t handle, const char filename[], uint32_t* numCachelinesRead) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorWriteKernelCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorReadKernelCacheFromFile(cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateTensorDescriptor(const cutensorHandle_t handle, cutensorTensorDescriptor_t* desc, const uint32_t numModes, const int64_t extent[], const int64_t stride[], cudaDataType_t dataType, uint32_t alignmentRequirement) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroyTensorDescriptor(cutensorTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateElementwiseTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAB, cutensorOperator_t opABC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorElementwiseTrinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* B, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateElementwiseBinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorElementwiseBinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreatePermutation(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorPermute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, void* B, const cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroyOperationDescriptor(cutensorOperationDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorOperationDescriptorSetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorOperationDescriptorGetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreatePlanPreference(const cutensorHandle_t handle, cutensorPlanPreference_t* pref, cutensorAlgo_t algo, cutensorJitMode_t jitMode) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroyPlanPreference(cutensorPlanPreference_t pref) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorPlanPreferenceSetAttribute(const cutensorHandle_t handle, cutensorPlanPreference_t pref, cutensorPlanPreferenceAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorPlanGetAttribute(const cutensorHandle_t handle, const cutensorPlan_t plan, cutensorPlanAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorEstimateWorkspaceSize(const cutensorHandle_t handle, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t planPref, const cutensorWorksizePreference_t workspacePref, uint64_t* workspaceSizeEstimate) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreatePlan(const cutensorHandle_t handle, cutensorPlan_t* plan, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t pref, uint64_t workspaceSizeLimit) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroyPlan(cutensorPlan_t plan) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateReduction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opReduce, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorReduce(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateContractionTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opD, const cutensorTensorDescriptor_t descE, const int32_t modeE[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorContractTrinary(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* C, const void* beta, const void* D, void* E, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateBlockSparseTensorDescriptor(cutensorHandle_t handle, cutensorBlockSparseTensorDescriptor_t* desc, const uint32_t numModes, const uint64_t numNonZeroBlocks, const uint32_t numSectionsPerMode[], const int64_t extent[], const int32_t nonZeroCoordinates[], const int64_t stride[], cudaDataType_t dataType) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorDestroyBlockSparseTensorDescriptor(cutensorBlockSparseTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorCreateBlockSparseContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorBlockSparseTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorBlockSparseTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorBlockSparseTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorBlockSparseTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorBlockSparseContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* const A[], const void* const B[], const void* beta, const void* const C[], void* const D[], void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* cutensorGetErrorString(const cutensorStatus_t error) except?NULL nogil
+cdef size_t cutensorGetVersion() except?0 nogil
+cdef size_t cutensorGetCudartVersion() except?0 nogil
+cdef cutensorStatus_t cutensorLoggerSetCallback(cutensorLoggerCallback_t callback) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorLoggerSetFile(FILE* file) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorLoggerOpenFile(const char* logFile) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorLoggerSetLevel(int32_t level) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorLoggerSetMask(int32_t mask) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cutensorStatus_t cutensorLoggerForceDisable() except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cycutensor.pyx b/nvmath/bindings/cycutensor.pyx
new file mode 100644
index 0000000..df39ec6
--- /dev/null
+++ b/nvmath/bindings/cycutensor.pyx
@@ -0,0 +1,187 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 2.3.1. Do not modify it directly.
+
+from ._internal cimport cutensor as _cutensor
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cutensorStatus_t cutensorCreate(cutensorHandle_t* handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreate(handle)
+
+
+cdef cutensorStatus_t cutensorDestroy(cutensorHandle_t handle) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroy(handle)
+
+
+cdef cutensorStatus_t cutensorHandleResizePlanCache(cutensorHandle_t handle, const uint32_t numEntries) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorHandleResizePlanCache(handle, numEntries)
+
+
+cdef cutensorStatus_t cutensorHandleWritePlanCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorHandleWritePlanCacheToFile(handle, filename)
+
+
+cdef cutensorStatus_t cutensorHandleReadPlanCacheFromFile(cutensorHandle_t handle, const char filename[], uint32_t* numCachelinesRead) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorHandleReadPlanCacheFromFile(handle, filename, numCachelinesRead)
+
+
+cdef cutensorStatus_t cutensorWriteKernelCacheToFile(const cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorWriteKernelCacheToFile(handle, filename)
+
+
+cdef cutensorStatus_t cutensorReadKernelCacheFromFile(cutensorHandle_t handle, const char filename[]) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorReadKernelCacheFromFile(handle, filename)
+
+
+cdef cutensorStatus_t cutensorCreateTensorDescriptor(const cutensorHandle_t handle, cutensorTensorDescriptor_t* desc, const uint32_t numModes, const int64_t extent[], const int64_t stride[], cudaDataType_t dataType, uint32_t alignmentRequirement) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateTensorDescriptor(handle, desc, numModes, extent, stride, dataType, alignmentRequirement)
+
+
+cdef cutensorStatus_t cutensorDestroyTensorDescriptor(cutensorTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroyTensorDescriptor(desc)
+
+
+cdef cutensorStatus_t cutensorCreateElementwiseTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAB, cutensorOperator_t opABC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateElementwiseTrinary(handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, opAB, opABC, descCompute)
+
+
+cdef cutensorStatus_t cutensorElementwiseTrinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* B, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorElementwiseTrinaryExecute(handle, plan, alpha, A, beta, B, gamma, C, D, stream)
+
+
+cdef cutensorStatus_t cutensorCreateElementwiseBinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opAC, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateElementwiseBinary(handle, desc, descA, modeA, opA, descC, modeC, opC, descD, modeD, opAC, descCompute)
+
+
+cdef cutensorStatus_t cutensorElementwiseBinaryExecute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* gamma, const void* C, void* D, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorElementwiseBinaryExecute(handle, plan, alpha, A, gamma, C, D, stream)
+
+
+cdef cutensorStatus_t cutensorCreatePermutation(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreatePermutation(handle, desc, descA, modeA, opA, descB, modeB, descCompute)
+
+
+cdef cutensorStatus_t cutensorPermute(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, void* B, const cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorPermute(handle, plan, alpha, A, B, stream)
+
+
+cdef cutensorStatus_t cutensorCreateContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateContraction(handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, descCompute)
+
+
+cdef cutensorStatus_t cutensorDestroyOperationDescriptor(cutensorOperationDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroyOperationDescriptor(desc)
+
+
+cdef cutensorStatus_t cutensorOperationDescriptorSetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorOperationDescriptorSetAttribute(handle, desc, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t cutensorOperationDescriptorGetAttribute(const cutensorHandle_t handle, cutensorOperationDescriptor_t desc, cutensorOperationDescriptorAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorOperationDescriptorGetAttribute(handle, desc, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t cutensorCreatePlanPreference(const cutensorHandle_t handle, cutensorPlanPreference_t* pref, cutensorAlgo_t algo, cutensorJitMode_t jitMode) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreatePlanPreference(handle, pref, algo, jitMode)
+
+
+cdef cutensorStatus_t cutensorDestroyPlanPreference(cutensorPlanPreference_t pref) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroyPlanPreference(pref)
+
+
+cdef cutensorStatus_t cutensorPlanPreferenceSetAttribute(const cutensorHandle_t handle, cutensorPlanPreference_t pref, cutensorPlanPreferenceAttribute_t attr, const void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorPlanPreferenceSetAttribute(handle, pref, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t cutensorPlanGetAttribute(const cutensorHandle_t handle, const cutensorPlan_t plan, cutensorPlanAttribute_t attr, void* buf, size_t sizeInBytes) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorPlanGetAttribute(handle, plan, attr, buf, sizeInBytes)
+
+
+cdef cutensorStatus_t cutensorEstimateWorkspaceSize(const cutensorHandle_t handle, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t planPref, const cutensorWorksizePreference_t workspacePref, uint64_t* workspaceSizeEstimate) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorEstimateWorkspaceSize(handle, desc, planPref, workspacePref, workspaceSizeEstimate)
+
+
+cdef cutensorStatus_t cutensorCreatePlan(const cutensorHandle_t handle, cutensorPlan_t* plan, const cutensorOperationDescriptor_t desc, const cutensorPlanPreference_t pref, uint64_t workspaceSizeLimit) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreatePlan(handle, plan, desc, pref, workspaceSizeLimit)
+
+
+cdef cutensorStatus_t cutensorDestroyPlan(cutensorPlan_t plan) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroyPlan(plan)
+
+
+cdef cutensorStatus_t cutensorContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorContract(handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t cutensorCreateReduction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opReduce, const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateReduction(handle, desc, descA, modeA, opA, descC, modeC, opC, descD, modeD, opReduce, descCompute)
+
+
+cdef cutensorStatus_t cutensorReduce(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* beta, const void* C, void* D, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorReduce(handle, plan, alpha, A, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t cutensorCreateContractionTrinary(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorTensorDescriptor_t descD, const int32_t modeD[], cutensorOperator_t opD, const cutensorTensorDescriptor_t descE, const int32_t modeE[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateContractionTrinary(handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, opD, descE, modeE, descCompute)
+
+
+cdef cutensorStatus_t cutensorContractTrinary(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* A, const void* B, const void* C, const void* beta, const void* D, void* E, void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorContractTrinary(handle, plan, alpha, A, B, C, beta, D, E, workspace, workspaceSize, stream)
+
+
+cdef cutensorStatus_t cutensorCreateBlockSparseTensorDescriptor(cutensorHandle_t handle, cutensorBlockSparseTensorDescriptor_t* desc, const uint32_t numModes, const uint64_t numNonZeroBlocks, const uint32_t numSectionsPerMode[], const int64_t extent[], const int32_t nonZeroCoordinates[], const int64_t stride[], cudaDataType_t dataType) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateBlockSparseTensorDescriptor(handle, desc, numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, stride, dataType)
+
+
+cdef cutensorStatus_t cutensorDestroyBlockSparseTensorDescriptor(cutensorBlockSparseTensorDescriptor_t desc) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorDestroyBlockSparseTensorDescriptor(desc)
+
+
+cdef cutensorStatus_t cutensorCreateBlockSparseContraction(const cutensorHandle_t handle, cutensorOperationDescriptor_t* desc, const cutensorBlockSparseTensorDescriptor_t descA, const int32_t modeA[], cutensorOperator_t opA, const cutensorBlockSparseTensorDescriptor_t descB, const int32_t modeB[], cutensorOperator_t opB, const cutensorBlockSparseTensorDescriptor_t descC, const int32_t modeC[], cutensorOperator_t opC, const cutensorBlockSparseTensorDescriptor_t descD, const int32_t modeD[], const cutensorComputeDescriptor_t descCompute) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorCreateBlockSparseContraction(handle, desc, descA, modeA, opA, descB, modeB, opB, descC, modeC, opC, descD, modeD, descCompute)
+
+
+cdef cutensorStatus_t cutensorBlockSparseContract(const cutensorHandle_t handle, const cutensorPlan_t plan, const void* alpha, const void* const A[], const void* const B[], const void* beta, const void* const C[], void* const D[], void* workspace, uint64_t workspaceSize, cudaStream_t stream) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorBlockSparseContract(handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize, stream)
+
+
+cdef const char* cutensorGetErrorString(const cutensorStatus_t error) except?NULL nogil:
+    return _cutensor._cutensorGetErrorString(error)
+
+
+cdef size_t cutensorGetVersion() except?0 nogil:
+    return _cutensor._cutensorGetVersion()
+
+
+cdef size_t cutensorGetCudartVersion() except?0 nogil:
+    return _cutensor._cutensorGetCudartVersion()
+
+
+cdef cutensorStatus_t cutensorLoggerSetCallback(cutensorLoggerCallback_t callback) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerSetCallback(callback)
+
+
+cdef cutensorStatus_t cutensorLoggerSetFile(FILE* file) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerSetFile(file)
+
+
+cdef cutensorStatus_t cutensorLoggerOpenFile(const char* logFile) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerOpenFile(logFile)
+
+
+cdef cutensorStatus_t cutensorLoggerSetLevel(int32_t level) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerSetLevel(level)
+
+
+cdef cutensorStatus_t cutensorLoggerSetMask(int32_t mask) except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerSetMask(mask)
+
+
+cdef cutensorStatus_t cutensorLoggerForceDisable() except?_CUTENSORSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cutensor._cutensorLoggerForceDisable()
diff --git a/nvmath/bindings/cymathdx.pxd b/nvmath/bindings/cymathdx.pxd
index df0f549..0384b2a 100644
--- a/nvmath/bindings/cymathdx.pxd
+++ b/nvmath/bindings/cymathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int64_t
@@ -152,6 +152,8 @@ ctypedef enum cublasdxTensorTrait "cublasdxTensorTrait":
     CUBLASDX_TENSOR_TRAIT_ALIGNMENT_BYTES "CUBLASDX_TENSOR_TRAIT_ALIGNMENT_BYTES" = 1
     CUBLASDX_TENSOR_TRAIT_UID "CUBLASDX_TENSOR_TRAIT_UID" = 2
     CUBLASDX_TENSOR_TRAIT_OPAQUE_NAME "CUBLASDX_TENSOR_TRAIT_OPAQUE_NAME" = 4
+    CUBLASDX_TENSOR_TRAIT_LOGICAL_SIZE "CUBLASDX_TENSOR_TRAIT_LOGICAL_SIZE" = 5
+    CUBLASDX_TENSOR_TRAIT_MEMORY_SPACE "CUBLASDX_TENSOR_TRAIT_MEMORY_SPACE" = 6
 
 ctypedef enum cublasdxDeviceFunctionTrait "cublasdxDeviceFunctionTrait":
     CUBLASDX_DEVICE_FUNCTION_TRAIT_SYMBOL "CUBLASDX_DEVICE_FUNCTION_TRAIT_SYMBOL" = 1
@@ -165,6 +167,12 @@ ctypedef enum cublasdxDeviceFunctionType "cublasdxDeviceFunctionType":
     CUBLASDX_DEVICE_FUNCTION_COPY_WAIT "CUBLASDX_DEVICE_FUNCTION_COPY_WAIT" = 2
     CUBLASDX_DEVICE_FUNCTION_CLEAR "CUBLASDX_DEVICE_FUNCTION_CLEAR" = 3
     CUBLASDX_DEVICE_FUNCTION_AXPBY "CUBLASDX_DEVICE_FUNCTION_AXPBY" = 4
+    CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD "CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD" = 5
+    CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD_PARTITIONER "CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD_PARTITIONER" = 6
+    CUBLASDX_DEVICE_FUNCTION_MAP_CRD2IDX "CUBLASDX_DEVICE_FUNCTION_MAP_CRD2IDX" = 7
+    CUBLASDX_DEVICE_FUNCTION_IS_THREAD_ACTIVE "CUBLASDX_DEVICE_FUNCTION_IS_THREAD_ACTIVE" = 8
+    CUBLASDX_DEVICE_FUNCTION_IS_PREDICATED "CUBLASDX_DEVICE_FUNCTION_IS_PREDICATED" = 9
+    CUBLASDX_DEVICE_FUNCTION_IS_INDEX_IN_BOUNDS "CUBLASDX_DEVICE_FUNCTION_IS_INDEX_IN_BOUNDS" = 10
 
 ctypedef enum cufftdxApi "cufftdxApi":
     CUFFTDX_API_LMEM "CUFFTDX_API_LMEM" = 0
@@ -277,6 +285,11 @@ ctypedef enum cusolverdxTraitType "cusolverdxTraitType":
     CUSOLVERDX_TRAIT_SHARED_MEMORY_SIZE "CUSOLVERDX_TRAIT_SHARED_MEMORY_SIZE" = 1
     CUSOLVERDX_TRAIT_SYMBOL_NAME "CUSOLVERDX_TRAIT_SYMBOL_NAME" = 2
 
+ctypedef enum cublasdxMemorySpace "cublasdxMemorySpace":
+    CUBLASDX_MEMORY_SPACE_RMEM "CUBLASDX_MEMORY_SPACE_RMEM" = 0
+    CUBLASDX_MEMORY_SPACE_SMEM "CUBLASDX_MEMORY_SPACE_SMEM" = 1
+    CUBLASDX_MEMORY_SPACE_GMEM "CUBLASDX_MEMORY_SPACE_GMEM" = 2
+
 # types
 ctypedef long long int commondxCode 'commondxCode'
 ctypedef long long int cublasdxDescriptor 'cublasdxDescriptor'
@@ -310,11 +323,11 @@ cdef commondxStatusType cublasdxSetOperatorInt64(cublasdxDescriptor handle, cubl
 cdef commondxStatusType cublasdxSetOperatorInt64s(cublasdxDescriptor handle, cublasdxOperatorType op, size_t count, const long long int* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxBindTensor(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxSetTensorOptionInt64(cublasdxTensor tensor, cublasdxTensorOption option, long long int value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
-cdef commondxStatusType cublasdxFinalizeTensors(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxFinalizeTensorsNew(size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxGetTensorTraitInt64(cublasdxTensor tensor, cublasdxTensorTrait trait, long long int* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxGetTensorTraitStrSize(cublasdxTensor tensor, cublasdxTensorTrait trait, size_t* size) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxGetTensorTraitStr(cublasdxTensor tensor, cublasdxTensorTrait trait, size_t size, char* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
-cdef commondxStatusType cublasdxBindDeviceFunction(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxCreateDeviceFunctionOld(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxFinalizeDeviceFunctions(commondxCode code, size_t count, const cublasdxDeviceFunction* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxGetDeviceFunctionTraitStrSize(cublasdxDeviceFunction device_function, cublasdxDeviceFunctionTrait trait, size_t* size) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef commondxStatusType cublasdxGetDeviceFunctionTraitStr(cublasdxDeviceFunction device_function, cublasdxDeviceFunctionTrait trait, size_t size, char* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
@@ -360,3 +373,10 @@ cdef commondxStatusType cusolverdxFinalizeCode(commondxCode code, cusolverdxDesc
 cdef commondxStatusType cusolverdxDestroyDescriptor(cusolverdxDescriptor handle) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
 cdef const char* cusolverdxOperatorTypeToStr(cusolverdxOperatorType op) except?NULL nogil
 cdef const char* cusolverdxTraitTypeToStr(cusolverdxTraitType trait) except?NULL nogil
+cdef commondxStatusType cublasdxCreateTensorNew(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxMakeTensorLike(cublasdxTensor input, commondxValueType value_type, cublasdxTensor* output) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxDestroyTensorNew(cublasdxTensor tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxCreateDeviceFunctionNew(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+cdef commondxStatusType cublasdxDestroyDeviceFunctionNew(cublasdxDeviceFunction device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
+
+cdef commondxStatusType cublasdxFinalizeTensors203(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cymathdx.pyx b/nvmath/bindings/cymathdx.pyx
index 728cdd3..07d6c89 100644
--- a/nvmath/bindings/cymathdx.pyx
+++ b/nvmath/bindings/cymathdx.pyx
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 from ._internal cimport mathdx as _mathdx
 
@@ -87,8 +87,8 @@ cdef commondxStatusType cublasdxSetTensorOptionInt64(cublasdxTensor tensor, cubl
     return _mathdx._cublasdxSetTensorOptionInt64(tensor, option, value)
 
 
-cdef commondxStatusType cublasdxFinalizeTensors(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
-    return _mathdx._cublasdxFinalizeTensors(handle, count, array)
+cdef commondxStatusType cublasdxFinalizeTensorsNew(size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxFinalizeTensorsNew(count, array)
 
 
 cdef commondxStatusType cublasdxGetTensorTraitInt64(cublasdxTensor tensor, cublasdxTensorTrait trait, long long int* value) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
@@ -103,8 +103,8 @@ cdef commondxStatusType cublasdxGetTensorTraitStr(cublasdxTensor tensor, cublasd
     return _mathdx._cublasdxGetTensorTraitStr(tensor, trait, size, value)
 
 
-cdef commondxStatusType cublasdxBindDeviceFunction(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
-    return _mathdx._cublasdxBindDeviceFunction(handle, device_function_type, count, array, device_function)
+cdef commondxStatusType cublasdxCreateDeviceFunctionOld(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxCreateDeviceFunctionOld(handle, device_function_type, count, array, device_function)
 
 
 cdef commondxStatusType cublasdxFinalizeDeviceFunctions(commondxCode code, size_t count, const cublasdxDeviceFunction* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
@@ -285,3 +285,26 @@ cdef const char* cusolverdxOperatorTypeToStr(cusolverdxOperatorType op) except?N
 
 cdef const char* cusolverdxTraitTypeToStr(cusolverdxTraitType trait) except?NULL nogil:
     return _mathdx._cusolverdxTraitTypeToStr(trait)
+
+
+cdef commondxStatusType cublasdxCreateTensorNew(cublasdxDescriptor handle, cublasdxTensorType tensor_type, cublasdxTensor* tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxCreateTensorNew(handle, tensor_type, tensor)
+
+
+cdef commondxStatusType cublasdxMakeTensorLike(cublasdxTensor input, commondxValueType value_type, cublasdxTensor* output) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxMakeTensorLike(input, value_type, output)
+
+
+cdef commondxStatusType cublasdxDestroyTensorNew(cublasdxTensor tensor) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxDestroyTensorNew(tensor)
+
+
+cdef commondxStatusType cublasdxCreateDeviceFunctionNew(cublasdxDescriptor handle, cublasdxDeviceFunctionType device_function_type, size_t count, const cublasdxTensor* array, cublasdxDeviceFunction* device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxCreateDeviceFunctionNew(handle, device_function_type, count, array, device_function)
+
+
+cdef commondxStatusType cublasdxDestroyDeviceFunctionNew(cublasdxDeviceFunction device_function) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxDestroyDeviceFunctionNew(device_function)
+
+cdef commondxStatusType cublasdxFinalizeTensors203(cublasdxDescriptor handle, size_t count, const cublasdxTensor* array) except?_COMMONDXSTATUSTYPE_INTERNAL_LOADING_ERROR nogil:
+    return _mathdx._cublasdxFinalizeTensors203(handle, count, array)
diff --git a/nvmath/bindings/cynccl.pxd b/nvmath/bindings/cynccl.pxd
new file mode 100644
index 0000000..c4a9639
--- /dev/null
+++ b/nvmath/bindings/cynccl.pxd
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+
+from libc.stdint cimport int64_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum ncclResult_t "ncclResult_t":
+    ncclSuccess "ncclSuccess" = 0
+    ncclUnhandledCudaError "ncclUnhandledCudaError" = 1
+    ncclSystemError "ncclSystemError" = 2
+    ncclInternalError "ncclInternalError" = 3
+    ncclInvalidArgument "ncclInvalidArgument" = 4
+    ncclInvalidUsage "ncclInvalidUsage" = 5
+    ncclRemoteError "ncclRemoteError" = 6
+    ncclInProgress "ncclInProgress" = 7
+    ncclNumResults "ncclNumResults" = 8
+    _NCCLRESULT_T_INTERNAL_LOADING_ERROR "_NCCLRESULT_T_INTERNAL_LOADING_ERROR" = -42
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+
+
+ctypedef void* ncclComm_t 'ncclComm_t'
+ctypedef struct ncclUniqueId 'ncclUniqueId':
+    char internal[128]
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef ncclResult_t ncclGetVersion(int* version) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclCommDestroy(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclCommAbort(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* ncclGetErrorString(ncclResult_t result) except?NULL nogil
+cdef ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
+cdef const char* ncclGetLastError(ncclComm_t comm) except?NULL nogil
+cdef ncclResult_t ncclCommFinalize(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cynccl.pyx b/nvmath/bindings/cynccl.pyx
new file mode 100644
index 0000000..7418611
--- /dev/null
+++ b/nvmath/bindings/cynccl.pyx
@@ -0,0 +1,55 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+from ._internal cimport nccl as _nccl
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef ncclResult_t ncclGetVersion(int* version) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclGetVersion(version)
+
+
+cdef ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclGetUniqueId(uniqueId)
+
+
+cdef ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommInitRank(comm, nranks, commId, rank)
+
+
+cdef ncclResult_t ncclCommDestroy(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommDestroy(comm)
+
+
+cdef ncclResult_t ncclCommAbort(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommAbort(comm)
+
+
+cdef const char* ncclGetErrorString(ncclResult_t result) except?NULL nogil:
+    return _nccl._ncclGetErrorString(result)
+
+
+cdef ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommCount(comm, count)
+
+
+cdef ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommCuDevice(comm, device)
+
+
+cdef ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommUserRank(comm, rank)
+
+
+cdef const char* ncclGetLastError(ncclComm_t comm) except?NULL nogil:
+    return _nccl._ncclGetLastError(comm)
+
+
+cdef ncclResult_t ncclCommFinalize(ncclComm_t comm) except?_NCCLRESULT_T_INTERNAL_LOADING_ERROR nogil:
+    return _nccl._ncclCommFinalize(comm)
diff --git a/nvmath/bindings/mathdx.pxd b/nvmath/bindings/mathdx.pxd
index 0169d0e..a3e4c27 100644
--- a/nvmath/bindings/mathdx.pxd
+++ b/nvmath/bindings/mathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -53,6 +53,7 @@ ctypedef cusolverdxSide _CusolverdxSide
 ctypedef cusolverdxDiag _CusolverdxDiag
 ctypedef cusolverdxOperatorType _CusolverdxOperatorType
 ctypedef cusolverdxTraitType _CusolverdxTraitType
+ctypedef cublasdxMemorySpace _CublasdxMemorySpace
 
 
 ###############################################################################
@@ -79,11 +80,11 @@ cpdef cublasdx_set_operator_int64(long long int handle, int op, long long int va
 cpdef cublasdx_set_operator_int64s(long long int handle, int op, size_t count, array)
 cpdef long long int cublasdx_bind_tensor(long long int handle, int tensor_type) except? 0
 cpdef cublasdx_set_tensor_option_int64(long long int tensor, int option, long long int value)
-cpdef cublasdx_finalize_tensors(long long int handle, size_t count, array)
+cpdef cublasdx_finalize_tensors_new(size_t count, array)
 cpdef long long int cublasdx_get_tensor_trait_int64(long long int tensor, int trait) except? 0
 cpdef size_t cublasdx_get_tensor_trait_str_size(long long int tensor, int trait) except? 0
 cpdef cublasdx_get_tensor_trait_str(long long int tensor, int trait, size_t size, value)
-cpdef long long int cublasdx_bind_device_function(long long int handle, int device_function_type, size_t count, array) except? 0
+cpdef long long int cublasdx_create_device_function_old(long long int handle, int device_function_type, size_t count, array) except? 0
 cpdef cublasdx_finalize_device_functions(long long int code, size_t count, array)
 cpdef size_t cublasdx_get_device_function_trait_str_size(long long int device_function, int trait) except? 0
 cpdef cublasdx_get_device_function_trait_str(long long int device_function, int trait, size_t size, value)
@@ -129,3 +130,15 @@ cpdef cusolverdx_finalize_code(long long int code, long long int handle)
 cpdef cusolverdx_destroy_descriptor(long long int handle)
 cpdef str cusolverdx_operator_type_to_str(int op)
 cpdef str cusolverdx_trait_type_to_str(int trait)
+cpdef long long int cublasdx_create_tensor_new(long long int handle, int tensor_type) except? 0
+cpdef long long int cublasdx_make_tensor_like(long long int input, int value_type) except? 0
+cpdef cublasdx_destroy_tensor_new(long long int tensor)
+cpdef long long int cublasdx_create_device_function_new(long long int handle, int device_function_type, size_t count, array) except? 0
+cpdef cublasdx_destroy_device_function_new(long long int device_function)
+
+# 0.2 - 0.3 compatibility layer
+cpdef cublasdx_finalize_tensors203(long long int handle, size_t count, array)
+cpdef long long int cublasdx_create_device_function(long long int handle, int device_function_type, size_t count, array) except? 0
+cpdef cublasdx_destroy_device_function(long long int device_function)
+cpdef cublasdx_destroy_tensor(long long int tensor)
+cpdef long long int cublasdx_create_tensor(long long int handle, int tensor_type) except? 0
diff --git a/nvmath/bindings/mathdx.pyi b/nvmath/bindings/mathdx.pyi
index 3ccc465..c8650ea 100644
--- a/nvmath/bindings/mathdx.pyi
+++ b/nvmath/bindings/mathdx.pyi
@@ -1,80 +1,83 @@
-import _cython_3_1_3
+import _cython_3_1_4
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-check_status: _cython_3_1_3.cython_function_or_method
-commondx_create_code: _cython_3_1_3.cython_function_or_method
-commondx_destroy_code: _cython_3_1_3.cython_function_or_method
-commondx_get_code_ltoir: _cython_3_1_3.cython_function_or_method
-commondx_get_code_ltoir_size: _cython_3_1_3.cython_function_or_method
-commondx_get_code_ltoir_sizes: _cython_3_1_3.cython_function_or_method
-commondx_get_code_ltoirs: _cython_3_1_3.cython_function_or_method
-commondx_get_code_num_ltoirs: _cython_3_1_3.cython_function_or_method
-commondx_get_code_option_int64: _cython_3_1_3.cython_function_or_method
-commondx_get_code_options_int64s: _cython_3_1_3.cython_function_or_method
-commondx_set_code_option_int64: _cython_3_1_3.cython_function_or_method
-commondx_set_code_option_str: _cython_3_1_3.cython_function_or_method
-commondx_status_to_str: _cython_3_1_3.cython_function_or_method
-cublasdx_bind_device_function: _cython_3_1_3.cython_function_or_method
-cublasdx_bind_tensor: _cython_3_1_3.cython_function_or_method
-cublasdx_create_descriptor: _cython_3_1_3.cython_function_or_method
-cublasdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
-cublasdx_finalize_code: _cython_3_1_3.cython_function_or_method
-cublasdx_finalize_device_functions: _cython_3_1_3.cython_function_or_method
-cublasdx_finalize_tensors: _cython_3_1_3.cython_function_or_method
-cublasdx_get_device_function_trait_str: _cython_3_1_3.cython_function_or_method
-cublasdx_get_device_function_trait_str_size: _cython_3_1_3.cython_function_or_method
-cublasdx_get_ltoir: _cython_3_1_3.cython_function_or_method
-cublasdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
-cublasdx_get_tensor_trait_int64: _cython_3_1_3.cython_function_or_method
-cublasdx_get_tensor_trait_str: _cython_3_1_3.cython_function_or_method
-cublasdx_get_tensor_trait_str_size: _cython_3_1_3.cython_function_or_method
-cublasdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
-cublasdx_get_trait_int64s: _cython_3_1_3.cython_function_or_method
-cublasdx_get_trait_str: _cython_3_1_3.cython_function_or_method
-cublasdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
-cublasdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
-cublasdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
-cublasdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
-cublasdx_set_option_str: _cython_3_1_3.cython_function_or_method
-cublasdx_set_tensor_option_int64: _cython_3_1_3.cython_function_or_method
-cublasdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
-cufftdx_create_descriptor: _cython_3_1_3.cython_function_or_method
-cufftdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
-cufftdx_finalize_code: _cython_3_1_3.cython_function_or_method
-cufftdx_get_knob_int64s: _cython_3_1_3.cython_function_or_method
-cufftdx_get_knob_int64size: _cython_3_1_3.cython_function_or_method
-cufftdx_get_ltoir: _cython_3_1_3.cython_function_or_method
-cufftdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
-cufftdx_get_trait_commondx_data_type: _cython_3_1_3.cython_function_or_method
-cufftdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
-cufftdx_get_trait_int64s: _cython_3_1_3.cython_function_or_method
-cufftdx_get_trait_str: _cython_3_1_3.cython_function_or_method
-cufftdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
-cufftdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
-cufftdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
-cufftdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
-cufftdx_set_option_str: _cython_3_1_3.cython_function_or_method
-cufftdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
-cusolverdx_create_descriptor: _cython_3_1_3.cython_function_or_method
-cusolverdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
-cusolverdx_finalize_code: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_ltoir: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_trait_str: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_universal_fatbin: _cython_3_1_3.cython_function_or_method
-cusolverdx_get_universal_fatbin_size: _cython_3_1_3.cython_function_or_method
-cusolverdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
-cusolverdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
-cusolverdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
-cusolverdx_set_option_str: _cython_3_1_3.cython_function_or_method
-cusolverdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
-get_version: _cython_3_1_3.cython_function_or_method
-get_version_ex: _cython_3_1_3.cython_function_or_method
+check_status: _cython_3_1_4.cython_function_or_method
+commondx_create_code: _cython_3_1_4.cython_function_or_method
+commondx_destroy_code: _cython_3_1_4.cython_function_or_method
+commondx_get_code_ltoir: _cython_3_1_4.cython_function_or_method
+commondx_get_code_ltoir_size: _cython_3_1_4.cython_function_or_method
+commondx_get_code_ltoir_sizes: _cython_3_1_4.cython_function_or_method
+commondx_get_code_ltoirs: _cython_3_1_4.cython_function_or_method
+commondx_get_code_num_ltoirs: _cython_3_1_4.cython_function_or_method
+commondx_get_code_option_int64: _cython_3_1_4.cython_function_or_method
+commondx_get_code_options_int64s: _cython_3_1_4.cython_function_or_method
+commondx_set_code_option_int64: _cython_3_1_4.cython_function_or_method
+commondx_set_code_option_str: _cython_3_1_4.cython_function_or_method
+commondx_status_to_str: _cython_3_1_4.cython_function_or_method
+cublasdx_create_descriptor: _cython_3_1_4.cython_function_or_method
+cublasdx_create_device_function: _cython_3_1_4.cython_function_or_method
+cublasdx_create_tensor: _cython_3_1_4.cython_function_or_method
+cublasdx_destroy_descriptor: _cython_3_1_4.cython_function_or_method
+cublasdx_destroy_device_function: _cython_3_1_4.cython_function_or_method
+cublasdx_destroy_tensor: _cython_3_1_4.cython_function_or_method
+cublasdx_finalize_code: _cython_3_1_4.cython_function_or_method
+cublasdx_finalize_device_functions: _cython_3_1_4.cython_function_or_method
+cublasdx_finalize_tensors: _cython_3_1_4.cython_function_or_method
+cublasdx_get_device_function_trait_str: _cython_3_1_4.cython_function_or_method
+cublasdx_get_device_function_trait_str_size: _cython_3_1_4.cython_function_or_method
+cublasdx_get_ltoir: _cython_3_1_4.cython_function_or_method
+cublasdx_get_ltoir_size: _cython_3_1_4.cython_function_or_method
+cublasdx_get_tensor_trait_int64: _cython_3_1_4.cython_function_or_method
+cublasdx_get_tensor_trait_str: _cython_3_1_4.cython_function_or_method
+cublasdx_get_tensor_trait_str_size: _cython_3_1_4.cython_function_or_method
+cublasdx_get_trait_int64: _cython_3_1_4.cython_function_or_method
+cublasdx_get_trait_int64s: _cython_3_1_4.cython_function_or_method
+cublasdx_get_trait_str: _cython_3_1_4.cython_function_or_method
+cublasdx_get_trait_str_size: _cython_3_1_4.cython_function_or_method
+cublasdx_make_tensor_like: _cython_3_1_4.cython_function_or_method
+cublasdx_operator_type_to_str: _cython_3_1_4.cython_function_or_method
+cublasdx_set_operator_int64: _cython_3_1_4.cython_function_or_method
+cublasdx_set_operator_int64s: _cython_3_1_4.cython_function_or_method
+cublasdx_set_option_str: _cython_3_1_4.cython_function_or_method
+cublasdx_set_tensor_option_int64: _cython_3_1_4.cython_function_or_method
+cublasdx_trait_type_to_str: _cython_3_1_4.cython_function_or_method
+cufftdx_create_descriptor: _cython_3_1_4.cython_function_or_method
+cufftdx_destroy_descriptor: _cython_3_1_4.cython_function_or_method
+cufftdx_finalize_code: _cython_3_1_4.cython_function_or_method
+cufftdx_get_knob_int64s: _cython_3_1_4.cython_function_or_method
+cufftdx_get_knob_int64size: _cython_3_1_4.cython_function_or_method
+cufftdx_get_ltoir: _cython_3_1_4.cython_function_or_method
+cufftdx_get_ltoir_size: _cython_3_1_4.cython_function_or_method
+cufftdx_get_trait_commondx_data_type: _cython_3_1_4.cython_function_or_method
+cufftdx_get_trait_int64: _cython_3_1_4.cython_function_or_method
+cufftdx_get_trait_int64s: _cython_3_1_4.cython_function_or_method
+cufftdx_get_trait_str: _cython_3_1_4.cython_function_or_method
+cufftdx_get_trait_str_size: _cython_3_1_4.cython_function_or_method
+cufftdx_operator_type_to_str: _cython_3_1_4.cython_function_or_method
+cufftdx_set_operator_int64: _cython_3_1_4.cython_function_or_method
+cufftdx_set_operator_int64s: _cython_3_1_4.cython_function_or_method
+cufftdx_set_option_str: _cython_3_1_4.cython_function_or_method
+cufftdx_trait_type_to_str: _cython_3_1_4.cython_function_or_method
+cusolverdx_create_descriptor: _cython_3_1_4.cython_function_or_method
+cusolverdx_destroy_descriptor: _cython_3_1_4.cython_function_or_method
+cusolverdx_finalize_code: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_ltoir: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_ltoir_size: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_trait_int64: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_trait_str: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_trait_str_size: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_universal_fatbin: _cython_3_1_4.cython_function_or_method
+cusolverdx_get_universal_fatbin_size: _cython_3_1_4.cython_function_or_method
+cusolverdx_operator_type_to_str: _cython_3_1_4.cython_function_or_method
+cusolverdx_set_operator_int64: _cython_3_1_4.cython_function_or_method
+cusolverdx_set_operator_int64s: _cython_3_1_4.cython_function_or_method
+cusolverdx_set_option_str: _cython_3_1_4.cython_function_or_method
+cusolverdx_trait_type_to_str: _cython_3_1_4.cython_function_or_method
+get_version: _cython_3_1_4.cython_function_or_method
+get_version_ex: _cython_3_1_4.cython_function_or_method
 
 class CommondxCodeContainer(enum.IntEnum):
     __new__: ClassVar[Callable] = ...
@@ -244,7 +247,6 @@ class CublasdxDeviceFunctionOption(enum.IntEnum):
 
 class CublasdxDeviceFunctionTrait(enum.IntEnum):
     __new__: ClassVar[Callable] = ...
-    NAME: ClassVar[CublasdxDeviceFunctionTrait] = ...
     SYMBOL: ClassVar[CublasdxDeviceFunctionTrait] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
@@ -262,6 +264,12 @@ class CublasdxDeviceFunctionType(enum.IntEnum):
     COPY: ClassVar[CublasdxDeviceFunctionType] = ...
     COPY_WAIT: ClassVar[CublasdxDeviceFunctionType] = ...
     EXECUTE: ClassVar[CublasdxDeviceFunctionType] = ...
+    IS_INDEX_IN_BOUNDS: ClassVar[CublasdxDeviceFunctionType] = ...
+    IS_PREDICATED: ClassVar[CublasdxDeviceFunctionType] = ...
+    IS_THREAD_ACTIVE: ClassVar[CublasdxDeviceFunctionType] = ...
+    MAP_CRD2IDX: ClassVar[CublasdxDeviceFunctionType] = ...
+    MAP_IDX2CRD: ClassVar[CublasdxDeviceFunctionType] = ...
+    MAP_IDX2CRD_PARTITIONER: ClassVar[CublasdxDeviceFunctionType] = ...
     _generate_next_value_: ClassVar[Callable] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
@@ -283,6 +291,20 @@ class CublasdxFunction(enum.IntEnum):
     _value2member_map_: ClassVar[dict] = ...
     def __format__(self, *args, **kwargs) -> str: ...
 
+class CublasdxMemorySpace(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    GMEM: ClassVar[CublasdxMemorySpace] = ...
+    RMEM: ClassVar[CublasdxMemorySpace] = ...
+    SMEM: ClassVar[CublasdxMemorySpace] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
 class CublasdxOperatorType(enum.IntEnum):
     __new__: ClassVar[Callable] = ...
     ALIGNMENT: ClassVar[CublasdxOperatorType] = ...
@@ -322,6 +344,8 @@ class CublasdxTensorOption(enum.IntEnum):
 class CublasdxTensorTrait(enum.IntEnum):
     __new__: ClassVar[Callable] = ...
     ALIGNMENT_BYTES: ClassVar[CublasdxTensorTrait] = ...
+    LOGICAL_SIZE: ClassVar[CublasdxTensorTrait] = ...
+    MEMORY_SPACE: ClassVar[CublasdxTensorTrait] = ...
     OPAQUE_NAME: ClassVar[CublasdxTensorTrait] = ...
     STORAGE_BYTES: ClassVar[CublasdxTensorTrait] = ...
     UID: ClassVar[CublasdxTensorTrait] = ...
diff --git a/nvmath/bindings/mathdx.pyx b/nvmath/bindings/mathdx.pyx
index f86a674..cd6f05e 100644
--- a/nvmath/bindings/mathdx.pyx
+++ b/nvmath/bindings/mathdx.pyx
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.3. Do not modify it directly.
+# This code was automatically generated across versions from 0.2.3 to 0.3.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -176,6 +176,8 @@ class CublasdxTensorTrait(_IntEnum):
     ALIGNMENT_BYTES = CUBLASDX_TENSOR_TRAIT_ALIGNMENT_BYTES
     UID = CUBLASDX_TENSOR_TRAIT_UID
     OPAQUE_NAME = CUBLASDX_TENSOR_TRAIT_OPAQUE_NAME
+    LOGICAL_SIZE = CUBLASDX_TENSOR_TRAIT_LOGICAL_SIZE
+    MEMORY_SPACE = CUBLASDX_TENSOR_TRAIT_MEMORY_SPACE
 
 class CublasdxDeviceFunctionTrait(_IntEnum):
     """See `cublasdxDeviceFunctionTrait`."""
@@ -192,6 +194,12 @@ class CublasdxDeviceFunctionType(_IntEnum):
     COPY_WAIT = CUBLASDX_DEVICE_FUNCTION_COPY_WAIT
     CLEAR = CUBLASDX_DEVICE_FUNCTION_CLEAR
     AXPBY = CUBLASDX_DEVICE_FUNCTION_AXPBY
+    MAP_IDX2CRD = CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD
+    MAP_IDX2CRD_PARTITIONER = CUBLASDX_DEVICE_FUNCTION_MAP_IDX2CRD_PARTITIONER
+    MAP_CRD2IDX = CUBLASDX_DEVICE_FUNCTION_MAP_CRD2IDX
+    IS_THREAD_ACTIVE = CUBLASDX_DEVICE_FUNCTION_IS_THREAD_ACTIVE
+    IS_PREDICATED = CUBLASDX_DEVICE_FUNCTION_IS_PREDICATED
+    IS_INDEX_IN_BOUNDS = CUBLASDX_DEVICE_FUNCTION_IS_INDEX_IN_BOUNDS
 
 class CufftdxApi(_IntEnum):
     """See `cufftdxApi`."""
@@ -322,6 +330,12 @@ class CusolverdxTraitType(_IntEnum):
     SHARED_MEMORY_SIZE = CUSOLVERDX_TRAIT_SHARED_MEMORY_SIZE
     SYMBOL_NAME = CUSOLVERDX_TRAIT_SYMBOL_NAME
 
+class CublasdxMemorySpace(_IntEnum):
+    """See `cublasdxMemorySpace`."""
+    RMEM = CUBLASDX_MEMORY_SPACE_RMEM
+    SMEM = CUBLASDX_MEMORY_SPACE_SMEM
+    GMEM = CUBLASDX_MEMORY_SPACE_GMEM
+
 
 ###############################################################################
 # Error handling
@@ -365,8 +379,8 @@ cpdef long long int commondx_create_code() except? 0:
     """
     cdef commondxCode code
     with nogil:
-        status = commondxCreateCode(&code)
-    check_status(status)
+        __status__ = commondxCreateCode(&code)
+    check_status(__status__)
     return <long long int>code
 
 
@@ -381,8 +395,8 @@ cpdef commondx_set_code_option_int64(long long int code, int option, long long i
     .. seealso:: `commondxSetCodeOptionInt64`
     """
     with nogil:
-        status = commondxSetCodeOptionInt64(<commondxCode>code, <_CommondxOption>option, value)
-    check_status(status)
+        __status__ = commondxSetCodeOptionInt64(<commondxCode>code, <_CommondxOption>option, value)
+    check_status(__status__)
 
 
 cpdef commondx_set_code_option_str(long long int code, int option, value):
@@ -400,8 +414,8 @@ cpdef commondx_set_code_option_str(long long int code, int option, value):
     cdef bytes _temp_value_ = (<str>value).encode()
     cdef char* _value_ = _temp_value_
     with nogil:
-        status = commondxSetCodeOptionStr(<commondxCode>code, <_CommondxOption>option, <const char*>_value_)
-    check_status(status)
+        __status__ = commondxSetCodeOptionStr(<commondxCode>code, <_CommondxOption>option, <const char*>_value_)
+    check_status(__status__)
 
 
 cpdef long long int commondx_get_code_option_int64(long long int code, int option) except? 0:
@@ -418,8 +432,8 @@ cpdef long long int commondx_get_code_option_int64(long long int code, int optio
     """
     cdef long long int value
     with nogil:
-        status = commondxGetCodeOptionInt64(<commondxCode>code, <_CommondxOption>option, &value)
-    check_status(status)
+        __status__ = commondxGetCodeOptionInt64(<commondxCode>code, <_CommondxOption>option, &value)
+    check_status(__status__)
     return value
 
 
@@ -441,8 +455,8 @@ cpdef commondx_get_code_options_int64s(long long int code, int option, size_t si
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = commondxGetCodeOptionsInt64s(<commondxCode>code, <_CommondxOption>option, size, <long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = commondxGetCodeOptionsInt64s(<commondxCode>code, <_CommondxOption>option, size, <long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef size_t commondx_get_code_ltoir_size(long long int code) except? 0:
@@ -458,8 +472,8 @@ cpdef size_t commondx_get_code_ltoir_size(long long int code) except? 0:
     """
     cdef size_t size
     with nogil:
-        status = commondxGetCodeLTOIRSize(<commondxCode>code, &size)
-    check_status(status)
+        __status__ = commondxGetCodeLTOIRSize(<commondxCode>code, &size)
+    check_status(__status__)
     return size
 
 
@@ -475,8 +489,8 @@ cpdef commondx_get_code_ltoir(long long int code, size_t size, out):
     """
     cdef void* _out_ = get_buffer_pointer(out, size, readonly=False)
     with nogil:
-        status = commondxGetCodeLTOIR(<commondxCode>code, size, <void*>_out_)
-    check_status(status)
+        __status__ = commondxGetCodeLTOIR(<commondxCode>code, size, <void*>_out_)
+    check_status(__status__)
 
 
 cpdef size_t commondx_get_code_num_ltoirs(long long int code) except? 0:
@@ -492,8 +506,8 @@ cpdef size_t commondx_get_code_num_ltoirs(long long int code) except? 0:
     """
     cdef size_t size
     with nogil:
-        status = commondxGetCodeNumLTOIRs(<commondxCode>code, &size)
-    check_status(status)
+        __status__ = commondxGetCodeNumLTOIRs(<commondxCode>code, &size)
+    check_status(__status__)
     return size
 
 
@@ -514,8 +528,8 @@ cpdef commondx_get_code_ltoir_sizes(long long int code, size_t size, out):
     cdef nullable_unique_ptr[ vector[size_t] ] _out_
     get_resource_ptr[size_t](_out_, out, <size_t*>NULL)
     with nogil:
-        status = commondxGetCodeLTOIRSizes(<commondxCode>code, size, <size_t*>(_out_.data()))
-    check_status(status)
+        __status__ = commondxGetCodeLTOIRSizes(<commondxCode>code, size, <size_t*>(_out_.data()))
+    check_status(__status__)
 
 
 cpdef commondx_get_code_ltoirs(long long int code, size_t size, out):
@@ -535,8 +549,8 @@ cpdef commondx_get_code_ltoirs(long long int code, size_t size, out):
     cdef nullable_unique_ptr[ vector[void*] ] _out_
     get_resource_ptrs[void](_out_, out, <void*>NULL)
     with nogil:
-        status = commondxGetCodeLTOIRs(<commondxCode>code, size, <void**>(_out_.data()))
-    check_status(status)
+        __status__ = commondxGetCodeLTOIRs(<commondxCode>code, size, <void**>(_out_.data()))
+    check_status(__status__)
 
 
 cpdef commondx_destroy_code(long long int code):
@@ -548,8 +562,8 @@ cpdef commondx_destroy_code(long long int code):
     .. seealso:: `commondxDestroyCode`
     """
     with nogil:
-        status = commondxDestroyCode(<commondxCode>code)
-    check_status(status)
+        __status__ = commondxDestroyCode(<commondxCode>code)
+    check_status(__status__)
 
 
 cpdef str commondx_status_to_str(int status):
@@ -575,8 +589,8 @@ cpdef int get_version() except? 0:
     """
     cdef int version
     with nogil:
-        status = mathdxGetVersion(&version)
-    check_status(status)
+        __status__ = mathdxGetVersion(&version)
+    check_status(__status__)
     return version
 
 
@@ -596,8 +610,8 @@ cpdef tuple get_version_ex():
     cdef int minor
     cdef int patch
     with nogil:
-        status = mathdxGetVersionEx(&major, &minor, &patch)
-    check_status(status)
+        __status__ = mathdxGetVersionEx(&major, &minor, &patch)
+    check_status(__status__)
     return (major, minor, patch)
 
 
@@ -611,8 +625,8 @@ cpdef long long int cublasdx_create_descriptor() except? 0:
     """
     cdef cublasdxDescriptor handle
     with nogil:
-        status = cublasdxCreateDescriptor(&handle)
-    check_status(status)
+        __status__ = cublasdxCreateDescriptor(&handle)
+    check_status(__status__)
     return <long long int>handle
 
 
@@ -631,8 +645,8 @@ cpdef cublasdx_set_option_str(long long int handle, int option, value):
     cdef bytes _temp_value_ = (<str>value).encode()
     cdef char* _value_ = _temp_value_
     with nogil:
-        status = cublasdxSetOptionStr(<cublasdxDescriptor>handle, <_CommondxOption>option, <const char*>_value_)
-    check_status(status)
+        __status__ = cublasdxSetOptionStr(<cublasdxDescriptor>handle, <_CommondxOption>option, <const char*>_value_)
+    check_status(__status__)
 
 
 cpdef cublasdx_set_operator_int64(long long int handle, int op, long long int value):
@@ -646,8 +660,8 @@ cpdef cublasdx_set_operator_int64(long long int handle, int op, long long int va
     .. seealso:: `cublasdxSetOperatorInt64`
     """
     with nogil:
-        status = cublasdxSetOperatorInt64(<cublasdxDescriptor>handle, <_CublasdxOperatorType>op, value)
-    check_status(status)
+        __status__ = cublasdxSetOperatorInt64(<cublasdxDescriptor>handle, <_CublasdxOperatorType>op, value)
+    check_status(__status__)
 
 
 cpdef cublasdx_set_operator_int64s(long long int handle, int op, size_t count, array):
@@ -668,8 +682,8 @@ cpdef cublasdx_set_operator_int64s(long long int handle, int op, size_t count, a
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cublasdxSetOperatorInt64s(<cublasdxDescriptor>handle, <_CublasdxOperatorType>op, count, <const long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = cublasdxSetOperatorInt64s(<cublasdxDescriptor>handle, <_CublasdxOperatorType>op, count, <const long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef long long int cublasdx_bind_tensor(long long int handle, int tensor_type) except? 0:
@@ -686,8 +700,8 @@ cpdef long long int cublasdx_bind_tensor(long long int handle, int tensor_type)
     """
     cdef cublasdxTensor tensor
     with nogil:
-        status = cublasdxBindTensor(<cublasdxDescriptor>handle, <_CublasdxTensorType>tensor_type, &tensor)
-    check_status(status)
+        __status__ = cublasdxBindTensor(<cublasdxDescriptor>handle, <_CublasdxTensorType>tensor_type, &tensor)
+    check_status(__status__)
     return <long long int>tensor
 
 
@@ -695,43 +709,30 @@ cpdef cublasdx_set_tensor_option_int64(long long int tensor, int option, long lo
     """Set an option on a tensor. This must be called before the tensor is finalized.
 
     Args:
-        tensor (long long int): A cuBLASDx tensor, output of cublasdxBindTensor.
+        tensor (long long int): A cuBLASDx tensor, output of cublasdxCreateTensor.
         option (CublasdxTensorOption): The option to set on the tensor.
         value (long long int): A value for the option.
 
     .. seealso:: `cublasdxSetTensorOptionInt64`
     """
     with nogil:
-        status = cublasdxSetTensorOptionInt64(<cublasdxTensor>tensor, <_CublasdxTensorOption>option, value)
-    check_status(status)
-
-
-cpdef cublasdx_finalize_tensors(long long int handle, size_t count, array):
-    """Finalize the tensors. This is required before traits can be queried.
-
-    Args:
-        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
-        count (size_t): The number of tensors to finalized.
-        array (object): The array of tensors. It can be:
-
-            - an :class:`int` as the pointer address to the array, or
-            - a Python sequence of ``cublasdxTensor``.
+        __status__ = cublasdxSetTensorOptionInt64(<cublasdxTensor>tensor, <_CublasdxTensorOption>option, value)
+    check_status(__status__)
 
 
-    .. seealso:: `cublasdxFinalizeTensors`
-    """
+cpdef cublasdx_finalize_tensors_new(size_t count, array):
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cublasdxFinalizeTensors(<cublasdxDescriptor>handle, count, <const cublasdxTensor*>(_array_.data()))
-    check_status(status)
+        __status__ = cublasdxFinalizeTensorsNew(count, <const cublasdxTensor*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef long long int cublasdx_get_tensor_trait_int64(long long int tensor, int trait) except? 0:
     """Query an integer trait value from a finalized tensor.
 
     Args:
-        tensor (long long int): A finalized tensor handle, output of cublasdxBindTensor.
+        tensor (long long int): A finalized tensor handle, output of cublasdxCreateTensor.
         trait (CublasdxTensorTrait): The trait to query.
 
     Returns:
@@ -741,8 +742,8 @@ cpdef long long int cublasdx_get_tensor_trait_int64(long long int tensor, int tr
     """
     cdef long long int value
     with nogil:
-        status = cublasdxGetTensorTraitInt64(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, &value)
-    check_status(status)
+        __status__ = cublasdxGetTensorTraitInt64(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, &value)
+    check_status(__status__)
     return value
 
 
@@ -750,7 +751,7 @@ cpdef size_t cublasdx_get_tensor_trait_str_size(long long int tensor, int trait)
     """Query an C-string trait's size from a finalized tensor.
 
     Args:
-        tensor (long long int): A finalized tensor handle, output of cublasdxBindTensor.
+        tensor (long long int): A finalized tensor handle, output of cublasdxCreateTensor.
         trait (CublasdxTensorTrait): The trait to query.
 
     Returns:
@@ -760,8 +761,8 @@ cpdef size_t cublasdx_get_tensor_trait_str_size(long long int tensor, int trait)
     """
     cdef size_t size
     with nogil:
-        status = cublasdxGetTensorTraitStrSize(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, &size)
-    check_status(status)
+        __status__ = cublasdxGetTensorTraitStrSize(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, &size)
+    check_status(__status__)
     return size
 
 
@@ -769,7 +770,7 @@ cpdef cublasdx_get_tensor_trait_str(long long int tensor, int trait, size_t size
     """Query a C-string trait value from a finalized tensor.
 
     Args:
-        tensor (long long int): A finalized tensor handle, output of cublasdxBindTensor.
+        tensor (long long int): A finalized tensor handle, output of cublasdxCreateTensor.
         trait (CublasdxTensorTrait): The trait to query.
         size (size_t): The C-string size, as returned by cublasdxGetTensorTraitStrSize.
         value (bytes): The C-string trait value.
@@ -778,34 +779,17 @@ cpdef cublasdx_get_tensor_trait_str(long long int tensor, int trait, size_t size
     """
     cdef void* _value_ = get_buffer_pointer(value, size, readonly=False)
     with nogil:
-        status = cublasdxGetTensorTraitStr(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, size, <char*>_value_)
-    check_status(status)
+        __status__ = cublasdxGetTensorTraitStr(<cublasdxTensor>tensor, <_CublasdxTensorTrait>trait, size, <char*>_value_)
+    check_status(__status__)
 
 
-cpdef long long int cublasdx_bind_device_function(long long int handle, int device_function_type, size_t count, array) except? 0:
-    """Binds (aka create) a device function from a set of tensor.
-
-    Args:
-        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
-        device_function_type (CublasdxDeviceFunctionType): The device function to create.
-        count (size_t): The number of input & output tensors to the device function.
-        array (object): The array of input & output tensors. It can be:
-
-            - an :class:`int` as the pointer address to the array, or
-            - a Python sequence of ``cublasdxTensor``.
-
-
-    Returns:
-        long long int: The device function.
-
-    .. seealso:: `cublasdxBindDeviceFunction`
-    """
+cpdef long long int cublasdx_create_device_function_old(long long int handle, int device_function_type, size_t count, array) except? 0:
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     cdef cublasdxDeviceFunction device_function
     with nogil:
-        status = cublasdxBindDeviceFunction(<cublasdxDescriptor>handle, <_CublasdxDeviceFunctionType>device_function_type, count, <const cublasdxTensor*>(_array_.data()), &device_function)
-    check_status(status)
+        __status__ = cublasdxCreateDeviceFunctionOld(<cublasdxDescriptor>handle, <_CublasdxDeviceFunctionType>device_function_type, count, <const cublasdxTensor*>(_array_.data()), &device_function)
+    check_status(__status__)
     return <long long int>device_function
 
 
@@ -826,8 +810,8 @@ cpdef cublasdx_finalize_device_functions(long long int code, size_t count, array
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cublasdxFinalizeDeviceFunctions(<commondxCode>code, count, <const cublasdxDeviceFunction*>(_array_.data()))
-    check_status(status)
+        __status__ = cublasdxFinalizeDeviceFunctions(<commondxCode>code, count, <const cublasdxDeviceFunction*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef size_t cublasdx_get_device_function_trait_str_size(long long int device_function, int trait) except? 0:
@@ -844,8 +828,8 @@ cpdef size_t cublasdx_get_device_function_trait_str_size(long long int device_fu
     """
     cdef size_t size
     with nogil:
-        status = cublasdxGetDeviceFunctionTraitStrSize(<cublasdxDeviceFunction>device_function, <_CublasdxDeviceFunctionTrait>trait, &size)
-    check_status(status)
+        __status__ = cublasdxGetDeviceFunctionTraitStrSize(<cublasdxDeviceFunction>device_function, <_CublasdxDeviceFunctionTrait>trait, &size)
+    check_status(__status__)
     return size
 
 
@@ -862,8 +846,8 @@ cpdef cublasdx_get_device_function_trait_str(long long int device_function, int
     """
     cdef void* _value_ = get_buffer_pointer(value, size, readonly=False)
     with nogil:
-        status = cublasdxGetDeviceFunctionTraitStr(<cublasdxDeviceFunction>device_function, <_CublasdxDeviceFunctionTrait>trait, size, <char*>_value_)
-    check_status(status)
+        __status__ = cublasdxGetDeviceFunctionTraitStr(<cublasdxDeviceFunction>device_function, <_CublasdxDeviceFunctionTrait>trait, size, <char*>_value_)
+    check_status(__status__)
 
 
 cpdef size_t cublasdx_get_ltoir_size(long long int handle) except? 0:
@@ -879,8 +863,8 @@ cpdef size_t cublasdx_get_ltoir_size(long long int handle) except? 0:
     """
     cdef size_t lto_size
     with nogil:
-        status = cublasdxGetLTOIRSize(<cublasdxDescriptor>handle, &lto_size)
-    check_status(status)
+        __status__ = cublasdxGetLTOIRSize(<cublasdxDescriptor>handle, &lto_size)
+    check_status(__status__)
     return lto_size
 
 
@@ -896,8 +880,8 @@ cpdef cublasdx_get_ltoir(long long int handle, size_t size, lto):
     """
     cdef void* _lto_ = get_buffer_pointer(lto, size, readonly=False)
     with nogil:
-        status = cublasdxGetLTOIR(<cublasdxDescriptor>handle, size, <void*>_lto_)
-    check_status(status)
+        __status__ = cublasdxGetLTOIR(<cublasdxDescriptor>handle, size, <void*>_lto_)
+    check_status(__status__)
 
 
 cpdef size_t cublasdx_get_trait_str_size(long long int handle, int trait) except? 0:
@@ -914,8 +898,8 @@ cpdef size_t cublasdx_get_trait_str_size(long long int handle, int trait) except
     """
     cdef size_t size
     with nogil:
-        status = cublasdxGetTraitStrSize(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, &size)
-    check_status(status)
+        __status__ = cublasdxGetTraitStrSize(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, &size)
+    check_status(__status__)
     return size
 
 
@@ -932,8 +916,8 @@ cpdef cublasdx_get_trait_str(long long int handle, int trait, size_t size, value
     """
     cdef void* _value_ = get_buffer_pointer(value, size, readonly=False)
     with nogil:
-        status = cublasdxGetTraitStr(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, size, <char*>_value_)
-    check_status(status)
+        __status__ = cublasdxGetTraitStr(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, size, <char*>_value_)
+    check_status(__status__)
 
 
 cpdef long long int cublasdx_get_trait_int64(long long int handle, int trait) except? 0:
@@ -950,8 +934,8 @@ cpdef long long int cublasdx_get_trait_int64(long long int handle, int trait) ex
     """
     cdef long long int value
     with nogil:
-        status = cublasdxGetTraitInt64(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, &value)
-    check_status(status)
+        __status__ = cublasdxGetTraitInt64(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, &value)
+    check_status(__status__)
     return value
 
 
@@ -973,8 +957,8 @@ cpdef cublasdx_get_trait_int64s(long long int handle, int trait, size_t count, a
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cublasdxGetTraitInt64s(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, count, <long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = cublasdxGetTraitInt64s(<cublasdxDescriptor>handle, <_CublasdxTraitType>trait, count, <long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef str cublasdx_operator_type_to_str(int op):
@@ -1013,8 +997,8 @@ cpdef cublasdx_finalize_code(long long int code, long long int handle):
     .. seealso:: `cublasdxFinalizeCode`
     """
     with nogil:
-        status = cublasdxFinalizeCode(<commondxCode>code, <cublasdxDescriptor>handle)
-    check_status(status)
+        __status__ = cublasdxFinalizeCode(<commondxCode>code, <cublasdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef cublasdx_destroy_descriptor(long long int handle):
@@ -1026,8 +1010,8 @@ cpdef cublasdx_destroy_descriptor(long long int handle):
     .. seealso:: `cublasdxDestroyDescriptor`
     """
     with nogil:
-        status = cublasdxDestroyDescriptor(<cublasdxDescriptor>handle)
-    check_status(status)
+        __status__ = cublasdxDestroyDescriptor(<cublasdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef long long int cufftdx_create_descriptor() except? 0:
@@ -1040,8 +1024,8 @@ cpdef long long int cufftdx_create_descriptor() except? 0:
     """
     cdef cufftdxDescriptor handle
     with nogil:
-        status = cufftdxCreateDescriptor(&handle)
-    check_status(status)
+        __status__ = cufftdxCreateDescriptor(&handle)
+    check_status(__status__)
     return <long long int>handle
 
 
@@ -1060,8 +1044,8 @@ cpdef cufftdx_set_option_str(long long int handle, int opt, value):
     cdef bytes _temp_value_ = (<str>value).encode()
     cdef char* _value_ = _temp_value_
     with nogil:
-        status = cufftdxSetOptionStr(<cufftdxDescriptor>handle, <_CommondxOption>opt, <const char*>_value_)
-    check_status(status)
+        __status__ = cufftdxSetOptionStr(<cufftdxDescriptor>handle, <_CommondxOption>opt, <const char*>_value_)
+    check_status(__status__)
 
 
 cpdef size_t cufftdx_get_knob_int64size(long long int handle, size_t num_knobs, knobs_ptr) except? 0:
@@ -1085,8 +1069,8 @@ cpdef size_t cufftdx_get_knob_int64size(long long int handle, size_t num_knobs,
     get_resource_ptr[int](_knobs_ptr_, knobs_ptr, <int*>NULL)
     cdef size_t size
     with nogil:
-        status = cufftdxGetKnobInt64Size(<cufftdxDescriptor>handle, num_knobs, <_CufftdxKnobType*>(_knobs_ptr_.data()), &size)
-    check_status(status)
+        __status__ = cufftdxGetKnobInt64Size(<cufftdxDescriptor>handle, num_knobs, <_CufftdxKnobType*>(_knobs_ptr_.data()), &size)
+    check_status(__status__)
     return size
 
 
@@ -1109,8 +1093,8 @@ cpdef cufftdx_get_knob_int64s(long long int handle, size_t num_knobs, knobs_ptr,
     cdef nullable_unique_ptr[ vector[int] ] _knobs_ptr_
     get_resource_ptr[int](_knobs_ptr_, knobs_ptr, <int*>NULL)
     with nogil:
-        status = cufftdxGetKnobInt64s(<cufftdxDescriptor>handle, num_knobs, <_CufftdxKnobType*>(_knobs_ptr_.data()), size, <long long int*>values)
-    check_status(status)
+        __status__ = cufftdxGetKnobInt64s(<cufftdxDescriptor>handle, num_knobs, <_CufftdxKnobType*>(_knobs_ptr_.data()), size, <long long int*>values)
+    check_status(__status__)
 
 
 cpdef cufftdx_set_operator_int64(long long int handle, int op, long long int value):
@@ -1124,8 +1108,8 @@ cpdef cufftdx_set_operator_int64(long long int handle, int op, long long int val
     .. seealso:: `cufftdxSetOperatorInt64`
     """
     with nogil:
-        status = cufftdxSetOperatorInt64(<cufftdxDescriptor>handle, <_CufftdxOperatorType>op, value)
-    check_status(status)
+        __status__ = cufftdxSetOperatorInt64(<cufftdxDescriptor>handle, <_CufftdxOperatorType>op, value)
+    check_status(__status__)
 
 
 cpdef cufftdx_set_operator_int64s(long long int handle, int op, size_t count, array):
@@ -1146,8 +1130,8 @@ cpdef cufftdx_set_operator_int64s(long long int handle, int op, size_t count, ar
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cufftdxSetOperatorInt64s(<cufftdxDescriptor>handle, <_CufftdxOperatorType>op, count, <const long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = cufftdxSetOperatorInt64s(<cufftdxDescriptor>handle, <_CufftdxOperatorType>op, count, <const long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef size_t cufftdx_get_ltoir_size(long long int handle) except? 0:
@@ -1163,8 +1147,8 @@ cpdef size_t cufftdx_get_ltoir_size(long long int handle) except? 0:
     """
     cdef size_t lto_size
     with nogil:
-        status = cufftdxGetLTOIRSize(<cufftdxDescriptor>handle, &lto_size)
-    check_status(status)
+        __status__ = cufftdxGetLTOIRSize(<cufftdxDescriptor>handle, &lto_size)
+    check_status(__status__)
     return lto_size
 
 
@@ -1180,8 +1164,8 @@ cpdef cufftdx_get_ltoir(long long int handle, size_t size, lto):
     """
     cdef void* _lto_ = get_buffer_pointer(lto, size, readonly=False)
     with nogil:
-        status = cufftdxGetLTOIR(<cufftdxDescriptor>handle, size, <void*>_lto_)
-    check_status(status)
+        __status__ = cufftdxGetLTOIR(<cufftdxDescriptor>handle, size, <void*>_lto_)
+    check_status(__status__)
 
 
 cpdef size_t cufftdx_get_trait_str_size(long long int handle, int trait) except? 0:
@@ -1198,8 +1182,8 @@ cpdef size_t cufftdx_get_trait_str_size(long long int handle, int trait) except?
     """
     cdef size_t size
     with nogil:
-        status = cufftdxGetTraitStrSize(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &size)
-    check_status(status)
+        __status__ = cufftdxGetTraitStrSize(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &size)
+    check_status(__status__)
     return size
 
 
@@ -1216,8 +1200,8 @@ cpdef cufftdx_get_trait_str(long long int handle, int trait, size_t size, value)
     """
     cdef void* _value_ = get_buffer_pointer(value, size, readonly=False)
     with nogil:
-        status = cufftdxGetTraitStr(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, size, <char*>_value_)
-    check_status(status)
+        __status__ = cufftdxGetTraitStr(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, size, <char*>_value_)
+    check_status(__status__)
 
 
 cpdef long long int cufftdx_get_trait_int64(long long int handle, int trait) except? 0:
@@ -1234,8 +1218,8 @@ cpdef long long int cufftdx_get_trait_int64(long long int handle, int trait) exc
     """
     cdef long long int value
     with nogil:
-        status = cufftdxGetTraitInt64(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &value)
-    check_status(status)
+        __status__ = cufftdxGetTraitInt64(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &value)
+    check_status(__status__)
     return value
 
 
@@ -1257,8 +1241,8 @@ cpdef cufftdx_get_trait_int64s(long long int handle, int trait, size_t count, ar
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cufftdxGetTraitInt64s(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, count, <long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = cufftdxGetTraitInt64s(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, count, <long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef int cufftdx_get_trait_commondx_data_type(long long int handle, int trait) except? -1:
@@ -1275,8 +1259,8 @@ cpdef int cufftdx_get_trait_commondx_data_type(long long int handle, int trait)
     """
     cdef _CommondxValueType value
     with nogil:
-        status = cufftdxGetTraitCommondxDataType(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &value)
-    check_status(status)
+        __status__ = cufftdxGetTraitCommondxDataType(<cufftdxDescriptor>handle, <_CufftdxTraitType>trait, &value)
+    check_status(__status__)
     return <int>value
 
 
@@ -1290,8 +1274,8 @@ cpdef cufftdx_finalize_code(long long int code, long long int handle):
     .. seealso:: `cufftdxFinalizeCode`
     """
     with nogil:
-        status = cufftdxFinalizeCode(<commondxCode>code, <cufftdxDescriptor>handle)
-    check_status(status)
+        __status__ = cufftdxFinalizeCode(<commondxCode>code, <cufftdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef cufftdx_destroy_descriptor(long long int handle):
@@ -1303,8 +1287,8 @@ cpdef cufftdx_destroy_descriptor(long long int handle):
     .. seealso:: `cufftdxDestroyDescriptor`
     """
     with nogil:
-        status = cufftdxDestroyDescriptor(<cufftdxDescriptor>handle)
-    check_status(status)
+        __status__ = cufftdxDestroyDescriptor(<cufftdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef str cufftdx_operator_type_to_str(int op):
@@ -1343,8 +1327,8 @@ cpdef long long int cusolverdx_create_descriptor() except? 0:
     """
     cdef cusolverdxDescriptor handle
     with nogil:
-        status = cusolverdxCreateDescriptor(&handle)
-    check_status(status)
+        __status__ = cusolverdxCreateDescriptor(&handle)
+    check_status(__status__)
     return <long long int>handle
 
 
@@ -1363,8 +1347,8 @@ cpdef cusolverdx_set_option_str(long long int handle, int opt, value):
     cdef bytes _temp_value_ = (<str>value).encode()
     cdef char* _value_ = _temp_value_
     with nogil:
-        status = cusolverdxSetOptionStr(<cusolverdxDescriptor>handle, <_CommondxOption>opt, <const char*>_value_)
-    check_status(status)
+        __status__ = cusolverdxSetOptionStr(<cusolverdxDescriptor>handle, <_CommondxOption>opt, <const char*>_value_)
+    check_status(__status__)
 
 
 cpdef cusolverdx_set_operator_int64(long long int handle, int op, long long int value):
@@ -1378,8 +1362,8 @@ cpdef cusolverdx_set_operator_int64(long long int handle, int op, long long int
     .. seealso:: `cusolverdxSetOperatorInt64`
     """
     with nogil:
-        status = cusolverdxSetOperatorInt64(<cusolverdxDescriptor>handle, <_CusolverdxOperatorType>op, value)
-    check_status(status)
+        __status__ = cusolverdxSetOperatorInt64(<cusolverdxDescriptor>handle, <_CusolverdxOperatorType>op, value)
+    check_status(__status__)
 
 
 cpdef cusolverdx_set_operator_int64s(long long int handle, int op, size_t count, array):
@@ -1400,8 +1384,8 @@ cpdef cusolverdx_set_operator_int64s(long long int handle, int op, size_t count,
     cdef nullable_unique_ptr[ vector[int64_t] ] _array_
     get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
     with nogil:
-        status = cusolverdxSetOperatorInt64s(<cusolverdxDescriptor>handle, <_CusolverdxOperatorType>op, count, <const long long int*>(_array_.data()))
-    check_status(status)
+        __status__ = cusolverdxSetOperatorInt64s(<cusolverdxDescriptor>handle, <_CusolverdxOperatorType>op, count, <const long long int*>(_array_.data()))
+    check_status(__status__)
 
 
 cpdef size_t cusolverdx_get_ltoir_size(long long int handle) except? 0:
@@ -1417,8 +1401,8 @@ cpdef size_t cusolverdx_get_ltoir_size(long long int handle) except? 0:
     """
     cdef size_t lto_size
     with nogil:
-        status = cusolverdxGetLTOIRSize(<cusolverdxDescriptor>handle, &lto_size)
-    check_status(status)
+        __status__ = cusolverdxGetLTOIRSize(<cusolverdxDescriptor>handle, &lto_size)
+    check_status(__status__)
     return lto_size
 
 
@@ -1434,8 +1418,8 @@ cpdef cusolverdx_get_ltoir(long long int handle, size_t size, lto):
     """
     cdef void* _lto_ = get_buffer_pointer(lto, size, readonly=False)
     with nogil:
-        status = cusolverdxGetLTOIR(<cusolverdxDescriptor>handle, size, <void*>_lto_)
-    check_status(status)
+        __status__ = cusolverdxGetLTOIR(<cusolverdxDescriptor>handle, size, <void*>_lto_)
+    check_status(__status__)
 
 
 cpdef size_t cusolverdx_get_universal_fatbin_size(long long int handle) except? 0:
@@ -1451,8 +1435,8 @@ cpdef size_t cusolverdx_get_universal_fatbin_size(long long int handle) except?
     """
     cdef size_t fatbin_size
     with nogil:
-        status = cusolverdxGetUniversalFATBINSize(<cusolverdxDescriptor>handle, &fatbin_size)
-    check_status(status)
+        __status__ = cusolverdxGetUniversalFATBINSize(<cusolverdxDescriptor>handle, &fatbin_size)
+    check_status(__status__)
     return fatbin_size
 
 
@@ -1468,8 +1452,8 @@ cpdef cusolverdx_get_universal_fatbin(long long int handle, size_t fatbin_size,
     """
     cdef void* _fatbin_ = get_buffer_pointer(fatbin, fatbin_size, readonly=False)
     with nogil:
-        status = cusolverdxGetUniversalFATBIN(<cusolverdxDescriptor>handle, fatbin_size, <void*>_fatbin_)
-    check_status(status)
+        __status__ = cusolverdxGetUniversalFATBIN(<cusolverdxDescriptor>handle, fatbin_size, <void*>_fatbin_)
+    check_status(__status__)
 
 
 cpdef size_t cusolverdx_get_trait_str_size(long long int handle, int trait) except? 0:
@@ -1486,8 +1470,8 @@ cpdef size_t cusolverdx_get_trait_str_size(long long int handle, int trait) exce
     """
     cdef size_t size
     with nogil:
-        status = cusolverdxGetTraitStrSize(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, &size)
-    check_status(status)
+        __status__ = cusolverdxGetTraitStrSize(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, &size)
+    check_status(__status__)
     return size
 
 
@@ -1504,8 +1488,8 @@ cpdef cusolverdx_get_trait_str(long long int handle, int trait, size_t size, val
     """
     cdef void* _value_ = get_buffer_pointer(value, size, readonly=False)
     with nogil:
-        status = cusolverdxGetTraitStr(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, size, <char*>_value_)
-    check_status(status)
+        __status__ = cusolverdxGetTraitStr(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, size, <char*>_value_)
+    check_status(__status__)
 
 
 cpdef long long int cusolverdx_get_trait_int64(long long int handle, int trait) except? 0:
@@ -1522,8 +1506,8 @@ cpdef long long int cusolverdx_get_trait_int64(long long int handle, int trait)
     """
     cdef long long int value
     with nogil:
-        status = cusolverdxGetTraitInt64(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, &value)
-    check_status(status)
+        __status__ = cusolverdxGetTraitInt64(<cusolverdxDescriptor>handle, <_CusolverdxTraitType>trait, &value)
+    check_status(__status__)
     return value
 
 
@@ -1537,8 +1521,8 @@ cpdef cusolverdx_finalize_code(long long int code, long long int handle):
     .. seealso:: `cusolverdxFinalizeCode`
     """
     with nogil:
-        status = cusolverdxFinalizeCode(<commondxCode>code, <cusolverdxDescriptor>handle)
-    check_status(status)
+        __status__ = cusolverdxFinalizeCode(<commondxCode>code, <cusolverdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef cusolverdx_destroy_descriptor(long long int handle):
@@ -1550,8 +1534,8 @@ cpdef cusolverdx_destroy_descriptor(long long int handle):
     .. seealso:: `cusolverdxDestroyDescriptor`
     """
     with nogil:
-        status = cusolverdxDestroyDescriptor(<cusolverdxDescriptor>handle)
-    check_status(status)
+        __status__ = cusolverdxDestroyDescriptor(<cusolverdxDescriptor>handle)
+    check_status(__status__)
 
 
 cpdef str cusolverdx_operator_type_to_str(int op):
@@ -1578,3 +1562,146 @@ cpdef str cusolverdx_trait_type_to_str(int trait):
     cdef bytes _output_
     _output_ = cusolverdxTraitTypeToStr(<_CusolverdxTraitType>trait)
     return _output_.decode()
+
+
+cpdef long long int cublasdx_create_tensor_new(long long int handle, int tensor_type) except? 0:
+    cdef cublasdxTensor tensor
+    with nogil:
+        __status__ = cublasdxCreateTensorNew(<cublasdxDescriptor>handle, <_CublasdxTensorType>tensor_type, &tensor)
+    check_status(__status__)
+    return <long long int>tensor
+
+
+cpdef long long int cublasdx_make_tensor_like(long long int input, int value_type) except? 0:
+    """Create an opaque tensor with a identical layout (smem/gmem) or partitioner (rmem), but with a different datatype.
+
+    Args:
+        input (long long int): An opaque tensors.
+        value_type (CommondxValueType): The new datatype.
+
+    Returns:
+        long long int: The output tensor.
+
+    .. seealso:: `cublasdxMakeTensorLike`
+    """
+    cdef cublasdxTensor output
+    with nogil:
+        __status__ = cublasdxMakeTensorLike(<cublasdxTensor>input, <_CommondxValueType>value_type, &output)
+    check_status(__status__)
+    return <long long int>output
+
+
+cpdef cublasdx_destroy_tensor_new(long long int tensor):
+    with nogil:
+        __status__ = cublasdxDestroyTensorNew(<cublasdxTensor>tensor)
+    check_status(__status__)
+
+
+cpdef long long int cublasdx_create_device_function_new(long long int handle, int device_function_type, size_t count, array) except? 0:
+    cdef nullable_unique_ptr[ vector[int64_t] ] _array_
+    get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
+    cdef cublasdxDeviceFunction device_function
+    with nogil:
+        __status__ = cublasdxCreateDeviceFunctionNew(<cublasdxDescriptor>handle, <_CublasdxDeviceFunctionType>device_function_type, count, <const cublasdxTensor*>(_array_.data()), &device_function)
+    check_status(__status__)
+    return <long long int>device_function
+
+
+cpdef cublasdx_destroy_device_function_new(long long int device_function):
+    with nogil:
+        __status__ = cublasdxDestroyDeviceFunctionNew(<cublasdxDeviceFunction>device_function)
+    check_status(__status__)
+
+cpdef cublasdx_finalize_tensors203(long long int handle, size_t count, array):
+    """Finalize the tensors. This is required before traits can be queried.
+
+    Args:
+        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
+        count (size_t): The number of tensors to finalized.
+        array (object): The array of tensors. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``cublasdxTensor``.
+
+
+    .. seealso:: `cublasdxFinalizeTensors`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _array_
+    get_resource_ptr[int64_t](_array_, array, <int64_t*>NULL)
+    with nogil:
+        __status__ = cublasdxFinalizeTensors203(<cublasdxDescriptor>handle, count, <const cublasdxTensor*>(_array_.data()))
+    check_status(__status__)
+
+cpdef cublasdx_finalize_tensors(long long int handle, size_t count, array):
+    """Finalize the tensors. This is required before traits can be queried.
+
+    Args:
+        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
+        count (size_t): The number of tensors to finalized.
+        array (object): The array of tensors. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``cublasdxTensor``.
+
+
+    .. seealso:: `cublasdxFinalizeTensors`
+    """
+    if get_version_ex() < (0, 3, 0):
+        return cublasdx_finalize_tensors203(handle, count, array)
+    else:
+        return cublasdx_finalize_tensors_new(count, array)
+
+cpdef long long int cublasdx_create_device_function(long long int handle, int device_function_type, size_t count, array) except? 0:
+    """Binds (aka create) a device function from a set of tensor.
+
+    Args:
+        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
+        device_function_type (CublasdxDeviceFunctionType): The device function to create.
+        count (size_t): The number of input & output tensors to the device function.
+        array (object): The array of input & output tensors. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``cublasdxTensor``.
+
+
+    Returns:
+        long long int: The device function.
+
+    .. seealso:: `cublasdxCreateDeviceFunction`
+    """
+    if get_version_ex() < (0, 3, 0):
+        return cublasdx_create_device_function_old(handle, device_function_type, count, array)
+    else:
+        return cublasdx_create_device_function_new(handle, device_function_type, count, array)
+
+cpdef cublasdx_destroy_device_function(long long int device_function):
+    if get_version_ex() >= (0, 3, 0):
+        cublasdx_destroy_device_function_new(device_function)
+
+cpdef cublasdx_destroy_tensor(long long int tensor):
+    """Destroys a tensor handle created using cublasdxCreateTensor or cublasdxMakeTensorLike.
+
+    Args:
+        tensor (long long int): The tensor to destroy.
+
+    .. seealso:: `cublasdxDestroyTensor`
+    """
+    if get_version_ex() >= (0, 3, 0):
+        cublasdx_destroy_tensor_new(tensor)
+
+cpdef long long int cublasdx_create_tensor(long long int handle, int tensor_type) except? 0:
+    """Create a tensor handle.
+
+    Args:
+        handle (long long int): A cuBLASDx descriptor, output of cublasdxCreateDescriptor.
+        tensor_type (CublasdxTensorType): The tensor type to bind to the handle.
+
+    Returns:
+        long long int: A valid tensor handle.
+
+    .. seealso:: `cublasdxCreateTensor`
+    """
+    if get_version_ex() < (0, 3, 0):
+        return cublasdx_bind_tensor(handle, tensor_type)
+    else:
+        return cublasdx_create_tensor_new(handle, tensor_type)
diff --git a/nvmath/bindings/nccl.pxd b/nvmath/bindings/nccl.pxd
new file mode 100644
index 0000000..d62b31c
--- /dev/null
+++ b/nvmath/bindings/nccl.pxd
@@ -0,0 +1,42 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cynccl cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef ncclComm_t Comm
+
+ctypedef cudaStream_t Stream
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef ncclResult_t _Result
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef int get_version() except? -1
+cpdef get_unique_id(intptr_t unique_id)
+cpdef intptr_t comm_init_rank(int nranks, comm_id, int rank) except? 0
+cpdef comm_destroy(intptr_t comm)
+cpdef comm_abort(intptr_t comm)
+cpdef str get_error_string(int result)
+cpdef int comm_count(intptr_t comm) except? -1
+cpdef int comm_cu_device(intptr_t comm) except? -1
+cpdef int comm_user_rank(intptr_t comm) except? -1
+cpdef str get_last_error(intptr_t comm)
+cpdef comm_finalize(intptr_t comm)
diff --git a/nvmath/bindings/nccl.pyx b/nvmath/bindings/nccl.pyx
new file mode 100644
index 0000000..3b5e4a8
--- /dev/null
+++ b/nvmath/bindings/nccl.pyx
@@ -0,0 +1,251 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 2.11.4 to 2.28.3. Do not modify it directly.
+
+cimport cython  # NOQA
+from cpython cimport buffer as _buffer
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+from ._internal.utils cimport get_buffer_pointer
+
+from enum import IntEnum as _IntEnum
+
+import numpy as _numpy
+
+
+###############################################################################
+# POD
+###############################################################################
+
+unique_id_dtype = _numpy.dtype([
+    ("internal", _numpy.int8, (128,)),
+    ], align=True)
+
+
+cdef class UniqueId:
+    """Empty-initialize an array of `ncclUniqueId`.
+
+    The resulting object is of length `size` and of dtype `unique_id_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `ncclUniqueId`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self, size=1):
+        arr = _numpy.empty(size, dtype=unique_id_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof(ncclUniqueId), \
+            f"itemsize {self._data.itemsize} mismatches struct size {sizeof(ncclUniqueId)}"
+
+    def __repr__(self):
+        if self._data.size > 1:
+            return f"<{__name__}.UniqueId_Array_{self._data.size} object at {hex(id(self))}>"
+        else:
+            return f"<{__name__}.UniqueId object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        if self._data.size > 1:
+            raise TypeError("int() argument must be a bytes-like object of size 1. "
+                            "To get the pointer address of an array, use .ptr")
+        return self._data.ctypes.data
+
+    def __len__(self):
+        return self._data.size
+
+    def __eq__(self, other):
+        if not isinstance(other, UniqueId):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            size = self._data.size
+            if key >= size or key <= -(size+1):
+                raise IndexError("index is out of bounds")
+            if key < 0:
+                key += size
+            return UniqueId.from_data(self._data[key:key+1])
+        out = self._data[key]
+        if isinstance(out, _numpy.recarray) and out.dtype == unique_id_dtype:
+            return UniqueId.from_data(out)
+        return out
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an UniqueId instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `unique_id_dtype` holding the data.
+        """
+        cdef UniqueId obj = UniqueId.__new__(UniqueId)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != unique_id_dtype:
+            raise ValueError("data array must be of dtype unique_id_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False):
+        """Create an UniqueId instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef UniqueId obj = UniqueId.__new__(UniqueId)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof(ncclUniqueId) * size, flag)
+        data = _numpy.ndarray((size,), buffer=buf,
+                              dtype=unique_id_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Result(_IntEnum):
+    """See `ncclResult_t`."""
+    Success = ncclSuccess
+    UnhandledCudaError = ncclUnhandledCudaError
+    SystemError = ncclSystemError
+    InternalError = ncclInternalError
+    InvalidArgument = ncclInvalidArgument
+    InvalidUsage = ncclInvalidUsage
+    RemoteError = ncclRemoteError
+    InProgress = ncclInProgress
+    NumResults = ncclNumResults
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class NCCLError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Result(status)
+        cdef str err = f"{s.name} ({s.value}): {get_error_string(status)}"
+        super(NCCLError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cpdef inline check_status(int status):
+    if status != 0:
+        raise NCCLError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef int get_version() except? -1:
+    cdef int version
+    with nogil:
+        status = ncclGetVersion(&version)
+    check_status(status)
+    return version
+
+
+cpdef get_unique_id(intptr_t unique_id):
+    with nogil:
+        status = ncclGetUniqueId(<ncclUniqueId*>unique_id)
+    check_status(status)
+
+
+cpdef intptr_t comm_init_rank(int nranks, comm_id, int rank) except? 0:
+    cdef void* _comm_id_ = get_buffer_pointer(comm_id, -1, readonly=False)
+    cdef Comm comm
+    with nogil:
+        status = ncclCommInitRank(&comm, nranks, (<ncclUniqueId*>(_comm_id_))[0], rank)
+    check_status(status)
+    return <intptr_t>comm
+
+
+cpdef comm_destroy(intptr_t comm):
+    with nogil:
+        status = ncclCommDestroy(<Comm>comm)
+    check_status(status)
+
+
+cpdef comm_abort(intptr_t comm):
+    with nogil:
+        status = ncclCommAbort(<Comm>comm)
+    check_status(status)
+
+
+cpdef str get_error_string(int result):
+    cdef bytes _output_
+    _output_ = ncclGetErrorString(<_Result>result)
+    return _output_.decode()
+
+
+cpdef int comm_count(intptr_t comm) except? -1:
+    cdef int count
+    with nogil:
+        status = ncclCommCount(<const Comm>comm, &count)
+    check_status(status)
+    return count
+
+
+cpdef int comm_cu_device(intptr_t comm) except? -1:
+    cdef int device
+    with nogil:
+        status = ncclCommCuDevice(<const Comm>comm, &device)
+    check_status(status)
+    return device
+
+
+cpdef int comm_user_rank(intptr_t comm) except? -1:
+    cdef int rank
+    with nogil:
+        status = ncclCommUserRank(<const Comm>comm, &rank)
+    check_status(status)
+    return rank
+
+
+cpdef str get_last_error(intptr_t comm):
+    cdef bytes _output_
+    _output_ = ncclGetLastError(<Comm>comm)
+    return _output_.decode()
+
+
+cpdef comm_finalize(intptr_t comm):
+    with nogil:
+        status = ncclCommFinalize(<Comm>comm)
+    check_status(status)
diff --git a/nvmath/bindings/nvpl/__init__.py b/nvmath/bindings/nvpl/__init__.py
index 6172ecb..1d78271 100644
--- a/nvmath/bindings/nvpl/__init__.py
+++ b/nvmath/bindings/nvpl/__init__.py
@@ -5,7 +5,9 @@
 # type: ignore
 
 from . import fft
+from . import blas
 
 __all__ = [
     "fft",
+    "blas",
 ]
diff --git a/nvmath/bindings/nvpl/_internal/blas.pxd b/nvmath/bindings/nvpl/_internal/blas.pxd
new file mode 100644
index 0000000..219506a
--- /dev/null
+++ b/nvmath/bindings/nvpl/_internal/blas.pxd
@@ -0,0 +1,125 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+from ..cyblas cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef int _MKL_mkl_set_num_threads_local(int nth) except?-42 nogil
+cdef void _MKL_mkl_set_num_threads(int nth) except* nogil
+cdef void _openblas_openblas_set_num_threads(int num_threads) except* nogil
+cdef int _openblas_openblas_set_num_threads_local(int num_threads) except?-42 nogil
+cdef int _nvpl_blas_get_version() except?-42 nogil
+cdef int _nvpl_blas_get_max_threads() except?-42 nogil
+cdef void _nvpl_blas_set_num_threads(int nthr) except* nogil
+cdef int _nvpl_blas_set_num_threads_local(int nthr_local) except?-42 nogil
+cdef void _cblas_sgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_sgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_strmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_stbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_stpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_strsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_stbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_stpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_dgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_dtrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dtbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dtpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dtrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dtbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_dtpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_cgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_cgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_ctrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ctbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ctpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ctrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ctbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ctpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_zgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_zgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_ztrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ztbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ztpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ztrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ztbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ztpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void _cblas_ssymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_ssbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_sspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* Ap, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_sger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_ssyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_sspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* Ap) except* nogil
+cdef void _cblas_ssyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_sspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A) except* nogil
+cdef void _cblas_dsymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_dsbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_dspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* Ap, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_dger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_dsyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_dspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* Ap) except* nogil
+cdef void _cblas_dsyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_dspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A) except* nogil
+cdef void _cblas_chemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_chbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_chpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_cgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_cgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_cher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_chpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil
+cdef void _cblas_cher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_chpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil
+cdef void _cblas_zhemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_zhbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_zhpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void _cblas_zgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_zgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_zher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_zhpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil
+cdef void _cblas_zher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void _cblas_zhpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil
+cdef void _cblas_sgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_ssymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_ssyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_ssyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_strmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_strsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_dgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_dsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_dsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_dsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_dtrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_dtrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_cgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_csymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_csyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_csyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_ctrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_ctrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_zgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_ztrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_ztrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void _cblas_chemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_cherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const void* A, const nvpl_int_t lda, const float beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_cher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const float beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zhemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const void* A, const nvpl_int_t lda, const double beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_zher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const double beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void _cblas_sgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const float* alpha_array, const float** A_array, nvpl_int_t* lda_array, const float** B_array, nvpl_int_t* ldb_array, const float* beta_array, float** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void _cblas_dgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const double* alpha_array, const double** A_array, nvpl_int_t* lda_array, const double** B_array, nvpl_int_t* ldb_array, const double* beta_array, double** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void _cblas_cgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void _cblas_zgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void _cblas_sgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const nvpl_int_t stridea, const float* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const float beta, float* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void _cblas_dgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const nvpl_int_t stridea, const double* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const double beta, double* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void _cblas_cgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void _cblas_zgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
diff --git a/nvmath/bindings/nvpl/_internal/blas_linux.pyx b/nvmath/bindings/nvpl/_internal/blas_linux.pyx
new file mode 100644
index 0000000..e921896
--- /dev/null
+++ b/nvmath/bindings/nvpl/_internal/blas_linux.pyx
@@ -0,0 +1,2594 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+import os
+
+cimport cython
+from libc.stdint cimport intptr_t
+
+from ..._internal.utils import FunctionNotFoundError, NotSupportedError
+
+import threading
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvpl_blas_init = False
+cdef str __current_so_name = ""
+cdef tuple __lib_so_names = ("libnvpl_blas_ilp64_gomp.so.0", "libmkl_rt.so.2",  "libopenblas.so.0",)
+
+
+cdef void* __MKL_Set_Num_Threads_Local = NULL
+cdef void* __MKL_Set_Num_Threads = NULL
+cdef void* __openblas_set_num_threads = NULL
+cdef void* __openblas_set_num_threads_local = NULL
+cdef void* __nvpl_blas_get_version = NULL
+cdef void* __nvpl_blas_get_max_threads = NULL
+cdef void* __nvpl_blas_set_num_threads = NULL
+cdef void* __nvpl_blas_set_num_threads_local = NULL
+cdef void* __cblas_sgemv = NULL
+cdef void* __cblas_sgbmv = NULL
+cdef void* __cblas_strmv = NULL
+cdef void* __cblas_stbmv = NULL
+cdef void* __cblas_stpmv = NULL
+cdef void* __cblas_strsv = NULL
+cdef void* __cblas_stbsv = NULL
+cdef void* __cblas_stpsv = NULL
+cdef void* __cblas_dgemv = NULL
+cdef void* __cblas_dgbmv = NULL
+cdef void* __cblas_dtrmv = NULL
+cdef void* __cblas_dtbmv = NULL
+cdef void* __cblas_dtpmv = NULL
+cdef void* __cblas_dtrsv = NULL
+cdef void* __cblas_dtbsv = NULL
+cdef void* __cblas_dtpsv = NULL
+cdef void* __cblas_cgemv = NULL
+cdef void* __cblas_cgbmv = NULL
+cdef void* __cblas_ctrmv = NULL
+cdef void* __cblas_ctbmv = NULL
+cdef void* __cblas_ctpmv = NULL
+cdef void* __cblas_ctrsv = NULL
+cdef void* __cblas_ctbsv = NULL
+cdef void* __cblas_ctpsv = NULL
+cdef void* __cblas_zgemv = NULL
+cdef void* __cblas_zgbmv = NULL
+cdef void* __cblas_ztrmv = NULL
+cdef void* __cblas_ztbmv = NULL
+cdef void* __cblas_ztpmv = NULL
+cdef void* __cblas_ztrsv = NULL
+cdef void* __cblas_ztbsv = NULL
+cdef void* __cblas_ztpsv = NULL
+cdef void* __cblas_ssymv = NULL
+cdef void* __cblas_ssbmv = NULL
+cdef void* __cblas_sspmv = NULL
+cdef void* __cblas_sger = NULL
+cdef void* __cblas_ssyr = NULL
+cdef void* __cblas_sspr = NULL
+cdef void* __cblas_ssyr2 = NULL
+cdef void* __cblas_sspr2 = NULL
+cdef void* __cblas_dsymv = NULL
+cdef void* __cblas_dsbmv = NULL
+cdef void* __cblas_dspmv = NULL
+cdef void* __cblas_dger = NULL
+cdef void* __cblas_dsyr = NULL
+cdef void* __cblas_dspr = NULL
+cdef void* __cblas_dsyr2 = NULL
+cdef void* __cblas_dspr2 = NULL
+cdef void* __cblas_chemv = NULL
+cdef void* __cblas_chbmv = NULL
+cdef void* __cblas_chpmv = NULL
+cdef void* __cblas_cgeru = NULL
+cdef void* __cblas_cgerc = NULL
+cdef void* __cblas_cher = NULL
+cdef void* __cblas_chpr = NULL
+cdef void* __cblas_cher2 = NULL
+cdef void* __cblas_chpr2 = NULL
+cdef void* __cblas_zhemv = NULL
+cdef void* __cblas_zhbmv = NULL
+cdef void* __cblas_zhpmv = NULL
+cdef void* __cblas_zgeru = NULL
+cdef void* __cblas_zgerc = NULL
+cdef void* __cblas_zher = NULL
+cdef void* __cblas_zhpr = NULL
+cdef void* __cblas_zher2 = NULL
+cdef void* __cblas_zhpr2 = NULL
+cdef void* __cblas_sgemm = NULL
+cdef void* __cblas_ssymm = NULL
+cdef void* __cblas_ssyrk = NULL
+cdef void* __cblas_ssyr2k = NULL
+cdef void* __cblas_strmm = NULL
+cdef void* __cblas_strsm = NULL
+cdef void* __cblas_dgemm = NULL
+cdef void* __cblas_dsymm = NULL
+cdef void* __cblas_dsyrk = NULL
+cdef void* __cblas_dsyr2k = NULL
+cdef void* __cblas_dtrmm = NULL
+cdef void* __cblas_dtrsm = NULL
+cdef void* __cblas_cgemm = NULL
+cdef void* __cblas_csymm = NULL
+cdef void* __cblas_csyrk = NULL
+cdef void* __cblas_csyr2k = NULL
+cdef void* __cblas_ctrmm = NULL
+cdef void* __cblas_ctrsm = NULL
+cdef void* __cblas_zgemm = NULL
+cdef void* __cblas_zsymm = NULL
+cdef void* __cblas_zsyrk = NULL
+cdef void* __cblas_zsyr2k = NULL
+cdef void* __cblas_ztrmm = NULL
+cdef void* __cblas_ztrsm = NULL
+cdef void* __cblas_chemm = NULL
+cdef void* __cblas_cherk = NULL
+cdef void* __cblas_cher2k = NULL
+cdef void* __cblas_zhemm = NULL
+cdef void* __cblas_zherk = NULL
+cdef void* __cblas_zher2k = NULL
+cdef void* __cblas_sgemm_batch = NULL
+cdef void* __cblas_dgemm_batch = NULL
+cdef void* __cblas_cgemm_batch = NULL
+cdef void* __cblas_zgemm_batch = NULL
+cdef void* __cblas_sgemm_batch_strided = NULL
+cdef void* __cblas_dgemm_batch_strided = NULL
+cdef void* __cblas_cgemm_batch_strided = NULL
+cdef void* __cblas_zgemm_batch_strided = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef void* handle
+    cdef str all_err_msg = ""
+    cdef str env_lib_so_name = os.getenv("NVMATH_BLAS_CPU_LIBRARY", "")
+
+    if env_lib_so_name != "":
+        handle = dlopen(env_lib_so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            global __current_so_name
+            __current_so_name = env_lib_so_name
+        else:
+            error_msg = dlerror()
+        raise RuntimeError(
+            f"Failed to dlopen NVMATH_BLAS_CPU_LIBRARY={env_lib_so_name}. "
+            f"Please check that NVMATH_BLAS_CPU_LIBRARY is the name of a lib on the LD_LIBRARY_PATH. {error_msg.decode()}"
+        )
+
+    if len(__lib_so_names) == 0:
+        raise RuntimeError("Cannot load BLAS-compatible library. No lib names were specified.")
+    for so_name in __lib_so_names:
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            global __current_so_name
+            __current_so_name = so_name
+            break  # stop at first successful open
+        else:
+            error_msg = dlerror()
+            all_err_msg += f"\n{error_msg.decode()}"
+    else:
+        all_libs = ", ".join(__lib_so_names)
+        raise RuntimeError(
+            f"Failed to dlopen all of the following libraries: {all_libs}. "
+            "Install/add one of these libraries to LD_LIBRARY_PATH or"
+            f"use environment variable NVMATH_BLAS_CPU_LIBRARY to name a lib on the LD_LIBRARY_PATH. {all_err_msg}"
+        )
+
+
+cdef int _check_or_init_nvpl_blas() except -1 nogil:
+    global __py_nvpl_blas_init
+    if __py_nvpl_blas_init:
+        return 0
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Load function
+        global __MKL_Set_Num_Threads_Local
+        __MKL_Set_Num_Threads_Local = dlsym(RTLD_DEFAULT, 'MKL_Set_Num_Threads_Local')
+        if __MKL_Set_Num_Threads_Local == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __MKL_Set_Num_Threads_Local = dlsym(handle, 'MKL_Set_Num_Threads_Local')
+
+        global __MKL_Set_Num_Threads
+        __MKL_Set_Num_Threads = dlsym(RTLD_DEFAULT, 'MKL_Set_Num_Threads')
+        if __MKL_Set_Num_Threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __MKL_Set_Num_Threads = dlsym(handle, 'MKL_Set_Num_Threads')
+
+        global __openblas_set_num_threads
+        __openblas_set_num_threads = dlsym(RTLD_DEFAULT, 'openblas_set_num_threads')
+        if __openblas_set_num_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __openblas_set_num_threads = dlsym(handle, 'openblas_set_num_threads')
+
+        global __openblas_set_num_threads_local
+        __openblas_set_num_threads_local = dlsym(RTLD_DEFAULT, 'openblas_set_num_threads_local')
+        if __openblas_set_num_threads_local == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __openblas_set_num_threads_local = dlsym(handle, 'openblas_set_num_threads_local')
+
+        global __nvpl_blas_get_version
+        __nvpl_blas_get_version = dlsym(RTLD_DEFAULT, 'nvpl_blas_get_version')
+        if __nvpl_blas_get_version == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvpl_blas_get_version = dlsym(handle, 'nvpl_blas_get_version')
+
+        global __nvpl_blas_get_max_threads
+        __nvpl_blas_get_max_threads = dlsym(RTLD_DEFAULT, 'nvpl_blas_get_max_threads')
+        if __nvpl_blas_get_max_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvpl_blas_get_max_threads = dlsym(handle, 'nvpl_blas_get_max_threads')
+
+        global __nvpl_blas_set_num_threads
+        __nvpl_blas_set_num_threads = dlsym(RTLD_DEFAULT, 'nvpl_blas_set_num_threads')
+        if __nvpl_blas_set_num_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvpl_blas_set_num_threads = dlsym(handle, 'nvpl_blas_set_num_threads')
+
+        global __nvpl_blas_set_num_threads_local
+        __nvpl_blas_set_num_threads_local = dlsym(RTLD_DEFAULT, 'nvpl_blas_set_num_threads_local')
+        if __nvpl_blas_set_num_threads_local == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvpl_blas_set_num_threads_local = dlsym(handle, 'nvpl_blas_set_num_threads_local')
+
+        global __cblas_sgemv
+        __cblas_sgemv = dlsym(RTLD_DEFAULT, 'cblas_sgemv')
+        if __cblas_sgemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sgemv = dlsym(handle, 'cblas_sgemv')
+
+        global __cblas_sgbmv
+        __cblas_sgbmv = dlsym(RTLD_DEFAULT, 'cblas_sgbmv')
+        if __cblas_sgbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sgbmv = dlsym(handle, 'cblas_sgbmv')
+
+        global __cblas_strmv
+        __cblas_strmv = dlsym(RTLD_DEFAULT, 'cblas_strmv')
+        if __cblas_strmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_strmv = dlsym(handle, 'cblas_strmv')
+
+        global __cblas_stbmv
+        __cblas_stbmv = dlsym(RTLD_DEFAULT, 'cblas_stbmv')
+        if __cblas_stbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_stbmv = dlsym(handle, 'cblas_stbmv')
+
+        global __cblas_stpmv
+        __cblas_stpmv = dlsym(RTLD_DEFAULT, 'cblas_stpmv')
+        if __cblas_stpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_stpmv = dlsym(handle, 'cblas_stpmv')
+
+        global __cblas_strsv
+        __cblas_strsv = dlsym(RTLD_DEFAULT, 'cblas_strsv')
+        if __cblas_strsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_strsv = dlsym(handle, 'cblas_strsv')
+
+        global __cblas_stbsv
+        __cblas_stbsv = dlsym(RTLD_DEFAULT, 'cblas_stbsv')
+        if __cblas_stbsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_stbsv = dlsym(handle, 'cblas_stbsv')
+
+        global __cblas_stpsv
+        __cblas_stpsv = dlsym(RTLD_DEFAULT, 'cblas_stpsv')
+        if __cblas_stpsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_stpsv = dlsym(handle, 'cblas_stpsv')
+
+        global __cblas_dgemv
+        __cblas_dgemv = dlsym(RTLD_DEFAULT, 'cblas_dgemv')
+        if __cblas_dgemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dgemv = dlsym(handle, 'cblas_dgemv')
+
+        global __cblas_dgbmv
+        __cblas_dgbmv = dlsym(RTLD_DEFAULT, 'cblas_dgbmv')
+        if __cblas_dgbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dgbmv = dlsym(handle, 'cblas_dgbmv')
+
+        global __cblas_dtrmv
+        __cblas_dtrmv = dlsym(RTLD_DEFAULT, 'cblas_dtrmv')
+        if __cblas_dtrmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtrmv = dlsym(handle, 'cblas_dtrmv')
+
+        global __cblas_dtbmv
+        __cblas_dtbmv = dlsym(RTLD_DEFAULT, 'cblas_dtbmv')
+        if __cblas_dtbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtbmv = dlsym(handle, 'cblas_dtbmv')
+
+        global __cblas_dtpmv
+        __cblas_dtpmv = dlsym(RTLD_DEFAULT, 'cblas_dtpmv')
+        if __cblas_dtpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtpmv = dlsym(handle, 'cblas_dtpmv')
+
+        global __cblas_dtrsv
+        __cblas_dtrsv = dlsym(RTLD_DEFAULT, 'cblas_dtrsv')
+        if __cblas_dtrsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtrsv = dlsym(handle, 'cblas_dtrsv')
+
+        global __cblas_dtbsv
+        __cblas_dtbsv = dlsym(RTLD_DEFAULT, 'cblas_dtbsv')
+        if __cblas_dtbsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtbsv = dlsym(handle, 'cblas_dtbsv')
+
+        global __cblas_dtpsv
+        __cblas_dtpsv = dlsym(RTLD_DEFAULT, 'cblas_dtpsv')
+        if __cblas_dtpsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtpsv = dlsym(handle, 'cblas_dtpsv')
+
+        global __cblas_cgemv
+        __cblas_cgemv = dlsym(RTLD_DEFAULT, 'cblas_cgemv')
+        if __cblas_cgemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgemv = dlsym(handle, 'cblas_cgemv')
+
+        global __cblas_cgbmv
+        __cblas_cgbmv = dlsym(RTLD_DEFAULT, 'cblas_cgbmv')
+        if __cblas_cgbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgbmv = dlsym(handle, 'cblas_cgbmv')
+
+        global __cblas_ctrmv
+        __cblas_ctrmv = dlsym(RTLD_DEFAULT, 'cblas_ctrmv')
+        if __cblas_ctrmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctrmv = dlsym(handle, 'cblas_ctrmv')
+
+        global __cblas_ctbmv
+        __cblas_ctbmv = dlsym(RTLD_DEFAULT, 'cblas_ctbmv')
+        if __cblas_ctbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctbmv = dlsym(handle, 'cblas_ctbmv')
+
+        global __cblas_ctpmv
+        __cblas_ctpmv = dlsym(RTLD_DEFAULT, 'cblas_ctpmv')
+        if __cblas_ctpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctpmv = dlsym(handle, 'cblas_ctpmv')
+
+        global __cblas_ctrsv
+        __cblas_ctrsv = dlsym(RTLD_DEFAULT, 'cblas_ctrsv')
+        if __cblas_ctrsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctrsv = dlsym(handle, 'cblas_ctrsv')
+
+        global __cblas_ctbsv
+        __cblas_ctbsv = dlsym(RTLD_DEFAULT, 'cblas_ctbsv')
+        if __cblas_ctbsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctbsv = dlsym(handle, 'cblas_ctbsv')
+
+        global __cblas_ctpsv
+        __cblas_ctpsv = dlsym(RTLD_DEFAULT, 'cblas_ctpsv')
+        if __cblas_ctpsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctpsv = dlsym(handle, 'cblas_ctpsv')
+
+        global __cblas_zgemv
+        __cblas_zgemv = dlsym(RTLD_DEFAULT, 'cblas_zgemv')
+        if __cblas_zgemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgemv = dlsym(handle, 'cblas_zgemv')
+
+        global __cblas_zgbmv
+        __cblas_zgbmv = dlsym(RTLD_DEFAULT, 'cblas_zgbmv')
+        if __cblas_zgbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgbmv = dlsym(handle, 'cblas_zgbmv')
+
+        global __cblas_ztrmv
+        __cblas_ztrmv = dlsym(RTLD_DEFAULT, 'cblas_ztrmv')
+        if __cblas_ztrmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztrmv = dlsym(handle, 'cblas_ztrmv')
+
+        global __cblas_ztbmv
+        __cblas_ztbmv = dlsym(RTLD_DEFAULT, 'cblas_ztbmv')
+        if __cblas_ztbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztbmv = dlsym(handle, 'cblas_ztbmv')
+
+        global __cblas_ztpmv
+        __cblas_ztpmv = dlsym(RTLD_DEFAULT, 'cblas_ztpmv')
+        if __cblas_ztpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztpmv = dlsym(handle, 'cblas_ztpmv')
+
+        global __cblas_ztrsv
+        __cblas_ztrsv = dlsym(RTLD_DEFAULT, 'cblas_ztrsv')
+        if __cblas_ztrsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztrsv = dlsym(handle, 'cblas_ztrsv')
+
+        global __cblas_ztbsv
+        __cblas_ztbsv = dlsym(RTLD_DEFAULT, 'cblas_ztbsv')
+        if __cblas_ztbsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztbsv = dlsym(handle, 'cblas_ztbsv')
+
+        global __cblas_ztpsv
+        __cblas_ztpsv = dlsym(RTLD_DEFAULT, 'cblas_ztpsv')
+        if __cblas_ztpsv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztpsv = dlsym(handle, 'cblas_ztpsv')
+
+        global __cblas_ssymv
+        __cblas_ssymv = dlsym(RTLD_DEFAULT, 'cblas_ssymv')
+        if __cblas_ssymv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssymv = dlsym(handle, 'cblas_ssymv')
+
+        global __cblas_ssbmv
+        __cblas_ssbmv = dlsym(RTLD_DEFAULT, 'cblas_ssbmv')
+        if __cblas_ssbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssbmv = dlsym(handle, 'cblas_ssbmv')
+
+        global __cblas_sspmv
+        __cblas_sspmv = dlsym(RTLD_DEFAULT, 'cblas_sspmv')
+        if __cblas_sspmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sspmv = dlsym(handle, 'cblas_sspmv')
+
+        global __cblas_sger
+        __cblas_sger = dlsym(RTLD_DEFAULT, 'cblas_sger')
+        if __cblas_sger == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sger = dlsym(handle, 'cblas_sger')
+
+        global __cblas_ssyr
+        __cblas_ssyr = dlsym(RTLD_DEFAULT, 'cblas_ssyr')
+        if __cblas_ssyr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssyr = dlsym(handle, 'cblas_ssyr')
+
+        global __cblas_sspr
+        __cblas_sspr = dlsym(RTLD_DEFAULT, 'cblas_sspr')
+        if __cblas_sspr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sspr = dlsym(handle, 'cblas_sspr')
+
+        global __cblas_ssyr2
+        __cblas_ssyr2 = dlsym(RTLD_DEFAULT, 'cblas_ssyr2')
+        if __cblas_ssyr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssyr2 = dlsym(handle, 'cblas_ssyr2')
+
+        global __cblas_sspr2
+        __cblas_sspr2 = dlsym(RTLD_DEFAULT, 'cblas_sspr2')
+        if __cblas_sspr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sspr2 = dlsym(handle, 'cblas_sspr2')
+
+        global __cblas_dsymv
+        __cblas_dsymv = dlsym(RTLD_DEFAULT, 'cblas_dsymv')
+        if __cblas_dsymv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsymv = dlsym(handle, 'cblas_dsymv')
+
+        global __cblas_dsbmv
+        __cblas_dsbmv = dlsym(RTLD_DEFAULT, 'cblas_dsbmv')
+        if __cblas_dsbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsbmv = dlsym(handle, 'cblas_dsbmv')
+
+        global __cblas_dspmv
+        __cblas_dspmv = dlsym(RTLD_DEFAULT, 'cblas_dspmv')
+        if __cblas_dspmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dspmv = dlsym(handle, 'cblas_dspmv')
+
+        global __cblas_dger
+        __cblas_dger = dlsym(RTLD_DEFAULT, 'cblas_dger')
+        if __cblas_dger == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dger = dlsym(handle, 'cblas_dger')
+
+        global __cblas_dsyr
+        __cblas_dsyr = dlsym(RTLD_DEFAULT, 'cblas_dsyr')
+        if __cblas_dsyr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsyr = dlsym(handle, 'cblas_dsyr')
+
+        global __cblas_dspr
+        __cblas_dspr = dlsym(RTLD_DEFAULT, 'cblas_dspr')
+        if __cblas_dspr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dspr = dlsym(handle, 'cblas_dspr')
+
+        global __cblas_dsyr2
+        __cblas_dsyr2 = dlsym(RTLD_DEFAULT, 'cblas_dsyr2')
+        if __cblas_dsyr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsyr2 = dlsym(handle, 'cblas_dsyr2')
+
+        global __cblas_dspr2
+        __cblas_dspr2 = dlsym(RTLD_DEFAULT, 'cblas_dspr2')
+        if __cblas_dspr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dspr2 = dlsym(handle, 'cblas_dspr2')
+
+        global __cblas_chemv
+        __cblas_chemv = dlsym(RTLD_DEFAULT, 'cblas_chemv')
+        if __cblas_chemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chemv = dlsym(handle, 'cblas_chemv')
+
+        global __cblas_chbmv
+        __cblas_chbmv = dlsym(RTLD_DEFAULT, 'cblas_chbmv')
+        if __cblas_chbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chbmv = dlsym(handle, 'cblas_chbmv')
+
+        global __cblas_chpmv
+        __cblas_chpmv = dlsym(RTLD_DEFAULT, 'cblas_chpmv')
+        if __cblas_chpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chpmv = dlsym(handle, 'cblas_chpmv')
+
+        global __cblas_cgeru
+        __cblas_cgeru = dlsym(RTLD_DEFAULT, 'cblas_cgeru')
+        if __cblas_cgeru == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgeru = dlsym(handle, 'cblas_cgeru')
+
+        global __cblas_cgerc
+        __cblas_cgerc = dlsym(RTLD_DEFAULT, 'cblas_cgerc')
+        if __cblas_cgerc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgerc = dlsym(handle, 'cblas_cgerc')
+
+        global __cblas_cher
+        __cblas_cher = dlsym(RTLD_DEFAULT, 'cblas_cher')
+        if __cblas_cher == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cher = dlsym(handle, 'cblas_cher')
+
+        global __cblas_chpr
+        __cblas_chpr = dlsym(RTLD_DEFAULT, 'cblas_chpr')
+        if __cblas_chpr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chpr = dlsym(handle, 'cblas_chpr')
+
+        global __cblas_cher2
+        __cblas_cher2 = dlsym(RTLD_DEFAULT, 'cblas_cher2')
+        if __cblas_cher2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cher2 = dlsym(handle, 'cblas_cher2')
+
+        global __cblas_chpr2
+        __cblas_chpr2 = dlsym(RTLD_DEFAULT, 'cblas_chpr2')
+        if __cblas_chpr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chpr2 = dlsym(handle, 'cblas_chpr2')
+
+        global __cblas_zhemv
+        __cblas_zhemv = dlsym(RTLD_DEFAULT, 'cblas_zhemv')
+        if __cblas_zhemv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhemv = dlsym(handle, 'cblas_zhemv')
+
+        global __cblas_zhbmv
+        __cblas_zhbmv = dlsym(RTLD_DEFAULT, 'cblas_zhbmv')
+        if __cblas_zhbmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhbmv = dlsym(handle, 'cblas_zhbmv')
+
+        global __cblas_zhpmv
+        __cblas_zhpmv = dlsym(RTLD_DEFAULT, 'cblas_zhpmv')
+        if __cblas_zhpmv == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhpmv = dlsym(handle, 'cblas_zhpmv')
+
+        global __cblas_zgeru
+        __cblas_zgeru = dlsym(RTLD_DEFAULT, 'cblas_zgeru')
+        if __cblas_zgeru == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgeru = dlsym(handle, 'cblas_zgeru')
+
+        global __cblas_zgerc
+        __cblas_zgerc = dlsym(RTLD_DEFAULT, 'cblas_zgerc')
+        if __cblas_zgerc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgerc = dlsym(handle, 'cblas_zgerc')
+
+        global __cblas_zher
+        __cblas_zher = dlsym(RTLD_DEFAULT, 'cblas_zher')
+        if __cblas_zher == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zher = dlsym(handle, 'cblas_zher')
+
+        global __cblas_zhpr
+        __cblas_zhpr = dlsym(RTLD_DEFAULT, 'cblas_zhpr')
+        if __cblas_zhpr == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhpr = dlsym(handle, 'cblas_zhpr')
+
+        global __cblas_zher2
+        __cblas_zher2 = dlsym(RTLD_DEFAULT, 'cblas_zher2')
+        if __cblas_zher2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zher2 = dlsym(handle, 'cblas_zher2')
+
+        global __cblas_zhpr2
+        __cblas_zhpr2 = dlsym(RTLD_DEFAULT, 'cblas_zhpr2')
+        if __cblas_zhpr2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhpr2 = dlsym(handle, 'cblas_zhpr2')
+
+        global __cblas_sgemm
+        __cblas_sgemm = dlsym(RTLD_DEFAULT, 'cblas_sgemm')
+        if __cblas_sgemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sgemm = dlsym(handle, 'cblas_sgemm')
+
+        global __cblas_ssymm
+        __cblas_ssymm = dlsym(RTLD_DEFAULT, 'cblas_ssymm')
+        if __cblas_ssymm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssymm = dlsym(handle, 'cblas_ssymm')
+
+        global __cblas_ssyrk
+        __cblas_ssyrk = dlsym(RTLD_DEFAULT, 'cblas_ssyrk')
+        if __cblas_ssyrk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssyrk = dlsym(handle, 'cblas_ssyrk')
+
+        global __cblas_ssyr2k
+        __cblas_ssyr2k = dlsym(RTLD_DEFAULT, 'cblas_ssyr2k')
+        if __cblas_ssyr2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ssyr2k = dlsym(handle, 'cblas_ssyr2k')
+
+        global __cblas_strmm
+        __cblas_strmm = dlsym(RTLD_DEFAULT, 'cblas_strmm')
+        if __cblas_strmm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_strmm = dlsym(handle, 'cblas_strmm')
+
+        global __cblas_strsm
+        __cblas_strsm = dlsym(RTLD_DEFAULT, 'cblas_strsm')
+        if __cblas_strsm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_strsm = dlsym(handle, 'cblas_strsm')
+
+        global __cblas_dgemm
+        __cblas_dgemm = dlsym(RTLD_DEFAULT, 'cblas_dgemm')
+        if __cblas_dgemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dgemm = dlsym(handle, 'cblas_dgemm')
+
+        global __cblas_dsymm
+        __cblas_dsymm = dlsym(RTLD_DEFAULT, 'cblas_dsymm')
+        if __cblas_dsymm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsymm = dlsym(handle, 'cblas_dsymm')
+
+        global __cblas_dsyrk
+        __cblas_dsyrk = dlsym(RTLD_DEFAULT, 'cblas_dsyrk')
+        if __cblas_dsyrk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsyrk = dlsym(handle, 'cblas_dsyrk')
+
+        global __cblas_dsyr2k
+        __cblas_dsyr2k = dlsym(RTLD_DEFAULT, 'cblas_dsyr2k')
+        if __cblas_dsyr2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dsyr2k = dlsym(handle, 'cblas_dsyr2k')
+
+        global __cblas_dtrmm
+        __cblas_dtrmm = dlsym(RTLD_DEFAULT, 'cblas_dtrmm')
+        if __cblas_dtrmm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtrmm = dlsym(handle, 'cblas_dtrmm')
+
+        global __cblas_dtrsm
+        __cblas_dtrsm = dlsym(RTLD_DEFAULT, 'cblas_dtrsm')
+        if __cblas_dtrsm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dtrsm = dlsym(handle, 'cblas_dtrsm')
+
+        global __cblas_cgemm
+        __cblas_cgemm = dlsym(RTLD_DEFAULT, 'cblas_cgemm')
+        if __cblas_cgemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgemm = dlsym(handle, 'cblas_cgemm')
+
+        global __cblas_csymm
+        __cblas_csymm = dlsym(RTLD_DEFAULT, 'cblas_csymm')
+        if __cblas_csymm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_csymm = dlsym(handle, 'cblas_csymm')
+
+        global __cblas_csyrk
+        __cblas_csyrk = dlsym(RTLD_DEFAULT, 'cblas_csyrk')
+        if __cblas_csyrk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_csyrk = dlsym(handle, 'cblas_csyrk')
+
+        global __cblas_csyr2k
+        __cblas_csyr2k = dlsym(RTLD_DEFAULT, 'cblas_csyr2k')
+        if __cblas_csyr2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_csyr2k = dlsym(handle, 'cblas_csyr2k')
+
+        global __cblas_ctrmm
+        __cblas_ctrmm = dlsym(RTLD_DEFAULT, 'cblas_ctrmm')
+        if __cblas_ctrmm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctrmm = dlsym(handle, 'cblas_ctrmm')
+
+        global __cblas_ctrsm
+        __cblas_ctrsm = dlsym(RTLD_DEFAULT, 'cblas_ctrsm')
+        if __cblas_ctrsm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ctrsm = dlsym(handle, 'cblas_ctrsm')
+
+        global __cblas_zgemm
+        __cblas_zgemm = dlsym(RTLD_DEFAULT, 'cblas_zgemm')
+        if __cblas_zgemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgemm = dlsym(handle, 'cblas_zgemm')
+
+        global __cblas_zsymm
+        __cblas_zsymm = dlsym(RTLD_DEFAULT, 'cblas_zsymm')
+        if __cblas_zsymm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zsymm = dlsym(handle, 'cblas_zsymm')
+
+        global __cblas_zsyrk
+        __cblas_zsyrk = dlsym(RTLD_DEFAULT, 'cblas_zsyrk')
+        if __cblas_zsyrk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zsyrk = dlsym(handle, 'cblas_zsyrk')
+
+        global __cblas_zsyr2k
+        __cblas_zsyr2k = dlsym(RTLD_DEFAULT, 'cblas_zsyr2k')
+        if __cblas_zsyr2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zsyr2k = dlsym(handle, 'cblas_zsyr2k')
+
+        global __cblas_ztrmm
+        __cblas_ztrmm = dlsym(RTLD_DEFAULT, 'cblas_ztrmm')
+        if __cblas_ztrmm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztrmm = dlsym(handle, 'cblas_ztrmm')
+
+        global __cblas_ztrsm
+        __cblas_ztrsm = dlsym(RTLD_DEFAULT, 'cblas_ztrsm')
+        if __cblas_ztrsm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_ztrsm = dlsym(handle, 'cblas_ztrsm')
+
+        global __cblas_chemm
+        __cblas_chemm = dlsym(RTLD_DEFAULT, 'cblas_chemm')
+        if __cblas_chemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_chemm = dlsym(handle, 'cblas_chemm')
+
+        global __cblas_cherk
+        __cblas_cherk = dlsym(RTLD_DEFAULT, 'cblas_cherk')
+        if __cblas_cherk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cherk = dlsym(handle, 'cblas_cherk')
+
+        global __cblas_cher2k
+        __cblas_cher2k = dlsym(RTLD_DEFAULT, 'cblas_cher2k')
+        if __cblas_cher2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cher2k = dlsym(handle, 'cblas_cher2k')
+
+        global __cblas_zhemm
+        __cblas_zhemm = dlsym(RTLD_DEFAULT, 'cblas_zhemm')
+        if __cblas_zhemm == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zhemm = dlsym(handle, 'cblas_zhemm')
+
+        global __cblas_zherk
+        __cblas_zherk = dlsym(RTLD_DEFAULT, 'cblas_zherk')
+        if __cblas_zherk == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zherk = dlsym(handle, 'cblas_zherk')
+
+        global __cblas_zher2k
+        __cblas_zher2k = dlsym(RTLD_DEFAULT, 'cblas_zher2k')
+        if __cblas_zher2k == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zher2k = dlsym(handle, 'cblas_zher2k')
+
+        global __cblas_sgemm_batch
+        __cblas_sgemm_batch = dlsym(RTLD_DEFAULT, 'cblas_sgemm_batch')
+        if __cblas_sgemm_batch == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sgemm_batch = dlsym(handle, 'cblas_sgemm_batch')
+
+        global __cblas_dgemm_batch
+        __cblas_dgemm_batch = dlsym(RTLD_DEFAULT, 'cblas_dgemm_batch')
+        if __cblas_dgemm_batch == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dgemm_batch = dlsym(handle, 'cblas_dgemm_batch')
+
+        global __cblas_cgemm_batch
+        __cblas_cgemm_batch = dlsym(RTLD_DEFAULT, 'cblas_cgemm_batch')
+        if __cblas_cgemm_batch == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgemm_batch = dlsym(handle, 'cblas_cgemm_batch')
+
+        global __cblas_zgemm_batch
+        __cblas_zgemm_batch = dlsym(RTLD_DEFAULT, 'cblas_zgemm_batch')
+        if __cblas_zgemm_batch == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgemm_batch = dlsym(handle, 'cblas_zgemm_batch')
+
+        global __cblas_sgemm_batch_strided
+        __cblas_sgemm_batch_strided = dlsym(RTLD_DEFAULT, 'cblas_sgemm_batch_strided')
+        if __cblas_sgemm_batch_strided == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_sgemm_batch_strided = dlsym(handle, 'cblas_sgemm_batch_strided')
+
+        global __cblas_dgemm_batch_strided
+        __cblas_dgemm_batch_strided = dlsym(RTLD_DEFAULT, 'cblas_dgemm_batch_strided')
+        if __cblas_dgemm_batch_strided == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_dgemm_batch_strided = dlsym(handle, 'cblas_dgemm_batch_strided')
+
+        global __cblas_cgemm_batch_strided
+        __cblas_cgemm_batch_strided = dlsym(RTLD_DEFAULT, 'cblas_cgemm_batch_strided')
+        if __cblas_cgemm_batch_strided == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_cgemm_batch_strided = dlsym(handle, 'cblas_cgemm_batch_strided')
+
+        global __cblas_zgemm_batch_strided
+        __cblas_zgemm_batch_strided = dlsym(RTLD_DEFAULT, 'cblas_zgemm_batch_strided')
+        if __cblas_zgemm_batch_strided == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cblas_zgemm_batch_strided = dlsym(handle, 'cblas_zgemm_batch_strided')
+
+        __py_nvpl_blas_init = True
+        return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef void _set_lib_so_names(tuple lib_so_names):
+    global __lib_so_names
+    __lib_so_names = lib_so_names
+
+
+cpdef tuple _get_lib_so_names():
+    global __lib_so_names
+    return __lib_so_names
+
+
+cpdef str _get_current_lib_so_name():
+    global __current_so_name
+    return __current_so_name
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvpl_blas()
+    cdef dict data = {}
+
+    global __MKL_Set_Num_Threads_Local
+    data["__MKL_Set_Num_Threads_Local"] = <intptr_t>__MKL_Set_Num_Threads_Local
+
+    global __MKL_Set_Num_Threads
+    data["__MKL_Set_Num_Threads"] = <intptr_t>__MKL_Set_Num_Threads
+
+    global __openblas_set_num_threads
+    data["__openblas_set_num_threads"] = <intptr_t>__openblas_set_num_threads
+
+    global __openblas_set_num_threads_local
+    data["__openblas_set_num_threads_local"] = <intptr_t>__openblas_set_num_threads_local
+
+    global __nvpl_blas_get_version
+    data["__nvpl_blas_get_version"] = <intptr_t>__nvpl_blas_get_version
+
+    global __nvpl_blas_get_max_threads
+    data["__nvpl_blas_get_max_threads"] = <intptr_t>__nvpl_blas_get_max_threads
+
+    global __nvpl_blas_set_num_threads
+    data["__nvpl_blas_set_num_threads"] = <intptr_t>__nvpl_blas_set_num_threads
+
+    global __nvpl_blas_set_num_threads_local
+    data["__nvpl_blas_set_num_threads_local"] = <intptr_t>__nvpl_blas_set_num_threads_local
+
+    global __cblas_sgemv
+    data["__cblas_sgemv"] = <intptr_t>__cblas_sgemv
+
+    global __cblas_sgbmv
+    data["__cblas_sgbmv"] = <intptr_t>__cblas_sgbmv
+
+    global __cblas_strmv
+    data["__cblas_strmv"] = <intptr_t>__cblas_strmv
+
+    global __cblas_stbmv
+    data["__cblas_stbmv"] = <intptr_t>__cblas_stbmv
+
+    global __cblas_stpmv
+    data["__cblas_stpmv"] = <intptr_t>__cblas_stpmv
+
+    global __cblas_strsv
+    data["__cblas_strsv"] = <intptr_t>__cblas_strsv
+
+    global __cblas_stbsv
+    data["__cblas_stbsv"] = <intptr_t>__cblas_stbsv
+
+    global __cblas_stpsv
+    data["__cblas_stpsv"] = <intptr_t>__cblas_stpsv
+
+    global __cblas_dgemv
+    data["__cblas_dgemv"] = <intptr_t>__cblas_dgemv
+
+    global __cblas_dgbmv
+    data["__cblas_dgbmv"] = <intptr_t>__cblas_dgbmv
+
+    global __cblas_dtrmv
+    data["__cblas_dtrmv"] = <intptr_t>__cblas_dtrmv
+
+    global __cblas_dtbmv
+    data["__cblas_dtbmv"] = <intptr_t>__cblas_dtbmv
+
+    global __cblas_dtpmv
+    data["__cblas_dtpmv"] = <intptr_t>__cblas_dtpmv
+
+    global __cblas_dtrsv
+    data["__cblas_dtrsv"] = <intptr_t>__cblas_dtrsv
+
+    global __cblas_dtbsv
+    data["__cblas_dtbsv"] = <intptr_t>__cblas_dtbsv
+
+    global __cblas_dtpsv
+    data["__cblas_dtpsv"] = <intptr_t>__cblas_dtpsv
+
+    global __cblas_cgemv
+    data["__cblas_cgemv"] = <intptr_t>__cblas_cgemv
+
+    global __cblas_cgbmv
+    data["__cblas_cgbmv"] = <intptr_t>__cblas_cgbmv
+
+    global __cblas_ctrmv
+    data["__cblas_ctrmv"] = <intptr_t>__cblas_ctrmv
+
+    global __cblas_ctbmv
+    data["__cblas_ctbmv"] = <intptr_t>__cblas_ctbmv
+
+    global __cblas_ctpmv
+    data["__cblas_ctpmv"] = <intptr_t>__cblas_ctpmv
+
+    global __cblas_ctrsv
+    data["__cblas_ctrsv"] = <intptr_t>__cblas_ctrsv
+
+    global __cblas_ctbsv
+    data["__cblas_ctbsv"] = <intptr_t>__cblas_ctbsv
+
+    global __cblas_ctpsv
+    data["__cblas_ctpsv"] = <intptr_t>__cblas_ctpsv
+
+    global __cblas_zgemv
+    data["__cblas_zgemv"] = <intptr_t>__cblas_zgemv
+
+    global __cblas_zgbmv
+    data["__cblas_zgbmv"] = <intptr_t>__cblas_zgbmv
+
+    global __cblas_ztrmv
+    data["__cblas_ztrmv"] = <intptr_t>__cblas_ztrmv
+
+    global __cblas_ztbmv
+    data["__cblas_ztbmv"] = <intptr_t>__cblas_ztbmv
+
+    global __cblas_ztpmv
+    data["__cblas_ztpmv"] = <intptr_t>__cblas_ztpmv
+
+    global __cblas_ztrsv
+    data["__cblas_ztrsv"] = <intptr_t>__cblas_ztrsv
+
+    global __cblas_ztbsv
+    data["__cblas_ztbsv"] = <intptr_t>__cblas_ztbsv
+
+    global __cblas_ztpsv
+    data["__cblas_ztpsv"] = <intptr_t>__cblas_ztpsv
+
+    global __cblas_ssymv
+    data["__cblas_ssymv"] = <intptr_t>__cblas_ssymv
+
+    global __cblas_ssbmv
+    data["__cblas_ssbmv"] = <intptr_t>__cblas_ssbmv
+
+    global __cblas_sspmv
+    data["__cblas_sspmv"] = <intptr_t>__cblas_sspmv
+
+    global __cblas_sger
+    data["__cblas_sger"] = <intptr_t>__cblas_sger
+
+    global __cblas_ssyr
+    data["__cblas_ssyr"] = <intptr_t>__cblas_ssyr
+
+    global __cblas_sspr
+    data["__cblas_sspr"] = <intptr_t>__cblas_sspr
+
+    global __cblas_ssyr2
+    data["__cblas_ssyr2"] = <intptr_t>__cblas_ssyr2
+
+    global __cblas_sspr2
+    data["__cblas_sspr2"] = <intptr_t>__cblas_sspr2
+
+    global __cblas_dsymv
+    data["__cblas_dsymv"] = <intptr_t>__cblas_dsymv
+
+    global __cblas_dsbmv
+    data["__cblas_dsbmv"] = <intptr_t>__cblas_dsbmv
+
+    global __cblas_dspmv
+    data["__cblas_dspmv"] = <intptr_t>__cblas_dspmv
+
+    global __cblas_dger
+    data["__cblas_dger"] = <intptr_t>__cblas_dger
+
+    global __cblas_dsyr
+    data["__cblas_dsyr"] = <intptr_t>__cblas_dsyr
+
+    global __cblas_dspr
+    data["__cblas_dspr"] = <intptr_t>__cblas_dspr
+
+    global __cblas_dsyr2
+    data["__cblas_dsyr2"] = <intptr_t>__cblas_dsyr2
+
+    global __cblas_dspr2
+    data["__cblas_dspr2"] = <intptr_t>__cblas_dspr2
+
+    global __cblas_chemv
+    data["__cblas_chemv"] = <intptr_t>__cblas_chemv
+
+    global __cblas_chbmv
+    data["__cblas_chbmv"] = <intptr_t>__cblas_chbmv
+
+    global __cblas_chpmv
+    data["__cblas_chpmv"] = <intptr_t>__cblas_chpmv
+
+    global __cblas_cgeru
+    data["__cblas_cgeru"] = <intptr_t>__cblas_cgeru
+
+    global __cblas_cgerc
+    data["__cblas_cgerc"] = <intptr_t>__cblas_cgerc
+
+    global __cblas_cher
+    data["__cblas_cher"] = <intptr_t>__cblas_cher
+
+    global __cblas_chpr
+    data["__cblas_chpr"] = <intptr_t>__cblas_chpr
+
+    global __cblas_cher2
+    data["__cblas_cher2"] = <intptr_t>__cblas_cher2
+
+    global __cblas_chpr2
+    data["__cblas_chpr2"] = <intptr_t>__cblas_chpr2
+
+    global __cblas_zhemv
+    data["__cblas_zhemv"] = <intptr_t>__cblas_zhemv
+
+    global __cblas_zhbmv
+    data["__cblas_zhbmv"] = <intptr_t>__cblas_zhbmv
+
+    global __cblas_zhpmv
+    data["__cblas_zhpmv"] = <intptr_t>__cblas_zhpmv
+
+    global __cblas_zgeru
+    data["__cblas_zgeru"] = <intptr_t>__cblas_zgeru
+
+    global __cblas_zgerc
+    data["__cblas_zgerc"] = <intptr_t>__cblas_zgerc
+
+    global __cblas_zher
+    data["__cblas_zher"] = <intptr_t>__cblas_zher
+
+    global __cblas_zhpr
+    data["__cblas_zhpr"] = <intptr_t>__cblas_zhpr
+
+    global __cblas_zher2
+    data["__cblas_zher2"] = <intptr_t>__cblas_zher2
+
+    global __cblas_zhpr2
+    data["__cblas_zhpr2"] = <intptr_t>__cblas_zhpr2
+
+    global __cblas_sgemm
+    data["__cblas_sgemm"] = <intptr_t>__cblas_sgemm
+
+    global __cblas_ssymm
+    data["__cblas_ssymm"] = <intptr_t>__cblas_ssymm
+
+    global __cblas_ssyrk
+    data["__cblas_ssyrk"] = <intptr_t>__cblas_ssyrk
+
+    global __cblas_ssyr2k
+    data["__cblas_ssyr2k"] = <intptr_t>__cblas_ssyr2k
+
+    global __cblas_strmm
+    data["__cblas_strmm"] = <intptr_t>__cblas_strmm
+
+    global __cblas_strsm
+    data["__cblas_strsm"] = <intptr_t>__cblas_strsm
+
+    global __cblas_dgemm
+    data["__cblas_dgemm"] = <intptr_t>__cblas_dgemm
+
+    global __cblas_dsymm
+    data["__cblas_dsymm"] = <intptr_t>__cblas_dsymm
+
+    global __cblas_dsyrk
+    data["__cblas_dsyrk"] = <intptr_t>__cblas_dsyrk
+
+    global __cblas_dsyr2k
+    data["__cblas_dsyr2k"] = <intptr_t>__cblas_dsyr2k
+
+    global __cblas_dtrmm
+    data["__cblas_dtrmm"] = <intptr_t>__cblas_dtrmm
+
+    global __cblas_dtrsm
+    data["__cblas_dtrsm"] = <intptr_t>__cblas_dtrsm
+
+    global __cblas_cgemm
+    data["__cblas_cgemm"] = <intptr_t>__cblas_cgemm
+
+    global __cblas_csymm
+    data["__cblas_csymm"] = <intptr_t>__cblas_csymm
+
+    global __cblas_csyrk
+    data["__cblas_csyrk"] = <intptr_t>__cblas_csyrk
+
+    global __cblas_csyr2k
+    data["__cblas_csyr2k"] = <intptr_t>__cblas_csyr2k
+
+    global __cblas_ctrmm
+    data["__cblas_ctrmm"] = <intptr_t>__cblas_ctrmm
+
+    global __cblas_ctrsm
+    data["__cblas_ctrsm"] = <intptr_t>__cblas_ctrsm
+
+    global __cblas_zgemm
+    data["__cblas_zgemm"] = <intptr_t>__cblas_zgemm
+
+    global __cblas_zsymm
+    data["__cblas_zsymm"] = <intptr_t>__cblas_zsymm
+
+    global __cblas_zsyrk
+    data["__cblas_zsyrk"] = <intptr_t>__cblas_zsyrk
+
+    global __cblas_zsyr2k
+    data["__cblas_zsyr2k"] = <intptr_t>__cblas_zsyr2k
+
+    global __cblas_ztrmm
+    data["__cblas_ztrmm"] = <intptr_t>__cblas_ztrmm
+
+    global __cblas_ztrsm
+    data["__cblas_ztrsm"] = <intptr_t>__cblas_ztrsm
+
+    global __cblas_chemm
+    data["__cblas_chemm"] = <intptr_t>__cblas_chemm
+
+    global __cblas_cherk
+    data["__cblas_cherk"] = <intptr_t>__cblas_cherk
+
+    global __cblas_cher2k
+    data["__cblas_cher2k"] = <intptr_t>__cblas_cher2k
+
+    global __cblas_zhemm
+    data["__cblas_zhemm"] = <intptr_t>__cblas_zhemm
+
+    global __cblas_zherk
+    data["__cblas_zherk"] = <intptr_t>__cblas_zherk
+
+    global __cblas_zher2k
+    data["__cblas_zher2k"] = <intptr_t>__cblas_zher2k
+
+    global __cblas_sgemm_batch
+    data["__cblas_sgemm_batch"] = <intptr_t>__cblas_sgemm_batch
+
+    global __cblas_dgemm_batch
+    data["__cblas_dgemm_batch"] = <intptr_t>__cblas_dgemm_batch
+
+    global __cblas_cgemm_batch
+    data["__cblas_cgemm_batch"] = <intptr_t>__cblas_cgemm_batch
+
+    global __cblas_zgemm_batch
+    data["__cblas_zgemm_batch"] = <intptr_t>__cblas_zgemm_batch
+
+    global __cblas_sgemm_batch_strided
+    data["__cblas_sgemm_batch_strided"] = <intptr_t>__cblas_sgemm_batch_strided
+
+    global __cblas_dgemm_batch_strided
+    data["__cblas_dgemm_batch_strided"] = <intptr_t>__cblas_dgemm_batch_strided
+
+    global __cblas_cgemm_batch_strided
+    data["__cblas_cgemm_batch_strided"] = <intptr_t>__cblas_cgemm_batch_strided
+
+    global __cblas_zgemm_batch_strided
+    data["__cblas_zgemm_batch_strided"] = <intptr_t>__cblas_zgemm_batch_strided
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef int _MKL_mkl_set_num_threads_local(int nth) except?-42 nogil:
+    global __MKL_Set_Num_Threads_Local
+    _check_or_init_nvpl_blas()
+    if __MKL_Set_Num_Threads_Local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function MKL_Set_Num_Threads_Local is not found")
+    return (<int (*)(int) noexcept nogil>__MKL_Set_Num_Threads_Local)(
+        nth)
+
+
+@cython.show_performance_hints(False)
+cdef void _MKL_mkl_set_num_threads(int nth) except* nogil:
+    global __MKL_Set_Num_Threads
+    _check_or_init_nvpl_blas()
+    if __MKL_Set_Num_Threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function MKL_Set_Num_Threads is not found")
+    (<void (*)(int) noexcept nogil>__MKL_Set_Num_Threads)(
+        nth)
+
+
+@cython.show_performance_hints(False)
+cdef void _openblas_openblas_set_num_threads(int num_threads) except* nogil:
+    global __openblas_set_num_threads
+    _check_or_init_nvpl_blas()
+    if __openblas_set_num_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function openblas_set_num_threads is not found")
+    (<void (*)(int) noexcept nogil>__openblas_set_num_threads)(
+        num_threads)
+
+
+cdef int _openblas_openblas_set_num_threads_local(int num_threads) except?-42 nogil:
+    global __openblas_set_num_threads_local
+    _check_or_init_nvpl_blas()
+    if __openblas_set_num_threads_local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function openblas_set_num_threads_local is not found")
+    return (<int (*)(int) noexcept nogil>__openblas_set_num_threads_local)(
+        num_threads)
+
+
+cdef int _nvpl_blas_get_version() except?-42 nogil:
+    global __nvpl_blas_get_version
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_get_version == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_get_version is not found")
+    return (<int (*)() noexcept nogil>__nvpl_blas_get_version)(
+        )
+
+
+cdef int _nvpl_blas_get_max_threads() except?-42 nogil:
+    global __nvpl_blas_get_max_threads
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_get_max_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_get_max_threads is not found")
+    return (<int (*)() noexcept nogil>__nvpl_blas_get_max_threads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _nvpl_blas_set_num_threads(int nthr) except* nogil:
+    global __nvpl_blas_set_num_threads
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_set_num_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_set_num_threads is not found")
+    (<void (*)(int) noexcept nogil>__nvpl_blas_set_num_threads)(
+        nthr)
+
+
+cdef int _nvpl_blas_set_num_threads_local(int nthr_local) except?-42 nogil:
+    global __nvpl_blas_set_num_threads_local
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_set_num_threads_local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_set_num_threads_local is not found")
+    return (<int (*)(int) noexcept nogil>__nvpl_blas_set_num_threads_local)(
+        nthr_local)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_sgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_strmv
+    _check_or_init_nvpl_blas()
+    if __cblas_strmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_stbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_stbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_stpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, float*, const nvpl_int_t) noexcept nogil>__cblas_stpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_strsv
+    _check_or_init_nvpl_blas()
+    if __cblas_strsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_stbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_stbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_stpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, float*, const nvpl_int_t) noexcept nogil>__cblas_stpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, double*, const nvpl_int_t) noexcept nogil>__cblas_dtpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, double*, const nvpl_int_t) noexcept nogil>__cblas_dtpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_cgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_cgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_cgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ctpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ctpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ztpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ztpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_ssymv
+    _check_or_init_nvpl_blas()
+    if __cblas_ssymv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssymv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssymv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_ssbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ssbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* Ap, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sspmv
+    _check_or_init_nvpl_blas()
+    if __cblas_sspmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sspmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_sger
+    _check_or_init_nvpl_blas()
+    if __cblas_sger == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sger is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_sger)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_ssyr
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* Ap) except* nogil:
+    global __cblas_sspr
+    _check_or_init_nvpl_blas()
+    if __cblas_sspr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*) noexcept nogil>__cblas_sspr)(
+        order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_ssyr2
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A) except* nogil:
+    global __cblas_sspr2
+    _check_or_init_nvpl_blas()
+    if __cblas_sspr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*) noexcept nogil>__cblas_sspr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dsymv
+    _check_or_init_nvpl_blas()
+    if __cblas_dsymv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsymv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsymv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dsbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dsbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* Ap, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dspmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dspmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dspmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dger
+    _check_or_init_nvpl_blas()
+    if __cblas_dger == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dger is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dger)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dsyr
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* Ap) except* nogil:
+    global __cblas_dspr
+    _check_or_init_nvpl_blas()
+    if __cblas_dspr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*) noexcept nogil>__cblas_dspr)(
+        order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dsyr2
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A) except* nogil:
+    global __cblas_dspr2
+    _check_or_init_nvpl_blas()
+    if __cblas_dspr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*) noexcept nogil>__cblas_dspr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chemv
+    _check_or_init_nvpl_blas()
+    if __cblas_chemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chemv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_chbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_chpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chpmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cgeru
+    _check_or_init_nvpl_blas()
+    if __cblas_cgeru == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgeru is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cgeru)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cgerc
+    _check_or_init_nvpl_blas()
+    if __cblas_cgerc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgerc is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cgerc)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cher
+    _check_or_init_nvpl_blas()
+    if __cblas_cher == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cher)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    global __cblas_chpr
+    _check_or_init_nvpl_blas()
+    if __cblas_chpr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_chpr)(
+        order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cher2
+    _check_or_init_nvpl_blas()
+    if __cblas_cher2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cher2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    global __cblas_chpr2
+    _check_or_init_nvpl_blas()
+    if __cblas_chpr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_chpr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhemv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhemv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhpmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zgeru
+    _check_or_init_nvpl_blas()
+    if __cblas_zgeru == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgeru is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zgeru)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zgerc
+    _check_or_init_nvpl_blas()
+    if __cblas_zgerc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgerc is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zgerc)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zher
+    _check_or_init_nvpl_blas()
+    if __cblas_zher == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zher)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    global __cblas_zhpr
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_zhpr)(
+        order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zher2
+    _check_or_init_nvpl_blas()
+    if __cblas_zher2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zher2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    global __cblas_zhpr2
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_zhpr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_sgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssymm
+    _check_or_init_nvpl_blas()
+    if __cblas_ssymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_strmm
+    _check_or_init_nvpl_blas()
+    if __cblas_strmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_strsm
+    _check_or_init_nvpl_blas()
+    if __cblas_strsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsymm
+    _check_or_init_nvpl_blas()
+    if __cblas_dsymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_dtrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_dtrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csymm
+    _check_or_init_nvpl_blas()
+    if __cblas_csymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_csyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_csyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ctrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ctrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsymm
+    _check_or_init_nvpl_blas()
+    if __cblas_zsymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_zsyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_zsyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ztrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ztrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_chemm
+    _check_or_init_nvpl_blas()
+    if __cblas_chemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chemm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const void* A, const nvpl_int_t lda, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cherk
+    _check_or_init_nvpl_blas()
+    if __cblas_cherk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cherk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const void*, const nvpl_int_t, const float, void*, const nvpl_int_t) noexcept nogil>__cblas_cherk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cher2k
+    _check_or_init_nvpl_blas()
+    if __cblas_cher2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const float, void*, const nvpl_int_t) noexcept nogil>__cblas_cher2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zhemm
+    _check_or_init_nvpl_blas()
+    if __cblas_zhemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhemm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const void* A, const nvpl_int_t lda, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zherk
+    _check_or_init_nvpl_blas()
+    if __cblas_zherk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zherk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const void*, const nvpl_int_t, const double, void*, const nvpl_int_t) noexcept nogil>__cblas_zherk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zher2k
+    _check_or_init_nvpl_blas()
+    if __cblas_zher2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const double, void*, const nvpl_int_t) noexcept nogil>__cblas_zher2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const float* alpha_array, const float** A_array, nvpl_int_t* lda_array, const float** B_array, nvpl_int_t* ldb_array, const float* beta_array, float** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_sgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const float*, const float**, nvpl_int_t*, const float**, nvpl_int_t*, const float*, float**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_sgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const double* alpha_array, const double** A_array, nvpl_int_t* lda_array, const double** B_array, nvpl_int_t* ldb_array, const double* beta_array, double** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_dgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const double*, const double**, nvpl_int_t*, const double**, nvpl_int_t*, const double*, double**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_dgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_cgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const void*, const void**, nvpl_int_t*, const void**, nvpl_int_t*, const void*, void**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_cgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_zgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const void*, const void**, nvpl_int_t*, const void**, nvpl_int_t*, const void*, void**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_zgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const nvpl_int_t stridea, const float* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const float beta, float* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_sgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, const nvpl_int_t, const float, float*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_sgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const nvpl_int_t stridea, const double* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const double beta, double* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_dgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, const nvpl_int_t, const double, double*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_dgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_cgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, const nvpl_int_t, const void*, void*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_cgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_zgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, const nvpl_int_t, const void*, void*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_zgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
diff --git a/nvmath/bindings/nvpl/_internal/blas_windows.pyx b/nvmath/bindings/nvpl/_internal/blas_windows.pyx
new file mode 100644
index 0000000..b725bdb
--- /dev/null
+++ b/nvmath/bindings/nvpl/_internal/blas_windows.pyx
@@ -0,0 +1,2224 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+cimport cython
+from libc.stdint cimport intptr_t
+
+import os
+import site
+import threading
+import win32api
+
+from ..._internal.utils import FunctionNotFoundError, NotSupportedError
+
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvpl_blas_init = False
+cdef str __current_dll_name = ""
+cdef tuple __lib_dll_names = ("mkl_rt.2.dll",  "openblas.dll",)
+
+cdef void* __MKL_Set_Num_Threads_Local = NULL
+cdef void* __MKL_Set_Num_Threads = NULL
+cdef void* __openblas_set_num_threads = NULL
+cdef void* __openblas_set_num_threads_local = NULL
+cdef void* __nvpl_blas_get_version = NULL
+cdef void* __nvpl_blas_get_max_threads = NULL
+cdef void* __nvpl_blas_set_num_threads = NULL
+cdef void* __nvpl_blas_set_num_threads_local = NULL
+cdef void* __cblas_sgemv = NULL
+cdef void* __cblas_sgbmv = NULL
+cdef void* __cblas_strmv = NULL
+cdef void* __cblas_stbmv = NULL
+cdef void* __cblas_stpmv = NULL
+cdef void* __cblas_strsv = NULL
+cdef void* __cblas_stbsv = NULL
+cdef void* __cblas_stpsv = NULL
+cdef void* __cblas_dgemv = NULL
+cdef void* __cblas_dgbmv = NULL
+cdef void* __cblas_dtrmv = NULL
+cdef void* __cblas_dtbmv = NULL
+cdef void* __cblas_dtpmv = NULL
+cdef void* __cblas_dtrsv = NULL
+cdef void* __cblas_dtbsv = NULL
+cdef void* __cblas_dtpsv = NULL
+cdef void* __cblas_cgemv = NULL
+cdef void* __cblas_cgbmv = NULL
+cdef void* __cblas_ctrmv = NULL
+cdef void* __cblas_ctbmv = NULL
+cdef void* __cblas_ctpmv = NULL
+cdef void* __cblas_ctrsv = NULL
+cdef void* __cblas_ctbsv = NULL
+cdef void* __cblas_ctpsv = NULL
+cdef void* __cblas_zgemv = NULL
+cdef void* __cblas_zgbmv = NULL
+cdef void* __cblas_ztrmv = NULL
+cdef void* __cblas_ztbmv = NULL
+cdef void* __cblas_ztpmv = NULL
+cdef void* __cblas_ztrsv = NULL
+cdef void* __cblas_ztbsv = NULL
+cdef void* __cblas_ztpsv = NULL
+cdef void* __cblas_ssymv = NULL
+cdef void* __cblas_ssbmv = NULL
+cdef void* __cblas_sspmv = NULL
+cdef void* __cblas_sger = NULL
+cdef void* __cblas_ssyr = NULL
+cdef void* __cblas_sspr = NULL
+cdef void* __cblas_ssyr2 = NULL
+cdef void* __cblas_sspr2 = NULL
+cdef void* __cblas_dsymv = NULL
+cdef void* __cblas_dsbmv = NULL
+cdef void* __cblas_dspmv = NULL
+cdef void* __cblas_dger = NULL
+cdef void* __cblas_dsyr = NULL
+cdef void* __cblas_dspr = NULL
+cdef void* __cblas_dsyr2 = NULL
+cdef void* __cblas_dspr2 = NULL
+cdef void* __cblas_chemv = NULL
+cdef void* __cblas_chbmv = NULL
+cdef void* __cblas_chpmv = NULL
+cdef void* __cblas_cgeru = NULL
+cdef void* __cblas_cgerc = NULL
+cdef void* __cblas_cher = NULL
+cdef void* __cblas_chpr = NULL
+cdef void* __cblas_cher2 = NULL
+cdef void* __cblas_chpr2 = NULL
+cdef void* __cblas_zhemv = NULL
+cdef void* __cblas_zhbmv = NULL
+cdef void* __cblas_zhpmv = NULL
+cdef void* __cblas_zgeru = NULL
+cdef void* __cblas_zgerc = NULL
+cdef void* __cblas_zher = NULL
+cdef void* __cblas_zhpr = NULL
+cdef void* __cblas_zher2 = NULL
+cdef void* __cblas_zhpr2 = NULL
+cdef void* __cblas_sgemm = NULL
+cdef void* __cblas_ssymm = NULL
+cdef void* __cblas_ssyrk = NULL
+cdef void* __cblas_ssyr2k = NULL
+cdef void* __cblas_strmm = NULL
+cdef void* __cblas_strsm = NULL
+cdef void* __cblas_dgemm = NULL
+cdef void* __cblas_dsymm = NULL
+cdef void* __cblas_dsyrk = NULL
+cdef void* __cblas_dsyr2k = NULL
+cdef void* __cblas_dtrmm = NULL
+cdef void* __cblas_dtrsm = NULL
+cdef void* __cblas_cgemm = NULL
+cdef void* __cblas_csymm = NULL
+cdef void* __cblas_csyrk = NULL
+cdef void* __cblas_csyr2k = NULL
+cdef void* __cblas_ctrmm = NULL
+cdef void* __cblas_ctrsm = NULL
+cdef void* __cblas_zgemm = NULL
+cdef void* __cblas_zsymm = NULL
+cdef void* __cblas_zsyrk = NULL
+cdef void* __cblas_zsyr2k = NULL
+cdef void* __cblas_ztrmm = NULL
+cdef void* __cblas_ztrsm = NULL
+cdef void* __cblas_chemm = NULL
+cdef void* __cblas_cherk = NULL
+cdef void* __cblas_cher2k = NULL
+cdef void* __cblas_zhemm = NULL
+cdef void* __cblas_zherk = NULL
+cdef void* __cblas_zher2k = NULL
+cdef void* __cblas_sgemm_batch = NULL
+cdef void* __cblas_dgemm_batch = NULL
+cdef void* __cblas_cgemm_batch = NULL
+cdef void* __cblas_zgemm_batch = NULL
+cdef void* __cblas_sgemm_batch_strided = NULL
+cdef void* __cblas_dgemm_batch_strided = NULL
+cdef void* __cblas_cgemm_batch_strided = NULL
+cdef void* __cblas_zgemm_batch_strided = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef void* load_library() except* with gil:
+    handle = 0
+    cdef str all_err_msg = ""
+    cdef str env_lib_dll_name = os.getenv("NVMATH_BLAS_CPU_LIBRARY", "")
+
+    if env_lib_dll_name != "":
+        try:
+            handle = win32api.GetModuleHandle(env_lib_dll_name)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to dlopen NVMATH_BLAS_CPU_LIBRARY={env_lib_dll_name}. "
+                f"Please check that NVMATH_BLAS_CPU_LIBRARY is the name of a DLL on the PATH. {e}"
+            )
+        global __current_dll_name
+        __current_dll_name = env_lib_dll_name
+        assert handle != 0
+        return <void*><intptr_t>handle
+
+    if len(__lib_dll_names) == 0:
+        raise RuntimeError("Cannot load a BLAS-compatible library. No DLL names were specified.")
+    for dll_name in __lib_dll_names:
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "..", "..", "Library", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+    else:
+        all_libs = ", ".join(__lib_dll_names)
+        raise RuntimeError(
+            f"Failed to dlopen all of the following libraries: {all_libs}. "
+            "Install/add one of these libraries to the PATH or "
+            f"use environment variable NVMATH_BLAS_CPU_LIBRARY to name a DLL on the PATH. {all_err_msg}"
+        )
+
+    global __current_dll_name
+    __current_dll_name = dll_name
+
+    assert handle != 0
+    return <void*><intptr_t>handle
+
+
+cdef int _check_or_init_nvpl_blas() except -1 nogil:
+    global __py_nvpl_blas_init
+    if __py_nvpl_blas_init:
+        return 0
+
+    with gil, __symbol_lock:
+        # Load library
+        handle = <intptr_t>load_library()
+
+        # Load function
+        global __MKL_Set_Num_Threads_Local
+        __MKL_Set_Num_Threads_Local = GetProcAddress(handle, 'MKL_Set_Num_Threads_Local')
+
+        global __MKL_Set_Num_Threads
+        __MKL_Set_Num_Threads = GetProcAddress(handle, 'MKL_Set_Num_Threads')
+
+        global __openblas_set_num_threads
+        __openblas_set_num_threads = GetProcAddress(handle, 'openblas_set_num_threads')
+
+        global __openblas_set_num_threads_local
+        __openblas_set_num_threads_local = GetProcAddress(handle, 'openblas_set_num_threads_local')
+
+        global __nvpl_blas_get_version
+        __nvpl_blas_get_version = GetProcAddress(handle, 'nvpl_blas_get_version')
+
+        global __nvpl_blas_get_max_threads
+        __nvpl_blas_get_max_threads = GetProcAddress(handle, 'nvpl_blas_get_max_threads')
+
+        global __nvpl_blas_set_num_threads
+        __nvpl_blas_set_num_threads = GetProcAddress(handle, 'nvpl_blas_set_num_threads')
+
+        global __nvpl_blas_set_num_threads_local
+        __nvpl_blas_set_num_threads_local = GetProcAddress(handle, 'nvpl_blas_set_num_threads_local')
+
+        global __cblas_sgemv
+        __cblas_sgemv = GetProcAddress(handle, 'cblas_sgemv')
+
+        global __cblas_sgbmv
+        __cblas_sgbmv = GetProcAddress(handle, 'cblas_sgbmv')
+
+        global __cblas_strmv
+        __cblas_strmv = GetProcAddress(handle, 'cblas_strmv')
+
+        global __cblas_stbmv
+        __cblas_stbmv = GetProcAddress(handle, 'cblas_stbmv')
+
+        global __cblas_stpmv
+        __cblas_stpmv = GetProcAddress(handle, 'cblas_stpmv')
+
+        global __cblas_strsv
+        __cblas_strsv = GetProcAddress(handle, 'cblas_strsv')
+
+        global __cblas_stbsv
+        __cblas_stbsv = GetProcAddress(handle, 'cblas_stbsv')
+
+        global __cblas_stpsv
+        __cblas_stpsv = GetProcAddress(handle, 'cblas_stpsv')
+
+        global __cblas_dgemv
+        __cblas_dgemv = GetProcAddress(handle, 'cblas_dgemv')
+
+        global __cblas_dgbmv
+        __cblas_dgbmv = GetProcAddress(handle, 'cblas_dgbmv')
+
+        global __cblas_dtrmv
+        __cblas_dtrmv = GetProcAddress(handle, 'cblas_dtrmv')
+
+        global __cblas_dtbmv
+        __cblas_dtbmv = GetProcAddress(handle, 'cblas_dtbmv')
+
+        global __cblas_dtpmv
+        __cblas_dtpmv = GetProcAddress(handle, 'cblas_dtpmv')
+
+        global __cblas_dtrsv
+        __cblas_dtrsv = GetProcAddress(handle, 'cblas_dtrsv')
+
+        global __cblas_dtbsv
+        __cblas_dtbsv = GetProcAddress(handle, 'cblas_dtbsv')
+
+        global __cblas_dtpsv
+        __cblas_dtpsv = GetProcAddress(handle, 'cblas_dtpsv')
+
+        global __cblas_cgemv
+        __cblas_cgemv = GetProcAddress(handle, 'cblas_cgemv')
+
+        global __cblas_cgbmv
+        __cblas_cgbmv = GetProcAddress(handle, 'cblas_cgbmv')
+
+        global __cblas_ctrmv
+        __cblas_ctrmv = GetProcAddress(handle, 'cblas_ctrmv')
+
+        global __cblas_ctbmv
+        __cblas_ctbmv = GetProcAddress(handle, 'cblas_ctbmv')
+
+        global __cblas_ctpmv
+        __cblas_ctpmv = GetProcAddress(handle, 'cblas_ctpmv')
+
+        global __cblas_ctrsv
+        __cblas_ctrsv = GetProcAddress(handle, 'cblas_ctrsv')
+
+        global __cblas_ctbsv
+        __cblas_ctbsv = GetProcAddress(handle, 'cblas_ctbsv')
+
+        global __cblas_ctpsv
+        __cblas_ctpsv = GetProcAddress(handle, 'cblas_ctpsv')
+
+        global __cblas_zgemv
+        __cblas_zgemv = GetProcAddress(handle, 'cblas_zgemv')
+
+        global __cblas_zgbmv
+        __cblas_zgbmv = GetProcAddress(handle, 'cblas_zgbmv')
+
+        global __cblas_ztrmv
+        __cblas_ztrmv = GetProcAddress(handle, 'cblas_ztrmv')
+
+        global __cblas_ztbmv
+        __cblas_ztbmv = GetProcAddress(handle, 'cblas_ztbmv')
+
+        global __cblas_ztpmv
+        __cblas_ztpmv = GetProcAddress(handle, 'cblas_ztpmv')
+
+        global __cblas_ztrsv
+        __cblas_ztrsv = GetProcAddress(handle, 'cblas_ztrsv')
+
+        global __cblas_ztbsv
+        __cblas_ztbsv = GetProcAddress(handle, 'cblas_ztbsv')
+
+        global __cblas_ztpsv
+        __cblas_ztpsv = GetProcAddress(handle, 'cblas_ztpsv')
+
+        global __cblas_ssymv
+        __cblas_ssymv = GetProcAddress(handle, 'cblas_ssymv')
+
+        global __cblas_ssbmv
+        __cblas_ssbmv = GetProcAddress(handle, 'cblas_ssbmv')
+
+        global __cblas_sspmv
+        __cblas_sspmv = GetProcAddress(handle, 'cblas_sspmv')
+
+        global __cblas_sger
+        __cblas_sger = GetProcAddress(handle, 'cblas_sger')
+
+        global __cblas_ssyr
+        __cblas_ssyr = GetProcAddress(handle, 'cblas_ssyr')
+
+        global __cblas_sspr
+        __cblas_sspr = GetProcAddress(handle, 'cblas_sspr')
+
+        global __cblas_ssyr2
+        __cblas_ssyr2 = GetProcAddress(handle, 'cblas_ssyr2')
+
+        global __cblas_sspr2
+        __cblas_sspr2 = GetProcAddress(handle, 'cblas_sspr2')
+
+        global __cblas_dsymv
+        __cblas_dsymv = GetProcAddress(handle, 'cblas_dsymv')
+
+        global __cblas_dsbmv
+        __cblas_dsbmv = GetProcAddress(handle, 'cblas_dsbmv')
+
+        global __cblas_dspmv
+        __cblas_dspmv = GetProcAddress(handle, 'cblas_dspmv')
+
+        global __cblas_dger
+        __cblas_dger = GetProcAddress(handle, 'cblas_dger')
+
+        global __cblas_dsyr
+        __cblas_dsyr = GetProcAddress(handle, 'cblas_dsyr')
+
+        global __cblas_dspr
+        __cblas_dspr = GetProcAddress(handle, 'cblas_dspr')
+
+        global __cblas_dsyr2
+        __cblas_dsyr2 = GetProcAddress(handle, 'cblas_dsyr2')
+
+        global __cblas_dspr2
+        __cblas_dspr2 = GetProcAddress(handle, 'cblas_dspr2')
+
+        global __cblas_chemv
+        __cblas_chemv = GetProcAddress(handle, 'cblas_chemv')
+
+        global __cblas_chbmv
+        __cblas_chbmv = GetProcAddress(handle, 'cblas_chbmv')
+
+        global __cblas_chpmv
+        __cblas_chpmv = GetProcAddress(handle, 'cblas_chpmv')
+
+        global __cblas_cgeru
+        __cblas_cgeru = GetProcAddress(handle, 'cblas_cgeru')
+
+        global __cblas_cgerc
+        __cblas_cgerc = GetProcAddress(handle, 'cblas_cgerc')
+
+        global __cblas_cher
+        __cblas_cher = GetProcAddress(handle, 'cblas_cher')
+
+        global __cblas_chpr
+        __cblas_chpr = GetProcAddress(handle, 'cblas_chpr')
+
+        global __cblas_cher2
+        __cblas_cher2 = GetProcAddress(handle, 'cblas_cher2')
+
+        global __cblas_chpr2
+        __cblas_chpr2 = GetProcAddress(handle, 'cblas_chpr2')
+
+        global __cblas_zhemv
+        __cblas_zhemv = GetProcAddress(handle, 'cblas_zhemv')
+
+        global __cblas_zhbmv
+        __cblas_zhbmv = GetProcAddress(handle, 'cblas_zhbmv')
+
+        global __cblas_zhpmv
+        __cblas_zhpmv = GetProcAddress(handle, 'cblas_zhpmv')
+
+        global __cblas_zgeru
+        __cblas_zgeru = GetProcAddress(handle, 'cblas_zgeru')
+
+        global __cblas_zgerc
+        __cblas_zgerc = GetProcAddress(handle, 'cblas_zgerc')
+
+        global __cblas_zher
+        __cblas_zher = GetProcAddress(handle, 'cblas_zher')
+
+        global __cblas_zhpr
+        __cblas_zhpr = GetProcAddress(handle, 'cblas_zhpr')
+
+        global __cblas_zher2
+        __cblas_zher2 = GetProcAddress(handle, 'cblas_zher2')
+
+        global __cblas_zhpr2
+        __cblas_zhpr2 = GetProcAddress(handle, 'cblas_zhpr2')
+
+        global __cblas_sgemm
+        __cblas_sgemm = GetProcAddress(handle, 'cblas_sgemm')
+
+        global __cblas_ssymm
+        __cblas_ssymm = GetProcAddress(handle, 'cblas_ssymm')
+
+        global __cblas_ssyrk
+        __cblas_ssyrk = GetProcAddress(handle, 'cblas_ssyrk')
+
+        global __cblas_ssyr2k
+        __cblas_ssyr2k = GetProcAddress(handle, 'cblas_ssyr2k')
+
+        global __cblas_strmm
+        __cblas_strmm = GetProcAddress(handle, 'cblas_strmm')
+
+        global __cblas_strsm
+        __cblas_strsm = GetProcAddress(handle, 'cblas_strsm')
+
+        global __cblas_dgemm
+        __cblas_dgemm = GetProcAddress(handle, 'cblas_dgemm')
+
+        global __cblas_dsymm
+        __cblas_dsymm = GetProcAddress(handle, 'cblas_dsymm')
+
+        global __cblas_dsyrk
+        __cblas_dsyrk = GetProcAddress(handle, 'cblas_dsyrk')
+
+        global __cblas_dsyr2k
+        __cblas_dsyr2k = GetProcAddress(handle, 'cblas_dsyr2k')
+
+        global __cblas_dtrmm
+        __cblas_dtrmm = GetProcAddress(handle, 'cblas_dtrmm')
+
+        global __cblas_dtrsm
+        __cblas_dtrsm = GetProcAddress(handle, 'cblas_dtrsm')
+
+        global __cblas_cgemm
+        __cblas_cgemm = GetProcAddress(handle, 'cblas_cgemm')
+
+        global __cblas_csymm
+        __cblas_csymm = GetProcAddress(handle, 'cblas_csymm')
+
+        global __cblas_csyrk
+        __cblas_csyrk = GetProcAddress(handle, 'cblas_csyrk')
+
+        global __cblas_csyr2k
+        __cblas_csyr2k = GetProcAddress(handle, 'cblas_csyr2k')
+
+        global __cblas_ctrmm
+        __cblas_ctrmm = GetProcAddress(handle, 'cblas_ctrmm')
+
+        global __cblas_ctrsm
+        __cblas_ctrsm = GetProcAddress(handle, 'cblas_ctrsm')
+
+        global __cblas_zgemm
+        __cblas_zgemm = GetProcAddress(handle, 'cblas_zgemm')
+
+        global __cblas_zsymm
+        __cblas_zsymm = GetProcAddress(handle, 'cblas_zsymm')
+
+        global __cblas_zsyrk
+        __cblas_zsyrk = GetProcAddress(handle, 'cblas_zsyrk')
+
+        global __cblas_zsyr2k
+        __cblas_zsyr2k = GetProcAddress(handle, 'cblas_zsyr2k')
+
+        global __cblas_ztrmm
+        __cblas_ztrmm = GetProcAddress(handle, 'cblas_ztrmm')
+
+        global __cblas_ztrsm
+        __cblas_ztrsm = GetProcAddress(handle, 'cblas_ztrsm')
+
+        global __cblas_chemm
+        __cblas_chemm = GetProcAddress(handle, 'cblas_chemm')
+
+        global __cblas_cherk
+        __cblas_cherk = GetProcAddress(handle, 'cblas_cherk')
+
+        global __cblas_cher2k
+        __cblas_cher2k = GetProcAddress(handle, 'cblas_cher2k')
+
+        global __cblas_zhemm
+        __cblas_zhemm = GetProcAddress(handle, 'cblas_zhemm')
+
+        global __cblas_zherk
+        __cblas_zherk = GetProcAddress(handle, 'cblas_zherk')
+
+        global __cblas_zher2k
+        __cblas_zher2k = GetProcAddress(handle, 'cblas_zher2k')
+
+        global __cblas_sgemm_batch
+        __cblas_sgemm_batch = GetProcAddress(handle, 'cblas_sgemm_batch')
+
+        global __cblas_dgemm_batch
+        __cblas_dgemm_batch = GetProcAddress(handle, 'cblas_dgemm_batch')
+
+        global __cblas_cgemm_batch
+        __cblas_cgemm_batch = GetProcAddress(handle, 'cblas_cgemm_batch')
+
+        global __cblas_zgemm_batch
+        __cblas_zgemm_batch = GetProcAddress(handle, 'cblas_zgemm_batch')
+
+        global __cblas_sgemm_batch_strided
+        __cblas_sgemm_batch_strided = GetProcAddress(handle, 'cblas_sgemm_batch_strided')
+
+        global __cblas_dgemm_batch_strided
+        __cblas_dgemm_batch_strided = GetProcAddress(handle, 'cblas_dgemm_batch_strided')
+
+        global __cblas_cgemm_batch_strided
+        __cblas_cgemm_batch_strided = GetProcAddress(handle, 'cblas_cgemm_batch_strided')
+
+        global __cblas_zgemm_batch_strided
+        __cblas_zgemm_batch_strided = GetProcAddress(handle, 'cblas_zgemm_batch_strided')
+
+    __py_nvpl_blas_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef void _set_lib_so_names(tuple lib_so_names):
+    global __lib_dll_names
+    __lib_dll_names = lib_so_names
+
+
+cpdef tuple _get_lib_so_names():
+    global __lib_dll_names
+    return __lib_dll_names
+
+
+cpdef str _get_current_lib_so_name():
+    global __current_dll_name
+    return __current_dll_name
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvpl_blas()
+    cdef dict data = {}
+
+    global __MKL_Set_Num_Threads_Local
+    data["__MKL_Set_Num_Threads_Local"] = <intptr_t>__MKL_Set_Num_Threads_Local
+
+    global __MKL_Set_Num_Threads
+    data["__MKL_Set_Num_Threads"] = <intptr_t>__MKL_Set_Num_Threads
+
+    global __openblas_set_num_threads
+    data["__openblas_set_num_threads"] = <intptr_t>__openblas_set_num_threads
+
+    global __openblas_set_num_threads_local
+    data["__openblas_set_num_threads_local"] = <intptr_t>__openblas_set_num_threads_local
+
+    global __nvpl_blas_get_version
+    data["__nvpl_blas_get_version"] = <intptr_t>__nvpl_blas_get_version
+
+    global __nvpl_blas_get_max_threads
+    data["__nvpl_blas_get_max_threads"] = <intptr_t>__nvpl_blas_get_max_threads
+
+    global __nvpl_blas_set_num_threads
+    data["__nvpl_blas_set_num_threads"] = <intptr_t>__nvpl_blas_set_num_threads
+
+    global __nvpl_blas_set_num_threads_local
+    data["__nvpl_blas_set_num_threads_local"] = <intptr_t>__nvpl_blas_set_num_threads_local
+
+    global __cblas_sgemv
+    data["__cblas_sgemv"] = <intptr_t>__cblas_sgemv
+
+    global __cblas_sgbmv
+    data["__cblas_sgbmv"] = <intptr_t>__cblas_sgbmv
+
+    global __cblas_strmv
+    data["__cblas_strmv"] = <intptr_t>__cblas_strmv
+
+    global __cblas_stbmv
+    data["__cblas_stbmv"] = <intptr_t>__cblas_stbmv
+
+    global __cblas_stpmv
+    data["__cblas_stpmv"] = <intptr_t>__cblas_stpmv
+
+    global __cblas_strsv
+    data["__cblas_strsv"] = <intptr_t>__cblas_strsv
+
+    global __cblas_stbsv
+    data["__cblas_stbsv"] = <intptr_t>__cblas_stbsv
+
+    global __cblas_stpsv
+    data["__cblas_stpsv"] = <intptr_t>__cblas_stpsv
+
+    global __cblas_dgemv
+    data["__cblas_dgemv"] = <intptr_t>__cblas_dgemv
+
+    global __cblas_dgbmv
+    data["__cblas_dgbmv"] = <intptr_t>__cblas_dgbmv
+
+    global __cblas_dtrmv
+    data["__cblas_dtrmv"] = <intptr_t>__cblas_dtrmv
+
+    global __cblas_dtbmv
+    data["__cblas_dtbmv"] = <intptr_t>__cblas_dtbmv
+
+    global __cblas_dtpmv
+    data["__cblas_dtpmv"] = <intptr_t>__cblas_dtpmv
+
+    global __cblas_dtrsv
+    data["__cblas_dtrsv"] = <intptr_t>__cblas_dtrsv
+
+    global __cblas_dtbsv
+    data["__cblas_dtbsv"] = <intptr_t>__cblas_dtbsv
+
+    global __cblas_dtpsv
+    data["__cblas_dtpsv"] = <intptr_t>__cblas_dtpsv
+
+    global __cblas_cgemv
+    data["__cblas_cgemv"] = <intptr_t>__cblas_cgemv
+
+    global __cblas_cgbmv
+    data["__cblas_cgbmv"] = <intptr_t>__cblas_cgbmv
+
+    global __cblas_ctrmv
+    data["__cblas_ctrmv"] = <intptr_t>__cblas_ctrmv
+
+    global __cblas_ctbmv
+    data["__cblas_ctbmv"] = <intptr_t>__cblas_ctbmv
+
+    global __cblas_ctpmv
+    data["__cblas_ctpmv"] = <intptr_t>__cblas_ctpmv
+
+    global __cblas_ctrsv
+    data["__cblas_ctrsv"] = <intptr_t>__cblas_ctrsv
+
+    global __cblas_ctbsv
+    data["__cblas_ctbsv"] = <intptr_t>__cblas_ctbsv
+
+    global __cblas_ctpsv
+    data["__cblas_ctpsv"] = <intptr_t>__cblas_ctpsv
+
+    global __cblas_zgemv
+    data["__cblas_zgemv"] = <intptr_t>__cblas_zgemv
+
+    global __cblas_zgbmv
+    data["__cblas_zgbmv"] = <intptr_t>__cblas_zgbmv
+
+    global __cblas_ztrmv
+    data["__cblas_ztrmv"] = <intptr_t>__cblas_ztrmv
+
+    global __cblas_ztbmv
+    data["__cblas_ztbmv"] = <intptr_t>__cblas_ztbmv
+
+    global __cblas_ztpmv
+    data["__cblas_ztpmv"] = <intptr_t>__cblas_ztpmv
+
+    global __cblas_ztrsv
+    data["__cblas_ztrsv"] = <intptr_t>__cblas_ztrsv
+
+    global __cblas_ztbsv
+    data["__cblas_ztbsv"] = <intptr_t>__cblas_ztbsv
+
+    global __cblas_ztpsv
+    data["__cblas_ztpsv"] = <intptr_t>__cblas_ztpsv
+
+    global __cblas_ssymv
+    data["__cblas_ssymv"] = <intptr_t>__cblas_ssymv
+
+    global __cblas_ssbmv
+    data["__cblas_ssbmv"] = <intptr_t>__cblas_ssbmv
+
+    global __cblas_sspmv
+    data["__cblas_sspmv"] = <intptr_t>__cblas_sspmv
+
+    global __cblas_sger
+    data["__cblas_sger"] = <intptr_t>__cblas_sger
+
+    global __cblas_ssyr
+    data["__cblas_ssyr"] = <intptr_t>__cblas_ssyr
+
+    global __cblas_sspr
+    data["__cblas_sspr"] = <intptr_t>__cblas_sspr
+
+    global __cblas_ssyr2
+    data["__cblas_ssyr2"] = <intptr_t>__cblas_ssyr2
+
+    global __cblas_sspr2
+    data["__cblas_sspr2"] = <intptr_t>__cblas_sspr2
+
+    global __cblas_dsymv
+    data["__cblas_dsymv"] = <intptr_t>__cblas_dsymv
+
+    global __cblas_dsbmv
+    data["__cblas_dsbmv"] = <intptr_t>__cblas_dsbmv
+
+    global __cblas_dspmv
+    data["__cblas_dspmv"] = <intptr_t>__cblas_dspmv
+
+    global __cblas_dger
+    data["__cblas_dger"] = <intptr_t>__cblas_dger
+
+    global __cblas_dsyr
+    data["__cblas_dsyr"] = <intptr_t>__cblas_dsyr
+
+    global __cblas_dspr
+    data["__cblas_dspr"] = <intptr_t>__cblas_dspr
+
+    global __cblas_dsyr2
+    data["__cblas_dsyr2"] = <intptr_t>__cblas_dsyr2
+
+    global __cblas_dspr2
+    data["__cblas_dspr2"] = <intptr_t>__cblas_dspr2
+
+    global __cblas_chemv
+    data["__cblas_chemv"] = <intptr_t>__cblas_chemv
+
+    global __cblas_chbmv
+    data["__cblas_chbmv"] = <intptr_t>__cblas_chbmv
+
+    global __cblas_chpmv
+    data["__cblas_chpmv"] = <intptr_t>__cblas_chpmv
+
+    global __cblas_cgeru
+    data["__cblas_cgeru"] = <intptr_t>__cblas_cgeru
+
+    global __cblas_cgerc
+    data["__cblas_cgerc"] = <intptr_t>__cblas_cgerc
+
+    global __cblas_cher
+    data["__cblas_cher"] = <intptr_t>__cblas_cher
+
+    global __cblas_chpr
+    data["__cblas_chpr"] = <intptr_t>__cblas_chpr
+
+    global __cblas_cher2
+    data["__cblas_cher2"] = <intptr_t>__cblas_cher2
+
+    global __cblas_chpr2
+    data["__cblas_chpr2"] = <intptr_t>__cblas_chpr2
+
+    global __cblas_zhemv
+    data["__cblas_zhemv"] = <intptr_t>__cblas_zhemv
+
+    global __cblas_zhbmv
+    data["__cblas_zhbmv"] = <intptr_t>__cblas_zhbmv
+
+    global __cblas_zhpmv
+    data["__cblas_zhpmv"] = <intptr_t>__cblas_zhpmv
+
+    global __cblas_zgeru
+    data["__cblas_zgeru"] = <intptr_t>__cblas_zgeru
+
+    global __cblas_zgerc
+    data["__cblas_zgerc"] = <intptr_t>__cblas_zgerc
+
+    global __cblas_zher
+    data["__cblas_zher"] = <intptr_t>__cblas_zher
+
+    global __cblas_zhpr
+    data["__cblas_zhpr"] = <intptr_t>__cblas_zhpr
+
+    global __cblas_zher2
+    data["__cblas_zher2"] = <intptr_t>__cblas_zher2
+
+    global __cblas_zhpr2
+    data["__cblas_zhpr2"] = <intptr_t>__cblas_zhpr2
+
+    global __cblas_sgemm
+    data["__cblas_sgemm"] = <intptr_t>__cblas_sgemm
+
+    global __cblas_ssymm
+    data["__cblas_ssymm"] = <intptr_t>__cblas_ssymm
+
+    global __cblas_ssyrk
+    data["__cblas_ssyrk"] = <intptr_t>__cblas_ssyrk
+
+    global __cblas_ssyr2k
+    data["__cblas_ssyr2k"] = <intptr_t>__cblas_ssyr2k
+
+    global __cblas_strmm
+    data["__cblas_strmm"] = <intptr_t>__cblas_strmm
+
+    global __cblas_strsm
+    data["__cblas_strsm"] = <intptr_t>__cblas_strsm
+
+    global __cblas_dgemm
+    data["__cblas_dgemm"] = <intptr_t>__cblas_dgemm
+
+    global __cblas_dsymm
+    data["__cblas_dsymm"] = <intptr_t>__cblas_dsymm
+
+    global __cblas_dsyrk
+    data["__cblas_dsyrk"] = <intptr_t>__cblas_dsyrk
+
+    global __cblas_dsyr2k
+    data["__cblas_dsyr2k"] = <intptr_t>__cblas_dsyr2k
+
+    global __cblas_dtrmm
+    data["__cblas_dtrmm"] = <intptr_t>__cblas_dtrmm
+
+    global __cblas_dtrsm
+    data["__cblas_dtrsm"] = <intptr_t>__cblas_dtrsm
+
+    global __cblas_cgemm
+    data["__cblas_cgemm"] = <intptr_t>__cblas_cgemm
+
+    global __cblas_csymm
+    data["__cblas_csymm"] = <intptr_t>__cblas_csymm
+
+    global __cblas_csyrk
+    data["__cblas_csyrk"] = <intptr_t>__cblas_csyrk
+
+    global __cblas_csyr2k
+    data["__cblas_csyr2k"] = <intptr_t>__cblas_csyr2k
+
+    global __cblas_ctrmm
+    data["__cblas_ctrmm"] = <intptr_t>__cblas_ctrmm
+
+    global __cblas_ctrsm
+    data["__cblas_ctrsm"] = <intptr_t>__cblas_ctrsm
+
+    global __cblas_zgemm
+    data["__cblas_zgemm"] = <intptr_t>__cblas_zgemm
+
+    global __cblas_zsymm
+    data["__cblas_zsymm"] = <intptr_t>__cblas_zsymm
+
+    global __cblas_zsyrk
+    data["__cblas_zsyrk"] = <intptr_t>__cblas_zsyrk
+
+    global __cblas_zsyr2k
+    data["__cblas_zsyr2k"] = <intptr_t>__cblas_zsyr2k
+
+    global __cblas_ztrmm
+    data["__cblas_ztrmm"] = <intptr_t>__cblas_ztrmm
+
+    global __cblas_ztrsm
+    data["__cblas_ztrsm"] = <intptr_t>__cblas_ztrsm
+
+    global __cblas_chemm
+    data["__cblas_chemm"] = <intptr_t>__cblas_chemm
+
+    global __cblas_cherk
+    data["__cblas_cherk"] = <intptr_t>__cblas_cherk
+
+    global __cblas_cher2k
+    data["__cblas_cher2k"] = <intptr_t>__cblas_cher2k
+
+    global __cblas_zhemm
+    data["__cblas_zhemm"] = <intptr_t>__cblas_zhemm
+
+    global __cblas_zherk
+    data["__cblas_zherk"] = <intptr_t>__cblas_zherk
+
+    global __cblas_zher2k
+    data["__cblas_zher2k"] = <intptr_t>__cblas_zher2k
+
+    global __cblas_sgemm_batch
+    data["__cblas_sgemm_batch"] = <intptr_t>__cblas_sgemm_batch
+
+    global __cblas_dgemm_batch
+    data["__cblas_dgemm_batch"] = <intptr_t>__cblas_dgemm_batch
+
+    global __cblas_cgemm_batch
+    data["__cblas_cgemm_batch"] = <intptr_t>__cblas_cgemm_batch
+
+    global __cblas_zgemm_batch
+    data["__cblas_zgemm_batch"] = <intptr_t>__cblas_zgemm_batch
+
+    global __cblas_sgemm_batch_strided
+    data["__cblas_sgemm_batch_strided"] = <intptr_t>__cblas_sgemm_batch_strided
+
+    global __cblas_dgemm_batch_strided
+    data["__cblas_dgemm_batch_strided"] = <intptr_t>__cblas_dgemm_batch_strided
+
+    global __cblas_cgemm_batch_strided
+    data["__cblas_cgemm_batch_strided"] = <intptr_t>__cblas_cgemm_batch_strided
+
+    global __cblas_zgemm_batch_strided
+    data["__cblas_zgemm_batch_strided"] = <intptr_t>__cblas_zgemm_batch_strided
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef int _MKL_mkl_set_num_threads_local(int nth) except?-42 nogil:
+    global __MKL_Set_Num_Threads_Local
+    _check_or_init_nvpl_blas()
+    if __MKL_Set_Num_Threads_Local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function MKL_Set_Num_Threads_Local is not found")
+    return (<int (*)(int) noexcept nogil>__MKL_Set_Num_Threads_Local)(
+        nth)
+
+
+@cython.show_performance_hints(False)
+cdef void _MKL_mkl_set_num_threads(int nth) except* nogil:
+    global __MKL_Set_Num_Threads
+    _check_or_init_nvpl_blas()
+    if __MKL_Set_Num_Threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function MKL_Set_Num_Threads is not found")
+    (<void (*)(int) noexcept nogil>__MKL_Set_Num_Threads)(
+        nth)
+
+
+@cython.show_performance_hints(False)
+cdef void _openblas_openblas_set_num_threads(int num_threads) except* nogil:
+    global __openblas_set_num_threads
+    _check_or_init_nvpl_blas()
+    if __openblas_set_num_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function openblas_set_num_threads is not found")
+    (<void (*)(int) noexcept nogil>__openblas_set_num_threads)(
+        num_threads)
+
+
+cdef int _openblas_openblas_set_num_threads_local(int num_threads) except?-42 nogil:
+    global __openblas_set_num_threads_local
+    _check_or_init_nvpl_blas()
+    if __openblas_set_num_threads_local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function openblas_set_num_threads_local is not found")
+    return (<int (*)(int) noexcept nogil>__openblas_set_num_threads_local)(
+        num_threads)
+
+
+cdef int _nvpl_blas_get_version() except?-42 nogil:
+    global __nvpl_blas_get_version
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_get_version == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_get_version is not found")
+    return (<int (*)() noexcept nogil>__nvpl_blas_get_version)(
+        )
+
+
+cdef int _nvpl_blas_get_max_threads() except?-42 nogil:
+    global __nvpl_blas_get_max_threads
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_get_max_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_get_max_threads is not found")
+    return (<int (*)() noexcept nogil>__nvpl_blas_get_max_threads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _nvpl_blas_set_num_threads(int nthr) except* nogil:
+    global __nvpl_blas_set_num_threads
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_set_num_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_set_num_threads is not found")
+    (<void (*)(int) noexcept nogil>__nvpl_blas_set_num_threads)(
+        nthr)
+
+
+cdef int _nvpl_blas_set_num_threads_local(int nthr_local) except?-42 nogil:
+    global __nvpl_blas_set_num_threads_local
+    _check_or_init_nvpl_blas()
+    if __nvpl_blas_set_num_threads_local == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_blas_set_num_threads_local is not found")
+    return (<int (*)(int) noexcept nogil>__nvpl_blas_set_num_threads_local)(
+        nthr_local)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_sgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_strmv
+    _check_or_init_nvpl_blas()
+    if __cblas_strmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_stbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_stbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_stpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, float*, const nvpl_int_t) noexcept nogil>__cblas_stpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_strsv
+    _check_or_init_nvpl_blas()
+    if __cblas_strsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_stbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_stbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_stpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_stpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_stpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_stpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const float*, float*, const nvpl_int_t) noexcept nogil>__cblas_stpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, double*, const nvpl_int_t) noexcept nogil>__cblas_dtpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_dtpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_dtpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const double*, double*, const nvpl_int_t) noexcept nogil>__cblas_dtpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_cgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_cgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_cgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ctpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ctpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ctpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ctpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zgemv
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgemv)(
+        order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zgbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zgbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgbmv)(
+        order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztrmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrmv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztbmv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ztpmv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztrsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrsv)(
+        order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztbsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztbsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztbsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztbsv)(
+        order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    global __cblas_ztpsv
+    _check_or_init_nvpl_blas()
+    if __cblas_ztpsv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztpsv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_ztpsv)(
+        order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_ssymv
+    _check_or_init_nvpl_blas()
+    if __cblas_ssymv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssymv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssymv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_ssbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_ssbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* Ap, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_sspmv
+    _check_or_init_nvpl_blas()
+    if __cblas_sspmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sspmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_sger
+    _check_or_init_nvpl_blas()
+    if __cblas_sger == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sger is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_sger)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_ssyr
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* Ap) except* nogil:
+    global __cblas_sspr
+    _check_or_init_nvpl_blas()
+    if __cblas_sspr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*) noexcept nogil>__cblas_sspr)(
+        order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_ssyr2
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A) except* nogil:
+    global __cblas_sspr2
+    _check_or_init_nvpl_blas()
+    if __cblas_sspr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sspr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, float*) noexcept nogil>__cblas_sspr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dsymv
+    _check_or_init_nvpl_blas()
+    if __cblas_dsymv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsymv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsymv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dsbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dsbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* Ap, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_dspmv
+    _check_or_init_nvpl_blas()
+    if __cblas_dspmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dspmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dger
+    _check_or_init_nvpl_blas()
+    if __cblas_dger == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dger is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dger)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dsyr
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* Ap) except* nogil:
+    global __cblas_dspr
+    _check_or_init_nvpl_blas()
+    if __cblas_dspr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*) noexcept nogil>__cblas_dspr)(
+        order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_dsyr2
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A) except* nogil:
+    global __cblas_dspr2
+    _check_or_init_nvpl_blas()
+    if __cblas_dspr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dspr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, double*) noexcept nogil>__cblas_dspr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chemv
+    _check_or_init_nvpl_blas()
+    if __cblas_chemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chemv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_chbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_chpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_chpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chpmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cgeru
+    _check_or_init_nvpl_blas()
+    if __cblas_cgeru == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgeru is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cgeru)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cgerc
+    _check_or_init_nvpl_blas()
+    if __cblas_cgerc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgerc is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cgerc)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cher
+    _check_or_init_nvpl_blas()
+    if __cblas_cher == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cher)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    global __cblas_chpr
+    _check_or_init_nvpl_blas()
+    if __cblas_chpr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const float, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_chpr)(
+        order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_cher2
+    _check_or_init_nvpl_blas()
+    if __cblas_cher2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_cher2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    global __cblas_chpr2
+    _check_or_init_nvpl_blas()
+    if __cblas_chpr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chpr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_chpr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhemv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhemv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhemv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhemv)(
+        order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhbmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhbmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhbmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhbmv)(
+        order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    global __cblas_zhpmv
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpmv == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpmv is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhpmv)(
+        order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zgeru
+    _check_or_init_nvpl_blas()
+    if __cblas_zgeru == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgeru is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zgeru)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zgerc
+    _check_or_init_nvpl_blas()
+    if __cblas_zgerc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgerc is not found")
+    (<void (*)(const CBLAS_ORDER, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zgerc)(
+        order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zher
+    _check_or_init_nvpl_blas()
+    if __cblas_zher == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zher)(
+        order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    global __cblas_zhpr
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpr == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpr is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const double, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_zhpr)(
+        order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    global __cblas_zher2
+    _check_or_init_nvpl_blas()
+    if __cblas_zher2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_zher2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    global __cblas_zhpr2
+    _check_or_init_nvpl_blas()
+    if __cblas_zhpr2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhpr2 is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, void*) noexcept nogil>__cblas_zhpr2)(
+        order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_sgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_sgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssymm
+    _check_or_init_nvpl_blas()
+    if __cblas_ssymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ssyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_ssyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_ssyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ssyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const float*, const nvpl_int_t, const float, float*, const nvpl_int_t) noexcept nogil>__cblas_ssyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_strmm
+    _check_or_init_nvpl_blas()
+    if __cblas_strmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_strsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_strsm
+    _check_or_init_nvpl_blas()
+    if __cblas_strsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_strsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, float*, const nvpl_int_t) noexcept nogil>__cblas_strsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsymm
+    _check_or_init_nvpl_blas()
+    if __cblas_dsymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_dsyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_dsyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dsyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const double*, const nvpl_int_t, const double, double*, const nvpl_int_t) noexcept nogil>__cblas_dsyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_dtrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dtrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_dtrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_dtrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dtrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, double*, const nvpl_int_t) noexcept nogil>__cblas_dtrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_cgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csymm
+    _check_or_init_nvpl_blas()
+    if __cblas_csymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_csyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_csyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_csyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_csyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_csyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_csyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ctrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ctrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ctrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_ctrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ctrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ctrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zgemm
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zgemm)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsymm
+    _check_or_init_nvpl_blas()
+    if __cblas_zsymm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsymm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsymm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsyrk
+    _check_or_init_nvpl_blas()
+    if __cblas_zsyrk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsyrk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsyrk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zsyr2k
+    _check_or_init_nvpl_blas()
+    if __cblas_zsyr2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zsyr2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zsyr2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ztrmm
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrmm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrmm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrmm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_ztrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    global __cblas_ztrsm
+    _check_or_init_nvpl_blas()
+    if __cblas_ztrsm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_ztrsm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_DIAG, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, void*, const nvpl_int_t) noexcept nogil>__cblas_ztrsm)(
+        Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_chemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_chemm
+    _check_or_init_nvpl_blas()
+    if __cblas_chemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_chemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_chemm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const void* A, const nvpl_int_t lda, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cherk
+    _check_or_init_nvpl_blas()
+    if __cblas_cherk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cherk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const float, const void*, const nvpl_int_t, const float, void*, const nvpl_int_t) noexcept nogil>__cblas_cherk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_cher2k
+    _check_or_init_nvpl_blas()
+    if __cblas_cher2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cher2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const float, void*, const nvpl_int_t) noexcept nogil>__cblas_cher2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zhemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zhemm
+    _check_or_init_nvpl_blas()
+    if __cblas_zhemm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zhemm is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const void*, void*, const nvpl_int_t) noexcept nogil>__cblas_zhemm)(
+        Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const void* A, const nvpl_int_t lda, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zherk
+    _check_or_init_nvpl_blas()
+    if __cblas_zherk == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zherk is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const double, const void*, const nvpl_int_t, const double, void*, const nvpl_int_t) noexcept nogil>__cblas_zherk)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    global __cblas_zher2k
+    _check_or_init_nvpl_blas()
+    if __cblas_zher2k == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zher2k is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const void*, const nvpl_int_t, const double, void*, const nvpl_int_t) noexcept nogil>__cblas_zher2k)(
+        Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const float* alpha_array, const float** A_array, nvpl_int_t* lda_array, const float** B_array, nvpl_int_t* ldb_array, const float* beta_array, float** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_sgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const float*, const float**, nvpl_int_t*, const float**, nvpl_int_t*, const float*, float**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_sgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const double* alpha_array, const double** A_array, nvpl_int_t* lda_array, const double** B_array, nvpl_int_t* ldb_array, const double* beta_array, double** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_dgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const double*, const double**, nvpl_int_t*, const double**, nvpl_int_t*, const double*, double**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_dgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_cgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const void*, const void**, nvpl_int_t*, const void**, nvpl_int_t*, const void*, void**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_cgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    global __cblas_zgemm_batch
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm_batch == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm_batch is not found")
+    (<void (*)(CBLAS_ORDER, CBLAS_TRANSPOSE*, CBLAS_TRANSPOSE*, nvpl_int_t*, nvpl_int_t*, nvpl_int_t*, const void*, const void**, nvpl_int_t*, const void**, nvpl_int_t*, const void*, void**, nvpl_int_t*, nvpl_int_t, nvpl_int_t*) noexcept nogil>__cblas_zgemm_batch)(
+        Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_sgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const nvpl_int_t stridea, const float* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const float beta, float* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_sgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_sgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_sgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const float, const float*, const nvpl_int_t, const nvpl_int_t, const float*, const nvpl_int_t, const nvpl_int_t, const float, float*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_sgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_dgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const nvpl_int_t stridea, const double* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const double beta, double* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_dgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_dgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_dgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const double, const double*, const nvpl_int_t, const nvpl_int_t, const double*, const nvpl_int_t, const nvpl_int_t, const double, double*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_dgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_cgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_cgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_cgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_cgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, const nvpl_int_t, const void*, void*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_cgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void _cblas_zgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    global __cblas_zgemm_batch_strided
+    _check_or_init_nvpl_blas()
+    if __cblas_zgemm_batch_strided == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cblas_zgemm_batch_strided is not found")
+    (<void (*)(const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t, const void*, const void*, const nvpl_int_t, const nvpl_int_t, const void*, const nvpl_int_t, const nvpl_int_t, const void*, void*, const nvpl_int_t, const nvpl_int_t, const nvpl_int_t) noexcept nogil>__cblas_zgemm_batch_strided)(
+        Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
diff --git a/nvmath/bindings/nvpl/_internal/fft.pxd b/nvmath/bindings/nvpl/_internal/fft.pxd
index 198b8d3..07082ed 100644
--- a/nvmath/bindings/nvpl/_internal/fft.pxd
+++ b/nvmath/bindings/nvpl/_internal/fft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 
 from ..cyfft cimport *
 
diff --git a/nvmath/bindings/nvpl/_internal/fft_linux.pyx b/nvmath/bindings/nvpl/_internal/fft_linux.pyx
index a4b77a4..3bc7ec3 100644
--- a/nvmath/bindings/nvpl/_internal/fft_linux.pyx
+++ b/nvmath/bindings/nvpl/_internal/fft_linux.pyx
@@ -2,13 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 
 cimport cython
 from libc.stdint cimport intptr_t
 
 from ..._internal.utils import FunctionNotFoundError, NotSupportedError
 
+import threading
 
 ###############################################################################
 # Extern
@@ -33,6 +34,7 @@ cdef extern from "<dlfcn.h>" nogil:
 # Wrapper init
 ###############################################################################
 
+cdef object __symbol_lock = threading.Lock()
 cdef bint __py_nvpl_fft_init = False
 cdef str __current_so_name = ""
 cdef tuple __lib_so_names = ("libnvpl_fftw.so.0", "libmkl_rt.so.2",)
@@ -89,171 +91,173 @@ cdef int _check_or_init_nvpl_fft() except -1 nogil:
     if __py_nvpl_fft_init:
         return 0
 
-    # Load function
     cdef void* handle = NULL
-    global __nvpl_fft_get_version
-    __nvpl_fft_get_version = dlsym(RTLD_DEFAULT, 'nvpl_fft_get_version')
-    if __nvpl_fft_get_version == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __nvpl_fft_get_version = dlsym(handle, 'nvpl_fft_get_version')
-
-    global __fftw_plan_many_dft
-    __fftw_plan_many_dft = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft')
-    if __fftw_plan_many_dft == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_plan_many_dft = dlsym(handle, 'fftw_plan_many_dft')
-
-    global __fftw_plan_many_dft_r2c
-    __fftw_plan_many_dft_r2c = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft_r2c')
-    if __fftw_plan_many_dft_r2c == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_plan_many_dft_r2c = dlsym(handle, 'fftw_plan_many_dft_r2c')
-
-    global __fftw_plan_many_dft_c2r
-    __fftw_plan_many_dft_c2r = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft_c2r')
-    if __fftw_plan_many_dft_c2r == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_plan_many_dft_c2r = dlsym(handle, 'fftw_plan_many_dft_c2r')
-
-    global __fftw_execute_dft
-    __fftw_execute_dft = dlsym(RTLD_DEFAULT, 'fftw_execute_dft')
-    if __fftw_execute_dft == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_execute_dft = dlsym(handle, 'fftw_execute_dft')
-
-    global __fftw_execute_dft_r2c
-    __fftw_execute_dft_r2c = dlsym(RTLD_DEFAULT, 'fftw_execute_dft_r2c')
-    if __fftw_execute_dft_r2c == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_execute_dft_r2c = dlsym(handle, 'fftw_execute_dft_r2c')
-
-    global __fftw_execute_dft_c2r
-    __fftw_execute_dft_c2r = dlsym(RTLD_DEFAULT, 'fftw_execute_dft_c2r')
-    if __fftw_execute_dft_c2r == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_execute_dft_c2r = dlsym(handle, 'fftw_execute_dft_c2r')
-
-    global __fftwf_plan_many_dft
-    __fftwf_plan_many_dft = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft')
-    if __fftwf_plan_many_dft == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_plan_many_dft = dlsym(handle, 'fftwf_plan_many_dft')
-
-    global __fftwf_plan_many_dft_r2c
-    __fftwf_plan_many_dft_r2c = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft_r2c')
-    if __fftwf_plan_many_dft_r2c == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_plan_many_dft_r2c = dlsym(handle, 'fftwf_plan_many_dft_r2c')
-
-    global __fftwf_plan_many_dft_c2r
-    __fftwf_plan_many_dft_c2r = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft_c2r')
-    if __fftwf_plan_many_dft_c2r == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_plan_many_dft_c2r = dlsym(handle, 'fftwf_plan_many_dft_c2r')
-
-    global __fftwf_execute_dft
-    __fftwf_execute_dft = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft')
-    if __fftwf_execute_dft == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_execute_dft = dlsym(handle, 'fftwf_execute_dft')
-
-    global __fftwf_execute_dft_r2c
-    __fftwf_execute_dft_r2c = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft_r2c')
-    if __fftwf_execute_dft_r2c == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_execute_dft_r2c = dlsym(handle, 'fftwf_execute_dft_r2c')
 
-    global __fftwf_execute_dft_c2r
-    __fftwf_execute_dft_c2r = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft_c2r')
-    if __fftwf_execute_dft_c2r == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_execute_dft_c2r = dlsym(handle, 'fftwf_execute_dft_c2r')
-
-    global __fftw_init_threads
-    __fftw_init_threads = dlsym(RTLD_DEFAULT, 'fftw_init_threads')
-    if __fftw_init_threads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_init_threads = dlsym(handle, 'fftw_init_threads')
-
-    global __fftwf_init_threads
-    __fftwf_init_threads = dlsym(RTLD_DEFAULT, 'fftwf_init_threads')
-    if __fftwf_init_threads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_init_threads = dlsym(handle, 'fftwf_init_threads')
-
-    global __fftw_plan_with_nthreads
-    __fftw_plan_with_nthreads = dlsym(RTLD_DEFAULT, 'fftw_plan_with_nthreads')
-    if __fftw_plan_with_nthreads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_plan_with_nthreads = dlsym(handle, 'fftw_plan_with_nthreads')
-
-    global __fftwf_plan_with_nthreads
-    __fftwf_plan_with_nthreads = dlsym(RTLD_DEFAULT, 'fftwf_plan_with_nthreads')
-    if __fftwf_plan_with_nthreads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_plan_with_nthreads = dlsym(handle, 'fftwf_plan_with_nthreads')
-
-    global __fftw_planner_nthreads
-    __fftw_planner_nthreads = dlsym(RTLD_DEFAULT, 'fftw_planner_nthreads')
-    if __fftw_planner_nthreads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_planner_nthreads = dlsym(handle, 'fftw_planner_nthreads')
-
-    global __fftwf_planner_nthreads
-    __fftwf_planner_nthreads = dlsym(RTLD_DEFAULT, 'fftwf_planner_nthreads')
-    if __fftwf_planner_nthreads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_planner_nthreads = dlsym(handle, 'fftwf_planner_nthreads')
-
-    global __fftw_cleanup_threads
-    __fftw_cleanup_threads = dlsym(RTLD_DEFAULT, 'fftw_cleanup_threads')
-    if __fftw_cleanup_threads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_cleanup_threads = dlsym(handle, 'fftw_cleanup_threads')
-
-    global __fftwf_cleanup_threads
-    __fftwf_cleanup_threads = dlsym(RTLD_DEFAULT, 'fftwf_cleanup_threads')
-    if __fftwf_cleanup_threads == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_cleanup_threads = dlsym(handle, 'fftwf_cleanup_threads')
-
-    global __fftw_destroy_plan
-    __fftw_destroy_plan = dlsym(RTLD_DEFAULT, 'fftw_destroy_plan')
-    if __fftw_destroy_plan == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftw_destroy_plan = dlsym(handle, 'fftw_destroy_plan')
-
-    global __fftwf_destroy_plan
-    __fftwf_destroy_plan = dlsym(RTLD_DEFAULT, 'fftwf_destroy_plan')
-    if __fftwf_destroy_plan == NULL:
-        if handle == NULL:
-            handle = load_library()
-        __fftwf_destroy_plan = dlsym(handle, 'fftwf_destroy_plan')
-
-    __py_nvpl_fft_init = True
-    return 0
+    with gil, __symbol_lock:
+        # Load function
+        global __nvpl_fft_get_version
+        __nvpl_fft_get_version = dlsym(RTLD_DEFAULT, 'nvpl_fft_get_version')
+        if __nvpl_fft_get_version == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvpl_fft_get_version = dlsym(handle, 'nvpl_fft_get_version')
+
+        global __fftw_plan_many_dft
+        __fftw_plan_many_dft = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft')
+        if __fftw_plan_many_dft == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_plan_many_dft = dlsym(handle, 'fftw_plan_many_dft')
+
+        global __fftw_plan_many_dft_r2c
+        __fftw_plan_many_dft_r2c = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft_r2c')
+        if __fftw_plan_many_dft_r2c == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_plan_many_dft_r2c = dlsym(handle, 'fftw_plan_many_dft_r2c')
+
+        global __fftw_plan_many_dft_c2r
+        __fftw_plan_many_dft_c2r = dlsym(RTLD_DEFAULT, 'fftw_plan_many_dft_c2r')
+        if __fftw_plan_many_dft_c2r == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_plan_many_dft_c2r = dlsym(handle, 'fftw_plan_many_dft_c2r')
+
+        global __fftw_execute_dft
+        __fftw_execute_dft = dlsym(RTLD_DEFAULT, 'fftw_execute_dft')
+        if __fftw_execute_dft == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_execute_dft = dlsym(handle, 'fftw_execute_dft')
+
+        global __fftw_execute_dft_r2c
+        __fftw_execute_dft_r2c = dlsym(RTLD_DEFAULT, 'fftw_execute_dft_r2c')
+        if __fftw_execute_dft_r2c == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_execute_dft_r2c = dlsym(handle, 'fftw_execute_dft_r2c')
+
+        global __fftw_execute_dft_c2r
+        __fftw_execute_dft_c2r = dlsym(RTLD_DEFAULT, 'fftw_execute_dft_c2r')
+        if __fftw_execute_dft_c2r == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_execute_dft_c2r = dlsym(handle, 'fftw_execute_dft_c2r')
+
+        global __fftwf_plan_many_dft
+        __fftwf_plan_many_dft = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft')
+        if __fftwf_plan_many_dft == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_plan_many_dft = dlsym(handle, 'fftwf_plan_many_dft')
+
+        global __fftwf_plan_many_dft_r2c
+        __fftwf_plan_many_dft_r2c = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft_r2c')
+        if __fftwf_plan_many_dft_r2c == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_plan_many_dft_r2c = dlsym(handle, 'fftwf_plan_many_dft_r2c')
+
+        global __fftwf_plan_many_dft_c2r
+        __fftwf_plan_many_dft_c2r = dlsym(RTLD_DEFAULT, 'fftwf_plan_many_dft_c2r')
+        if __fftwf_plan_many_dft_c2r == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_plan_many_dft_c2r = dlsym(handle, 'fftwf_plan_many_dft_c2r')
+
+        global __fftwf_execute_dft
+        __fftwf_execute_dft = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft')
+        if __fftwf_execute_dft == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_execute_dft = dlsym(handle, 'fftwf_execute_dft')
+
+        global __fftwf_execute_dft_r2c
+        __fftwf_execute_dft_r2c = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft_r2c')
+        if __fftwf_execute_dft_r2c == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_execute_dft_r2c = dlsym(handle, 'fftwf_execute_dft_r2c')
+
+        global __fftwf_execute_dft_c2r
+        __fftwf_execute_dft_c2r = dlsym(RTLD_DEFAULT, 'fftwf_execute_dft_c2r')
+        if __fftwf_execute_dft_c2r == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_execute_dft_c2r = dlsym(handle, 'fftwf_execute_dft_c2r')
+
+        global __fftw_init_threads
+        __fftw_init_threads = dlsym(RTLD_DEFAULT, 'fftw_init_threads')
+        if __fftw_init_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_init_threads = dlsym(handle, 'fftw_init_threads')
+
+        global __fftwf_init_threads
+        __fftwf_init_threads = dlsym(RTLD_DEFAULT, 'fftwf_init_threads')
+        if __fftwf_init_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_init_threads = dlsym(handle, 'fftwf_init_threads')
+
+        global __fftw_plan_with_nthreads
+        __fftw_plan_with_nthreads = dlsym(RTLD_DEFAULT, 'fftw_plan_with_nthreads')
+        if __fftw_plan_with_nthreads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_plan_with_nthreads = dlsym(handle, 'fftw_plan_with_nthreads')
+
+        global __fftwf_plan_with_nthreads
+        __fftwf_plan_with_nthreads = dlsym(RTLD_DEFAULT, 'fftwf_plan_with_nthreads')
+        if __fftwf_plan_with_nthreads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_plan_with_nthreads = dlsym(handle, 'fftwf_plan_with_nthreads')
+
+        global __fftw_planner_nthreads
+        __fftw_planner_nthreads = dlsym(RTLD_DEFAULT, 'fftw_planner_nthreads')
+        if __fftw_planner_nthreads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_planner_nthreads = dlsym(handle, 'fftw_planner_nthreads')
+
+        global __fftwf_planner_nthreads
+        __fftwf_planner_nthreads = dlsym(RTLD_DEFAULT, 'fftwf_planner_nthreads')
+        if __fftwf_planner_nthreads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_planner_nthreads = dlsym(handle, 'fftwf_planner_nthreads')
+
+        global __fftw_cleanup_threads
+        __fftw_cleanup_threads = dlsym(RTLD_DEFAULT, 'fftw_cleanup_threads')
+        if __fftw_cleanup_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_cleanup_threads = dlsym(handle, 'fftw_cleanup_threads')
+
+        global __fftwf_cleanup_threads
+        __fftwf_cleanup_threads = dlsym(RTLD_DEFAULT, 'fftwf_cleanup_threads')
+        if __fftwf_cleanup_threads == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_cleanup_threads = dlsym(handle, 'fftwf_cleanup_threads')
+
+        global __fftw_destroy_plan
+        __fftw_destroy_plan = dlsym(RTLD_DEFAULT, 'fftw_destroy_plan')
+        if __fftw_destroy_plan == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftw_destroy_plan = dlsym(handle, 'fftw_destroy_plan')
+
+        global __fftwf_destroy_plan
+        __fftwf_destroy_plan = dlsym(RTLD_DEFAULT, 'fftwf_destroy_plan')
+        if __fftwf_destroy_plan == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __fftwf_destroy_plan = dlsym(handle, 'fftwf_destroy_plan')
+
+        __py_nvpl_fft_init = True
+        return 0
 
 
 cdef dict func_ptrs = None
diff --git a/nvmath/bindings/nvpl/_internal/fft_windows.pyx b/nvmath/bindings/nvpl/_internal/fft_windows.pyx
new file mode 100644
index 0000000..ecee6ff
--- /dev/null
+++ b/nvmath/bindings/nvpl/_internal/fft_windows.pyx
@@ -0,0 +1,616 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
+
+cimport cython
+from libc.stdint cimport intptr_t
+
+import os
+import site
+import threading
+import win32api
+
+from ..._internal.utils import FunctionNotFoundError, NotSupportedError
+
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('something went wrong')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError('something went wrong')
+
+    return driver_ver
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvpl_fft_init = False
+cdef str __current_dll_name = ""
+cdef tuple __lib_dll_names = ("mkl_rt.2.dll", )
+
+cdef void* __nvpl_fft_get_version = NULL
+cdef void* __fftw_plan_many_dft = NULL
+cdef void* __fftw_plan_many_dft_r2c = NULL
+cdef void* __fftw_plan_many_dft_c2r = NULL
+cdef void* __fftw_execute_dft = NULL
+cdef void* __fftw_execute_dft_r2c = NULL
+cdef void* __fftw_execute_dft_c2r = NULL
+cdef void* __fftwf_plan_many_dft = NULL
+cdef void* __fftwf_plan_many_dft_r2c = NULL
+cdef void* __fftwf_plan_many_dft_c2r = NULL
+cdef void* __fftwf_execute_dft = NULL
+cdef void* __fftwf_execute_dft_r2c = NULL
+cdef void* __fftwf_execute_dft_c2r = NULL
+cdef void* __fftw_init_threads = NULL
+cdef void* __fftwf_init_threads = NULL
+cdef void* __fftw_plan_with_nthreads = NULL
+cdef void* __fftwf_plan_with_nthreads = NULL
+cdef void* __fftw_planner_nthreads = NULL
+cdef void* __fftwf_planner_nthreads = NULL
+cdef void* __fftw_cleanup_threads = NULL
+cdef void* __fftwf_cleanup_threads = NULL
+cdef void* __fftw_destroy_plan = NULL
+cdef void* __fftwf_destroy_plan = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef void* load_library() except* with gil:
+    handle = 0
+    cdef str all_err_msg = ""
+    cdef str env_lib_dll_name = os.getenv("NVMATH_FFT_CPU_LIBRARY", "")
+
+    if env_lib_dll_name != "":
+        try:
+            handle = win32api.GetModuleHandle(env_lib_dll_name)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to dlopen NVMATH_FFT_CPU_LIBRARY={env_lib_dll_name}. "
+                f"Please check that NVMATH_FFT_CPU_LIBRARY is the name of a DLL on the PATH. {e}"
+            )
+        global __current_dll_name
+        __current_dll_name = env_lib_dll_name
+        assert handle != 0
+        return <void*><intptr_t>handle
+
+    if len(__lib_dll_names) == 0:
+        raise RuntimeError("Cannot load a FFT-compatible library. No DLL names were specified.")
+    for dll_name in __lib_dll_names:
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "..", "..", "Library", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except Exception as e:
+            all_err_msg += f"\n{e}"
+            pass
+        else:
+            break  # stop at first successful open
+    else:
+        all_libs = ", ".join(__lib_dll_names)
+        raise RuntimeError(
+            f"Failed to dlopen all of the following libraries: {all_libs}. "
+            "Install/add one of these libraries to the PATH or "
+            f"use environment variable NVMATH_FFT_CPU_LIBRARY to name a DLL on the PATH. {all_err_msg}"
+        )
+
+    global __current_dll_name
+    __current_dll_name = dll_name
+
+    assert handle != 0
+    return <void*><intptr_t>handle
+
+
+cdef int _check_or_init_nvpl_fft() except -1 nogil:
+    global __py_nvpl_fft_init
+    if __py_nvpl_fft_init:
+        return 0
+
+    with gil, __symbol_lock:
+        # Load library
+        handle = <intptr_t>load_library()
+
+        # Load function
+        global __nvpl_fft_get_version
+        __nvpl_fft_get_version = GetProcAddress(handle, 'nvpl_fft_get_version')
+
+        global __fftw_plan_many_dft
+        __fftw_plan_many_dft = GetProcAddress(handle, 'fftw_plan_many_dft')
+
+        global __fftw_plan_many_dft_r2c
+        __fftw_plan_many_dft_r2c = GetProcAddress(handle, 'fftw_plan_many_dft_r2c')
+
+        global __fftw_plan_many_dft_c2r
+        __fftw_plan_many_dft_c2r = GetProcAddress(handle, 'fftw_plan_many_dft_c2r')
+
+        global __fftw_execute_dft
+        __fftw_execute_dft = GetProcAddress(handle, 'fftw_execute_dft')
+
+        global __fftw_execute_dft_r2c
+        __fftw_execute_dft_r2c = GetProcAddress(handle, 'fftw_execute_dft_r2c')
+
+        global __fftw_execute_dft_c2r
+        __fftw_execute_dft_c2r = GetProcAddress(handle, 'fftw_execute_dft_c2r')
+
+        global __fftwf_plan_many_dft
+        __fftwf_plan_many_dft = GetProcAddress(handle, 'fftwf_plan_many_dft')
+
+        global __fftwf_plan_many_dft_r2c
+        __fftwf_plan_many_dft_r2c = GetProcAddress(handle, 'fftwf_plan_many_dft_r2c')
+
+        global __fftwf_plan_many_dft_c2r
+        __fftwf_plan_many_dft_c2r = GetProcAddress(handle, 'fftwf_plan_many_dft_c2r')
+
+        global __fftwf_execute_dft
+        __fftwf_execute_dft = GetProcAddress(handle, 'fftwf_execute_dft')
+
+        global __fftwf_execute_dft_r2c
+        __fftwf_execute_dft_r2c = GetProcAddress(handle, 'fftwf_execute_dft_r2c')
+
+        global __fftwf_execute_dft_c2r
+        __fftwf_execute_dft_c2r = GetProcAddress(handle, 'fftwf_execute_dft_c2r')
+
+        global __fftw_init_threads
+        __fftw_init_threads = GetProcAddress(handle, 'fftw_init_threads')
+
+        global __fftwf_init_threads
+        __fftwf_init_threads = GetProcAddress(handle, 'fftwf_init_threads')
+
+        global __fftw_plan_with_nthreads
+        __fftw_plan_with_nthreads = GetProcAddress(handle, 'fftw_plan_with_nthreads')
+
+        global __fftwf_plan_with_nthreads
+        __fftwf_plan_with_nthreads = GetProcAddress(handle, 'fftwf_plan_with_nthreads')
+
+        global __fftw_planner_nthreads
+        __fftw_planner_nthreads = GetProcAddress(handle, 'fftw_planner_nthreads')
+
+        global __fftwf_planner_nthreads
+        __fftwf_planner_nthreads = GetProcAddress(handle, 'fftwf_planner_nthreads')
+
+        global __fftw_cleanup_threads
+        __fftw_cleanup_threads = GetProcAddress(handle, 'fftw_cleanup_threads')
+
+        global __fftwf_cleanup_threads
+        __fftwf_cleanup_threads = GetProcAddress(handle, 'fftwf_cleanup_threads')
+
+        global __fftw_destroy_plan
+        __fftw_destroy_plan = GetProcAddress(handle, 'fftw_destroy_plan')
+
+        global __fftwf_destroy_plan
+        __fftwf_destroy_plan = GetProcAddress(handle, 'fftwf_destroy_plan')
+
+    __py_nvpl_fft_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef void _set_lib_so_names(tuple lib_so_names):
+    global __lib_dll_names
+    __lib_dll_names = lib_so_names
+
+
+cpdef tuple _get_lib_so_names():
+    global __lib_dll_names
+    return __lib_dll_names
+
+
+cpdef str _get_current_lib_so_name():
+    global __current_dll_name
+    return __current_dll_name
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvpl_fft()
+    cdef dict data = {}
+
+    global __nvpl_fft_get_version
+    data["__nvpl_fft_get_version"] = <intptr_t>__nvpl_fft_get_version
+
+    global __fftw_plan_many_dft
+    data["__fftw_plan_many_dft"] = <intptr_t>__fftw_plan_many_dft
+
+    global __fftw_plan_many_dft_r2c
+    data["__fftw_plan_many_dft_r2c"] = <intptr_t>__fftw_plan_many_dft_r2c
+
+    global __fftw_plan_many_dft_c2r
+    data["__fftw_plan_many_dft_c2r"] = <intptr_t>__fftw_plan_many_dft_c2r
+
+    global __fftw_execute_dft
+    data["__fftw_execute_dft"] = <intptr_t>__fftw_execute_dft
+
+    global __fftw_execute_dft_r2c
+    data["__fftw_execute_dft_r2c"] = <intptr_t>__fftw_execute_dft_r2c
+
+    global __fftw_execute_dft_c2r
+    data["__fftw_execute_dft_c2r"] = <intptr_t>__fftw_execute_dft_c2r
+
+    global __fftwf_plan_many_dft
+    data["__fftwf_plan_many_dft"] = <intptr_t>__fftwf_plan_many_dft
+
+    global __fftwf_plan_many_dft_r2c
+    data["__fftwf_plan_many_dft_r2c"] = <intptr_t>__fftwf_plan_many_dft_r2c
+
+    global __fftwf_plan_many_dft_c2r
+    data["__fftwf_plan_many_dft_c2r"] = <intptr_t>__fftwf_plan_many_dft_c2r
+
+    global __fftwf_execute_dft
+    data["__fftwf_execute_dft"] = <intptr_t>__fftwf_execute_dft
+
+    global __fftwf_execute_dft_r2c
+    data["__fftwf_execute_dft_r2c"] = <intptr_t>__fftwf_execute_dft_r2c
+
+    global __fftwf_execute_dft_c2r
+    data["__fftwf_execute_dft_c2r"] = <intptr_t>__fftwf_execute_dft_c2r
+
+    global __fftw_init_threads
+    data["__fftw_init_threads"] = <intptr_t>__fftw_init_threads
+
+    global __fftwf_init_threads
+    data["__fftwf_init_threads"] = <intptr_t>__fftwf_init_threads
+
+    global __fftw_plan_with_nthreads
+    data["__fftw_plan_with_nthreads"] = <intptr_t>__fftw_plan_with_nthreads
+
+    global __fftwf_plan_with_nthreads
+    data["__fftwf_plan_with_nthreads"] = <intptr_t>__fftwf_plan_with_nthreads
+
+    global __fftw_planner_nthreads
+    data["__fftw_planner_nthreads"] = <intptr_t>__fftw_planner_nthreads
+
+    global __fftwf_planner_nthreads
+    data["__fftwf_planner_nthreads"] = <intptr_t>__fftwf_planner_nthreads
+
+    global __fftw_cleanup_threads
+    data["__fftw_cleanup_threads"] = <intptr_t>__fftw_cleanup_threads
+
+    global __fftwf_cleanup_threads
+    data["__fftwf_cleanup_threads"] = <intptr_t>__fftwf_cleanup_threads
+
+    global __fftw_destroy_plan
+    data["__fftw_destroy_plan"] = <intptr_t>__fftw_destroy_plan
+
+    global __fftwf_destroy_plan
+    data["__fftwf_destroy_plan"] = <intptr_t>__fftwf_destroy_plan
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef int _nvpl_fft_get_version() except?-42 nogil:
+    global __nvpl_fft_get_version
+    _check_or_init_nvpl_fft()
+    if __nvpl_fft_get_version == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvpl_fft_get_version is not found")
+    return (<int (*)() noexcept nogil>__nvpl_fft_get_version)(
+        )
+
+
+cdef fftw_plan _fftw_plan_many_dft(int rank, const int* n, int batch, fftw_complex* in_, const int* inembed, int istride, int idist, fftw_complex* out, const int* onembed, int ostride, int odist, int sign, unsigned flags) except?NULL nogil:
+    global __fftw_plan_many_dft
+    _check_or_init_nvpl_fft()
+    if __fftw_plan_many_dft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_plan_many_dft is not found")
+    return (<fftw_plan (*)(int, const int*, int, fftw_complex*, const int*, int, int, fftw_complex*, const int*, int, int, int, unsigned) noexcept nogil>__fftw_plan_many_dft)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, sign, flags)
+
+
+cdef fftw_plan _fftw_plan_many_dft_r2c(int rank, const int* n, int batch, double* in_, const int* inembed, int istride, int idist, fftw_complex* out, const int* onembed, int ostride, int odist, unsigned flags) except?NULL nogil:
+    global __fftw_plan_many_dft_r2c
+    _check_or_init_nvpl_fft()
+    if __fftw_plan_many_dft_r2c == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_plan_many_dft_r2c is not found")
+    return (<fftw_plan (*)(int, const int*, int, double*, const int*, int, int, fftw_complex*, const int*, int, int, unsigned) noexcept nogil>__fftw_plan_many_dft_r2c)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, flags)
+
+
+cdef fftw_plan _fftw_plan_many_dft_c2r(int rank, const int* n, int batch, fftw_complex* in_, const int* inembed, int istride, int idist, double* out, const int* onembed, int ostride, int odist, unsigned flags) except?NULL nogil:
+    global __fftw_plan_many_dft_c2r
+    _check_or_init_nvpl_fft()
+    if __fftw_plan_many_dft_c2r == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_plan_many_dft_c2r is not found")
+    return (<fftw_plan (*)(int, const int*, int, fftw_complex*, const int*, int, int, double*, const int*, int, int, unsigned) noexcept nogil>__fftw_plan_many_dft_c2r)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, flags)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_execute_dft(const fftw_plan plan, fftw_complex* idata, fftw_complex* odata) except* nogil:
+    global __fftw_execute_dft
+    _check_or_init_nvpl_fft()
+    if __fftw_execute_dft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_execute_dft is not found")
+    (<void (*)(const fftw_plan, fftw_complex*, fftw_complex*) noexcept nogil>__fftw_execute_dft)(
+        plan, idata, odata)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_execute_dft_r2c(const fftw_plan plan, double* idata, fftw_complex* odata) except* nogil:
+    global __fftw_execute_dft_r2c
+    _check_or_init_nvpl_fft()
+    if __fftw_execute_dft_r2c == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_execute_dft_r2c is not found")
+    (<void (*)(const fftw_plan, double*, fftw_complex*) noexcept nogil>__fftw_execute_dft_r2c)(
+        plan, idata, odata)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_execute_dft_c2r(const fftw_plan plan, fftw_complex* idata, double* odata) except* nogil:
+    global __fftw_execute_dft_c2r
+    _check_or_init_nvpl_fft()
+    if __fftw_execute_dft_c2r == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_execute_dft_c2r is not found")
+    (<void (*)(const fftw_plan, fftw_complex*, double*) noexcept nogil>__fftw_execute_dft_c2r)(
+        plan, idata, odata)
+
+
+cdef fftwf_plan _fftwf_plan_many_dft(int rank, const int* n, int batch, fftwf_complex* in_, const int* inembed, int istride, int idist, fftwf_complex* out, const int* onembed, int ostride, int odist, int sign, unsigned flags) except?NULL nogil:
+    global __fftwf_plan_many_dft
+    _check_or_init_nvpl_fft()
+    if __fftwf_plan_many_dft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_plan_many_dft is not found")
+    return (<fftwf_plan (*)(int, const int*, int, fftwf_complex*, const int*, int, int, fftwf_complex*, const int*, int, int, int, unsigned) noexcept nogil>__fftwf_plan_many_dft)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, sign, flags)
+
+
+cdef fftwf_plan _fftwf_plan_many_dft_r2c(int rank, const int* n, int batch, float* in_, const int* inembed, int istride, int idist, fftwf_complex* out, const int* onembed, int ostride, int odist, unsigned flags) except?NULL nogil:
+    global __fftwf_plan_many_dft_r2c
+    _check_or_init_nvpl_fft()
+    if __fftwf_plan_many_dft_r2c == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_plan_many_dft_r2c is not found")
+    return (<fftwf_plan (*)(int, const int*, int, float*, const int*, int, int, fftwf_complex*, const int*, int, int, unsigned) noexcept nogil>__fftwf_plan_many_dft_r2c)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, flags)
+
+
+cdef fftwf_plan _fftwf_plan_many_dft_c2r(int rank, const int* n, int batch, fftwf_complex* in_, const int* inembed, int istride, int idist, float* out, const int* onembed, int ostride, int odist, unsigned flags) except?NULL nogil:
+    global __fftwf_plan_many_dft_c2r
+    _check_or_init_nvpl_fft()
+    if __fftwf_plan_many_dft_c2r == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_plan_many_dft_c2r is not found")
+    return (<fftwf_plan (*)(int, const int*, int, fftwf_complex*, const int*, int, int, float*, const int*, int, int, unsigned) noexcept nogil>__fftwf_plan_many_dft_c2r)(
+        rank, n, batch, in_, inembed, istride, idist, out, onembed, ostride, odist, flags)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_execute_dft(const fftwf_plan plan, fftwf_complex* idata, fftwf_complex* odata) except* nogil:
+    global __fftwf_execute_dft
+    _check_or_init_nvpl_fft()
+    if __fftwf_execute_dft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_execute_dft is not found")
+    (<void (*)(const fftwf_plan, fftwf_complex*, fftwf_complex*) noexcept nogil>__fftwf_execute_dft)(
+        plan, idata, odata)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_execute_dft_r2c(const fftwf_plan plan, float* idata, fftwf_complex* odata) except* nogil:
+    global __fftwf_execute_dft_r2c
+    _check_or_init_nvpl_fft()
+    if __fftwf_execute_dft_r2c == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_execute_dft_r2c is not found")
+    (<void (*)(const fftwf_plan, float*, fftwf_complex*) noexcept nogil>__fftwf_execute_dft_r2c)(
+        plan, idata, odata)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_execute_dft_c2r(const fftwf_plan plan, fftwf_complex* idata, float* odata) except* nogil:
+    global __fftwf_execute_dft_c2r
+    _check_or_init_nvpl_fft()
+    if __fftwf_execute_dft_c2r == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_execute_dft_c2r is not found")
+    (<void (*)(const fftwf_plan, fftwf_complex*, float*) noexcept nogil>__fftwf_execute_dft_c2r)(
+        plan, idata, odata)
+
+
+cdef int _fftw_init_threads() except?-42 nogil:
+    global __fftw_init_threads
+    _check_or_init_nvpl_fft()
+    if __fftw_init_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_init_threads is not found")
+    return (<int (*)() noexcept nogil>__fftw_init_threads)(
+        )
+
+
+cdef int _fftwf_init_threads() except?-42 nogil:
+    global __fftwf_init_threads
+    _check_or_init_nvpl_fft()
+    if __fftwf_init_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_init_threads is not found")
+    return (<int (*)() noexcept nogil>__fftwf_init_threads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_plan_with_nthreads(int nthreads) except* nogil:
+    global __fftw_plan_with_nthreads
+    _check_or_init_nvpl_fft()
+    if __fftw_plan_with_nthreads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_plan_with_nthreads is not found")
+    (<void (*)(int) noexcept nogil>__fftw_plan_with_nthreads)(
+        nthreads)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_plan_with_nthreads(int nthreads) except* nogil:
+    global __fftwf_plan_with_nthreads
+    _check_or_init_nvpl_fft()
+    if __fftwf_plan_with_nthreads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_plan_with_nthreads is not found")
+    (<void (*)(int) noexcept nogil>__fftwf_plan_with_nthreads)(
+        nthreads)
+
+
+cdef int _fftw_planner_nthreads() except?-42 nogil:
+    global __fftw_planner_nthreads
+    _check_or_init_nvpl_fft()
+    if __fftw_planner_nthreads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_planner_nthreads is not found")
+    return (<int (*)() noexcept nogil>__fftw_planner_nthreads)(
+        )
+
+
+cdef int _fftwf_planner_nthreads() except?-42 nogil:
+    global __fftwf_planner_nthreads
+    _check_or_init_nvpl_fft()
+    if __fftwf_planner_nthreads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_planner_nthreads is not found")
+    return (<int (*)() noexcept nogil>__fftwf_planner_nthreads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_cleanup_threads() except* nogil:
+    global __fftw_cleanup_threads
+    _check_or_init_nvpl_fft()
+    if __fftw_cleanup_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_cleanup_threads is not found")
+    (<void (*)() noexcept nogil>__fftw_cleanup_threads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_cleanup_threads() except* nogil:
+    global __fftwf_cleanup_threads
+    _check_or_init_nvpl_fft()
+    if __fftwf_cleanup_threads == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_cleanup_threads is not found")
+    (<void (*)() noexcept nogil>__fftwf_cleanup_threads)(
+        )
+
+
+@cython.show_performance_hints(False)
+cdef void _fftw_destroy_plan(fftw_plan plan) except* nogil:
+    global __fftw_destroy_plan
+    _check_or_init_nvpl_fft()
+    if __fftw_destroy_plan == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftw_destroy_plan is not found")
+    (<void (*)(fftw_plan) noexcept nogil>__fftw_destroy_plan)(
+        plan)
+
+
+@cython.show_performance_hints(False)
+cdef void _fftwf_destroy_plan(fftwf_plan plan) except* nogil:
+    global __fftwf_destroy_plan
+    _check_or_init_nvpl_fft()
+    if __fftwf_destroy_plan == NULL:
+        with gil:
+            raise FunctionNotFoundError("function fftwf_destroy_plan is not found")
+    (<void (*)(fftwf_plan) noexcept nogil>__fftwf_destroy_plan)(
+        plan)
diff --git a/nvmath/bindings/nvpl/blas.pxd b/nvmath/bindings/nvpl/blas.pxd
new file mode 100644
index 0000000..3c6a757
--- /dev/null
+++ b/nvmath/bindings/nvpl/blas.pxd
@@ -0,0 +1,85 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cyblas cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef nvpl_scomplex_t scomplex
+ctypedef nvpl_dcomplex_t dcomplex
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef CBLAS_ORDER _ORDER
+ctypedef CBLAS_TRANSPOSE _TRANSPOSE
+ctypedef CBLAS_UPLO _UPLO
+ctypedef CBLAS_DIAG _DIAG
+ctypedef CBLAS_SIDE _SIDE
+
+
+###############################################################################
+# Convenience wrappers/adapters
+###############################################################################
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef int mkl_set_num_threads_local(int nth) except? -1
+cpdef void mkl_set_num_threads(int nth) except*
+cpdef void openblas_set_num_threads(int num_threads) except*
+cpdef int openblas_set_num_threads_local(int num_threads) except? -1
+cpdef int get_version() except? -1
+cpdef int get_max_threads() except? -1
+cpdef void set_num_threads(int nthr) except*
+cpdef int set_num_threads_local(int nthr_local) except? -1
+cpdef void sgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void ssymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void ssyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void ssyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void strmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void strsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void dgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void dsymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void dsyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void dsyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void dtrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void dtrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void cgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void csymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void csyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void csyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void ctrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void ctrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void zgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zsymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zsyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zsyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void ztrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void ztrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*
+cpdef void chemm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void cherk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void cher2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zhemm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zherk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void zher2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*
+cpdef void sgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*
+cpdef void dgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*
+cpdef void cgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*
+cpdef void zgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*
+cpdef void sgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, float beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*
+cpdef void dgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, double beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*
+cpdef void cgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, intptr_t beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*
+cpdef void zgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, intptr_t beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*
diff --git a/nvmath/bindings/nvpl/blas.pyi b/nvmath/bindings/nvpl/blas.pyi
new file mode 100644
index 0000000..8ae075c
--- /dev/null
+++ b/nvmath/bindings/nvpl/blas.pyi
@@ -0,0 +1,122 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_4
+import enum
+from typing import Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+cgemm: _cython_3_1_4.cython_function_or_method
+cgemm_batch: _cython_3_1_4.cython_function_or_method
+cgemm_batch_strided: _cython_3_1_4.cython_function_or_method
+chemm: _cython_3_1_4.cython_function_or_method
+chemm_batch_strided: _cython_3_1_4.cython_function_or_method
+cher2k: _cython_3_1_4.cython_function_or_method
+cherk: _cython_3_1_4.cython_function_or_method
+csymm: _cython_3_1_4.cython_function_or_method
+csymm_batch_strided: _cython_3_1_4.cython_function_or_method
+csyr2k: _cython_3_1_4.cython_function_or_method
+csyrk: _cython_3_1_4.cython_function_or_method
+ctrmm: _cython_3_1_4.cython_function_or_method
+ctrmm_batch_strided: _cython_3_1_4.cython_function_or_method
+ctrsm: _cython_3_1_4.cython_function_or_method
+dgemm: _cython_3_1_4.cython_function_or_method
+dgemm_batch: _cython_3_1_4.cython_function_or_method
+dgemm_batch_strided: _cython_3_1_4.cython_function_or_method
+dsymm: _cython_3_1_4.cython_function_or_method
+dsymm_batch_strided: _cython_3_1_4.cython_function_or_method
+dsyr2k: _cython_3_1_4.cython_function_or_method
+dsyrk: _cython_3_1_4.cython_function_or_method
+dtrmm: _cython_3_1_4.cython_function_or_method
+dtrmm_batch_strided: _cython_3_1_4.cython_function_or_method
+dtrsm: _cython_3_1_4.cython_function_or_method
+get_max_threads: _cython_3_1_4.cython_function_or_method
+get_version: _cython_3_1_4.cython_function_or_method
+mkl_set_num_threads: _cython_3_1_4.cython_function_or_method
+mkl_set_num_threads_local: _cython_3_1_4.cython_function_or_method
+openblas_set_num_threads: _cython_3_1_4.cython_function_or_method
+openblas_set_num_threads_local: _cython_3_1_4.cython_function_or_method
+set_num_threads: _cython_3_1_4.cython_function_or_method
+set_num_threads_local: _cython_3_1_4.cython_function_or_method
+sgemm: _cython_3_1_4.cython_function_or_method
+sgemm_batch: _cython_3_1_4.cython_function_or_method
+sgemm_batch_strided: _cython_3_1_4.cython_function_or_method
+ssymm: _cython_3_1_4.cython_function_or_method
+ssymm_batch_strided: _cython_3_1_4.cython_function_or_method
+ssyr2k: _cython_3_1_4.cython_function_or_method
+ssyrk: _cython_3_1_4.cython_function_or_method
+strmm: _cython_3_1_4.cython_function_or_method
+strmm_batch_strided: _cython_3_1_4.cython_function_or_method
+strsm: _cython_3_1_4.cython_function_or_method
+zgemm: _cython_3_1_4.cython_function_or_method
+zgemm_batch: _cython_3_1_4.cython_function_or_method
+zgemm_batch_strided: _cython_3_1_4.cython_function_or_method
+zhemm: _cython_3_1_4.cython_function_or_method
+zhemm_batch_strided: _cython_3_1_4.cython_function_or_method
+zher2k: _cython_3_1_4.cython_function_or_method
+zherk: _cython_3_1_4.cython_function_or_method
+zsymm: _cython_3_1_4.cython_function_or_method
+zsymm_batch_strided: _cython_3_1_4.cython_function_or_method
+zsyr2k: _cython_3_1_4.cython_function_or_method
+zsyrk: _cython_3_1_4.cython_function_or_method
+ztrmm: _cython_3_1_4.cython_function_or_method
+ztrmm_batch_strided: _cython_3_1_4.cython_function_or_method
+ztrsm: _cython_3_1_4.cython_function_or_method
+
+class DIAG(enum.IntEnum):
+    """See `CBLAS_DIAG`."""
+    __new__: ClassVar[Callable] = ...
+    NonUnit: ClassVar[DIAG] = ...
+    Unit: ClassVar[DIAG] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class ORDER(enum.IntEnum):
+    """See `CBLAS_ORDER`."""
+    __new__: ClassVar[Callable] = ...
+    ColMajor: ClassVar[ORDER] = ...
+    RowMajor: ClassVar[ORDER] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class SIDE(enum.IntEnum):
+    """See `CBLAS_SIDE`."""
+    __new__: ClassVar[Callable] = ...
+    Left: ClassVar[SIDE] = ...
+    Right: ClassVar[SIDE] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class TRANSPOSE(enum.IntEnum):
+    """See `CBLAS_TRANSPOSE`."""
+    __new__: ClassVar[Callable] = ...
+    ConjTrans: ClassVar[TRANSPOSE] = ...
+    NoTrans: ClassVar[TRANSPOSE] = ...
+    Trans: ClassVar[TRANSPOSE] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class UPLO(enum.IntEnum):
+    """See `CBLAS_UPLO`."""
+    __new__: ClassVar[Callable] = ...
+    Lower: ClassVar[UPLO] = ...
+    Upper: ClassVar[UPLO] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
diff --git a/nvmath/bindings/nvpl/blas.pyx b/nvmath/bindings/nvpl/blas.pyx
new file mode 100644
index 0000000..b7cecfc
--- /dev/null
+++ b/nvmath/bindings/nvpl/blas.pyx
@@ -0,0 +1,540 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from .._internal.utils cimport get_resource_ptr, nullable_unique_ptr
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class ORDER(_IntEnum):
+    """See `CBLAS_ORDER`."""
+    RowMajor = CblasRowMajor
+    ColMajor = CblasColMajor
+
+class TRANSPOSE(_IntEnum):
+    """See `CBLAS_TRANSPOSE`."""
+    NoTrans = CblasNoTrans
+    Trans = CblasTrans
+    ConjTrans = CblasConjTrans
+
+class UPLO(_IntEnum):
+    """See `CBLAS_UPLO`."""
+    Upper = CblasUpper
+    Lower = CblasLower
+
+class DIAG(_IntEnum):
+    """See `CBLAS_DIAG`."""
+    NonUnit = CblasNonUnit
+    Unit = CblasUnit
+
+class SIDE(_IntEnum):
+    """See `CBLAS_SIDE`."""
+    Left = CblasLeft
+    Right = CblasRight
+
+
+###############################################################################
+# Types
+###############################################################################
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+
+###############################################################################
+# Convenience wrappers/adapters
+###############################################################################
+
+
+cpdef void zhemm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_dcomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_dcomplex_t*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<nvpl_dcomplex_t*>c + batch_idx * stride_c)
+
+        zhemm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+cpdef void chemm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_scomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_scomplex_t*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<nvpl_scomplex_t*>c + batch_idx * stride_c)
+
+        chemm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void ssymm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    float alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    float beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<float*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<float*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<float*>c + batch_idx * stride_c)
+
+        ssymm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void dsymm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    double alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    double beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<double*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<double*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<double*>c + batch_idx * stride_c)
+
+        dsymm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void csymm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_scomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_scomplex_t*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<nvpl_scomplex_t*>c + batch_idx * stride_c)
+
+        csymm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void zsymm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    intptr_t beta,
+    intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stride_c,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef intptr_t c_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_dcomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_dcomplex_t*>b + batch_idx * stride_b)
+        c_batch = <intptr_t>(<nvpl_dcomplex_t*>c + batch_idx * stride_c)
+
+        zsymm(order, side, uplo, m, n, alpha, a_batch, lda, b_batch, ldb, beta, c_batch, ldc)
+
+
+cpdef void strmm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    float alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<float*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<float*>b + batch_idx * stride_b)
+
+        strmm(order, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb)
+
+
+cpdef void dtrmm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    double alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<double*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<double*>b + batch_idx * stride_b)
+
+        dtrmm(order, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb)
+
+cpdef void ctrmm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_scomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_scomplex_t*>b + batch_idx * stride_b)
+
+        ctrmm(order, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb)
+
+cpdef void ztrmm_batch_strided(
+    int order,
+    int side,
+    int uplo,
+    int trans_a,
+    int diag,
+    nvpl_int64_t m,
+    nvpl_int64_t n,
+    intptr_t alpha,
+    intptr_t a, nvpl_int64_t lda, nvpl_int64_t stride_a,
+    intptr_t b, nvpl_int64_t ldb, nvpl_int64_t stride_b,
+    nvpl_int64_t batch_count) except*:
+
+    cdef intptr_t a_batch
+    cdef intptr_t b_batch
+    cdef nvpl_int64_t batch_idx
+
+    for batch_idx in range(batch_count):
+        a_batch = <intptr_t>(<nvpl_dcomplex_t*>a + batch_idx * stride_a)
+        b_batch = <intptr_t>(<nvpl_dcomplex_t*>b + batch_idx * stride_b)
+
+        ztrmm(order, side, uplo, trans_a, diag, m, n, alpha, a_batch, lda, b_batch, ldb)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef int mkl_set_num_threads_local(int nth) except? -1:
+    """See `MKL_mkl_set_num_threads_local`."""
+    return MKL_mkl_set_num_threads_local(nth)
+
+
+cpdef void mkl_set_num_threads(int nth) except*:
+    """See `MKL_mkl_set_num_threads`."""
+    MKL_mkl_set_num_threads(nth)
+
+
+cpdef void openblas_set_num_threads(int num_threads) except*:
+    """See `openblas_openblas_set_num_threads`."""
+    openblas_openblas_set_num_threads(num_threads)
+
+
+cpdef int openblas_set_num_threads_local(int num_threads) except? -1:
+    """See `openblas_openblas_set_num_threads_local`."""
+    return openblas_openblas_set_num_threads_local(num_threads)
+
+
+cpdef int get_version() except? -1:
+    """See `nvpl_blas_get_version`."""
+    return nvpl_blas_get_version()
+
+
+cpdef int get_max_threads() except? -1:
+    """See `nvpl_blas_get_max_threads`."""
+    return nvpl_blas_get_max_threads()
+
+
+cpdef void set_num_threads(int nthr) except*:
+    """See `nvpl_blas_set_num_threads`."""
+    nvpl_blas_set_num_threads(nthr)
+
+
+cpdef int set_num_threads_local(int nthr_local) except? -1:
+    """See `nvpl_blas_set_num_threads_local`."""
+    return nvpl_blas_set_num_threads_local(nthr_local)
+
+
+cpdef void sgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_sgemm`."""
+    cblas_sgemm(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <const float*>b, <const nvpl_int_t>ldb, <const float>beta, <float*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void ssymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_ssymm`."""
+    cblas_ssymm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <const float*>b, <const nvpl_int_t>ldb, <const float>beta, <float*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void ssyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_ssyrk`."""
+    cblas_ssyrk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <const float>beta, <float*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void ssyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_ssyr2k`."""
+    cblas_ssyr2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <const float*>b, <const nvpl_int_t>ldb, <const float>beta, <float*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void strmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_strmm`."""
+    cblas_strmm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <float*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void strsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, float alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_strsm`."""
+    cblas_strsm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <float*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void dgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_dgemm`."""
+    cblas_dgemm(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <const double*>b, <const nvpl_int_t>ldb, <const double>beta, <double*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void dsymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_dsymm`."""
+    cblas_dsymm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <const double*>b, <const nvpl_int_t>ldb, <const double>beta, <double*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void dsyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_dsyrk`."""
+    cblas_dsyrk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <const double>beta, <double*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void dsyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_dsyr2k`."""
+    cblas_dsyr2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <const double*>b, <const nvpl_int_t>ldb, <const double>beta, <double*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void dtrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_dtrmm`."""
+    cblas_dtrmm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <double*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void dtrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, double alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_dtrsm`."""
+    cblas_dtrsm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <double*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void cgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_cgemm`."""
+    cblas_cgemm(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void csymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_csymm`."""
+    cblas_csymm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void csyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_csyrk`."""
+    cblas_csyrk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void csyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_csyr2k`."""
+    cblas_csyr2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void ctrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_ctrmm`."""
+    cblas_ctrmm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <void*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void ctrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_ctrsm`."""
+    cblas_ctrsm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <void*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void zgemm(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zgemm`."""
+    cblas_zgemm(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zsymm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zsymm`."""
+    cblas_zsymm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zsyrk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zsyrk`."""
+    cblas_zsyrk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zsyr2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zsyr2k`."""
+    cblas_zsyr2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void ztrmm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_ztrmm`."""
+    cblas_ztrmm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <void*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void ztrsm(int order, int side, int uplo, int trans_a, int diag, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb) except*:
+    """See `cblas_ztrsm`."""
+    cblas_ztrsm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const _TRANSPOSE>trans_a, <const _DIAG>diag, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <void*>b, <const nvpl_int_t>ldb)
+
+
+cpdef void chemm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_chemm`."""
+    cblas_chemm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void cherk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_cherk`."""
+    cblas_cherk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const float>alpha, <const void*>a, <const nvpl_int_t>lda, <const float>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void cher2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, float beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_cher2k`."""
+    cblas_cher2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const float>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zhemm(int order, int side, int uplo, nvpl_int64_t m, nvpl_int64_t n, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, intptr_t beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zhemm`."""
+    cblas_zhemm(<const _ORDER>order, <const _SIDE>side, <const _UPLO>uplo, <const nvpl_int_t>m, <const nvpl_int_t>n, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zherk(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zherk`."""
+    cblas_zherk(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const double>alpha, <const void*>a, <const nvpl_int_t>lda, <const double>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void zher2k(int order, int uplo, int trans, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, intptr_t b, nvpl_int64_t ldb, double beta, intptr_t c, nvpl_int64_t ldc) except*:
+    """See `cblas_zher2k`."""
+    cblas_zher2k(<const _ORDER>order, <const _UPLO>uplo, <const _TRANSPOSE>trans, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const void*>b, <const nvpl_int_t>ldb, <const double>beta, <void*>c, <const nvpl_int_t>ldc)
+
+
+cpdef void sgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*:
+    """See `cblas_sgemm_batch`."""
+    cblas_sgemm_batch(<_ORDER>order, <_TRANSPOSE*>trans_a_array, <_TRANSPOSE*>trans_b_array, <nvpl_int_t*>m_array, <nvpl_int_t*>n_array, <nvpl_int_t*>k_array, <const float*>alpha_array, <const float**>a_array, <nvpl_int_t*>lda_array, <const float**>b_array, <nvpl_int_t*>ldb_array, <const float*>beta_array, <float**>c_array, <nvpl_int_t*>ldc_array, <nvpl_int_t>group_count, <nvpl_int_t*>group_size)
+
+
+cpdef void dgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*:
+    """See `cblas_dgemm_batch`."""
+    cblas_dgemm_batch(<_ORDER>order, <_TRANSPOSE*>trans_a_array, <_TRANSPOSE*>trans_b_array, <nvpl_int_t*>m_array, <nvpl_int_t*>n_array, <nvpl_int_t*>k_array, <const double*>alpha_array, <const double**>a_array, <nvpl_int_t*>lda_array, <const double**>b_array, <nvpl_int_t*>ldb_array, <const double*>beta_array, <double**>c_array, <nvpl_int_t*>ldc_array, <nvpl_int_t>group_count, <nvpl_int_t*>group_size)
+
+
+cpdef void cgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*:
+    """See `cblas_cgemm_batch`."""
+    cblas_cgemm_batch(<_ORDER>order, <_TRANSPOSE*>trans_a_array, <_TRANSPOSE*>trans_b_array, <nvpl_int_t*>m_array, <nvpl_int_t*>n_array, <nvpl_int_t*>k_array, <const void*>alpha_array, <const void**>a_array, <nvpl_int_t*>lda_array, <const void**>b_array, <nvpl_int_t*>ldb_array, <const void*>beta_array, <void**>c_array, <nvpl_int_t*>ldc_array, <nvpl_int_t>group_count, <nvpl_int_t*>group_size)
+
+
+cpdef void zgemm_batch(int order, intptr_t trans_a_array, intptr_t trans_b_array, intptr_t m_array, intptr_t n_array, intptr_t k_array, intptr_t alpha_array, intptr_t a_array, intptr_t lda_array, intptr_t b_array, intptr_t ldb_array, intptr_t beta_array, intptr_t c_array, intptr_t ldc_array, nvpl_int64_t group_count, intptr_t group_size) except*:
+    """See `cblas_zgemm_batch`."""
+    cblas_zgemm_batch(<_ORDER>order, <_TRANSPOSE*>trans_a_array, <_TRANSPOSE*>trans_b_array, <nvpl_int_t*>m_array, <nvpl_int_t*>n_array, <nvpl_int_t*>k_array, <const void*>alpha_array, <const void**>a_array, <nvpl_int_t*>lda_array, <const void**>b_array, <nvpl_int_t*>ldb_array, <const void*>beta_array, <void**>c_array, <nvpl_int_t*>ldc_array, <nvpl_int_t>group_count, <nvpl_int_t*>group_size)
+
+
+cpdef void sgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, float alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, float beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*:
+    """See `cblas_sgemm_batch_strided`."""
+    cblas_sgemm_batch_strided(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const float>alpha, <const float*>a, <const nvpl_int_t>lda, <const nvpl_int_t>stridea, <const float*>b, <const nvpl_int_t>ldb, <const nvpl_int_t>strideb, <const float>beta, <float*>c, <const nvpl_int_t>ldc, <const nvpl_int_t>stridec, <const nvpl_int_t>batch_size)
+
+
+cpdef void dgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, double alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, double beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*:
+    """See `cblas_dgemm_batch_strided`."""
+    cblas_dgemm_batch_strided(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const double>alpha, <const double*>a, <const nvpl_int_t>lda, <const nvpl_int_t>stridea, <const double*>b, <const nvpl_int_t>ldb, <const nvpl_int_t>strideb, <const double>beta, <double*>c, <const nvpl_int_t>ldc, <const nvpl_int_t>stridec, <const nvpl_int_t>batch_size)
+
+
+cpdef void cgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, intptr_t beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*:
+    """See `cblas_cgemm_batch_strided`."""
+    cblas_cgemm_batch_strided(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const nvpl_int_t>stridea, <const void*>b, <const nvpl_int_t>ldb, <const nvpl_int_t>strideb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc, <const nvpl_int_t>stridec, <const nvpl_int_t>batch_size)
+
+
+cpdef void zgemm_batch_strided(int order, int trans_a, int trans_b, nvpl_int64_t m, nvpl_int64_t n, nvpl_int64_t k, intptr_t alpha, intptr_t a, nvpl_int64_t lda, nvpl_int64_t stridea, intptr_t b, nvpl_int64_t ldb, nvpl_int64_t strideb, intptr_t beta, intptr_t c, nvpl_int64_t ldc, nvpl_int64_t stridec, nvpl_int64_t batch_size) except*:
+    """See `cblas_zgemm_batch_strided`."""
+    cblas_zgemm_batch_strided(<const _ORDER>order, <const _TRANSPOSE>trans_a, <const _TRANSPOSE>trans_b, <const nvpl_int_t>m, <const nvpl_int_t>n, <const nvpl_int_t>k, <const void*>alpha, <const void*>a, <const nvpl_int_t>lda, <const nvpl_int_t>stridea, <const void*>b, <const nvpl_int_t>ldb, <const nvpl_int_t>strideb, <const void*>beta, <void*>c, <const nvpl_int_t>ldc, <const nvpl_int_t>stridec, <const nvpl_int_t>batch_size)
diff --git a/nvmath/bindings/nvpl/cyblas.pxd b/nvmath/bindings/nvpl/cyblas.pxd
new file mode 100644
index 0000000..af6526c
--- /dev/null
+++ b/nvmath/bindings/nvpl/cyblas.pxd
@@ -0,0 +1,164 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+# This layer exposes the C header to Cython as-is.
+
+from libc.stdint cimport int64_t, int32_t
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum CBLAS_ORDER "CBLAS_ORDER":
+    CblasRowMajor "CblasRowMajor" = 101
+    CblasColMajor "CblasColMajor" = 102
+
+ctypedef enum CBLAS_TRANSPOSE "CBLAS_TRANSPOSE":
+    CblasNoTrans "CblasNoTrans" = 111
+    CblasTrans "CblasTrans" = 112
+    CblasConjTrans "CblasConjTrans" = 113
+
+ctypedef enum CBLAS_UPLO "CBLAS_UPLO":
+    CblasUpper "CblasUpper" = 121
+    CblasLower "CblasLower" = 122
+
+ctypedef enum CBLAS_DIAG "CBLAS_DIAG":
+    CblasNonUnit "CblasNonUnit" = 131
+    CblasUnit "CblasUnit" = 132
+
+ctypedef enum CBLAS_SIDE "CBLAS_SIDE":
+    CblasLeft "CblasLeft" = 141
+    CblasRight "CblasRight" = 142
+
+
+# types
+ctypedef int64_t nvpl_int64_t 'nvpl_int64_t'
+ctypedef int32_t nvpl_int32_t 'nvpl_int32_t'
+ctypedef struct nvpl_scomplex_t 'nvpl_scomplex_t':
+    float real
+    float imag
+ctypedef struct nvpl_dcomplex_t 'nvpl_dcomplex_t':
+    double real
+    double imag
+ctypedef nvpl_int64_t nvpl_int_t 'nvpl_int_t'
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef int MKL_mkl_set_num_threads_local(int nth) except?-42 nogil
+cdef void MKL_mkl_set_num_threads(int nth) except* nogil
+cdef void openblas_openblas_set_num_threads(int num_threads) except* nogil
+cdef int openblas_openblas_set_num_threads_local(int num_threads) except?-42 nogil
+cdef int nvpl_blas_get_version() except?-42 nogil
+cdef int nvpl_blas_get_max_threads() except?-42 nogil
+cdef void nvpl_blas_set_num_threads(int nthr) except* nogil
+cdef int nvpl_blas_set_num_threads_local(int nthr_local) except?-42 nogil
+cdef void cblas_sgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_sgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_strmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_stbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_stpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_strsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_stbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_stpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_dgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_dtrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dtbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dtpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dtrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dtbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_dtpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_cgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_cgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_ctrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ctbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ctpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ctrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ctbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ctpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_zgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_zgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_ztrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ztbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ztpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ztrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ztbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ztpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil
+cdef void cblas_ssymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_ssbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_sspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* Ap, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_sger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_ssyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_sspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* Ap) except* nogil
+cdef void cblas_ssyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_sspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A) except* nogil
+cdef void cblas_dsymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_dsbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_dspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* Ap, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_dger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_dsyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_dspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* Ap) except* nogil
+cdef void cblas_dsyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_dspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A) except* nogil
+cdef void cblas_chemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_chbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_chpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_cgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_cgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_cher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_chpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil
+cdef void cblas_cher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_chpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil
+cdef void cblas_zhemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_zhbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_zhpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil
+cdef void cblas_zgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_zgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_zher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_zhpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil
+cdef void cblas_zher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil
+cdef void cblas_zhpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil
+cdef void cblas_sgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_ssymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_ssyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_ssyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_strmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_strsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_dgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_dsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_dsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_dsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_dtrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_dtrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_cgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_csymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_csyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_csyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_ctrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_ctrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_zgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_ztrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_ztrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil
+cdef void cblas_chemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_cherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const void* A, const nvpl_int_t lda, const float beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_cher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const float beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zhemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const void* A, const nvpl_int_t lda, const double beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_zher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const double beta, void* C, const nvpl_int_t ldc) except* nogil
+cdef void cblas_sgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const float* alpha_array, const float** A_array, nvpl_int_t* lda_array, const float** B_array, nvpl_int_t* ldb_array, const float* beta_array, float** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void cblas_dgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const double* alpha_array, const double** A_array, nvpl_int_t* lda_array, const double** B_array, nvpl_int_t* ldb_array, const double* beta_array, double** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void cblas_cgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void cblas_zgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil
+cdef void cblas_sgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const nvpl_int_t stridea, const float* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const float beta, float* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void cblas_dgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const nvpl_int_t stridea, const double* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const double beta, double* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void cblas_cgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
+cdef void cblas_zgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil
diff --git a/nvmath/bindings/nvpl/cyblas.pyx b/nvmath/bindings/nvpl/cyblas.pyx
new file mode 100644
index 0000000..9b6f147
--- /dev/null
+++ b/nvmath/bindings/nvpl/cyblas.pyx
@@ -0,0 +1,568 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated with version 0.4.1. Do not modify it directly.
+
+cimport cython
+
+from ._internal cimport blas as _nvpl_blas
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef int MKL_mkl_set_num_threads_local(int nth) except?-42 nogil:
+    return _nvpl_blas._MKL_mkl_set_num_threads_local(nth)
+
+
+@cython.show_performance_hints(False)
+cdef void MKL_mkl_set_num_threads(int nth) except* nogil:
+    _nvpl_blas._MKL_mkl_set_num_threads(nth)
+
+
+@cython.show_performance_hints(False)
+cdef void openblas_openblas_set_num_threads(int num_threads) except* nogil:
+    _nvpl_blas._openblas_openblas_set_num_threads(num_threads)
+
+
+cdef int openblas_openblas_set_num_threads_local(int num_threads) except?-42 nogil:
+    return _nvpl_blas._openblas_openblas_set_num_threads_local(num_threads)
+
+
+cdef int nvpl_blas_get_version() except?-42 nogil:
+    return _nvpl_blas._nvpl_blas_get_version()
+
+
+cdef int nvpl_blas_get_max_threads() except?-42 nogil:
+    return _nvpl_blas._nvpl_blas_get_max_threads()
+
+
+@cython.show_performance_hints(False)
+cdef void nvpl_blas_set_num_threads(int nthr) except* nogil:
+    _nvpl_blas._nvpl_blas_set_num_threads(nthr)
+
+
+cdef int nvpl_blas_set_num_threads_local(int nthr_local) except?-42 nogil:
+    return _nvpl_blas._nvpl_blas_set_num_threads_local(nthr_local)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_sgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_strmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_strmv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_stbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_stbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_stpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_stpmv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_strsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_strsv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_stbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const float* A, const nvpl_int_t lda, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_stbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_stpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const float* Ap, float* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_stpsv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_dgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtpmv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const double* A, const nvpl_int_t lda, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const double* Ap, double* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_dtpsv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_cgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_cgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctpmv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ctpsv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgemv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_zgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgbmv(const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t KL, const nvpl_int_t KU, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_zgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztrmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztpmv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztrsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztbsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const nvpl_int_t K, const void* A, const nvpl_int_t lda, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztpsv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t N, const void* Ap, void* X, const nvpl_int_t incX) except* nogil:
+    _nvpl_blas._cblas_ztpsv(order, Uplo, TransA, Diag, N, Ap, X, incX)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_ssbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* Ap, const float* X, const nvpl_int_t incX, const float beta, float* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_sspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_sger(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_ssyr(order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, float* Ap) except* nogil:
+    _nvpl_blas._cblas_sspr(order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_ssyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const float* X, const nvpl_int_t incX, const float* Y, const nvpl_int_t incY, float* A) except* nogil:
+    _nvpl_blas._cblas_sspr2(order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsymv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_dsymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_dsbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dspmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* Ap, const double* X, const nvpl_int_t incX, const double beta, double* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_dspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dger(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_dger(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsyr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_dsyr(order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dspr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, double* Ap) except* nogil:
+    _nvpl_blas._cblas_dspr(order, Uplo, N, alpha, X, incX, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsyr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_dsyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dspr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const double* X, const nvpl_int_t incX, const double* Y, const nvpl_int_t incY, double* A) except* nogil:
+    _nvpl_blas._cblas_dspr2(order, Uplo, N, alpha, X, incX, Y, incY, A)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_chemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_chbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_chpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_cgeru(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_cgerc(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_cher(order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const float alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    _nvpl_blas._cblas_chpr(order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_cher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    _nvpl_blas._cblas_chpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhemv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_zhemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhbmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_zhbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhpmv(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* Ap, const void* X, const nvpl_int_t incX, const void* beta, void* Y, const nvpl_int_t incY) except* nogil:
+    _nvpl_blas._cblas_zhpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgeru(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_zgeru(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgerc(const CBLAS_ORDER order, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_zgerc(order, M, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zher(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_zher(order, Uplo, N, alpha, X, incX, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhpr(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const double alpha, const void* X, const nvpl_int_t incX, void* A) except* nogil:
+    _nvpl_blas._cblas_zhpr(order, Uplo, N, alpha, X, incX, A)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zher2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* A, const nvpl_int_t lda) except* nogil:
+    _nvpl_blas._cblas_zher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhpr2(const CBLAS_ORDER order, const CBLAS_UPLO Uplo, const nvpl_int_t N, const void* alpha, const void* X, const nvpl_int_t incX, const void* Y, const nvpl_int_t incY, void* Ap) except* nogil:
+    _nvpl_blas._cblas_zhpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_ssymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_ssyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ssyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const float* B, const nvpl_int_t ldb, const float beta, float* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_ssyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_strmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_strmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_strsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const float alpha, const float* A, const nvpl_int_t lda, float* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_strsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_dsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_dsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const double* B, const nvpl_int_t ldb, const double beta, double* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_dsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_dtrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dtrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const double alpha, const double* A, const nvpl_int_t lda, double* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_dtrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_csymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_csymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_csyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_csyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_csyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_csyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_ctrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ctrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_ctrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgemm(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zsymm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zsyrk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zsyr2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztrmm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_ztrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_ztrsm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, void* B, const nvpl_int_t ldb) except* nogil:
+    _nvpl_blas._cblas_ztrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_chemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_chemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const void* A, const nvpl_int_t lda, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_cherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const float beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_cher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zhemm(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const nvpl_int_t M, const nvpl_int_t N, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const void* beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zhemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zherk(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const void* A, const nvpl_int_t lda, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zher2k(const CBLAS_ORDER Order, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE Trans, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const void* B, const nvpl_int_t ldb, const double beta, void* C, const nvpl_int_t ldc) except* nogil:
+    _nvpl_blas._cblas_zher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const float* alpha_array, const float** A_array, nvpl_int_t* lda_array, const float** B_array, nvpl_int_t* ldb_array, const float* beta_array, float** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    _nvpl_blas._cblas_sgemm_batch(Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const double* alpha_array, const double** A_array, nvpl_int_t* lda_array, const double** B_array, nvpl_int_t* ldb_array, const double* beta_array, double** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    _nvpl_blas._cblas_dgemm_batch(Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    _nvpl_blas._cblas_cgemm_batch(Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgemm_batch(CBLAS_ORDER Order, CBLAS_TRANSPOSE* TransA_array, CBLAS_TRANSPOSE* TransB_array, nvpl_int_t* M_array, nvpl_int_t* N_array, nvpl_int_t* K_array, const void* alpha_array, const void** A_array, nvpl_int_t* lda_array, const void** B_array, nvpl_int_t* ldb_array, const void* beta_array, void** C_array, nvpl_int_t* ldc_array, nvpl_int_t group_count, nvpl_int_t* group_size) except* nogil:
+    _nvpl_blas._cblas_zgemm_batch(Order, TransA_array, TransB_array, M_array, N_array, K_array, alpha_array, A_array, lda_array, B_array, ldb_array, beta_array, C_array, ldc_array, group_count, group_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_sgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const float alpha, const float* A, const nvpl_int_t lda, const nvpl_int_t stridea, const float* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const float beta, float* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    _nvpl_blas._cblas_sgemm_batch_strided(Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_dgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const double alpha, const double* A, const nvpl_int_t lda, const nvpl_int_t stridea, const double* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const double beta, double* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    _nvpl_blas._cblas_dgemm_batch_strided(Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_cgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    _nvpl_blas._cblas_cgemm_batch_strided(Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
+
+
+@cython.show_performance_hints(False)
+cdef void cblas_zgemm_batch_strided(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const nvpl_int_t M, const nvpl_int_t N, const nvpl_int_t K, const void* alpha, const void* A, const nvpl_int_t lda, const nvpl_int_t stridea, const void* B, const nvpl_int_t ldb, const nvpl_int_t strideb, const void* beta, void* C, const nvpl_int_t ldc, const nvpl_int_t stridec, const nvpl_int_t batch_size) except* nogil:
+    _nvpl_blas._cblas_zgemm_batch_strided(Order, TransA, TransB, M, N, K, alpha, A, lda, stridea, B, ldb, strideb, beta, C, ldc, stridec, batch_size)
diff --git a/nvmath/bindings/nvpl/cyfft.pxd b/nvmath/bindings/nvpl/cyfft.pxd
index 40e2fa6..d2efe20 100644
--- a/nvmath/bindings/nvpl/cyfft.pxd
+++ b/nvmath/bindings/nvpl/cyfft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 ###############################################################################
@@ -33,8 +33,13 @@ cdef extern from *:
     #define FFTW_PRESERVE_INPUT     0x0C
     #define FFTW_UNALIGNED          0x10
 
+    #ifdef _WIN32
+    typedef __declspec(align(16)) double fftw_complex[2];
+    typedef __declspec(align(8)) float fftwf_complex[2];
+    #else
     typedef double fftw_complex[2] __attribute__ ((aligned (16)));
     typedef float fftwf_complex[2] __attribute__ ((aligned (8)));
+    #endif
     """
 
     cdef const int FFTW_FORWARD
diff --git a/nvmath/bindings/nvpl/cyfft.pyx b/nvmath/bindings/nvpl/cyfft.pyx
index 3ad9973..546e7e6 100644
--- a/nvmath/bindings/nvpl/cyfft.pyx
+++ b/nvmath/bindings/nvpl/cyfft.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 
 cimport cython
 
diff --git a/nvmath/bindings/nvpl/fft.pxd b/nvmath/bindings/nvpl/fft.pxd
index 3380a13..0a0f38a 100644
--- a/nvmath/bindings/nvpl/fft.pxd
+++ b/nvmath/bindings/nvpl/fft.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/nvmath/bindings/nvpl/fft.pyi b/nvmath/bindings/nvpl/fft.pyi
index 2188d01..39e66dc 100644
--- a/nvmath/bindings/nvpl/fft.pyi
+++ b/nvmath/bindings/nvpl/fft.pyi
@@ -2,153 +2,99 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import _cython_3_1_2
+import _cython_3_1_3
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-cleanup_threads: _cython_3_1_2.cython_function_or_method
-cleanup_threads_double: _cython_3_1_2.cython_function_or_method
-cleanup_threads_float: _cython_3_1_2.cython_function_or_method
-destroy: _cython_3_1_2.cython_function_or_method
-destroy_plan_double: _cython_3_1_2.cython_function_or_method
-destroy_plan_float: _cython_3_1_2.cython_function_or_method
-execute: _cython_3_1_2.cython_function_or_method
-execute_c2c_double: _cython_3_1_2.cython_function_or_method
-execute_c2c_float: _cython_3_1_2.cython_function_or_method
-execute_c2r_double: _cython_3_1_2.cython_function_or_method
-execute_c2r_float: _cython_3_1_2.cython_function_or_method
-execute_r2c_double: _cython_3_1_2.cython_function_or_method
-execute_r2c_float: _cython_3_1_2.cython_function_or_method
-get_version: _cython_3_1_2.cython_function_or_method
-init_threads: _cython_3_1_2.cython_function_or_method
-init_threads_double: _cython_3_1_2.cython_function_or_method
-init_threads_float: _cython_3_1_2.cython_function_or_method
-plan_many: _cython_3_1_2.cython_function_or_method
-plan_many_c2c_double: _cython_3_1_2.cython_function_or_method
-plan_many_c2c_float: _cython_3_1_2.cython_function_or_method
-plan_many_c2r_double: _cython_3_1_2.cython_function_or_method
-plan_many_c2r_float: _cython_3_1_2.cython_function_or_method
-plan_many_r2c_double: _cython_3_1_2.cython_function_or_method
-plan_many_r2c_float: _cython_3_1_2.cython_function_or_method
-plan_with_nthreads: _cython_3_1_2.cython_function_or_method
-plan_with_nthreads_double: _cython_3_1_2.cython_function_or_method
-plan_with_nthreads_float: _cython_3_1_2.cython_function_or_method
-planner_nthreads: _cython_3_1_2.cython_function_or_method
-planner_nthreads_double: _cython_3_1_2.cython_function_or_method
-planner_nthreads_float: _cython_3_1_2.cython_function_or_method
+cleanup_threads: _cython_3_1_3.cython_function_or_method
+cleanup_threads_double: _cython_3_1_3.cython_function_or_method
+cleanup_threads_float: _cython_3_1_3.cython_function_or_method
+destroy: _cython_3_1_3.cython_function_or_method
+destroy_plan_double: _cython_3_1_3.cython_function_or_method
+destroy_plan_float: _cython_3_1_3.cython_function_or_method
+execute: _cython_3_1_3.cython_function_or_method
+execute_c2c_double: _cython_3_1_3.cython_function_or_method
+execute_c2c_float: _cython_3_1_3.cython_function_or_method
+execute_c2r_double: _cython_3_1_3.cython_function_or_method
+execute_c2r_float: _cython_3_1_3.cython_function_or_method
+execute_r2c_double: _cython_3_1_3.cython_function_or_method
+execute_r2c_float: _cython_3_1_3.cython_function_or_method
+get_version: _cython_3_1_3.cython_function_or_method
+init_threads: _cython_3_1_3.cython_function_or_method
+init_threads_double: _cython_3_1_3.cython_function_or_method
+init_threads_float: _cython_3_1_3.cython_function_or_method
+plan_many: _cython_3_1_3.cython_function_or_method
+plan_many_c2c_double: _cython_3_1_3.cython_function_or_method
+plan_many_c2c_float: _cython_3_1_3.cython_function_or_method
+plan_many_c2r_double: _cython_3_1_3.cython_function_or_method
+plan_many_c2r_float: _cython_3_1_3.cython_function_or_method
+plan_many_r2c_double: _cython_3_1_3.cython_function_or_method
+plan_many_r2c_float: _cython_3_1_3.cython_function_or_method
+plan_with_nthreads: _cython_3_1_3.cython_function_or_method
+plan_with_nthreads_double: _cython_3_1_3.cython_function_or_method
+plan_with_nthreads_float: _cython_3_1_3.cython_function_or_method
+planner_nthreads: _cython_3_1_3.cython_function_or_method
+planner_nthreads_double: _cython_3_1_3.cython_function_or_method
+planner_nthreads_float: _cython_3_1_3.cython_function_or_method
 
 class FFTWError(Exception): ...
 
 class FFTWUnaligned(FFTWError): ...
 
 class Kind(enum.IntFlag):
+    """An enumeration."""
     __new__: ClassVar[Callable] = ...
     C2C: ClassVar[Kind] = ...
     C2R: ClassVar[Kind] = ...
     R2C: ClassVar[Kind] = ...
-    _all_bits_: ClassVar[int] = ...
-    _boundary_: ClassVar[enum.FlagBoundary] = ...
-    _flag_mask_: ClassVar[int] = ...
     _generate_next_value_: ClassVar[Callable] = ...
-    _inverted_: ClassVar[None] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _singles_mask_: ClassVar[int] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    __and__: ClassVar[Callable] = ...
-    __invert__: ClassVar[Callable] = ...
-    __or__: ClassVar[Callable] = ...
-    __rand__: ClassVar[Callable] = ...
-    __ror__: ClassVar[Callable] = ...
-    __rxor__: ClassVar[Callable] = ...
-    __xor__: ClassVar[Callable] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Plan:
     @classmethod
-    def __init__(cls, *args, **kwargs) -> None: ...
-    def __reduce__(self) -> Any: ...
+    def __init__(cls, *args, **kwargs) -> None:
+        """Create and return a new object.  See help(type) for accurate signature."""
+    def __reduce__(self) -> Any:
+        """Plan.__reduce__(self)"""
 
 class PlannerFlags(enum.IntFlag):
+    """An enumeration."""
     __new__: ClassVar[Callable] = ...
     ESTIMATE: ClassVar[PlannerFlags] = ...
     EXHAUSTIVE: ClassVar[PlannerFlags] = ...
     MEASURE: ClassVar[PlannerFlags] = ...
     PATIENT: ClassVar[PlannerFlags] = ...
     WISDOM_ONLY: ClassVar[PlannerFlags] = ...
-    _all_bits_: ClassVar[int] = ...
-    _boundary_: ClassVar[enum.FlagBoundary] = ...
-    _flag_mask_: ClassVar[int] = ...
     _generate_next_value_: ClassVar[Callable] = ...
-    _inverted_: ClassVar[None] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _singles_mask_: ClassVar[int] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    __and__: ClassVar[Callable] = ...
-    __invert__: ClassVar[Callable] = ...
-    __or__: ClassVar[Callable] = ...
-    __rand__: ClassVar[Callable] = ...
-    __ror__: ClassVar[Callable] = ...
-    __rxor__: ClassVar[Callable] = ...
-    __xor__: ClassVar[Callable] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Precision(enum.IntFlag):
+    """An enumeration."""
     __new__: ClassVar[Callable] = ...
     DOUBLE: ClassVar[Precision] = ...
     FLOAT: ClassVar[Precision] = ...
-    _all_bits_: ClassVar[int] = ...
-    _boundary_: ClassVar[enum.FlagBoundary] = ...
-    _flag_mask_: ClassVar[int] = ...
     _generate_next_value_: ClassVar[Callable] = ...
-    _inverted_: ClassVar[None] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _singles_mask_: ClassVar[int] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    __and__: ClassVar[Callable] = ...
-    __invert__: ClassVar[Callable] = ...
-    __or__: ClassVar[Callable] = ...
-    __rand__: ClassVar[Callable] = ...
-    __ror__: ClassVar[Callable] = ...
-    __rxor__: ClassVar[Callable] = ...
-    __xor__: ClassVar[Callable] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
 
 class Sign(enum.IntFlag):
+    """An enumeration."""
     __new__: ClassVar[Callable] = ...
     FORWARD: ClassVar[Sign] = ...
     INVERSE: ClassVar[Sign] = ...
     UNSPECIFIED: ClassVar[Sign] = ...
-    _all_bits_: ClassVar[int] = ...
-    _boundary_: ClassVar[enum.FlagBoundary] = ...
-    _flag_mask_: ClassVar[int] = ...
     _generate_next_value_: ClassVar[Callable] = ...
-    _inverted_: ClassVar[None] = ...
     _member_map_: ClassVar[dict] = ...
     _member_names_: ClassVar[list] = ...
     _member_type_: ClassVar[type[int]] = ...
-    _singles_mask_: ClassVar[int] = ...
-    _unhashable_values_: ClassVar[list] = ...
-    _use_args_: ClassVar[bool] = ...
     _value2member_map_: ClassVar[dict] = ...
-    __and__: ClassVar[Callable] = ...
-    __invert__: ClassVar[Callable] = ...
-    __or__: ClassVar[Callable] = ...
-    __rand__: ClassVar[Callable] = ...
-    __ror__: ClassVar[Callable] = ...
-    __rxor__: ClassVar[Callable] = ...
-    __xor__: ClassVar[Callable] = ...
-    def __format__(self, *args, **kwargs) -> str: ...
diff --git a/nvmath/bindings/nvpl/fft.pyx b/nvmath/bindings/nvpl/fft.pyx
index 74583ec..4614ae3 100644
--- a/nvmath/bindings/nvpl/fft.pyx
+++ b/nvmath/bindings/nvpl/fft.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated with version 0.3.0. Do not modify it directly.
+# This code was automatically generated with version 0.4.2. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc.stdint cimport int64_t
diff --git a/nvmath/device/__init__.py b/nvmath/device/__init__.py
index 15ab7cb..abaff1e 100644
--- a/nvmath/device/__init__.py
+++ b/nvmath/device/__init__.py
@@ -6,8 +6,10 @@
 from .cufftdx import *  # noqa: E402, F403
 from .cublasdx import *  # noqa: E402, F403
 from .cublasdx_backend import *  # noqa: E402, F403
+from .types import *  # noqa: E402, F403
 from .vector_types_numba import *  # noqa: E402, F403
-from .common import make_tensor  # noqa: E402, F401
+from .common import *  # noqa: E402, F403
 
 # register models in numba
 from . import cublasdx_numba  # noqa: E402, F401
+from . import cufftdx_numba  # noqa: E402, F401
diff --git a/nvmath/device/common.py b/nvmath/device/common.py
index 8a73e24..aa9f9e6 100644
--- a/nvmath/device/common.py
+++ b/nvmath/device/common.py
@@ -5,16 +5,20 @@
 from abc import abstractmethod
 import os
 import tempfile
+from collections.abc import Sequence
+from typing import Any
 
 import numpy as np
 
-from .common_cuda import CodeType
+from .common_cuda import MAX_SUPPORTED_CC, MIN_SUPPORTED_CC, CodeType, ComputeCapability, get_current_device_cc
 
 
 __all__ = [
     "make_tensor",
     "OpaqueTensor",
     "Layout",
+    "Partition",
+    "Partitioner",
     "axpby",
     "copy",
     "copy_fragment",
@@ -30,6 +34,7 @@
 ``numpy.float64``.""".replace("\n", " "),
     #
     "code_type": "The target GPU code and compute-capability.",
+    "sm": "Target mathdx compute-capability.",
     #
     "execution": "A string specifying the execution method, can be ``'Block'`` or ``'Thread'``.",
 }
@@ -72,15 +77,41 @@ def check_contains(set, key):
         raise ValueError(f"{key} must be in {set}")
 
 
-def check_code_type(code_type):
-    if isinstance(code_type, CodeType):
-        if code_type.cc.major < 7:
-            raise ValueError(f"code_type.cc.major must be >= 7 ; got code_type.cc.major = {code_type.cc.major}")
-        if code_type.cc.minor < 0:
-            raise ValueError(f"code_type.cc.minor must be >= 0 ; got code_type.cc.minor = {code_type.cc.minor}")
-        check_in("code_type.kind", code_type.kind, ["lto"])
+def parse_sm(sm: Any) -> ComputeCapability:
+    if sm is None:
+        sm = get_current_device_cc()
     else:
+        if not isinstance(sm, ComputeCapability) and not isinstance(sm, int):
+            raise ValueError(f"sm should be a ComputeCapability or an int; got sm = {sm}")
+        if isinstance(sm, int):
+            sm = ComputeCapability(sm // 10, sm % 10)
+
+    return sm
+
+
+def check_sm(sm, library_name: str, var_name: str = "sm"):
+    if not isinstance(sm, ComputeCapability):
+        raise ValueError(f"{var_name} should be an instance of ComputeCapability ; got {var_name} = {sm}")
+    if sm < MIN_SUPPORTED_CC:
+        raise RuntimeError(f"Minimal compute capability {MIN_SUPPORTED_CC} is required by {library_name}, got {sm}")
+    if sm > MAX_SUPPORTED_CC:
+        raise RuntimeError(f"The maximum compute capability currently supported by device APIs is {MAX_SUPPORTED_CC}, got {sm}")
+    if sm.minor < 0:
+        raise ValueError(f"{var_name}.minor must be >= 0 ; got {var_name}.minor = {sm.minor}")
+
+
+def parse_code_type(code_type: Any) -> CodeType:
+    if not isinstance(code_type, Sequence) or len(code_type) != 2:
+        raise ValueError(f"code_type should be an instance of CodeType or a 2-tuple ; got code_type = {code_type}")
+
+    return CodeType(code_type[0], ComputeCapability(*code_type[1]))
+
+
+def check_code_type(code_type, library_name):
+    if not isinstance(code_type, CodeType):
         raise ValueError(f"code_type should be an instance of CodeType ; got code_type = {code_type}")
+    check_sm(code_type.cc, library_name, "code_type.cc")
+    check_in("code_type.kind", code_type.kind, ["lto"])
 
 
 def pad_or_truncate(list, target_len):
@@ -88,7 +119,14 @@ def pad_or_truncate(list, target_len):
 
 
 class Layout:
-    """Layout for the OpaqueTensor"""
+    """
+    Layout for the :py:class:`nvmath.device.OpaqueTensor`.
+
+    .. note:: Do not create directly, use appropriate method from
+        :py:func:`nvmath.device.Matmul`. Refer to
+        https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#imported-tensor-utilities
+        for guidance on which method to use.
+    """
 
     def __init__(self):
         raise RuntimeError("Layout should not be called directly.")
@@ -99,6 +137,9 @@ def size(self) -> int:
         """
         Number of valid elements in a tensor. This is simply a product of all
         shape dimensions.
+
+        Refer to the cuBLASDx documentation for more details on how to use this attribute:
+        https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#imported-tensor-utilities
         """
         pass
 
@@ -108,11 +149,25 @@ def cosize(self) -> int:
         """
         Returns a distance from last element of a tensor to its first element.
         It describes how many elements does the argument layout span.
+
+        Refer to the cuBLASDx documentation for more details on how to use this attribute:
+        https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#imported-tensor-utilities
         """
         pass
 
 
 class OpaqueTensor:
+    """
+    Abstraction over the cuBLASDx tensor type (an alias of the CuTe tensor type).
+    The CuTe tensor layout is powerful and supports layouts not provided by NumPy,
+    so this a bridge to add this functionality to Python.
+
+    .. note:: Do not create directly, use :py:func:`nvmath.device.make_tensor`.
+
+    Refer to the cuBLASDx documentation for more details on how to use this class:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#tensors
+    """
+
     buffer: np.ndarray
     layout: Layout
     leading_dimension: int | None
@@ -121,25 +176,160 @@ def __init__(self, *args):
         raise RuntimeError("OpaqueTensor should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
+class Partition:
+    """
+    Partition of a global memory tensor into a partitioned tensor. This is used
+    for accessing the C matrix when working with register fragments.
+
+    .. note:: Do not create directly, use
+        :py:func:`nvmath.device.Partitioner.partition_like_C`.
+
+    Refer to the cuBLASDx documentation for more details on how to use this class:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#partitioner-register-tensor-other-label
+    """
+
+    def __init__(self, *args):
+        raise RuntimeError("Partition should not be called directly")
+
+
+class Partitioner:
+    """
+    Partitioner is an abstraction for partitioning a global memory tensor into a
+    partitioned tensor.
+
+    .. note:: Do not create directly, use
+        :py:func:`nvmath.device.Matmul.suggest_partitioner`.
+
+    Refer to the cuBLASDx documentation for more details on how to use this class:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#partitioner-register-tensor-other-label
+    """
+
+    def __init__(self, *args):
+        raise RuntimeError("Partitioner should not be called directly")
+
+    @abstractmethod
+    def partition_like_C(self, gmem_c: OpaqueTensor) -> Partition:
+        """
+        Partitions the given global memory tensor `gmem_c` into a partitioned tensor.
+        The partitioned tensor is used for accessing the C matrix when working
+        with register fragment.
+        """
+        raise NotImplementedError("This method should be implemented in a subclass.")
+
+    @abstractmethod
+    def map_fragment_index(self, fragment_index: int) -> tuple[int, int]:
+        """
+        Maps the given fragment index to a global memory index.
+        This is used to access the correct element in the partitioned tensor.
+        """
+        raise NotImplementedError("This method should be implemented in a subclass.")
+
+    @abstractmethod
+    def is_thread_active(self) -> bool:
+        """
+        Checks if the current thread takes part in GEMM.
+        """
+        raise NotImplementedError("This method should be implemented in a subclass.")
+
+    @abstractmethod
+    def is_predicated(self) -> bool:
+        """
+        Checks if the current thread is predicated.
+        This is used to determine if the thread should execute the kernel.
+        """
+        raise NotImplementedError("This method should be implemented in a subclass.")
+
+    @abstractmethod
+    def is_index_in_bounds(self, index: int) -> bool:
+        """
+        Checks if the given index is within the bounds of the partitioned tensor.
+        This is used to prevent out-of-bounds access in the kernel.
+        """
+        raise NotImplementedError("This method should be implemented in a subclass.")
+
+
 def make_tensor(array: np.ndarray, layout: Layout) -> OpaqueTensor:
+    """
+    make_tensor is a helper function for creating
+    :py:class:`nvmath.device.OpaqueTensor` objects.
+
+    Args:
+        array (np.ndarray): The input array to be wrapped as an OpaqueTensor.
+        layout (Layout): The layout of the tensor, which describes how the data is
+            organized in memory.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#create-tensor-other-label
+    """
     raise RuntimeError("make_tensor should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
 def axpby(alpha: float, x_tensor: OpaqueTensor, beta: float, y_tensor: OpaqueTensor) -> None:
+    """
+    AXPBY operation: y = alpha * x + beta * y
+
+    Args:
+        alpha (float): Scalar multiplier for x_tensor.
+        x_tensor (OpaqueTensor): Input tensor x.
+        beta (float): Scalar multiplier for y_tensor.
+        y_tensor (OpaqueTensor): Input/output tensor y, which will be updated
+            with the result.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#imported-tensor-utilities
+    """
     raise RuntimeError("axpby should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
-def copy(src: OpaqueTensor, dst: OpaqueTensor):
+def copy(src: OpaqueTensor, dst: OpaqueTensor, alignment=None):
+    """
+    Copies data from the source tensor to the destination tensor.
+
+    Args:
+        src (OpaqueTensor): The source tensor to copy from.
+        dst (OpaqueTensor): The destination tensor to copy to.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#cooperative-global-shared-copying
+    """
     raise RuntimeError("copy should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
 def copy_fragment(src: OpaqueTensor, dst: OpaqueTensor):
+    """
+    A bidirectional copying method to copy data between register fragments and
+    global memory tensors.
+
+    Args:
+        src (OpaqueTensor): The source tensor to copy from.
+        dst (OpaqueTensor): The destination tensor to copy to.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#copying-registers-tensors
+    """
     raise RuntimeError("copy_fragment should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
 def clear(arr: OpaqueTensor):
-    raise RuntimeError("copy_c should not be called directly outside of a numba.cuda.jit(...) kernel.")
+    """
+    Clears the contents of the given tensor by setting all elements to zero.
+
+    Args:
+        arr (OpaqueTensor): The tensor to be cleared.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#imported-tensor-utilities
+    """
+    raise RuntimeError("clear should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
 
 def copy_wait():
+    """
+    Creates synchronization point. It has to be called after :py:func:`nvmath.device.copy`
+    to ensure that the copy operation has completed before any subsequent
+    operations are executed.
+
+    Refer to the cuBLASDx documentation for more details on how to use this function:
+    https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#cooperative-global-shared-copying
+    """
     raise RuntimeError("copy_wait should not be called directly outside of a numba.cuda.jit(...) kernel.")
diff --git a/nvmath/device/common_backend.py b/nvmath/device/common_backend.py
index 2d5f7c5..9994662 100644
--- a/nvmath/device/common_backend.py
+++ b/nvmath/device/common_backend.py
@@ -3,22 +3,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Callable
+import weakref
 import numpy as np
 
 from nvmath.device.common_cuda import ISAVersion
-from .types import np_float16x2, np_float16x4
+from .types import complex32, complex64, complex128, half2, half4
 
 from nvmath.bindings import mathdx
 
 MATHDX_TYPES_TO_NP = {
     mathdx.CommondxValueType.R_16F: np.float16,
-    mathdx.CommondxValueType.R_16F2: np_float16x2,
+    mathdx.CommondxValueType.R_16F2: half2,
     mathdx.CommondxValueType.R_32F: np.float32,
     mathdx.CommondxValueType.R_64F: np.float64,
-    mathdx.CommondxValueType.C_16F: np_float16x2,
-    mathdx.CommondxValueType.C_16F2: np_float16x4,
-    mathdx.CommondxValueType.C_32F: np.complex64,
-    mathdx.CommondxValueType.C_64F: np.complex128,
+    mathdx.CommondxValueType.C_16F: complex32,
+    mathdx.CommondxValueType.C_16F2: half4,
+    mathdx.CommondxValueType.C_32F: complex64,
+    mathdx.CommondxValueType.C_64F: complex128,
     mathdx.CommondxValueType.R_8I: np.int8,
     mathdx.CommondxValueType.R_16I: np.int16,
     mathdx.CommondxValueType.R_32I: np.int32,
@@ -66,16 +67,8 @@ class DescriptorWrapper:
     def __init__(self, descriptor, destructor):
         self.descriptor = descriptor
         self._destructor = destructor
-
-    def __del__(self):
-        if not self.descriptor:
-            return
-
-        self._destructor(self.descriptor)
-
-        # Safety clean up
-        self.descriptor = None
-        self._destructor = None
+        if destructor is not None:
+            weakref.finalize(self, self._destructor, self.descriptor)
 
 
 def get_lto(code_descriptor: int) -> bytes:
diff --git a/nvmath/device/common_cuda.py b/nvmath/device/common_cuda.py
index 833bdcd..2be888f 100644
--- a/nvmath/device/common_cuda.py
+++ b/nvmath/device/common_cuda.py
@@ -2,7 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["current_device_lto", "ComputeCapability", "CodeType", "ISAVersion", "Code", "Dim3", "MAX_SUPPORTED_CC"]
+__all__ = [
+    "current_device_lto",
+    "current_device_sm",
+    "ComputeCapability",
+    "CodeType",
+    "ISAVersion",
+    "Code",
+    "Dim3",
+    "MAX_SUPPORTED_CC",
+]
 
 from typing import NamedTuple
 from cuda.bindings import runtime as cudart, driver as cudadrv
@@ -62,6 +71,7 @@ def __str__(self):
     pass
 
 
+MIN_SUPPORTED_CC = ComputeCapability(7, 0)
 MAX_SUPPORTED_CC = ComputeCapability(12, 1)
 
 
@@ -154,3 +164,15 @@ def current_device_lto():
         device.
     """
     return get_default_code_type()
+
+
+def current_device_sm() -> ComputeCapability:
+    """
+    A helper function to get the default SM for mathdx types on the
+    current device.
+
+    Returns:
+        A :class:`ComputeCapability` object representing the default SM for
+        mathdx types on the current device.
+    """
+    return get_current_device_cc()
diff --git a/nvmath/device/common_mathdx.py b/nvmath/device/common_mathdx.py
index 7db9e24..8875101 100644
--- a/nvmath/device/common_mathdx.py
+++ b/nvmath/device/common_mathdx.py
@@ -2,99 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from importlib.metadata import files, PackageNotFoundError
-import os
-import platform
-import re
 import sys
-import warnings
-
-from nvmath._utils import get_nvrtc_version
-
-
-CUDA_HOME = None
-CURAND_HOME = None
-
 
 PLATFORM_LINUX = sys.platform.startswith("linux")
 PLATFORM_WIN = sys.platform.startswith("win32")
-
-
-def conda_get_target_name():
-    if PLATFORM_LINUX:
-        plat = platform.processor()
-        if plat == "aarch64":
-            return "sbsa-linux"
-        else:
-            return f"{plat}-linux"
-    elif PLATFORM_WIN:
-        return "x64"
-    else:
-        raise AssertionError()
-
-
-def check_cuda_home():
-    # We need some CUDA headers for compiling mathDx headers.
-    # We assume users properly managing their local envs (ex: no mix-n-match).
-    global CUDA_HOME
-    global CURAND_HOME
-
-    # Try wheel
-    try:
-        major, _, _ = get_nvrtc_version()
-        # We need CUDA 12+ for device API support
-        cudart = files("nvidia-cuda-runtime-cu12" if major == 12 else "nvidia-cuda-runtime")
-        cccl = files("nvidia-cuda-cccl-cu12" if major == 12 else "nvidia-cuda-cccl")
-        curand = files("nvidia-curand-cu12" if major == 12 else "nvidia-curand")
-        # use cuda_fp16.h (which we need) as a proxy
-        cudart = [f for f in cudart if "cuda_fp16.h" in str(f)][0]
-        cudart = os.path.join(os.path.dirname(cudart.locate()), "..")
-        # use cuda/std/type_traits as a proxy
-        cccl = min([f for f in cccl if re.match(r".*cuda\/std\/type_traits.*", str(f))], key=lambda x: len(str(x)))
-        cccl = os.path.join(os.path.dirname(cccl.locate()), "../.." + ("/.." if major == 12 else ""))
-        curand = [f for f in curand if "curand_kernel.h" in str(f)][0]
-        curand = os.path.dirname(curand.locate())
-    except PackageNotFoundError:
-        pass
-    except ValueError:
-        # cccl wheel is buggy (headers missing), skip using wheels
-        pass
-    else:
-        CUDA_HOME = (cudart, cccl)
-        CURAND_HOME = curand
-        return
-
-    # Try conda
-    if "CONDA_PREFIX" in os.environ:
-        if PLATFORM_LINUX:
-            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "targets", f"{conda_get_target_name()}", "include")
-        elif PLATFORM_WIN:
-            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "Library", "include")
-        else:
-            raise AssertionError()
-        if os.path.isfile(os.path.join(conda_include, "cuda_fp16.h")) and os.path.isfile(
-            os.path.join(conda_include, "cuda/std/type_traits")
-        ):
-            CUDA_HOME = (os.path.join(conda_include, ".."),)
-            CURAND_HOME = os.path.join(CUDA_HOME[0], "include")
-            return
-
-    # Try local
-    CUDA_PATH = os.environ.get("CUDA_PATH", None)
-    CUDA_HOME = os.environ.get("CUDA_HOME", None)
-    if CUDA_PATH is None and CUDA_HOME is None:
-        raise RuntimeError(
-            "cudart headers not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            "you may need to perform one of the steps below:\n"
-            "  - conda install -c conda-forge cuda-cudart-dev cuda-cccl cuda-version=12\n"
-            "  - export CUDA_HOME=/path/to/CUDA/Toolkit"
-        )
-    elif CUDA_PATH is not None and CUDA_HOME is None:
-        CUDA_HOME = CUDA_PATH
-    elif CUDA_PATH is not None and CUDA_HOME is not None and CUDA_HOME != CUDA_PATH:
-        warnings.warn("Both CUDA_HOME and CUDA_PATH are set but not consistent. Ignoring CUDA_PATH...")
-    CUDA_HOME = (CUDA_HOME,)
-    CURAND_HOME = os.path.join(CUDA_HOME[0], "include")
-
-
-check_cuda_home()
diff --git a/nvmath/device/common_numba.py b/nvmath/device/common_numba.py
index f4be9d4..7d8b8b6 100644
--- a/nvmath/device/common_numba.py
+++ b/nvmath/device/common_numba.py
@@ -2,14 +2,20 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Any
+from collections.abc import Callable
 from llvmlite import ir
 from numba import types
-from numba.core import cgutils
-from numba.extending import models, overload_attribute
+from numba.core import cgutils, typing
+from numba.core.base import BaseContext
+from numba.extending import models, overload, overload_attribute, typeof_impl, intrinsic
+from numba.core.errors import TypingError
+from nvmath.bindings import mathdx
 import numpy as np
 
+
 from .vector_types_numba import float16x2_type, float16x4_type, float32x2_type, float64x2_type
-from .types import np_float16x2, np_float16x4
+from .types import np_float16x2, np_float16x4, complex32, complex64, complex128, half2, half4, Complex, Vector
 from .common_opaque_tensor import OpaqueTensorType
 
 NP_TYPES_TO_NUMBA_FE_TYPES = {
@@ -34,6 +40,11 @@
     np.float16: types.float16,
     np.float32: types.float32,
     np.float64: types.float64,
+    complex32: float16x2_type,
+    complex64: float32x2_type,
+    complex128: float64x2_type,
+    half2: float16x2_type,
+    half4: float16x4_type,
     float16x2_type: float16x2_type,
     float16x4_type: float16x4_type,
     float32x2_type: float32x2_type,
@@ -49,16 +60,108 @@
 }
 
 
-def make_dx_codegen_one_arg(context, builder: ir.IRBuilder, type_, arg):
-    # mathdx expects everything to be passed by pointers.
-    # Arrays are lowered by passing a pointer to the data
-    # arg can be None, in which case a null pointer is passed
-    if isinstance(type_, OpaqueTensorType):
+class EmptyStructModel(models.StructModel):
+    """Data model that does not take space. Intended to be used with types that
+    are presented only at typing stage and not represented in memory."""
+
+    def __init__(self, dmm, fe_type):
+        members = []
+        super().__init__(dmm, fe_type, members)
+
+
+def overload_type_attribute(numba_type, attribute_base, attribute):
+    """Make type attribute available inside jitted code."""
+    assert issubclass(numba_type, types.Type)
+
+    @overload_attribute(numba_type, attribute, jit_options={"forceinline": True}, target="cuda")
+    def ol_blas_attribute(blas_numba):
+        tp = blas_numba
+        if attribute_base != "":
+            tp = getattr(tp, attribute_base)
+        val = getattr(tp, attribute)
+        return lambda blas_numba: val
+
+
+@typeof_impl.register(Complex)
+def typeof_complex(val: Complex, c: typing.Context) -> Any:
+    if val.real_dtype == np.float16:
+        return types.NumberClass(float16x2_type)
+    elif val.real_dtype == np.float32:
+        return types.NumberClass(float32x2_type)
+    elif val.real_dtype == np.float64:
+        return types.NumberClass(float64x2_type)
+
+    raise RuntimeError(f"Unsupported complex real dtype {val.real_dtype}")
+
+
+@typeof_impl.register(Vector)
+def typeof_vector(val: Vector, c: typing.Context) -> Any:
+    if val.real_dtype != np.float16 or val.size not in (2, 4):
+        raise RuntimeError(f"Unsupported vector type {val.real_dtype}x{val.size}")
+
+    return types.NumberClass(float16x2_type if val.size == 2 else float16x4_type)
+
+
+@intrinsic
+def get_array_ptr(typingctx: typing.Context, arr):
+    """Get raw pointer to the data of a Numba array."""
+    assert isinstance(arr, types.Array)
+
+    sig = typing.signature(types.CPointer(arr.dtype), arr)
+
+    def codegen(context: BaseContext, builder, sig, args):
+        arrTy = sig.args[0]
+        arr = args[0]
+
+        dtype = arrTy.dtype
+        valueTy = context.get_value_type(dtype)
+        ptrTy = ir.PointerType(valueTy)
+        if arr is None:
+            ptr = ptrTy(None)
+        else:
+            ptr = cgutils.create_struct_proxy(arrTy)(context, builder, arr).data
+
+        # Future release of numba-cuda may have support for address spaces.
+        # It is not supported to pass a non generic pointer to device function
+        # call.
+        if ptr.type.addrspace != 0:
+            ptr = builder.addrspacecast(ptr, ir.PointerType(ptr.type.pointee), "generic")
+
+        return ptr
+
+    return sig, codegen
+
+
+@intrinsic
+def get_value_ptr(typingctx: typing.Context, value):
+    """Get raw pointer to the value."""
+    if value not in [float16x2_type, float16x4_type, float32x2_type, float64x2_type] and not isinstance(  # noqa: UP038
+        value, (types.Float, types.Complex, types.Integer)
+    ):
+        raise TypingError(f"get_value_ptr does not support type {value}")
+
+    sig = typing.signature(types.CPointer(value), value)
+
+    def codegen(context: BaseContext, builder, sig, args):
+        return cgutils.alloca_once_value(builder, args[0])
+
+    return sig, codegen
+
+
+@intrinsic
+def get_opaque_tensor(typingctx: typing.Context, value: OpaqueTensorType):
+    """Get raw pointer to the value."""
+    if not isinstance(value, OpaqueTensorType):
+        raise TypingError(f"get_opaque_tensor does not support type {value}")
+
+    sig = typing.signature(value._capi_type, value)
+
+    def codegen(context: BaseContext, builder, sig, args):
         ptrTy = ir.PointerType(ir.IntType(8))
         ldTy = ir.IntType(64)
 
-        opaque_tensor = cgutils.create_struct_proxy(type_)(context, builder, arg)
-        ptr = cgutils.create_struct_proxy(type_.buffer_type)(context, builder, opaque_tensor.buffer).data
+        opaque_tensor = cgutils.create_struct_proxy(value)(context, builder, args[0])
+        ptr = cgutils.create_struct_proxy(value.buffer_type)(context, builder, opaque_tensor.buffer).data
 
         # Future release of numba-cuda may have support for address spaces.
         # It is not supported to pass a non generic pointer to device function
@@ -68,92 +171,70 @@ def make_dx_codegen_one_arg(context, builder: ir.IRBuilder, type_, arg):
 
         ptr = builder.bitcast(ptr, ptrTy)
 
-        layout = cgutils.create_struct_proxy(type_.layout)(context, builder, opaque_tensor.layout)
+        layout = cgutils.create_struct_proxy(value.layout)(context, builder, opaque_tensor.layout)
 
         member_values = [ptr]
-        member_types = [ptrTy]
-        if type_.layout.dynamic_ld:
-            member_types += [ldTy]
+        if value.layout.dynamic_ld:
             ld = builder.bitcast(layout.leading_dimension, ldTy)
             member_values += [ld]
 
-        structTy = ir.LiteralStructType(member_types)
-        val = cgutils.pack_struct(builder, member_values)
+        return cgutils.pack_struct(builder, member_values)
 
-        return (structTy, val)
-    elif isinstance(type_, types.Array):
-        dtype = type_.dtype
-        valueTy = context.get_value_type(dtype)
-        ptrTy = ir.PointerType(valueTy)
-        if arg is None:
-            ptr = ptrTy(None)
-        else:
-            ptr = cgutils.create_struct_proxy(type_)(context, builder, arg).data
+    return sig, codegen
 
-        # Future release of numba-cuda may have support for address spaces.
-        # It is not supported to pass a non generic pointer to device function
-        # call.
-        if ptr.type.addrspace != 0:
-            ptr = builder.addrspacecast(ptr, ir.PointerType(ptr.type.pointee), "generic")
 
-        return (ptrTy, ptr)
-    # Integers are passed as-pointers
-    # arg can be None, in which case a 0 is passed
-    elif isinstance(type_, types.Integer):
-        intTy = context.get_value_type(type_)
-        if arg is None:
-            val = intTy(0)
-        else:
-            val = arg
-        ptrTy = ir.PointerType(intTy)
-        ptr = cgutils.alloca_once_value(builder, val)
-        return (ptrTy, ptr)
-    # Floats and Complex are passed by reference (pointer) This is because some CUDA C++
-    # types, such as __half2 are non-trivial, and those must be passed by reference. For
-    # consistency we pass everything by reference.
-    elif type_ in [float16x2_type, float16x4_type, float32x2_type, float64x2_type] or isinstance(  # noqa: UP038
-        type_, (types.Float, types.Complex)
-    ):
-        assert arg is not None
-        valueTy = context.get_value_type(type_)
-        ptrTy = ir.PointerType(valueTy)
-        ptr = cgutils.alloca_once_value(builder, arg)
-        return (ptrTy, ptr)
-    else:
-        raise RuntimeError(f"Unsupported lowering for type {type_} for arg {arg}")
+_cabi_device_registry: dict[str | tuple[str, typing.Signature], Callable] = {}
 
 
-def make_function_call(symbol):
-    def codegen(context, builder, sig, args):
-        assert len(sig.args) == len(args)
-        argsTyAndArgs = [make_dx_codegen_one_arg(context, builder, t, a) for (t, a) in zip(sig.args, args, strict=True)]
-        argsTy = [t for (t, _) in argsTyAndArgs]
-        args = [v for (_, v) in argsTyAndArgs]
-        retTy = context.get_value_type(sig.return_type)
-        fnTy = ir.FunctionType(retTy, argsTy)
-        fn = cgutils.get_or_insert_function(builder.module, fnTy, symbol)
-        builder.call(fn, args)
+def declare_cabi_device(symbol: str, sig: typing.Signature, link=None):
+    """declare_cabi_device is an analog of cuda.declare_device but uses C ABI
+    calling convention instead of Numba ABI calling convention. It means that
+    the first argument is not a return value pointer."""
+    key: str | tuple[str, typing.Signature] = symbol
+    if mathdx.get_version_ex() < (0, 3, 0):
+        key = (symbol, sig)
+    device_func = _cabi_device_registry.get(key)
+    if device_func is None:
+        device_func = _declare_cabi_device(symbol, sig, link)
+        _cabi_device_registry[key] = device_func
 
-    return codegen
+    return device_func
 
 
-class EmptyStructModel(models.StructModel):
-    """Data model that does not take space. Intended to be used with types that
-    are presented only at typing stage and not represented in memory."""
+def _declare_cabi_device(symbol: str, sig: typing.Signature, link=None):
+    intrinsic_sig = sig.return_type(types.Tuple(sig.args))
 
-    def __init__(self, dmm, fe_type):
-        members = []
-        super().__init__(dmm, fe_type, members)
+    @intrinsic
+    def call_device(typingctx: typing.Context, args):
+        def codegen(context: BaseContext, builder, sig: typing.Signature, args):
+            if link is not None:
+                context.active_code_library.add_linking_file(link)
 
+            args = cgutils.unpack_tuple(builder, args[0])
+            argTypes = sig.args[0]
 
-def overload_type_attribute(numba_type, attribute_base, attribute):
-    """Make type attribute available inside jitted code."""
-    assert issubclass(numba_type, types.Type)
+            assert len(args) == len(argTypes)
 
-    @overload_attribute(numba_type, attribute, jit_options={"forceinline": True}, target="cuda")
-    def ol_blas_attribute(blas_numba):
-        tp = blas_numba
-        if attribute_base != "":
-            tp = getattr(tp, attribute_base)
-        val = getattr(tp, attribute)
-        return lambda blas_numba: val
+            retTy = context.get_value_type(sig.return_type)
+            fnTy = ir.FunctionType(retTy, [context.get_value_type(argTy) for argTy in argTypes])
+            fn = cgutils.get_or_insert_function(builder.module, fnTy, symbol)
+            builder.call(fn, args)
+
+        return intrinsic_sig, codegen
+
+    def device_func():
+        pass
+
+    @overload(device_func, jit_options={"forceinline": True}, target="cuda")
+    def ol_device_func(*args):
+        if len(args) != len(sig.args):
+            raise RuntimeError(
+                f"Invalid number of arguments for device function {symbol}: expected {len(sig.args)}, got {len(args)}"
+            )
+        for expected_t, provided_t in zip(sig.args, args, strict=True):
+            if expected_t == provided_t:
+                continue
+            raise RuntimeError(f"Invalid argument type for device function {symbol}: expected {expected_t}, got {provided_t}")
+        return lambda *args: call_device(tuple(args))
+
+    return device_func
diff --git a/nvmath/device/common_opaque_tensor.py b/nvmath/device/common_opaque_tensor.py
index 51b81e2..7342b31 100644
--- a/nvmath/device/common_opaque_tensor.py
+++ b/nvmath/device/common_opaque_tensor.py
@@ -89,6 +89,10 @@ def ndim(self):
     def dtype(self):
         return self.buffer_type.dtype
 
+    @cached_property
+    def _capi_type(self):
+        return OpaqueTensorCType(self)
+
 
 @register_model(OpaqueTensorType)
 class OpaqueTensorModel(models.StructModel):
@@ -104,6 +108,23 @@ def __init__(self, dmm, fe_type: OpaqueTensorType):
 make_attribute_wrapper(OpaqueTensorType, "layout", "layout")
 
 
+class OpaqueTensorCType(types.Type):
+    def __init__(self, tensor: OpaqueTensorType):
+        super().__init__(f"OpaqueTensorC(tensor={tensor})")
+        self._tensor = tensor
+
+
+@register_model(OpaqueTensorCType)
+class OpaqueTensorCModel(models.StructModel):
+    def __init__(self, dmm, fe_type: OpaqueTensorCType):
+        members = [
+            ("ptr", types.voidptr),
+        ]
+        if fe_type._tensor.layout.dynamic_ld:
+            members += [("ld", types.int64)]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+
 @type_callable(OpaqueTensor)
 def type_callable_OpaqueTensor(context):
     def typer(buffer_ty, layout_ty):
@@ -126,3 +147,47 @@ def impl_interval(context: BaseContext, builder: IRBuilder, sig, args):
     opaque_tensor.layout = layout
 
     return opaque_tensor._getvalue()
+
+
+class PartitionerType(types.Type):
+    """
+    Type class associated with partitioners.
+    """
+
+
+class PartitionType(types.Type):
+    """
+    Type class associated with partitions.
+    """
+
+    def __init__(self, partitioner: PartitionerType, tensor: OpaqueTensorType):
+        assert isinstance(partitioner, PartitionerType)
+        assert isinstance(tensor, OpaqueTensorType)
+
+        super().__init__(f"PartitionType(partitioner={partitioner}, tensor={tensor})")
+        self._partitioner = partitioner
+        self._tensor = tensor
+
+    @property
+    def tensor(self) -> OpaqueTensorType:
+        return self._tensor
+
+    @property
+    def partitioner(self) -> PartitionerType:
+        return self._partitioner
+
+
+@register_model(PartitionerType)
+class PartitionerModel(models.StructModel):
+    def __init__(self, dmm, fe_type: PartitionerType):
+        models.StructModel.__init__(self, dmm, fe_type, [])
+
+
+@register_model(PartitionType)
+class PartitionModel(models.StructModel):
+    def __init__(self, dmm, fe_type: PartitionType):
+        members = [
+            ("partitioner", fe_type.partitioner),
+            ("tensor", fe_type.tensor),
+        ]
+        models.StructModel.__init__(self, dmm, fe_type, members)
diff --git a/nvmath/device/cublasdx.py b/nvmath/device/cublasdx.py
index c9b1b5f..4a7ac99 100644
--- a/nvmath/device/cublasdx.py
+++ b/nvmath/device/cublasdx.py
@@ -2,40 +2,43 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["matmul", "TransposeMode", "BlasOptions"]
+__all__ = ["matmul", "TransposeMode", "Matmul", "SharedStorageCalc"]
 
 from functools import cached_property
 import itertools
 from collections.abc import Sequence
-import math
 import re
-from typing import overload
+from typing import Any, overload
 from warnings import warn
-import weakref
 
 from .common import (
     Layout,
-    make_binary_tempfile,
-    delete_binary_tempfiles,
+    Partitioner,
+    check_code_type,
     check_in,
     SHARED_DEVICE_DOCSTRINGS,
     pad_or_truncate,
+    parse_sm,
 )
 from .common_backend import MATHDX_TYPES_TO_NP, get_isa_version, get_lto
-from .common_cuda import MAX_SUPPORTED_CC, get_default_code_type, ComputeCapability, Code, CodeType, Dim3
-from .common_numba import NP_TYPES_TO_NUMBA_FE_TYPES
+from .common_cuda import (
+    Code,
+    CodeType,
+    Dim3,
+)
 from .cublasdx_backend import (
     Alignment,
     Arrangement,
     Precision,
     generate_MM,
     generate_code,
-    generate_code_tensors,
-    generate_copy_wait_lto,
+    generate_function_code,
+    generate_tensor,
     generate_tensors,
+    get_function_code,
     get_str_trait,
     get_int_traits,
-    get_tensor_int_traits,
+    get_tensor_traits,
     validate,
     LeadingDimension,
     TransposeMode,
@@ -154,12 +157,14 @@ def get(self):
 
 
 @docstring_decorator(CUBLASDX_DOCSTRING, skip_missing=False)
-class BlasOptions:
+class Matmul:
     """
-    A class that encapsulates a partial BLAS device function. A partial device function can
-    be queried for available or optimal values for some knobs (such as `leading_dimension`
-    or `block_dim`). It does not contain a compiled, ready-to-use, device function until
-    finalized using :meth:`create`.
+    A class that encapsulates a partial Matmul device function. A partial device function
+    can be queried for available or optimal values for some knobs (such as
+    `leading_dimension` or `block_dim`).
+
+    .. versionchanged:: 0.7.0
+        `Matmul` has replaced `BlasOptions` and `BlasOptionsComplete`.
 
     Args:
         size: {size}
@@ -168,7 +173,7 @@ class BlasOptions:
 
         data_type: {data_type}
 
-        code_type (CodeType): {code_type}
+        sm (ComputeCapability): {sm}
 
         block_size (int): {block_size}
 
@@ -186,17 +191,19 @@ class BlasOptions:
 
         execution (str): {execution}
 
-        execute_api:
+        execute_api (str): {execute_api}
+
             .. versionchanged:: 0.5.0
-                execute_api is not part of the Blas type. Pass this argument to
-                :py:func:`nvmath.device.matmul` instead.
+                execute_api is not part of the Matmul (ex. Blas) type. Pass this
+                argument to :py:func:`nvmath.device.matmul` instead.
+
+        tensor_types (Sequence[str]): {tensor_types}
 
-        tensor_types:
             .. versionchanged:: 0.5.0
-                tensor_types is not part of the Blas type. Pass this argument to
-                :py:func:`nvmath.device.matmul` instead.
+                tensor_types is not part of the Matmul (ex. Blas) type. Pass
+                this argument to :py:func:`nvmath.device.matmul` instead.
 
-    See Also:
+    .. seealso::
         The attributes of this class provide a 1:1 mapping with the CUDA C++ cuBLASDx APIs.
         For further details, please refer to `cuBLASDx documentation
         <https://docs.nvidia.com/cuda/cublasdx/>`_.
@@ -208,7 +215,7 @@ def __init__(
         precision,
         data_type,
         *,
-        code_type=None,
+        sm=None,
         block_size=None,
         block_dim=None,
         leading_dimension=None,
@@ -219,19 +226,7 @@ def __init__(
         static_block_dim=False,
         execution="Block",
     ):
-        if not isinstance(code_type, Sequence) or len(code_type) != 2:
-            raise ValueError(f"code_type should be an instance of CodeType or a 2-tuple ; got code_type = {code_type}")
-        code_type = CodeType(code_type[0], ComputeCapability(*code_type[1]))
-        if code_type.cc.major < 7:
-            raise RuntimeError(
-                f"Minimal compute capability 7.0 is required by cuBLASDx, got {code_type.cc.major}.{code_type.cc.minor}"
-            )
-        if (code_type.cc.major, code_type.cc.minor) > MAX_SUPPORTED_CC:
-            raise RuntimeError(
-                "The maximum compute capability currently supported by device "
-                f"APIs is {MAX_SUPPORTED_CC}, "
-                f"got {code_type.cc.major}.{code_type.cc.minor}"
-            )
+        sm = parse_sm(sm)
 
         if transpose_mode is not None:
             warn(
@@ -302,7 +297,7 @@ def __init__(
             transpose_mode=transpose_mode,
             arrangement=arrangement,
             alignment=alignment,
-            code_type=code_type,
+            sm=sm,
             leading_dimension=leading_dimension,
             block_dim=block_dim,
             function=function,
@@ -320,7 +315,7 @@ def __init__(
         self._transpose_mode = transpose_mode
         self._arrangement = arrangement
         self._alignment = alignment
-        self._code_type = code_type
+        self._sm = sm
         self._block_dim = block_dim
         self._function = function
         self._execution = execution
@@ -337,6 +332,10 @@ def __init__(
         if block_dim == "suggested":
             self._block_dim = self._suggested_block_dim
 
+    @cached_property
+    def _traits(self):
+        return _MatmulTraits(self)
+
     @property
     def precision(self) -> Precision:
         return self._precision
@@ -364,11 +363,13 @@ def arrangement(self) -> Arrangement:
 
     @property
     def alignment(self) -> Alignment:
+        if self._alignment is None:
+            return self._traits.alignment
         return self._alignment
 
     @property
-    def code_type(self):
-        return self._code_type
+    def sm(self):
+        return self._sm
 
     @property
     def function(self) -> str:
@@ -376,10 +377,12 @@ def function(self) -> str:
 
     @property
     def block_size(self) -> int:
-        return self._block_dim[0] * self._block_dim[1] * self._block_dim[2]
+        return self.block_dim[0] * self.block_dim[1] * self.block_dim[2]
 
     @property
     def block_dim(self) -> Dim3:
+        if self._block_dim is None:
+            return self._traits.block_dim
         return self._block_dim
 
     @property
@@ -388,6 +391,8 @@ def static_block_dim(self) -> bool:
 
     @property
     def leading_dimension(self) -> LeadingDimension:
+        if self._leading_dimension is None:
+            return self._traits.leading_dimension
         return self._leading_dimension
 
     #
@@ -397,7 +402,11 @@ def leading_dimension(self) -> LeadingDimension:
     def valid(self, *knobs):
         return itertools.product(*[self._valid(knob) for knob in knobs])
 
+    @deprecated("definition is deprecated and may be removed in future versions")
     def definition(self):
+        """
+        .. deprecated:: 0.7.0
+        """
         dd = {
             "size": self.size,
             "precision": self.precision,
@@ -405,7 +414,7 @@ def definition(self):
             "transpose_mode": self.transpose_mode,
             "arrangement": self.arrangement,
             "alignment": self.alignment,
-            "code_type": self.code_type,
+            "sm": self.sm,
             "block_dim": self.block_dim,
             "static_block_dim": self.static_block_dim,
             "function": self.function,
@@ -414,10 +423,31 @@ def definition(self):
         }
         return dd
 
-    def create(self, **kwargs):
+    @deprecated("create is deprecated and may be removed in future versions. Use `functools.partial` instead")
+    def create(
+        self, code_type=None, compiler=None, execute_api=None, tensor_types=None, global_memory_alignment=None, **kwargs
+    ):
+        """
+        Creates a copy of the instance with provided arguments updated.
+
+        .. deprecated:: 0.7.0
+            Please use :py:func:`functools.partial` instead.
+        """
+        if code_type is not None:
+            DeprecationWarning("code_type is deprecated and will be removed in future releases. It is no longer needed.")
+        if compiler is not None:
+            DeprecationWarning("compiler is deprecated and will be removed in future releases. It is no longer needed.")
+        if execute_api is not None:
+            DeprecationWarning("execute_api is deprecated and will be removed in future releases. It is no longer needed.")
+        if tensor_types is not None:
+            DeprecationWarning("tensor_types is deprecated and will be removed in future releases. It is no longer needed.")
+        if global_memory_alignment is not None:
+            DeprecationWarning(
+                "global_memory_alignment is deprecated and will be removed in future releases. It is no longer needed."
+            )
         dd = self.definition()
         dd.update(**kwargs)
-        return matmul(**dd)
+        return Matmul(**dd)
 
     #
     # Private implementations
@@ -431,17 +461,13 @@ def _valid(self, knob):
 
     @cached_property
     def _suggested_leading_dimension(self):
-        if self.code_type is None:
-            raise ValueError("leading_dimension='suggested' require code_type to be set.")
-        if self.execution != "Block":
-            raise ValueError("leading_dimension='suggested' require execution to be 'Block'.")
         # Generate special PTX for suggested_leading_dimension_of
         descriptor = generate_MM(
             size=self.size,
             function=self.function,
             precision=self.precision,
             data_type=self.data_type,
-            code_type=self.code_type,
+            sm=self.sm,
             transpose_mode=self._transpose_mode,
             arrangement=self._arrangement,
             alignment=self._alignment,
@@ -455,17 +481,13 @@ def _suggested_leading_dimension(self):
 
     @cached_property
     def _suggested_block_dim(self):
-        if self.code_type is None:
-            raise ValueError("block_dim='suggested' require code_type to be set.")
-        if self.execution != "Block":
-            raise ValueError("block_dim='suggested' require execution to be 'Block'.")
         # Generate full PTX
         descriptor = generate_MM(
             size=self.size,
             function=self.function,
             precision=self.precision,
             data_type=self.data_type,
-            code_type=self.code_type,
+            sm=self.sm,
             transpose_mode=self._transpose_mode,
             arrangement=self._arrangement,
             alignment=self._alignment,
@@ -477,85 +499,22 @@ def _suggested_block_dim(self):
 
         return Dim3(*get_int_traits(descriptor.descriptor, mathdx.CublasdxTraitType.SUGGESTED_BLOCK_DIM, 3))
 
-
-#
-# A complete set of knobs, ie sufficient to generate a device functions and query all traits
-# Not exposed to end users
-#
-class BlasOptionsComplete(BlasOptions):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        if self.code_type is None:
-            raise NotImplementedError(f"code_type should be set, but got code_type = {self.code_type}")
-        if self.execution != "Block":
-            raise NotImplementedError(f"Only execution=Block is implemented ; got execution = {self.execution}")
-
-        (m, n, k) = self.size
-
-        h = generate_MM(
-            size=self.size,
-            function=self.function,
-            precision=self.precision,
-            data_type=self.data_type,
-            code_type=self.code_type,
-            transpose_mode=self._transpose_mode,
-            arrangement=self.arrangement,
-            alignment=self._alignment,
-            block_dim=self.block_dim,
-            static_block_dim=self._static_block_dim,
-            leading_dimension=self._leading_dimension,
-            execution=self.execution,
-        ).descriptor
-
-        self._value_types = tuple(MATHDX_TYPES_TO_NP[vt] for vt in get_int_traits(h, mathdx.CublasdxTraitType.VALUE_TYPE, 3))
-        self._leading_dimension = LeadingDimension(*get_int_traits(h, mathdx.CublasdxTraitType.LEADING_DIMENSION, 3))
-        self._block_dim = Dim3(*get_int_traits(h, mathdx.CublasdxTraitType.BLOCK_DIM, 3))
-        self._alignment = Alignment(*get_int_traits(h, mathdx.CublasdxTraitType.ALIGNMENT, 3))
-
-        self._a_dim = (m, k)
-        self._b_dim = (k, n)
-        self._c_dim = (m, n)
-
-        if self._transpose_mode is not None:
-            if self._transpose_mode.a in {"transposed", "conj_transposed"}:
-                self._a_dim = self._a_dim[::-1]
-            if self._transpose_mode.b in {"transposed", "conj_transposed"}:
-                self._b_dim = self._b_dim[::-1]
-
-        [self._a_size, self._b_size, self._c_size] = self._calculate_abc_sizes(self._leading_dimension)
-
-        self._max_threads_per_block = self._block_dim.x * self._block_dim.y * self._block_dim.z
-
-    def _calculate_abc_sizes(self, ld: LeadingDimension) -> tuple[int, int, int]:
-        assert isinstance(ld, LeadingDimension)
-        if self._transpose_mode:
-            non_ld = (self._a_dim[1], self._b_dim[1], self._c_dim[1])
-        elif self._arrangement:
-            non_ld = (
-                self._a_dim[1 if self._arrangement.a == "col_major" else 0],
-                self._b_dim[1 if self._arrangement.b == "col_major" else 0],
-                self._c_dim[1 if self._arrangement.c == "col_major" else 0],
-            )
-
-        return tuple(x * y for x, y in zip(ld, non_ld, strict=True))
-
     @property
     def a_value_type(self):
-        return self._value_types[0]
+        return self._traits.value_types[0]
 
     @property
     def b_value_type(self):
-        return self._value_types[1]
+        return self._traits.value_types[1]
 
     @property
     def c_value_type(self):
-        return self._value_types[2]
+        return self._traits.value_types[2]
 
     @property
     @deprecated("value_type trait is deprecated. Please use {a|b|c}_value_type instead")
     def value_type(self):
-        if not all(vt == self._value_types[0] for vt in self._value_types):
+        if not all(vt == self._traits.value_types[0] for vt in self._traits.value_types):
             raise RuntimeError("value_type may be used only if all {a|b|c}_value_type have the same type")
         return self.a_value_type
 
@@ -571,33 +530,58 @@ def input_type(self):
     def output_type(self):
         return self.c_value_type
 
-    @property
+    @cached_property
     def a_dim(self):
-        return self._a_dim
+        (m, _, k) = self.size
 
-    @property
+        dim = (m, k)
+        if self._transpose_mode is not None and self._transpose_mode.a in {"transposed", "conj_transposed"}:
+            dim = dim[::-1]
+
+        return dim
+
+    @cached_property
     def b_dim(self):
-        return self._b_dim
+        (_, n, k) = self.size
 
-    @property
+        dim = (k, n)
+        if self._transpose_mode is not None and self._transpose_mode.b in {"transposed", "conj_transposed"}:
+            dim = dim[::-1]
+
+        return dim
+
+    @cached_property
     def c_dim(self):
-        return self._c_dim
+        (m, n, _) = self.size
+        return (m, n)
 
-    @property
-    def leading_dimension(self):
-        return self._leading_dimension
+    def _calculate_abc_sizes(self, ld: LeadingDimension) -> tuple[int, int, int]:
+        if self._transpose_mode:
+            non_ld = (self.a_dim[1], self.b_dim[1], self.c_dim[1])
+        elif self._arrangement:
+            non_ld = (
+                self.a_dim[1 if self._arrangement.a == "col_major" else 0],
+                self.b_dim[1 if self._arrangement.b == "col_major" else 0],
+                self.c_dim[1 if self._arrangement.c == "col_major" else 0],
+            )
+
+        return tuple(x * y for x, y in zip(ld, non_ld, strict=True))
+
+    @cached_property
+    def _abc_sizes(self):
+        return self._calculate_abc_sizes(self.leading_dimension)
 
     @property
     def a_size(self):
-        return self._a_size
+        return self._abc_sizes[0]
 
     @property
     def b_size(self):
-        return self._b_size
+        return self._abc_sizes[1]
 
     @property
     def c_size(self):
-        return self._c_size
+        return self._abc_sizes[2]
 
     @property
     @deprecated(
@@ -611,18 +595,18 @@ def shared_memory_size(self):
 
     @property
     def max_threads_per_block(self):
-        return self._max_threads_per_block
+        return self.block_dim.x * self.block_dim.y * self.block_dim.z
 
     def _get_shared_storage_size(self, *args, ab=False) -> int | None:  # type: ignore
         # Complex will be over-aligned (eg: f32x2 complex is aligned on 8B) with
         # this logic (which is what we want - for performance and vectorization)
-        item_sizes = tuple(numpy.dtype(vt).itemsize for vt in self._value_types)
+        item_sizes = tuple(numpy.dtype(vt).itemsize for vt in self._traits.value_types)
 
         alignment = self.alignment
         sizes = None
 
         if len(args) == 0:
-            sizes = (self._a_size, self._b_size, self._c_size)
+            sizes = (self.a_size, self.b_size, self.c_size)
         elif all(isinstance(arg, int) for arg in args):
             sizes = self._calculate_abc_sizes(LeadingDimension(*pad_or_truncate(list(args), 3)))
         elif all(isinstance(arg, Layout) for arg in args):
@@ -710,208 +694,157 @@ def suggest_layout_smem_c(self) -> Layout:
     def suggest_layout_rmem_c(self) -> Layout:
         return _BlasLayout(self, "suggest_layout_rmem_c")
 
+    def suggest_partitioner(self) -> Partitioner:
+        raise RuntimeError("suggest_partitioner should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
-#
-# A compiled BLAS device function, with knobs and device function
-#
-class BlasCompiled(BlasOptionsComplete):
-    def __init__(self, **kwargs):
-        execute_api = kwargs.pop("execute_api", "static_leading_dimensions")
-        tensor_types = kwargs.pop("tensor_types", None)
-        global_memory_alignment = kwargs.pop("global_memory_alignment", None)
-
-        # Build set of knobs
-        super().__init__(**kwargs)
-
-        if global_memory_alignment is not None:
-            if not isinstance(global_memory_alignment, Sequence) or len(global_memory_alignment) != 3:
-                raise ValueError(
-                    "global_memory_alignment should be an instance of Alignment"
-                    "or a 3-tuple ; "
-                    "got global_memory_alignment = {global_memory_alignment}"
-                )
-            global_memory_alignment = Alignment(*global_memory_alignment)
-
-            validate_alignment(
-                global_memory_alignment,
-                self.precision,
-                self.data_type,
-                gmem=True,
-            )
-
-        validate_execute_api(execute_api)
-        tensors_api = execute_api == "tensors"
-        if tensors_api:
-            validate_tensor_types(tensor_types)
-
-        self._execute_api = execute_api
-        self._tensor_types = tensor_types
-        self._global_memory_alignment = global_memory_alignment
-
-        handle = generate_MM(
-            size=self.size,
-            function=self.function,
-            precision=self.precision,
-            data_type=self.data_type,
-            code_type=self.code_type,
-            transpose_mode=self._transpose_mode,
-            arrangement=self.arrangement,
-            alignment=self._alignment,
-            block_dim=self.block_dim,
-            static_block_dim=self._static_block_dim,
-            # TODO: find better way to exclude ld operator for dynamic_leading_dimensions
-            leading_dimension=self._leading_dimension if self._execute_api == "static_leading_dimensions" else None,
-            execution=self._execution,
-            execute_api=self._execute_api,
-            tensor_types=self._tensor_types,
-        )
-
-        # TODO: remove once MM.files is deprecated
-        self._handle = handle
-
-        # Now compile the LTO device function
-        h = handle.descriptor
-
-        if tensors_api:
-            self._declare_tensors(h)
-
-            code, self._tensor_api_symbols = generate_code_tensors(
-                h, self.code_type.cc, self._gmem_tensors, self._target_tensors, rmem_c="rmem" in self._tensor_types[2]
-            )
-        else:
-            code = generate_code(h, self.code_type.cc)
-
-        # Compile
-        lto_fn = get_lto(code.descriptor)
-        isa_version = get_isa_version(code.descriptor)
-
-        self._ltos = [Code(self.code_type, isa_version, lto_fn)]
-        self._symbol = get_str_trait(h, mathdx.CublasdxTraitType.SYMBOL_NAME)
-
-        if self._tensor_types:
-            _, copy_wait_lto = generate_copy_wait_lto(self.code_type.cc)
-            self._ltos += [Code(self.code_type, isa_version, copy_wait_lto)]
-
-        self._finalizer = weakref.finalize(self, delete_binary_tempfiles, self.files)
-
-    def _declare_tensors(self, h):
-        # Complex will be over-aligned (eg: f32x2 complex is aligned on 8B) with
-        # this logic (which is what we want - for performance and vectorization)
-        item_sizes = tuple(numpy.dtype(vt).itemsize for vt in self._value_types)
-
-        self._gmem_tensors, self._target_tensors = generate_tensors(h, self._tensor_types, self._global_memory_alignment)
-        self._target_tensor_sizes = get_tensor_int_traits(self._target_tensors, mathdx.CublasdxTensorTrait.STORAGE_BYTES)
-        for ts, _is in zip(self._target_tensor_sizes, item_sizes, strict=True):
-            assert ts % _is == 0
-        self._target_tensor_sizes = tuple(ts // item_sizes[i] for i, ts in enumerate(self._target_tensor_sizes))
-        self._gmem_tensor_uids = get_tensor_int_traits(self._gmem_tensors, mathdx.CublasdxTensorTrait.UID)
-        self._target_tensor_uids = get_tensor_int_traits(self._target_tensors, mathdx.CublasdxTensorTrait.UID)
-
-    def definition(self):
-        dd = super().definition()
-        dd.update(execute_api=self.execute_api)
-        if self.execute_api == "tensors":
-            dd.update(tensor_types=self.tensor_types)
-        return dd
+    @deprecated("Calling MM(...) directly is deprecated, please use MM.execute(...) method instead.")
+    def __call__(self, *args):
+        raise RuntimeError("__call__ should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
-    @cached_property
-    def _tempfiles(self):
-        """
-        Create temporary files for the LTO functions.
-        """
-        return [make_binary_tempfile(lto.data, ".ltoir") for lto in self._ltos]
+    def execute(self, *args):
+        raise RuntimeError("execute should not be called directly outside of a numba.cuda.jit(...) kernel.")
 
     @property
+    @deprecated("files is deprecated and is no longer required and will be removed in future releases.")
     def files(self) -> list[str]:
         """The list of binary files for the lto functions."""
-        return [v.name for v in self._tempfiles]
+        return []
 
     @property
-    def codes(self):
+    @deprecated("codes is deprecated and is no longer required and will be removed in future releases.")
+    def codes(self) -> list[Code]:
         """A list of :class:`Code` objects for all lto functions."""
-        return self._ltos
+        return []
 
-    @property
-    def symbol(self):
-        """The name of the device function."""
-        return self._symbol
 
-    @property
-    def execute_api(self) -> str:
-        """
-        The API used to execute the function. It defines the signature of the
-        LTO function.
-        """
-        return self._execute_api
+class _MatmulTraits:
+    def __init__(self, mm: Matmul):
+        h = generate_MM(
+            size=mm._size,
+            function=mm._function,
+            precision=mm._precision,
+            data_type=mm._data_type,
+            sm=mm._sm,
+            transpose_mode=mm._transpose_mode,
+            arrangement=mm._arrangement,
+            alignment=mm._alignment,
+            block_dim=mm._block_dim,
+            static_block_dim=mm._static_block_dim,
+            leading_dimension=mm._leading_dimension,
+            execution=mm._execution,
+        ).descriptor
 
-    @property
-    def tensor_types(self) -> tuple[str, str, str]:
-        """
-        The tensor types used in the function. Defines types of the tensors for
-        the tensors API.
-        """
-        if self.execute_api != "tensors":
-            raise RuntimeError("tensor_types is only available when execute_api is 'tensors'")
-        return self._tensor_types
+        self.value_types = tuple(MATHDX_TYPES_TO_NP[vt] for vt in get_int_traits(h, mathdx.CublasdxTraitType.VALUE_TYPE, 3))
+        self.leading_dimension = LeadingDimension(*get_int_traits(h, mathdx.CublasdxTraitType.LEADING_DIMENSION, 3))
+        self.block_dim = Dim3(*get_int_traits(h, mathdx.CublasdxTraitType.BLOCK_DIM, 3))
+        self.alignment = Alignment(*get_int_traits(h, mathdx.CublasdxTraitType.ALIGNMENT, 3))
 
 
 #
-# A compiled BLAS device function, with knobs and device function, to be used with Numba
+# A compiled BLAS device function, with knobs and device function
 #
-class BlasNumba(BlasCompiled):
-    """
-    A class that encapsulates a compiled BLAS device function compatible with Numba.
 
 
-    """
-
-    def __init__(self, **kwargs):
-        if "code_type" not in kwargs:
-            kwargs["code_type"] = get_default_code_type()
-
-        # Build LTO device functions
-        super().__init__(**kwargs)
-
-        self._numba_value_types = tuple(NP_TYPES_TO_NUMBA_FE_TYPES[vt] for vt in self._value_types)
-
-    @property
-    def a_value_type(self):
-        return self._numba_value_types[0]
-
-    @property
-    def b_value_type(self):
-        return self._numba_value_types[1]
-
-    @property
-    def c_value_type(self):
-        return self._numba_value_types[2]
-
-    @deprecated("Calling MM(...) directly is deprecated, please use MM.execute(...) method instead.")
-    def __call__(self, *args):
-        raise RuntimeError("__call__ should not be called directly outside of a numba.cuda.jit(...) kernel.")
+def compile_blas_execute(
+    blas: Matmul,
+    code_type: Any,
+    execute_api: str | None = None,
+    tensor_types: Sequence[str] | None = None,
+    global_memory_alignment: Sequence[int] | None = None,
+) -> tuple[Code, str]:
+    if global_memory_alignment is not None:
+        if not isinstance(global_memory_alignment, Sequence) or len(global_memory_alignment) != 3:
+            raise ValueError(
+                "global_memory_alignment should be an instance of Alignment"
+                "or a 3-tuple ; "
+                "got global_memory_alignment = {global_memory_alignment}"
+            )
+        global_memory_alignment = Alignment(*global_memory_alignment)
+
+    check_code_type(code_type, "cuBLASDx")
+    validate_execute_api(execute_api)
+    tensors_api = execute_api == "tensors"
+    if tensors_api:
+        validate_tensor_types(tensor_types)
+
+    if global_memory_alignment is not None:
+        # Perform validation only after initialization since we need to
+        # know precision and data_type
+        validate_alignment(
+            global_memory_alignment,
+            blas.precision,
+            blas.data_type,
+            gmem=True,
+        )
 
-    def execute(self, *args):
-        raise RuntimeError("execute should not be called directly outside of a numba.cuda.jit(...) kernel.")
+    handle = generate_MM(
+        size=blas.size,
+        function=blas.function,
+        precision=blas.precision,
+        data_type=blas.data_type,
+        sm=blas.sm,
+        transpose_mode=blas.transpose_mode,
+        arrangement=blas.arrangement,
+        alignment=blas.alignment,
+        block_dim=blas.block_dim,
+        static_block_dim=blas._static_block_dim,
+        # TODO: find better way to exclude ld operator for dynamic_leading_dimensions
+        leading_dimension=blas._leading_dimension if execute_api == "static_leading_dimensions" else None,
+        execution=blas._execution,
+        execute_api=execute_api,
+        tensor_types=tensor_types,
+    )
 
-    @cached_property
-    def _copy_symbols_map(self):
-        if self.execute_api != "tensors":
-            return {}
-
-        return {
-            (self._gmem_tensor_uids[0], self._target_tensor_uids[0]): self._tensor_api_symbols.copy_a,
-            (self._gmem_tensor_uids[1], self._target_tensor_uids[1]): self._tensor_api_symbols.copy_b,
-            (self._gmem_tensor_uids[2], self._target_tensor_uids[2]): self._tensor_api_symbols.copy_c,
-            (self._target_tensor_uids[2], self._gmem_tensor_uids[2]): self._tensor_api_symbols.copy_c_back,
-        }
+    # Now compile the LTO device function
+    h = handle.descriptor
+
+    if tensors_api:
+        resp = generate_tensors(h, tensor_types, global_memory_alignment)
+        _, target_tensors = resp.gmem, resp.target
+        code, symbol = generate_function_code(h, mathdx.CublasdxDeviceFunctionType.EXECUTE, target_tensors, code_type.cc)
+    else:
+        code = generate_code(h, code_type.cc)
+
+    # Compile
+    lto_fn = get_lto(code.descriptor)
+    isa_version = get_isa_version(code.descriptor)
+
+    ltos = [Code(code_type, isa_version, lto_fn)]
+
+    if tensor_types:
+        symbol = symbol
+    else:
+        symbol = get_str_trait(h, mathdx.CublasdxTraitType.SYMBOL_NAME)
+
+    return ltos[0], symbol
+
+
+def _blas_tensors_handle(MM: Matmul):
+    handle = generate_MM(
+        size=MM.size,
+        function=MM.function,
+        precision=MM.precision,
+        data_type=MM.data_type,
+        sm=MM.sm,
+        transpose_mode=MM.transpose_mode,
+        arrangement=MM.arrangement,
+        alignment=MM.alignment,
+        block_dim=MM.block_dim,
+        static_block_dim=MM._static_block_dim,
+        execution=MM._execution,
+        execute_api="tensors",
+    )
+    return handle.descriptor
 
 
 @docstring_decorator(CUBLASDX_DOCSTRING, skip_missing=False)
-def matmul(*, compiler=None, **kwargs):
+def matmul(*, compiler=None, code_type=None, execute_api=None, tensor_types=None, global_memory_alignment=None, **kwargs):
     """
-    Create an :class:`BlasOptions` object that encapsulates a compiled and ready-to-use
+    Create an :class:`Matmul` object that encapsulates a compiled and ready-to-use
     device function for matrix multiplication.
 
+    .. deprecated:: 0.7.0
+
     Args:
         size: {size}
 
@@ -921,8 +854,18 @@ def matmul(*, compiler=None, **kwargs):
 
         compiler: {compiler}
 
+            .. versionchanged:: 0.7.0
+                compiler is no longer needed and does not take effect. Use
+                :py:func:`nvmath.device.compile_blas_execute` to get device
+                function code.
+
         code_type (CodeType): {code_type}
 
+            .. versionchanged:: 0.7.0
+                code_type should be used by
+                :py:func:`nvmath.device.compile_blas_execute` and no longer
+                needed for numba-cuda usage.
+
         block_size (int): {block_size}
 
         block_dim (Dim3): {block_dim}
@@ -941,12 +884,28 @@ def matmul(*, compiler=None, **kwargs):
 
         execute_api (str): {execute_api}
 
+            .. versionchanged:: 0.7.0
+                execute_api should be used by
+                :py:func:`nvmath.device.compile_blas_execute` and no longer
+                needed for numba-cuda usage.
+
         tensor_types (str): {tensor_types}
 
+            .. versionchanged:: 0.7.0
+                tensor_types should be used by
+                :py:func:`nvmath.device.compile_blas_execute` and no longer
+                needed for numba-cuda usage.
+
         global_memory_alignment (Alignment): {global_memory_alignment}
 
-    See Also:
-        The attributes of :class:`BlasOptions` provide a 1:1 mapping with the CUDA C++
+            .. versionchanged:: 0.7.0
+                alignment should be set at :py:func:`nvmath.device.copy`
+                global_memory_alignment should be used by
+                :py:func:`nvmath.device.compile_blas_execute` for non numba-cuda
+                usage. Alignment should be set
+
+    .. seealso::
+        The attributes of :class:`Matmul` provide a 1:1 mapping with the CUDA C++
         cuBLASDx APIs. For further details, please refer to `cuBLASDx documentation
         <https://docs.nvidia.com/cuda/cublasdx/>`_.
 
@@ -996,11 +955,22 @@ def matmul(*, compiler=None, **kwargs):
         Further examples can be found in the `nvmath/examples/device
         <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
     """
-    check_in("compiler", compiler, [None, "numba"])
-    if compiler is None:
-        return BlasCompiled(**kwargs)
-    elif compiler == "numba":
-        return BlasNumba(**kwargs)
+    DeprecationWarning("matmul is deprecated and will be removed in future releases. Please use Matmul class directly.")
+    if code_type is not None:
+        DeprecationWarning("code_type is deprecated and will be removed in future releases. It is no longer needed.")
+    if compiler is not None:
+        DeprecationWarning("compiler is deprecated and will be removed in future releases. It is no longer needed.")
+    if execute_api is not None:
+        DeprecationWarning("execute_api is deprecated and will be removed in future releases. It is no longer needed.")
+    if tensor_types is not None:
+        DeprecationWarning("tensor_types is deprecated and will be removed in future releases. It is no longer needed.")
+    if global_memory_alignment is not None:
+        DeprecationWarning(
+            "global_memory_alignment is deprecated and will be removed in "
+            "future releases. It is no longer needed. Please set alignment "
+            "at copy()"
+        )
+    return Matmul(**kwargs)
 
 
 def _parse_layout(layout: str) -> tuple[bool, str, str]:
@@ -1034,52 +1004,51 @@ class _BlasLayout(Layout):
     _leading_dimension: int | None
 
     # Internal fields to recreate the numba layout type
-    _MM: BlasNumba | None
+    _MM: Matmul | None
     _layout: str
 
     # Cached fields to avoid recomputing
-    _is_register: bool
     _tensor_index: int
 
-    def __init__(self, MM: BlasOptionsComplete, layout: str, leading_dimension: int | None = None):
-        if not isinstance(MM, BlasCompiled):
-            raise ValueError("MM should be an instance of BlasCompiled, support for BlasOptionsComplete is in progress")
-        if MM.execute_api != "tensors":
-            raise ValueError(f"{layout} is only available for execute_api='tensors'")
-
-        assert MM._tensor_types is not None
+    def __init__(self, MM: Matmul, layout: str, leading_dimension: int | None = None):
+        if not isinstance(MM, Matmul):
+            raise ValueError("MM should be an instance of Matmul")
 
         suggested, memory, tensor = _parse_layout(layout)
         self._tensor_index = ["a", "b", "c"].index(tensor)
 
-        self._size = math.prod((MM.a_dim, MM.b_dim, MM.c_dim)[self._tensor_index])
-
-        if memory == "g":
-            self._uid = MM._gmem_tensor_uids[self._tensor_index]
-            self._cosize = self._size
-        else:
-            tensor_type = f"{memory}mem_{tensor}"
+        tensor_type = f"{memory}mem_{tensor}"
 
-            if suggested:
-                tensor_type = "suggested_" + tensor_type
+        if suggested:
+            tensor_type = "suggested_" + tensor_type
 
-            if tensor_type not in set(MM._tensor_types):
-                raise ValueError(f"Invalid layout {layout} for tensor {tensor_type}. Available layouts are {MM._tensor_types}")
-            self._uid = MM._target_tensor_uids[self._tensor_index]
-            self._cosize = MM._target_tensor_sizes[self._tensor_index]
+        self._dtype = MM._traits.value_types[self._tensor_index]
+        itemsize = numpy.dtype(self._dtype).itemsize
 
-        if memory == "r":
-            # for register memory, we are using fragment so it does not have
-            # any gaps and contain only small chank, so dimension production
-            # does not apply
-            self._size = self._cosize
+        if memory == "g":
+            self._uid = -1  # gmem tensors at this stage do not have a uid
+            self._size = (MM.a_size, MM.b_size, MM.c_size)[self._tensor_index]
+            storage_size = self._size * itemsize
+        else:
+            th = generate_tensor(_blas_tensors_handle(MM), tensor_type)
+            self._uid, self._size, storage_size = get_tensor_traits(th.descriptor)
+        assert storage_size % itemsize == 0
+        self._cosize = storage_size // itemsize
+        if mathdx.get_version_ex() < (0, 3, 0):
+            self._size = (MM.a_size, MM.b_size, MM.c_size)[self._tensor_index]
+            if memory == "r":
+                self._size = self._cosize
 
-        self._is_register = memory == "r"
         self._dynamic_ld = memory == "g"  # dynamic ld only global memory
-        self._MM = MM if isinstance(MM, BlasNumba) else None
+        self._MM = MM
         self._layout = layout
+        self._tensor_type = tensor_type
         self._leading_dimension = leading_dimension
 
+    @property
+    def dtype(self):
+        return self._dtype
+
     @property
     def size(self) -> int:
         return self._size
@@ -1087,3 +1056,120 @@ def size(self) -> int:
     @property
     def cosize(self) -> int:
         return self._cosize
+
+
+def compile_blas_copy(
+    src_tensor: _BlasLayout,
+    dst_tensor: _BlasLayout,
+    code_type: CodeType,
+    alignment: int | None = None,
+):
+    check_code_type(code_type, "cuBLASDx")
+    assert src_tensor._MM == dst_tensor._MM
+    assert src_tensor._MM is not None
+
+    MM = src_tensor._MM
+
+    handle = _blas_tensors_handle(MM)
+    src_handler = generate_tensor(
+        handle, src_tensor._tensor_type, gmem_alignment=alignment if "gmem" in src_tensor._tensor_type else None
+    )
+    dst_handler = generate_tensor(
+        handle, dst_tensor._tensor_type, gmem_alignment=alignment if "gmem" in dst_tensor._tensor_type else None
+    )
+
+    return get_function_code(
+        handle, mathdx.CublasdxDeviceFunctionType.COPY, [src_handler.descriptor, dst_handler.descriptor], code_type
+    )
+
+
+def compile_blas_clear(
+    tensor: _BlasLayout,
+    code_type: CodeType,
+):
+    check_code_type(code_type, "cuBLASDx")
+    assert tensor._MM is not None
+
+    MM = tensor._MM
+
+    handle = _blas_tensors_handle(MM)
+    tensor_handler = generate_tensor(handle, tensor._tensor_type)
+
+    return get_function_code(handle, mathdx.CublasdxDeviceFunctionType.CLEAR, [tensor_handler.descriptor], code_type)
+
+
+def compile_blas_axpby(
+    x_tensor: _BlasLayout,
+    y_tensor: _BlasLayout,
+    code_type: CodeType,
+):
+    check_code_type(code_type, "cuBLASDx")
+    assert x_tensor._MM == y_tensor._MM
+    assert x_tensor._MM is not None
+
+    MM = x_tensor._MM
+
+    handle = _blas_tensors_handle(MM)
+    x_handler = generate_tensor(handle, x_tensor._tensor_type)
+    y_handler = generate_tensor(handle, y_tensor._tensor_type)
+
+    return get_function_code(
+        handle, mathdx.CublasdxDeviceFunctionType.AXPBY, [x_handler.descriptor, y_handler.descriptor], code_type
+    )
+
+
+def _compile_blas_partitioner_function(
+    MM: Matmul,
+    code_type: CodeType,
+    function: mathdx.CublasdxDeviceFunctionType,
+):
+    check_code_type(code_type, "cuBLASDx")
+
+    handle = _blas_tensors_handle(MM)
+    tensor_handle = generate_tensor(handle, "suggested_rmem_c")
+
+    return get_function_code(handle, function, [tensor_handle.descriptor], code_type)
+
+
+def compile_blas_map_idx2crd_partitioner(
+    MM: Matmul,
+    code_type: CodeType,
+):
+    return _compile_blas_partitioner_function(
+        MM,
+        code_type,
+        mathdx.CublasdxDeviceFunctionType.MAP_IDX2CRD_PARTITIONER,
+    )
+
+
+def compile_blas_is_thread_active(
+    MM: Matmul,
+    code_type: CodeType,
+):
+    return _compile_blas_partitioner_function(
+        MM,
+        code_type,
+        mathdx.CublasdxDeviceFunctionType.IS_THREAD_ACTIVE,
+    )
+
+
+def compile_blas_is_predicated(
+    MM: Matmul,
+    code_type: CodeType,
+):
+    return _compile_blas_partitioner_function(
+        MM,
+        code_type,
+        mathdx.CublasdxDeviceFunctionType.IS_PREDICATED,
+    )
+
+
+def compile_blas_is_index_in_bounds(
+    MM: Matmul,
+    code_type: CodeType,
+):
+    return _compile_blas_partitioner_function(
+        MM,
+        code_type,
+        mathdx.CublasdxDeviceFunctionType.IS_INDEX_IN_BOUNDS,
+    )
diff --git a/nvmath/device/cublasdx_backend.py b/nvmath/device/cublasdx_backend.py
index ccaead8..0477336 100644
--- a/nvmath/device/cublasdx_backend.py
+++ b/nvmath/device/cublasdx_backend.py
@@ -8,19 +8,22 @@
 from functools import lru_cache
 from typing import NamedTuple, Protocol
 from collections.abc import Sequence
+import weakref
 
 import numpy as np
 
 
-from .common import check_in, check_code_type
+from .common import check_in, check_sm
 from .common_backend import (
     EXECUTION_STR_TO_MATHDX,
     NP_TYPES_TO_MATHDX_PRECISION,
     NVARG_GEN_OPT_LTO,
     build_get_int_traits,
     build_get_str_trait,
+    get_isa_version,
+    get_lto,
 )
-from .common_cuda import CodeType, Dim3, ComputeCapability, ISAVersion
+from .common_cuda import Code, CodeType, Dim3, ComputeCapability, ISAVersion
 from .common_backend import DescriptorWrapper
 from .types import REAL_NP_TYPES, INT_NP_TYPES
 
@@ -149,14 +152,13 @@ class CublasdxTensors(NamedTuple):
     c: int
 
 
-class CublasdxTensorAPISymbols(NamedTuple):
-    copy_a: str
-    copy_b: str
-    copy_c: str
-    copy_c_back: str
-    clear_c: str
-    axpby: str
-    gemm: str
+class CublasdxTensorsResponse:
+    gmem: CublasdxTensors
+    target: CublasdxTensors
+
+    def __init__(self, gmem: CublasdxTensors, target: CublasdxTensors):
+        self.gmem = gmem
+        self.target = target
 
 
 MAX_ALIGNMENT = Alignment(16, 16, 16)
@@ -199,7 +201,7 @@ def validate(
     arrangement,
     alignment,
     block_dim,
-    code_type,
+    sm,
     function,
     leading_dimension,
     static_block_dim,
@@ -215,6 +217,7 @@ def validate(
             f"precision should be an instance of {Precision} or a 3-sequence, and individual fields "
             f"should be one of {_ACCEPTED_PRECISION}. Instead got precision = {precision}"
         )
+    check_sm(sm, "sm")
     check_in("data_type", data_type, ["real", "complex"])
     check_in("execution", execution, ["Block", "Thread"])
     check_in("function", function, ["MM"])
@@ -256,8 +259,6 @@ def validate(
             )
     else:
         raise ValueError(f"block_dim should be None, a Dim3 instance or 'suggested'; got block_dim = {block_dim}")
-    if code_type is not None:
-        check_code_type(code_type)
     if leading_dimension in (None, "suggested") or isinstance(leading_dimension, LeadingDimension):
         pass
     else:
@@ -325,11 +326,11 @@ def generate_MM(
     transpose_mode: TransposeMode | None,
     arrangement: Arrangement | None,
     alignment: Alignment | None,
-    code_type: CodeType | None,
+    sm: ComputeCapability,
     block_dim: Dim3 | None,
     static_block_dim: bool,
     execution: str,
-    leading_dimension: LeadingDimension | None,
+    leading_dimension: LeadingDimension | None = None,
     execute_api: str | None = None,
     tensor_types: tuple[str, str, str] | None = None,
 ):
@@ -342,6 +343,10 @@ def generate_MM(
 
     (m, n, k) = size
 
+    # TODO: remove once libmathdx supports it
+    if execute_api is None:
+        execute_api = "static_leading_dimensions"
+
     if execute_api is not None:
         mathdx.cublasdx_set_operator_int64(h, mathdx.CublasdxOperatorType.API, _BLAS_API_STR_TO_MATHDX[execute_api])
 
@@ -354,17 +359,14 @@ def generate_MM(
     mathdx.cublasdx_set_operator_int64(h, mathdx.CublasdxOperatorType.EXECUTION, EXECUTION_STR_TO_MATHDX[execution])
     mathdx.cublasdx_set_operator_int64(h, mathdx.CublasdxOperatorType.TYPE, _BLAS_TYPE_STR_TO_MATHDX[data_type])
 
+    mathdx.cublasdx_set_operator_int64(h, mathdx.CublasdxOperatorType.SM, sm.major * 100 + sm.minor * 10)
+
     if block_dim:
         mathdx.cublasdx_set_operator_int64s(h, mathdx.CublasdxOperatorType.BLOCK_DIM, 3, block_dim)
 
     if static_block_dim:
         mathdx.cublasdx_set_operator_int64(h, mathdx.CublasdxOperatorType.STATIC_BLOCK_DIM, 1)
 
-    if code_type:
-        mathdx.cublasdx_set_operator_int64(
-            h, mathdx.CublasdxOperatorType.SM, code_type.cc.major * 100 + code_type.cc.minor * 10
-        )
-
     if leading_dimension:
         mathdx.cublasdx_set_operator_int64s(
             h, mathdx.CublasdxOperatorType.LEADING_DIMENSION, 3, [leading_dimension.a, leading_dimension.b, leading_dimension.c]
@@ -415,15 +417,73 @@ def generate_code(handle, version: ComputeCapability, device_functions: tuple |
     return DescriptorWrapper(code, mathdx.commondx_destroy_code)
 
 
+@lru_cache
+def generate_tensor(h: int, tensor_type: str, gmem_alignment: int | None = None) -> DescriptorWrapper:
+    tensor = mathdx.cublasdx_create_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_type])
+
+    if gmem_alignment is not None:
+        try:
+            mathdx.cublasdx_set_tensor_option_int64(
+                tensor,
+                mathdx.CublasdxTensorOption.ALIGNMENT_BYTES,
+                gmem_alignment,
+            )
+        except Exception:
+            print(f"[WARN] Failed to set {tensor_type} tensor alignment {gmem_alignment}")
+
+    mathdx.cublasdx_finalize_tensors(h, 1, [tensor])
+
+    return DescriptorWrapper(tensor, mathdx.cublasdx_destroy_tensor)
+
+
+@lru_cache
+def get_tensor_traits(tensor: int) -> tuple[int, int, int]:
+    """Get tensor traits: (uid, logical_size, storage_bytes)"""
+    return (
+        int(mathdx.cublasdx_get_tensor_trait_int64(tensor, mathdx.CublasdxTensorTrait.UID)),
+        int(mathdx.cublasdx_get_tensor_trait_int64(tensor, mathdx.CublasdxTensorTrait.LOGICAL_SIZE))
+        if mathdx.get_version_ex() >= (0, 3, 0)
+        else 0,
+        int(mathdx.cublasdx_get_tensor_trait_int64(tensor, mathdx.CublasdxTensorTrait.STORAGE_BYTES)),
+    )
+
+
+def generate_function_code(
+    MM_handler: int, function_type: mathdx.CublasdxDeviceFunctionType, args: Sequence[int], version: ISAVersion
+):
+    function_handler = mathdx.cublasdx_create_device_function(MM_handler, function_type, len(args), list(args))
+    symbol = get_str_device_trait(function_handler, mathdx.CublasdxDeviceFunctionTrait.SYMBOL)
+    code = generate_code(MM_handler, version, (function_handler,))
+
+    return code, symbol
+
+
+def get_function_code(
+    MM_handler: int,
+    function_type: mathdx.CublasdxDeviceFunctionType,
+    args: Sequence[int],
+    code_type: CodeType,
+):
+    code, symbol = generate_function_code(MM_handler, function_type, args, code_type.cc)
+
+    # Compile
+    lto_fn = get_lto(code.descriptor)
+    isa_version = get_isa_version(code.descriptor)
+
+    lto = Code(code_type, isa_version, lto_fn)
+
+    return lto, symbol
+
+
 @lru_cache
 def generate_tensors(h, tensor_types, gmem_alignment: Alignment | None = None):
-    type_mem_a = mathdx.cublasdx_bind_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[0]])
-    type_mem_b = mathdx.cublasdx_bind_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[1]])
-    type_mem_c = mathdx.cublasdx_bind_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[2]])
+    type_mem_a = mathdx.cublasdx_create_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[0]])
+    type_mem_b = mathdx.cublasdx_create_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[1]])
+    type_mem_c = mathdx.cublasdx_create_tensor(h, _TENSOR_TYPE_STR_TO_MATHDX[tensor_types[2]])
 
-    gmem_a = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.GMEM_A)
-    gmem_b = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.GMEM_B)
-    gmem_c = mathdx.cublasdx_bind_tensor(h, mathdx.CublasdxTensorType.GMEM_C)
+    gmem_a = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.GMEM_A)
+    gmem_b = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.GMEM_B)
+    gmem_c = mathdx.cublasdx_create_tensor(h, mathdx.CublasdxTensorType.GMEM_C)
 
     if gmem_alignment:
         mathdx.cublasdx_set_tensor_option_int64(
@@ -456,58 +516,18 @@ def generate_tensors(h, tensor_types, gmem_alignment: Alignment | None = None):
     target_tensors = CublasdxTensors(type_mem_a, type_mem_b, type_mem_c)
     gmem_tensors = CublasdxTensors(gmem_a, gmem_b, gmem_c)
 
-    return gmem_tensors, target_tensors
+    resp = CublasdxTensorsResponse(gmem_tensors, target_tensors)
 
+    weakref.finalize(resp, destroy_tensors, resp.gmem)
+    weakref.finalize(resp, destroy_tensors, resp.target)
 
-@lru_cache
-def generate_code_tensors(
-    handle,
-    version: ISAVersion,
-    gmem_tensors: CublasdxTensors,
-    target_tensors: CublasdxTensors,
-    rmem_c: bool = False,
-):
-    copy_a = mathdx.cublasdx_bind_device_function(
-        handle, mathdx.CublasdxDeviceFunctionType.COPY, 2, [gmem_tensors.a, target_tensors.a]
-    )
-    copy_b = mathdx.cublasdx_bind_device_function(
-        handle, mathdx.CublasdxDeviceFunctionType.COPY, 2, [gmem_tensors.b, target_tensors.b]
-    )
-    copy_c = mathdx.cublasdx_bind_device_function(
-        handle, mathdx.CublasdxDeviceFunctionType.COPY, 2, [gmem_tensors.c, target_tensors.c]
-    )
-    copy_c_back = mathdx.cublasdx_bind_device_function(
-        handle, mathdx.CublasdxDeviceFunctionType.COPY, 2, [target_tensors.c, gmem_tensors.c]
-    )
-    if rmem_c:
-        clear_c_fn = mathdx.cublasdx_bind_device_function(
-            handle, mathdx.CublasdxDeviceFunctionType.CLEAR, 1, [target_tensors.c]
-        )
-        axpby_fn = mathdx.cublasdx_bind_device_function(
-            handle, mathdx.CublasdxDeviceFunctionType.AXPBY, 2, [target_tensors.c, target_tensors.c]
-        )
-    gemm = mathdx.cublasdx_bind_device_function(
-        handle, mathdx.CublasdxDeviceFunctionType.EXECUTE, 3, [target_tensors.a, target_tensors.b, target_tensors.c]
-    )
-
-    clear_c_sym = get_str_device_trait(clear_c_fn, mathdx.CublasdxDeviceFunctionTrait.SYMBOL) if rmem_c else ""
-    axpby_sm = get_str_device_trait(axpby_fn, mathdx.CublasdxDeviceFunctionTrait.SYMBOL) if rmem_c else ""
-
-    tensor_symbols = CublasdxTensorAPISymbols(
-        get_str_device_trait(copy_a, mathdx.CublasdxDeviceFunctionTrait.SYMBOL),
-        get_str_device_trait(copy_b, mathdx.CublasdxDeviceFunctionTrait.SYMBOL),
-        get_str_device_trait(copy_c, mathdx.CublasdxDeviceFunctionTrait.SYMBOL),
-        get_str_device_trait(copy_c_back, mathdx.CublasdxDeviceFunctionTrait.SYMBOL),
-        clear_c_sym,
-        axpby_sm,
-        get_str_device_trait(gemm, mathdx.CublasdxDeviceFunctionTrait.SYMBOL),
-    )
+    return resp
 
-    function_list = [copy_a, copy_b, copy_c, copy_c_back, gemm]
-    if rmem_c:
-        function_list += [clear_c_fn, axpby_fn]
 
-    return generate_code(handle, version, tuple(function_list)), tensor_symbols
+def destroy_tensors(tensors: CublasdxTensors):
+    mathdx.cublasdx_destroy_tensor(tensors.a)
+    mathdx.cublasdx_destroy_tensor(tensors.b)
+    mathdx.cublasdx_destroy_tensor(tensors.c)
 
 
 def generate_copy_wait_lto(compute_capability: ComputeCapability):
@@ -532,7 +552,7 @@ def generate_device_function_lto(compute_capability: ComputeCapability, function
     mathdx.cublasdx_set_operator_int64s(h, mathdx.CublasdxOperatorType.BLOCK_DIM, 3, [32, 1, 1])
     mathdx.cublasdx_set_operator_int64s(h, mathdx.CublasdxOperatorType.SIZE, 3, [1, 1, 1])
 
-    function = mathdx.cublasdx_bind_device_function(h, function_type, len(args), [*args])
+    function = mathdx.cublasdx_create_device_function(h, function_type, len(args), [*args])
     symbol = get_str_device_trait(function, mathdx.CublasdxDeviceFunctionTrait.SYMBOL)
 
     # Compile the device function to lto
@@ -547,6 +567,7 @@ def generate_device_function_lto(compute_capability: ComputeCapability, function
     mathdx.commondx_get_code_ltoir(code, lto_size, lto)
 
     mathdx.commondx_destroy_code(code)
+    mathdx.cublasdx_destroy_device_function(function)
     mathdx.cublasdx_destroy_descriptor(h)
 
     return symbol, bytes(lto)
diff --git a/nvmath/device/cublasdx_numba.py b/nvmath/device/cublasdx_numba.py
index 8450902..440812e 100644
--- a/nvmath/device/cublasdx_numba.py
+++ b/nvmath/device/cublasdx_numba.py
@@ -2,18 +2,52 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from functools import cached_property
+import operator
+from collections.abc import Callable
+from numba import cuda
+import numba
 from numba.core import typing, cgutils
 from numba.extending import typeof_impl, overload_method, intrinsic, types, utils, overload
 from numba.cuda.cudaimpl import lower_constant, registry as cuda_registry
 from numba.cuda.models import register_model
+from numba.core.errors import TypingError
+from numba.np import numpy_support
 
 from nvmath.device.common_cuda import get_default_code_type
 from nvmath.device.cublasdx_backend import generate_copy_wait_lto
 
 from .common import axpby, copy, copy_fragment, clear, copy_wait, make_tensor, OpaqueTensor
-from .common_numba import NUMBA_FE_TYPES_TO_NUMBA_IR, make_function_call, overload_type_attribute, EmptyStructModel
-from .cublasdx import BlasNumba, _BlasLayout
-from .common_opaque_tensor import LayoutModel, LayoutType, OpaqueTensorType
+from .common_numba import (
+    NUMBA_FE_TYPES_TO_NUMBA_IR,
+    declare_cabi_device,
+    get_array_ptr,
+    get_opaque_tensor,
+    get_value_ptr,
+    overload_type_attribute,
+    EmptyStructModel,
+)
+from .cublasdx import (
+    Matmul,
+    _BlasLayout,
+    compile_blas_axpby,
+    compile_blas_clear,
+    compile_blas_copy,
+    compile_blas_execute,
+    compile_blas_is_index_in_bounds,
+    compile_blas_is_predicated,
+    compile_blas_is_thread_active,
+    compile_blas_map_idx2crd_partitioner,
+)
+from .common_opaque_tensor import (
+    LayoutModel,
+    LayoutType,
+    OpaqueTensorType,
+    PartitionModel,
+    PartitionType,
+    PartitionerModel,
+    PartitionerType,
+)
 
 import llvmlite.ir as llvmir
 from numba.core.base import BaseContext
@@ -22,7 +56,7 @@
     "size",
     "precision",
     "data_type",
-    "code_type",
+    "sm",
     "block_size",
     "block_dim",
     "leading_dimension",
@@ -30,7 +64,7 @@
     "arrangement",
     "function",
     "execution",
-    "execute_api",
+    "alignment",
 ]
 
 _BLAS_COMPILED_ARGS = [
@@ -53,21 +87,19 @@
 
 class BlasType(types.Type):
     """
-    Type class associated with the `cublasdx.BlasNumba`.
+    Type class associated with the `cublasdx.Matmul`.
     """
 
-    def __init__(self, blas: BlasNumba):
-        assert isinstance(blas, BlasNumba)
+    def __init__(self, blas: Matmul):
+        assert isinstance(blas, Matmul)
         self._blas = blas
         attributes = [f"{attr}={getattr(blas, attr)}" for attr in _BLAS_DEFINITION_ARGS if getattr(blas, attr)]
-        if blas._tensor_types:
-            attributes += [f"tensor_types={blas._tensor_types}"]
         attributes.sort()
 
         self.name = "BlasNumba(" + ",".join(attributes) + ")"
 
     @property
-    def blas(self) -> BlasNumba:
+    def blas(self) -> Matmul:
         return self._blas
 
 
@@ -80,8 +112,8 @@ def constant_dummy(context, builder, typ, pyval):
     return struct_ptr._getvalue()
 
 
-@typeof_impl.register(BlasNumba)
-def typeof_blas_numba(val: BlasNumba, c: typing.Context) -> BlasType:
+@typeof_impl.register(Matmul)
+def typeof_blas_numba(val: Matmul, c: typing.Context) -> BlasType:
     return BlasType(val)
 
 
@@ -107,28 +139,13 @@ def ol_blas_numba_call(blas_numba: BlasType, _arg1, _arg2, _arg3, _arg4=None, _a
 
 
 def ol_blas_numba(blas_numba: BlasType, _arg1, _arg2, _arg3, _arg4=None, _arg5=None, _arg6=None, _arg7=None, _arg8=None):
-    if blas_numba.blas.execute_api == "tensors" and blas_numba.blas._tensor_types[2] == "suggested_rmem_c":
-        assert _arg4 in {None, types.Omitted(None)}
-        assert _arg5 in {None, types.Omitted(None)}
-        assert _arg6 in {None, types.Omitted(None)}
-        assert _arg7 in {None, types.Omitted(None)}
-        assert _arg8 in {None, types.Omitted(None)}
-
+    none_set = {None, types.Omitted(None)}
+    if {_arg4, _arg5, _arg6, _arg7, _arg8} <= none_set:
         return lambda _, a, b, c, _arg4=None, _arg5=None, _arg6=None, _arg7=None, _arg8=None: _bals_type___call__(_, a, b, c)
-    elif (
-        blas_numba.blas.execute_api == "static_leading_dimensions"
-        or blas_numba.blas.execute_api == "tensors"
-        and blas_numba.blas._tensor_types[2] != "suggested_rmem_c"
-    ):
-        assert _arg6 in {None, types.Omitted(None)}
-        assert _arg7 in {None, types.Omitted(None)}
-        assert _arg8 in {None, types.Omitted(None)}
-
+    elif {_arg6, _arg7, _arg8} <= none_set:
         return lambda _, alpha, a, b, beta, c, _arg6=None, _arg7=None, _arg8=None: _bals_type___call__(_, alpha, a, b, beta, c)
-    elif blas_numba.blas.execute_api == "dynamic_leading_dimensions":
-        return lambda _, alpha, a, lda, b, ldb, beta, c, ldc: _bals_type___call__(_, alpha, a, lda, b, ldb, beta, c, ldc)
     else:
-        return  # no implementation
+        return lambda _, alpha, a, lda, b, ldb, beta, c, ldc: _bals_type___call__(_, alpha, a, lda, b, ldb, beta, c, ldc)
 
 
 # TODO: use overload_method when supported
@@ -148,16 +165,32 @@ def ol_blas_type___call___tensors_rmem(
     MM = blas_numba.blas
     if not all(isinstance(t, OpaqueTensorType) for t in (a, b, c)):
         return
-    if (a.uid, b.uid, c.uid) != MM._target_tensor_uids:
+    if not all(isinstance(t.layout, BlasLayoutType) for t in (a, b, c)):
+        return
+    if "rmem" not in c.layout.blas_layout._tensor_type:
         return
 
-    @intrinsic
-    def sym_call(typingctx, a, b, c):
-        return_type = types.void
-        sig = typing.signature(return_type, a, b, c)
-        return sig, make_function_call(MM._tensor_api_symbols.gemm)
+    return_type = types.void
+    sig = typing.signature(return_type, a._capi_type, b._capi_type, c._capi_type)
+
+    code, symbol = compile_blas_execute(
+        MM,
+        code_type=get_default_code_type(),
+        execute_api="tensors",
+        tensor_types=(a.layout.blas_layout._tensor_type, b.layout.blas_layout._tensor_type, c.layout.blas_layout._tensor_type),
+    )
+
+    lto = cuda.LTOIR(code.data)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, a, b, c):
+        a_struct = get_opaque_tensor(a)
+        b_struct = get_opaque_tensor(b)
+        c_struct = get_opaque_tensor(c)
 
-    return lambda _, a, b, c: sym_call(a, b, c)
+        blas_device_func(a_struct, b_struct, c_struct)
+
+    return impl
 
 
 @overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
@@ -176,16 +209,35 @@ def ol_blas_type___call___tensors_smem(
         return
     if not all(isinstance(a, OpaqueTensorType) for a in (a, b, c)):
         return
-    if (a.uid, b.uid, c.uid) != MM._target_tensor_uids:
+    if not all(isinstance(t.layout, BlasLayoutType) for t in (a, b, c)):
+        return
+    if "smem" not in c.layout.blas_layout._tensor_type:
         return
 
-    @intrinsic
-    def sym_call(typingctx, alpha, a, b, beta, c):
-        return_type = types.void
-        sig = typing.signature(return_type, c.dtype, a, b, c.dtype, c)
-        return sig, make_function_call(MM._tensor_api_symbols.gemm)
+    return_type = types.void
+    c_ptr = types.CPointer(c.dtype)
+    sig = typing.signature(return_type, c_ptr, a._capi_type, b._capi_type, c_ptr, c._capi_type)
 
-    return lambda _, alpha, a, b, beta, c: sym_call(alpha, a, b, beta, c)
+    code, symbol = compile_blas_execute(
+        MM,
+        code_type=get_default_code_type(),
+        execute_api="tensors",
+        tensor_types=(a.layout.blas_layout._tensor_type, b.layout.blas_layout._tensor_type, c.layout.blas_layout._tensor_type),
+    )
+
+    lto = cuda.LTOIR(code.data)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, alpha, a, b, beta, c):
+        a_struct = get_opaque_tensor(a)
+        b_struct = get_opaque_tensor(b)
+        c_struct = get_opaque_tensor(c)
+        alpha_ptr = get_value_ptr(c.buffer.dtype.type(alpha))
+        beta_ptr = get_value_ptr(c.buffer.dtype.type(beta))
+
+        blas_device_func(alpha_ptr, a_struct, b_struct, beta_ptr, c_struct)
+
+    return impl
 
 
 @overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
@@ -204,21 +256,35 @@ def ol_blas_type___call___basic(
         return
     if not all(isinstance(a, types.Array) for a in (a, b, c)):
         return
-    if (a.dtype, b.dtype, c.dtype) != tuple(NUMBA_FE_TYPES_TO_NUMBA_IR[vt] for vt in MM._numba_value_types):
+    if (a.dtype, b.dtype, c.dtype) != tuple(NUMBA_FE_TYPES_TO_NUMBA_IR[vt] for vt in MM._traits.value_types):
         return
 
     # setting signature for intrinsic to much calling conventions. Numba will
     # automatically cast to desired values.
     return_type = types.void
-    sig = typing.signature(return_type, c.dtype, a, b, c.dtype, c)
+    c_ptr = types.CPointer(c.dtype)
+    sig = typing.signature(return_type, c_ptr, types.CPointer(a.dtype), types.CPointer(b.dtype), c_ptr, c_ptr)
 
-    symbol = MM.symbol
+    code, symbol = compile_blas_execute(
+        MM,
+        code_type=get_default_code_type(),
+        execute_api="static_leading_dimensions",
+    )
 
-    @intrinsic
-    def sym_call(typingctx, alpha, a, b, beta, c):
-        return sig, make_function_call(symbol)
+    lto = cuda.LTOIR(code.data)
+
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, alpha, a, b, beta, c):
+        aptr = get_array_ptr(a)
+        bptr = get_array_ptr(b)
+        cptr = get_array_ptr(c)
+        alpha = get_value_ptr(c.dtype.type(alpha))
+        beta = get_value_ptr(c.dtype.type(beta))
 
-    return lambda _, alpha, a, b, beta, c: sym_call(alpha, a, b, beta, c)
+        blas_device_func(alpha, aptr, bptr, beta, cptr)
+
+    return impl
 
 
 @overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
@@ -240,22 +306,41 @@ def ol_blas_type___call___ldabc(
         return
     if not all(isinstance(a, types.Array) for a in (a, b, c)):
         return
-    if (a.dtype, b.dtype, c.dtype) != tuple(NUMBA_FE_TYPES_TO_NUMBA_IR[vt] for vt in MM._numba_value_types):
+    if (a.dtype, b.dtype, c.dtype) != tuple(NUMBA_FE_TYPES_TO_NUMBA_IR[vt] for vt in MM._traits.value_types):
         return
     if not all(isinstance(a, types.Integer) for a in (lda, ldb, ldc)):
         return
 
-    # setting signature for intrinsic to much calling conventions. Numba will
-    # automatically cast to desired values.
     ld_type = types.uint32
+    ld_ptr = types.CPointer(ld_type)
     return_type = types.void
-    sig = typing.signature(return_type, c.dtype, a, ld_type, b, ld_type, c.dtype, c, ld_type)
+    c_ptr = types.CPointer(c.dtype)
+    sig = typing.signature(
+        return_type, c_ptr, types.CPointer(a.dtype), ld_ptr, types.CPointer(b.dtype), ld_ptr, c_ptr, c_ptr, ld_ptr
+    )
+
+    code, symbol = compile_blas_execute(
+        MM,
+        code_type=get_default_code_type(),
+        execute_api="dynamic_leading_dimensions",
+    )
+
+    lto = cuda.LTOIR(code.data)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, alpha, a, lda, b, ldb, beta, c, ldc):
+        aptr = get_array_ptr(a)
+        bptr = get_array_ptr(b)
+        cptr = get_array_ptr(c)
+        alpha = get_value_ptr(c.dtype.type(alpha))
+        beta = get_value_ptr(c.dtype.type(beta))
+        lda = get_value_ptr(ld_type(lda))
+        ldb = get_value_ptr(ld_type(ldb))
+        ldc = get_value_ptr(ld_type(ldc))
+
+        blas_device_func(alpha, aptr, lda, bptr, ldb, beta, cptr, ldc)
 
-    @intrinsic
-    def sym_call(typingctx, alpha, a, lda, b, ldb, beta, c, ldc):
-        return sig, make_function_call(MM.symbol)
-
-    return lambda _, alpha, a, lda, b, ldb, beta, c, ldc: sym_call(alpha, a, lda, b, ldb, beta, c, ldc)
+    return impl
 
 
 # __call__ overload is not supported by numba, however adding this overload
@@ -276,42 +361,62 @@ def method_impl(context, builder, sig, args):
 
 
 @overload(copy, target="cuda", jit_options={"forceinline": True}, strict=False)
-def ol_blas_copy(src: OpaqueTensorType, dst: OpaqueTensorType):
-    return ol_blas_copy_generic(src, dst, "copy")
+def ol_blas_copy(src: OpaqueTensorType, dst: OpaqueTensorType, alignment=None):
+    return ol_blas_copy_generic(src, dst, alignment, "copy")
 
 
 @overload(copy_fragment, target="cuda", jit_options={"forceinline": True}, strict=False)
-def ol_blas_copy_fragment(src: OpaqueTensorType, dst: OpaqueTensorType):
-    return ol_blas_copy_generic(src, dst, "copy_fragment")
+def ol_blas_copy_fragment(src: OpaqueTensorType, dst: OpaqueTensorType, alignment=None):
+    return ol_blas_copy_generic(src, dst, alignment, "copy_fragment")
 
 
-def ol_blas_copy_generic(src: OpaqueTensorType, dst: OpaqueTensorType, func: str):
+def ol_blas_copy_generic(src: OpaqueTensorType, dst: OpaqueTensorType, alignment_ty: types.Type | None, func: str):
     assert isinstance(src, OpaqueTensorType)
     assert isinstance(src.layout, BlasLayoutType)
     assert isinstance(dst, OpaqueTensorType)
     assert isinstance(dst.layout, BlasLayoutType)
+    assert src.dtype == dst.dtype
+
+    alignment: int | None = None
+    if alignment_ty not in {None, types.Omitted(None)}:
+        if not isinstance(alignment_ty, types.Literal):
+            return lambda src, dst, alignment: numba.literally(alignment)
+        alignment = alignment_ty.literal_value
+        if alignment not in {1, 2, 4, 8, 16}:
+            raise TypingError(f"Alignment must be one of (1, 2, 4, 8, 16), got {alignment}")
 
     rmem = "rmem" in dst.layout.layout or "rmem" in src.layout.layout
 
     if func == "copy_fragment":
-        assert rmem
+        if not rmem:
+            raise TypingError("copy_fragment is only supported for rmem tensors. Please use copy instead.")
     else:
         assert func == "copy"
-        assert not rmem
+        if rmem:
+            raise TypingError("copy is not supported for rmem tensors. Please use copy_fragment instead.")
 
-    symbol = src.layout.copy_to_symbol(dst.layout)
+    if alignment is not None:
+        dtype = numpy_support.as_dtype(src.layout.dtype)
+        if alignment < dtype.itemsize:
+            raise TypingError(f"Alignment must be at least the size of the data type {dtype.itemsize}, got {alignment}")
 
-    @intrinsic
-    def _intrinsic(typingctx, src, dst):
-        assert isinstance(src, OpaqueTensorType)
-        assert isinstance(dst, OpaqueTensorType)
-        assert src.dtype == dst.dtype
+    code, symbol = compile_blas_copy(
+        src_tensor=src.layout.blas_layout,
+        dst_tensor=dst.layout.blas_layout,
+        code_type=get_default_code_type(),
+        alignment=alignment,
+    )
 
-        return_type = types.void
-        return typing.signature(return_type, src, dst), make_function_call(symbol)
+    return_type = types.void
+    sig = typing.signature(return_type, src._capi_type, dst._capi_type)
 
-    def impl(src, dst):
-        return _intrinsic(src, dst)
+    lto = cuda.LTOIR(code.data)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(src, dst, alignment=None):
+        src_struct = get_opaque_tensor(src)
+        dst_struct = get_opaque_tensor(dst)
+        return blas_device_func(src_struct, dst_struct)
 
     return impl
 
@@ -322,17 +427,20 @@ def ol_blas_clear(arr: OpaqueTensorType):
     assert isinstance(arr.layout, BlasLayoutType)
     assert arr.buffer_type
 
-    symbol = arr.layout.clear_symbol
+    code, symbol = compile_blas_clear(
+        tensor=arr.layout.blas_layout,
+        code_type=get_default_code_type(),
+    )
 
-    assert symbol
+    lto = cuda.LTOIR(code.data)
 
-    @intrinsic
-    def _intrinsic(typingctx, arr):
-        return_type = types.void
-        return typing.signature(return_type, arr), make_function_call(symbol)
+    return_type = types.void
+    sig = typing.signature(return_type, arr._capi_type)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
 
     def impl(arr):
-        return _intrinsic(arr)
+        arr_struct = get_opaque_tensor(arr)
+        return blas_device_func(arr_struct)
 
     return impl
 
@@ -342,67 +450,47 @@ class BlasLayoutType(LayoutType):
     Type class associated with opaque tensor layouts.
     """
 
-    def __init__(self, MM: BlasNumba, layout: str):
-        assert isinstance(MM, BlasNumba)
-        assert isinstance(layout, str)
-
-        blas_layout = _BlasLayout(MM, layout)
+    def __init__(self, blas_layout: _BlasLayout):
+        assert isinstance(blas_layout, _BlasLayout)
 
-        self._uid = blas_layout._uid
-        self._size = blas_layout._size
-        self._cosize = blas_layout._cosize
-        self._tensor_index = blas_layout._tensor_index
-        self._dynamic_ld = blas_layout._dynamic_ld
-        self._dtype = NUMBA_FE_TYPES_TO_NUMBA_IR[MM._value_types[self._tensor_index]]
-        self._layout = layout
-
-        self._copy_symbols_map = MM._copy_symbols_map
-        self._clear_symbol = MM._tensor_api_symbols.clear_c if blas_layout._is_register else None
-        self._axpby_symbol = MM._tensor_api_symbols.axpby if blas_layout._is_register else None
+        self._blas_layout = blas_layout
 
         # Using handle descriptor in the type name to avoid symbol copy caching
         # by numba.
-        self.name = f"Layout(uid={self._uid},layout={self._layout},handle={MM._handle.descriptor})"
+        self.name = f"Layout(uid={blas_layout._uid},layout={blas_layout._layout},MM={blas_layout._MM})"
+
+    @property
+    def blas_layout(self) -> _BlasLayout:
+        return self._blas_layout
 
     @property
     def layout(self) -> str:
-        return self._layout
+        return self._blas_layout._layout
 
     @property
     def uid(self) -> int:
-        return self._uid
+        return self._blas_layout._uid
 
     @property
     def tensor_index(self) -> int:
         """Tensor index is 0 for A, 1 for B and 2 for C."""
-        return self._tensor_index
+        return self._blas_layout._tensor_index
 
-    @property
-    def dtype(self) -> str:
-        return self._dtype
+    @cached_property
+    def dtype(self) -> types.Number:
+        return NUMBA_FE_TYPES_TO_NUMBA_IR[self._blas_layout.dtype]
 
     @property
     def size(self) -> int:
-        return self._size
+        return self._blas_layout._size
 
     @property
     def cosize(self) -> int:
-        return self._cosize
+        return self._blas_layout._cosize
 
     @property
     def dynamic_ld(self) -> bool:
-        return self._dynamic_ld
-
-    def copy_to_symbol(self, dst: "BlasLayoutType") -> str:
-        return self._copy_symbols_map[(self.uid, dst.uid)]
-
-    @property
-    def clear_symbol(self) -> str | None:
-        return self._clear_symbol
-
-    @property
-    def axpby_symbol(self) -> str | None:
-        return self._axpby_symbol
+        return self._blas_layout._dynamic_ld
 
 
 register_model(BlasLayoutType)(LayoutModel)
@@ -415,9 +503,8 @@ def constant_blas_layout(context, builder, typ, pyval):
 
 
 @typeof_impl.register(_BlasLayout)
-def typeof_blas_layout(val: _BlasLayout, c: typing.Context) -> BlasLayoutType:
-    assert val._MM is not None
-    return BlasLayoutType(val._MM, val._layout)
+def typeof_blas_layout(blas_layout: _BlasLayout, c: typing.Context) -> BlasLayoutType:
+    return BlasLayoutType(blas_layout)
 
 
 for attribute in ["size", "cosize"]:
@@ -429,10 +516,9 @@ def ol_blas_layout(blas_numba: BlasType, method: str, leading_dimension: types.N
     if ("gmem" not in method) and (leading_dimension not in {None, types.Omitted(None)}):
         return
     MM = blas_numba.blas
-    if not MM._tensor_types:
-        return
 
-    return_type = BlasLayoutType(MM, method)
+    blas_layout = getattr(MM, method)()
+    return_type = BlasLayoutType(blas_layout)
 
     @intrinsic
     def _intrinsic(typingctx, leading_dimension=None):
@@ -493,15 +579,16 @@ def ol_make_tensor(array, layout):
 def ol_copy_wait():
     # numba has cache per compute capability, so the function won't end up
     # cached for the wrong compute capability.
+    return_type = types.void
+    sig = typing.signature(return_type)
+
     ct = get_default_code_type()
-    symbol, _ = generate_copy_wait_lto(ct.cc)
+    symbol, code = generate_copy_wait_lto(ct.cc)
 
-    @intrinsic
-    def _intrinsic(typingctx):
-        return_type = types.void
-        return typing.signature(return_type), make_function_call(symbol)
+    lto = cuda.LTOIR(code)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
 
-    return lambda: _intrinsic()
+    return lambda: blas_device_func()
 
 
 @overload(axpby, target="cuda", jit_options={"forceinline": True}, strict=False)
@@ -519,13 +606,288 @@ def ol_axpby(a, x, b, y):
     if "rmem" not in x.layout.layout:
         raise TypeError("axpby is only supported for rmem tensors")
 
-    symbol = x.layout.axpby_symbol
+    code, symbol = compile_blas_axpby(
+        x_tensor=x.layout.blas_layout,
+        y_tensor=y.layout.blas_layout,
+        code_type=get_default_code_type(),
+    )
+
+    lto = cuda.LTOIR(code.data)
+
+    return_type = types.void
+    sig = typing.signature(return_type, types.CPointer(x.dtype), x._capi_type, types.CPointer(y.dtype), y._capi_type)
+    blas_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(a, x, b, y):
+        x_struct = get_opaque_tensor(x)
+        y_struct = get_opaque_tensor(y)
+        a_ptr = get_value_ptr(x.buffer.dtype.type(a))
+        b_ptr = get_value_ptr(y.buffer.dtype.type(b))
+        return blas_device_func(a_ptr, x_struct, b_ptr, y_struct)
+
+    return impl
+
+
+class BlasPartitionerType(PartitionerType):
+    """
+    Type class for Blas partitioner.
+    """
+
+    def __init__(self, MM: Matmul):
+        assert isinstance(MM, Matmul)
+        self._MM = MM
+        super().__init__(f"BlasPartitioner(MM={MM})")
+
+    @property
+    def MM(self) -> Matmul:
+        return self._MM
+
+
+register_model(BlasPartitionerType)(PartitionerModel)
+
+
+class BlasPartitionType(PartitionType):
+    """
+    Type class for Blas partitioner.
+    """
+
+    def __init__(self, partitioner: BlasPartitionerType, tensor: OpaqueTensorType):
+        assert isinstance(partitioner, BlasPartitionerType)
+        super().__init__(partitioner, tensor)
+        self.name = f"BlasPartitionType(partitioner={partitioner}, tensor={tensor})"
+
+
+register_model(BlasPartitionType)(PartitionModel)
+
+
+@overload_method(BlasType, "suggest_partitioner", target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_blas_suggest_partitioner(blas_numba: BlasType):
+    assert isinstance(blas_numba, BlasType)
+
+    MM = blas_numba.blas
+    return_type = BlasPartitionerType(MM)
+
+    @intrinsic
+    def _intrinsic(typingctx):
+        def codegen(context: BaseContext, builder: llvmir.IRBuilder, signature, args):
+            # Create empty struct to avoid runtime memory usage
+            layout = cgutils.create_struct_proxy(return_type)(context, builder)
+            return layout._getvalue()
+
+        return typing.signature(return_type), codegen
+
+    return lambda blas_numba: _intrinsic()
+
 
-    assert symbol is not None
+@overload_method(BlasPartitionerType, "partition_like_C", target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_blas_partition_like_C(partitioner: BlasPartitionerType, tensor: OpaqueTensorType):
+    assert isinstance(partitioner, BlasPartitionerType)
+    assert isinstance(tensor, OpaqueTensorType)
+    assert tensor.layout.blas_layout._tensor_type == "gmem_c"
+
+    return_type = BlasPartitionType(partitioner, tensor)
+
+    @intrinsic
+    def _intrinsic(typingctx, partitioner, tensor):
+        def codegen(context: BaseContext, builder: llvmir.IRBuilder, signature, args):
+            partition = cgutils.create_struct_proxy(return_type)(context, builder)
+            partition.partitioner = args[0]
+            partition.tensor = args[1]
+            return partition._getvalue()
+
+        return typing.signature(return_type, partitioner, tensor), codegen
+
+    return lambda partitioner, tensor: _intrinsic(partitioner, tensor)
+
+
+def get_map_idx2crd_partitioner(symbol: str, lto: cuda.LTOIR):
+    assert isinstance(symbol, str)
+    return_type = types.Tuple((types.int32, types.int32))
 
     @intrinsic
-    def _intrinsic(typingctx, a, x, b, y):
-        return_type = types.void
-        return typing.signature(return_type, x.dtype, x, y.dtype, y), make_function_call(symbol)
+    def map_idx2crd_partitioner(typingctx, index):
+        def codegen(context: BaseContext, builder: llvmir.IRBuilder, signature, args):
+            context.active_code_library.add_linking_file(lto)
+            idx: llvmir.AllocaInstr = args[0]
+            i_ptr = cgutils.alloca_once(builder, cgutils.int32_t)
+            j_ptr = cgutils.alloca_once(builder, cgutils.int32_t)
+            idx_ptr = cgutils.alloca_once_value(builder, idx)
+
+            int32_ptr = cgutils.int32_t.as_pointer()
+
+            fnTy = llvmir.FunctionType(cgutils.voidptr_t, [int32_ptr, int32_ptr, int32_ptr])
+            fn = cgutils.get_or_insert_function(builder.module, fnTy, symbol)
+            builder.call(fn, [idx_ptr, i_ptr, j_ptr])
+
+            return context.make_tuple(builder, return_type, (builder.load(i_ptr), builder.load(j_ptr)))
+
+        return typing.signature(return_type, types.int32), codegen
+
+    return map_idx2crd_partitioner
+
+
+@overload_method(
+    BlasPartitionerType,
+    "map_fragment_index",
+    target="cuda",
+    jit_options={"forceinline": True},
+    strict=False,
+)
+def ol_blas_partitioner_map_fragment_index(
+    partitioner: BlasPartitionerType,
+    index: types.Integer,
+):
+    if not isinstance(partitioner, BlasPartitionerType):
+        return
+    if not isinstance(index, types.Integer):
+        return
+
+    code, symbol = compile_blas_map_idx2crd_partitioner(partitioner.MM, code_type=get_default_code_type())
+
+    lto = cuda.LTOIR(code.data)
+
+    map_idx2crd_partitioner = get_map_idx2crd_partitioner(symbol, lto=lto)
+
+    def map_fragment_index_impl(obj, idx):
+        i, j = map_idx2crd_partitioner(idx)
+        return (i, j)
+
+    return map_fragment_index_impl
+
+
+def get_bool_return_intrinsic(symbol: str, index: bool = False, lto=None):
+    return_type = types.bool
+
+    def codegen(context: BaseContext, builder: llvmir.IRBuilder, signature, args: list):
+        if lto is not None:
+            context.active_code_library.add_linking_file(lto)
+        active = cgutils.alloca_once(builder, cgutils.int32_t)
+        int32_ptr = cgutils.int32_t.as_pointer()
+
+        fn_args, fn_args_ty = [], []
+        if index:
+            idx_ptr = cgutils.alloca_once_value(builder, args[0])
+            fn_args += [idx_ptr]
+            fn_args_ty += [int32_ptr]
+        fn_args += [active]
+        fn_args_ty += [int32_ptr]
+
+        fnTy = llvmir.FunctionType(cgutils.voidptr_t, fn_args_ty)
+        fn = cgutils.get_or_insert_function(builder.module, fnTy, symbol)
+        builder.call(fn, fn_args)
+
+        res = builder.icmp_signed("!=", builder.load(active), llvmir.Constant(cgutils.int32_t, 0))
+        return res
+
+    _intrinsic: Callable = lambda typingctx: (typing.signature(return_type), codegen)
+    if index:
+        _intrinsic = lambda typingctx, index: (typing.signature(return_type, types.int32), codegen)
+
+    return intrinsic(_intrinsic)
+
+
+@overload_method(
+    BlasPartitionerType,
+    "is_thread_active",
+    target="cuda",
+    jit_options={"forceinline": True},
+    strict=False,
+)
+def ol_blas_partitioner_is_thread_active(
+    partitioner: BlasPartitionerType,
+):
+    if not isinstance(partitioner, BlasPartitionerType):
+        return
+
+    code, symbol = compile_blas_is_thread_active(partitioner.MM, code_type=get_default_code_type())
+    lto = cuda.LTOIR(code.data)
+    is_thread_active = get_bool_return_intrinsic(symbol, lto=lto)
+
+    return lambda partitioner: is_thread_active()
+
+
+@overload_method(
+    BlasPartitionerType,
+    "is_predicated",
+    target="cuda",
+    jit_options={"forceinline": True},
+    strict=False,
+)
+def ol_blas_partitioner_is_predicated(
+    partitioner: BlasPartitionerType,
+):
+    if not isinstance(partitioner, BlasPartitionerType):
+        return
+
+    code, symbol = compile_blas_is_predicated(partitioner.MM, code_type=get_default_code_type())
+    lto = cuda.LTOIR(code.data)
+    is_predicated = get_bool_return_intrinsic(symbol, lto=lto)
+
+    return lambda partitioner: is_predicated()
+
+
+@overload_method(
+    BlasPartitionerType,
+    "is_index_in_bounds",
+    target="cuda",
+    jit_options={"forceinline": True},
+    strict=False,
+)
+def ol_blas_partition_is_index_in_bounds(
+    partitioner: BlasPartitionerType,
+    index: types.Integer,
+):
+    if not isinstance(partitioner, BlasPartitionerType):
+        return
+    if not isinstance(index, types.Integer):
+        return
+
+    code, symbol = compile_blas_is_index_in_bounds(partitioner.MM, code_type=get_default_code_type())
+    lto = cuda.LTOIR(code.data)
+    is_index_in_bounds = get_bool_return_intrinsic(symbol, index=True, lto=lto)
+
+    return lambda partitioner, idx: is_index_in_bounds(idx)
+
+
+@intrinsic
+def extract_partition(typingctx, ty_partition: BlasPartitionType):
+    assert isinstance(ty_partition, BlasPartitionType)
+    return_type = types.Tuple((ty_partition.partitioner, ty_partition.tensor))
+
+    def codegen(context: BaseContext, builder: llvmir.IRBuilder, signature, args):
+        partition = cgutils.create_struct_proxy(ty_partition)(context, builder, value=args[0])
+        return context.make_tuple(builder, return_type, (partition.partitioner, partition.tensor))
+
+    return typing.signature(return_type, ty_partition), codegen
+
+
+@overload(operator.getitem, target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_blas_partition_getitem(partition: BlasPartitionType, index: types.Integer):
+    if not isinstance(partition, BlasPartitionType):
+        return
+    if not isinstance(index, types.Integer):
+        return
+
+    def dummy_getitem_impl(obj, idx):
+        partitioner, tensor = extract_partition(obj)
+        i, j = partitioner.map_fragment_index(idx)
+        return tensor.buffer[i, j]
+
+    return dummy_getitem_impl
+
+
+@overload(operator.setitem, target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_blas_partition_setitem(partition: BlasPartitionType, index: types.Integer, value: types.Number):
+    if not isinstance(partition, BlasPartitionType):
+        return
+    if not isinstance(index, types.Integer):
+        return
+    if not isinstance(value, types.Number):
+        return
+
+    def dummy_setitem_impl(obj, idx, value):
+        partitioner, tensor = extract_partition(obj)
+        i, j = partitioner.map_fragment_index(idx)
+        tensor.buffer[i, j] = value
 
-    return lambda a, x, b, y: _intrinsic(a, x, b, y)
+    return dummy_setitem_impl
diff --git a/nvmath/device/cufftdx.py b/nvmath/device/cufftdx.py
index 935c918..175667a 100644
--- a/nvmath/device/cufftdx.py
+++ b/nvmath/device/cufftdx.py
@@ -2,20 +2,22 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["fft", "FFTOptions"]
+__all__ = ["fft", "FFT", "compile_fft_execute"]
 from functools import cached_property
+from typing import Any
 import warnings
-import weakref
 
 from .common import (
-    make_binary_tempfile,
-    delete_binary_tempfiles,
-    check_in,
+    parse_code_type,
+    check_code_type,
     SHARED_DEVICE_DOCSTRINGS,
+    parse_sm,
+)
+from .common_cuda import (
+    Code,
+    Dim3,
 )
-from .common_cuda import MAX_SUPPORTED_CC, get_default_code_type, Code, CodeType, ComputeCapability, Dim3
 from .common_backend import MATHDX_TYPES_TO_NP, get_isa_version, get_lto
-from .common_numba import NP_TYPES_TO_NUMBA_FE_TYPES
 from .cufftdx_backend import (
     generate_code,
     get_int_trait,
@@ -27,11 +29,12 @@
     generate_FFT,
     validate_execute_api,
 )
-from .cufftdx_numba import codegen
 from nvmath.internal.utils import docstring_decorator
 
 from nvmath.bindings import mathdx
 
+from ._deprecated import deprecated
+
 CUFFTDX_DATABASE = None
 
 FFTDX_DOCSTRING = SHARED_DEVICE_DOCSTRINGS.copy()
@@ -69,13 +72,16 @@
 
 
 @docstring_decorator(FFTDX_DOCSTRING, skip_missing=False)
-class FFTOptions:
+class FFT:
     """
     A class that encapsulates a partial FFT device function. A partial device function can
     be queried for available or optimal values for some knobs (such as `ffts_per_block`
     or `elements_per_thread`). It does not contain a compiled, ready-to-use,
     device function until finalized using :meth:`create`.
 
+    .. versionchanged:: 0.7.0
+        `FFT` has replaced `FFTOptions` and `FFTOptionsComplete`.
+
     Args:
         size (int): {size}
 
@@ -83,7 +89,7 @@ class FFTOptions:
 
         fft_type (str): {fft_type}
 
-        code_type (CodeType): {code_type}
+        sm (ComputeCapability): {sm}
 
         execution (str): {execution}
 
@@ -100,16 +106,7 @@ class FFTOptions:
               ``'full'``.
             - ``'real_mode'``, currently supports ``'normal'`` and ``'folded``.
 
-        execute_api:
-            .. versionchanged:: 0.5.0
-                execute_api is not part of the FFT type. Pass this argument to
-                :py:func:`nvmath.device.fft` instead.
-
-    Note:
-        The class is not meant to be used directly with its constructor. Users are instead
-        advised to use :func:`fft` to create the object.
-
-    See Also:
+    .. seealso::
         The attributes of this class provide a 1:1 mapping with the CUDA C++ cuFFTDx APIs.
         For further details, please refer to `cuFFTDx documentation
         <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
@@ -120,27 +117,15 @@ def __init__(
         size,
         precision,
         fft_type,
-        code_type,
         execution,
         *,
+        sm=None,
         direction=None,
         ffts_per_block=None,
         elements_per_thread=None,
         real_fft_options=None,
     ):
-        if len(code_type) != 2:
-            raise ValueError(f"code_type should be an instance of CodeType or a 2-tuple ; got code_type = {code_type}")
-        code_type = CodeType(code_type[0], ComputeCapability(*code_type[1]))
-        if code_type.cc.major < 7:
-            raise RuntimeError(
-                f"Minimal compute capability 7.0 is required by cuFFTDx, got {code_type.cc.major}.{code_type.cc.minor}"
-            )
-        if (code_type.cc.major, code_type.cc.minor) > MAX_SUPPORTED_CC:
-            raise RuntimeError(
-                "The maximum compute capability currently supported by device "
-                f"APIs is {MAX_SUPPORTED_CC}, "
-                f"got {code_type.cc.major}.{code_type.cc.minor}"
-            )
+        sm = parse_sm(sm)
 
         #
         # Check that the knobs are, individually, valid
@@ -150,7 +135,7 @@ def __init__(
             size=size,
             precision=precision,
             fft_type=fft_type,
-            code_type=code_type,
+            sm=sm,
             execution=execution,
             direction=direction,
             ffts_per_block=ffts_per_block,
@@ -175,7 +160,7 @@ def __init__(
         self._ffts_per_block = ffts_per_block
         self._elements_per_thread = elements_per_thread
         self._real_fft_options = real_fft_options
-        self._code_type = code_type
+        self._sm = sm
 
         #
         # Update suggested traits
@@ -187,8 +172,14 @@ def __init__(
         if ffts_per_block == "suggested":
             self._ffts_per_block = self._suggested_ffts_per_block
 
+    @cached_property
+    def _traits(self):
+        return _FFTTraits(self)
+
     @property
     def elements_per_thread(self):
+        if self._elements_per_thread is None:
+            return self._traits.elements_per_thread
         return self._elements_per_thread
 
     @property
@@ -197,6 +188,8 @@ def precision(self):
 
     @property
     def ffts_per_block(self):
+        if self._ffts_per_block is None:
+            return self._traits.ffts_per_block
         return self._ffts_per_block
 
     @property
@@ -220,8 +213,8 @@ def real_fft_options(self):
         return self._real_fft_options
 
     @property
-    def code_type(self):
-        return self._code_type
+    def sm(self):
+        return self._sm
 
     #
     # Extensions
@@ -233,12 +226,16 @@ def valid(self, *knobs):
 
         return self._get_knobs(*knobs)
 
+    @deprecated("definition is deprecated and may be removed in future versions")
     def definition(self):
+        """
+        .. deprecated:: 0.7.0
+        """
         dd = {
             "size": self.size,
             "precision": self.precision,
             "fft_type": self.fft_type,
-            "code_type": self.code_type,
+            "sm": self.sm,
             "execution": self.execution,
             "direction": self.direction,
             "ffts_per_block": self.ffts_per_block,
@@ -247,132 +244,51 @@ def definition(self):
         }
         return dd
 
+    @deprecated("create is deprecated and may be removed in future versions. Use `functools.partial` instead")
     def create(self, **kwargs):
+        """
+        Creates a copy of the instance with provided arguments updated.
+
+        .. deprecated:: 0.7.0
+            Please use :py:func:`functools.partial` instead.
+        """
+        code_type = kwargs.pop("code_type", None)
+        if code_type is not None:
+            DeprecationWarning("code_type is deprecated and will be removed in future releases. It is no longer needed.")
+        compiler = kwargs.pop("compiler", None)
+        if compiler is not None:
+            DeprecationWarning("compiler is deprecated and will be removed in future releases. It is no longer needed.")
         dd = self.definition()
         dd.update(**kwargs)
-        return fft(**dd)
-
-    #
-    # Private implementations
-    #
-
-    def _suggested(self, what):
-        # Generate full PTX
-        h = generate_FFT(
-            size=self.size,
-            precision=self.precision,
-            fft_type=self.fft_type,
-            direction=self.direction,
-            ffts_per_block=(None if self.ffts_per_block == "suggested" else self.ffts_per_block),
-            elements_per_thread=(None if self.elements_per_thread == "suggested" else self.elements_per_thread),
-            real_fft_options=frozenset(self.real_fft_options.items()) if self.real_fft_options else None,
-            code_type=self.code_type,
-            execution=self.execution,
-            # TODO: remove after migrating to libmathdx 0.2.2+
-            execute_api="register_memory",
-        )
-
-        if what == "elements_per_thread":
-            return get_int_trait(h.descriptor, mathdx.CufftdxTraitType.ELEMENTS_PER_THREAD)
-
-        if what == "suggested_ffts_per_block":
-            return get_int_trait(h.descriptor, mathdx.CufftdxTraitType.SUGGESTED_FFTS_PER_BLOCK)
-
-        raise Exception(f"Unknown suggested option '{what}'")
-
-    @cached_property
-    def _suggested_ffts_per_block(self):
-        return self._suggested("suggested_ffts_per_block")
-
-    @cached_property
-    def _suggested_elements_per_thread(self):
-        return self._suggested("elements_per_thread")
-
-    def _get_knobs(self, *knobs):
-        if not (set(knobs) <= {"ffts_per_block", "elements_per_thread"}):
-            raise ValueError(f"Unsupported knob. Only valid knobs are ffts_per_block and elements_per_thread but got {knobs}")
-
-        h = generate_FFT(
-            size=self.size,
-            precision=self.precision,
-            fft_type=self.fft_type,
-            direction=self.direction,
-            ffts_per_block=self.ffts_per_block,
-            elements_per_thread=self.elements_per_thread,
-            real_fft_options=frozenset(self.real_fft_options.items()) if self.real_fft_options else None,
-            code_type=self.code_type,
-            execution=self.execution,
-            # TODO: remove after migrating to libmathdx 0.2.2+
-            execute_api="register_memory",
-        )
-
-        return get_knobs(h.descriptor, knobs)
-
-
-class FFTOptionsComplete(FFTOptions):
-    def __init__(self, **kwargs):
-        FFTOptions.__init__(self, **kwargs)
-
-        h = generate_FFT(
-            size=self.size,
-            precision=self.precision,
-            fft_type=self.fft_type,
-            direction=self.direction,
-            code_type=self.code_type,
-            execution=self.execution,
-            ffts_per_block=self.ffts_per_block if self.execution == "Block" else None,
-            elements_per_thread=self.elements_per_thread if self.execution == "Block" else None,
-            real_fft_options=frozenset(self.real_fft_options.items()) if self.real_fft_options else None,
-            # TODO: remove after migrating to libmathdx 0.2.2+
-            execute_api="register_memory",
-        ).descriptor
-
-        self._value_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.VALUE_TYPE)]
-        self._input_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.INPUT_TYPE)]
-        self._output_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.OUTPUT_TYPE)]
-
-        self._storage_size = get_int_trait(h, mathdx.CufftdxTraitType.STORAGE_SIZE)
-        self._stride = get_int_trait(h, mathdx.CufftdxTraitType.STRIDE)
-        self._elements_per_thread = get_int_trait(h, mathdx.CufftdxTraitType.ELEMENTS_PER_THREAD)
-        self._implicit_type_batching = get_int_trait(h, mathdx.CufftdxTraitType.IMPLICIT_TYPE_BATCHING)
-
-        self._workspace_size = 0
-        if self.execution == "Block":
-            self._block_dim = Dim3(*get_int_traits(h, mathdx.CufftdxTraitType.BLOCK_DIM, 3))
-            self._shared_memory_size = get_int_trait(h, mathdx.CufftdxTraitType.SHARED_MEMORY_SIZE)
-            self._ffts_per_block = get_int_trait(h, mathdx.CufftdxTraitType.FFTS_PER_BLOCK)
-        else:
-            self._block_dim = None
-            self._shared_memory_size = None
-            self._ffts_per_block = None
+        return FFT(**dd)
 
     @property
     def value_type(self):
-        return self._value_type
+        return self._traits.value_type
 
     @property
     def input_type(self):
-        return self._input_type
+        return self._traits.input_type
 
     @property
     def output_type(self):
-        return self._output_type
+        return self._traits.output_type
 
     @property
     def storage_size(self):
-        return self._storage_size
+        return self._traits.storage_size
 
     @property
     def shared_memory_size(self):
-        return self._shared_memory_size
+        return self._traits.shared_memory_size
 
     @property
     def stride(self):
-        return self._stride
+        return self._traits.stride
 
     @property
     def block_dim(self):
-        return self._block_dim
+        return self._traits.block_dim
 
     @property
     def requires_workspace(self):
@@ -381,136 +297,156 @@ def requires_workspace(self):
 
     @property
     def workspace_size(self):
-        return self._workspace_size
+        return self._traits.workspace_size
 
     @property
     def implicit_type_batching(self):
-        return self._implicit_type_batching
+        return self._traits.implicit_type_batching
 
+    @property
+    def extensions(self):
+        raise NotImplementedError("Extensions not supported yet")
 
-class FFTCompiled(FFTOptionsComplete):
-    def __init__(self, **kwargs):
-        execute_api = kwargs.pop("execute_api", None)
-        super().__init__(**kwargs)
-
-        # Fixup typo introduced in earlier versions.
-        if execute_api == "registry_memory":
-            warnings.warn(
-                "The execute_api 'registry_memory' is deprecated and will be "
-                "removed in future releases. "
-                "Please use 'register_memory' instead.",
-                DeprecationWarning,
-            )
-            execute_api = "register_memory"
+    def execute(*args):
+        raise RuntimeError("execute is a device function and can not be called on host.")
 
-        validate_execute_api(self.execution, execute_api)
+    @deprecated("Calling MM(...) directly is deprecated, please use MM.execute(...) method instead.")
+    def __call__(self, *args):
+        raise RuntimeError("__call__ is a device function and can not be called on host.")
 
-        if execute_api is None:
-            execute_api = "register_memory"
+    @property
+    @deprecated("files is deprecated and is no longer required and will be removed in future releases.")
+    def files(self) -> list:
+        return []
 
-        self._execute_api = execute_api
+    #
+    # Private implementations
+    #
 
+    def _suggested(self, what):
+        # Generate full PTX
         h = generate_FFT(
-            size=self.size,
-            precision=self.precision,
-            fft_type=self.fft_type,
-            direction=self.direction,
-            code_type=self.code_type,
-            execution=self.execution,
-            ffts_per_block=self.ffts_per_block if self.execution == "Block" else None,
-            elements_per_thread=self.elements_per_thread if self.execution == "Block" else None,
-            real_fft_options=frozenset(self.real_fft_options.items()) if self.real_fft_options else None,
-            execute_api=execute_api,
-        ).descriptor
-
-        code = generate_code(h, self.code_type.cc)
+            size=self._size,
+            precision=self._precision,
+            fft_type=self._fft_type,
+            direction=self._direction,
+            ffts_per_block=(None if self._ffts_per_block == "suggested" else self._ffts_per_block),
+            elements_per_thread=(None if self._elements_per_thread == "suggested" else self._elements_per_thread),
+            real_fft_options=frozenset(self._real_fft_options.items()) if self._real_fft_options else None,
+            sm=self._sm,
+            execution=self._execution,
+        )
 
-        # Compile
-        lto_fn = get_lto(code.descriptor)
-        isa_version = get_isa_version(code.descriptor)
+        if what == "elements_per_thread":
+            return get_int_trait(h.descriptor, mathdx.CufftdxTraitType.ELEMENTS_PER_THREAD)
 
-        self._ltos = [Code(self.code_type, isa_version, lto_fn)]
+        if what == "suggested_ffts_per_block":
+            return get_int_trait(h.descriptor, mathdx.CufftdxTraitType.SUGGESTED_FFTS_PER_BLOCK)
 
-        self._symbol = get_str_trait(h, mathdx.CufftdxTraitType.SYMBOL_NAME)
+        raise Exception(f"Unknown suggested option '{what}'")
 
-        self._finalizer = weakref.finalize(self, delete_binary_tempfiles, self.files)
+    @cached_property
+    def _suggested_ffts_per_block(self):
+        return self._suggested("suggested_ffts_per_block")
 
     @cached_property
-    def _tempfiles(self):
-        """
-        Create temporary files for the LTO functions.
-        """
-        return [make_binary_tempfile(lto.data, ".ltoir") for lto in self._ltos]
+    def _suggested_elements_per_thread(self):
+        return self._suggested("elements_per_thread")
 
-    @property
-    def files(self) -> list[str]:
-        return [v.name for v in self._tempfiles]
+    def _get_knobs(self, *knobs):
+        if not (set(knobs) <= {"ffts_per_block", "elements_per_thread"}):
+            raise ValueError(f"Unsupported knob. Only valid knobs are ffts_per_block and elements_per_thread but got {knobs}")
 
-    @property
-    def symbol(self):
-        return self._symbol
+        h = generate_FFT(
+            size=self._size,
+            precision=self._precision,
+            fft_type=self._fft_type,
+            direction=self._direction,
+            ffts_per_block=self._ffts_per_block,
+            elements_per_thread=self._elements_per_thread,
+            real_fft_options=frozenset(self._real_fft_options.items()) if self._real_fft_options else None,
+            sm=self._sm,
+            execution=self._execution,
+        )
 
-    @property
-    def codes(self):
-        return self._ltos
+        return get_knobs(h.descriptor, knobs)
 
-    def workspace(self):
-        _workspace_deprecation_warning()
-        raise NotImplementedError("Workspace not supported yet")
 
-    @property
-    def execute_api(self):
-        return self._execute_api
+class _FFTTraits:
+    def __init__(self, FFT: FFT):
+        h = generate_FFT(
+            size=FFT._size,
+            precision=FFT._precision,
+            fft_type=FFT._fft_type,
+            direction=FFT._direction,
+            sm=FFT._sm,
+            execution=FFT._execution,
+            ffts_per_block=FFT._ffts_per_block if FFT._execution == "Block" else None,
+            elements_per_thread=FFT._elements_per_thread if FFT._execution == "Block" else None,
+            real_fft_options=frozenset(FFT._real_fft_options.items()) if FFT._real_fft_options else None,
+        ).descriptor
 
-    def definition(self):
-        dd = super().definition()
-        dd.update(execute_api=self.execute_api)
-        return dd
+        self.value_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.VALUE_TYPE)]
+        self.input_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.INPUT_TYPE)]
+        self.output_type = MATHDX_TYPES_TO_NP[get_data_type_trait(h, mathdx.CufftdxTraitType.OUTPUT_TYPE)]
 
+        self.storage_size = get_int_trait(h, mathdx.CufftdxTraitType.STORAGE_SIZE)
+        self.stride = get_int_trait(h, mathdx.CufftdxTraitType.STRIDE)
+        self.elements_per_thread = get_int_trait(h, mathdx.CufftdxTraitType.ELEMENTS_PER_THREAD)
+        self.implicit_type_batching = get_int_trait(h, mathdx.CufftdxTraitType.IMPLICIT_TYPE_BATCHING)
 
-class FFTNumba(FFTCompiled):
-    def __init__(self, **kwargs):
-        if "code_type" not in kwargs:
-            kwargs["code_type"] = get_default_code_type()
+        self.workspace_size = 0
+        if FFT.execution == "Block":
+            self.block_dim: Dim3 | None = Dim3(*get_int_traits(h, mathdx.CufftdxTraitType.BLOCK_DIM, 3))
+            self.shared_memory_size: int | None = get_int_trait(h, mathdx.CufftdxTraitType.SHARED_MEMORY_SIZE)
+            self.ffts_per_block: int | None = get_int_trait(h, mathdx.CufftdxTraitType.FFTS_PER_BLOCK)
+        else:
+            self.block_dim = None
+            self.shared_memory_size = None
+            self.ffts_per_block = None
 
-        FFTCompiled.__init__(self, **kwargs)
 
-        codegen(
-            {
-                "value_type": self.value_type,
-                "symbol": self._symbol,
-                "execute_api": self._execute_api,
-                "execution": self.execution,
-            },
-            self,
-        )
+def compile_fft_execute(
+    fft: FFT,
+    code_type: Any,
+    execute_api: str | None = None,
+) -> tuple[Code, str]:
+    code_type = parse_code_type(code_type)
+    check_code_type(code_type, "cuFFTDx")
+    validate_execute_api(fft.execution, execute_api)
 
-    def __call__(self, *args):
-        raise Exception("__call__ should not be called directly outside of a numba.cuda.jit(...) kernel.")
+    h = generate_FFT(
+        size=fft._size,
+        precision=fft._precision,
+        fft_type=fft._fft_type,
+        direction=fft._direction,
+        sm=fft._sm,
+        execution=fft._execution,
+        ffts_per_block=fft._ffts_per_block if fft._execution == "Block" else None,
+        elements_per_thread=fft._elements_per_thread if fft._execution == "Block" else None,
+        real_fft_options=frozenset(fft._real_fft_options.items()) if fft._real_fft_options else None,
+        execute_api=execute_api,
+    ).descriptor
 
-    @property
-    def value_type(self):
-        return NP_TYPES_TO_NUMBA_FE_TYPES[super(FFTCompiled, self).value_type]
+    code = generate_code(h, code_type.cc)
 
-    @property
-    def input_type(self):
-        return NP_TYPES_TO_NUMBA_FE_TYPES[super(FFTCompiled, self).input_type]
+    # Compile
+    lto_fn = get_lto(code.descriptor)
+    isa_version = get_isa_version(code.descriptor)
 
-    @property
-    def output_type(self):
-        return NP_TYPES_TO_NUMBA_FE_TYPES[super(FFTCompiled, self).output_type]
+    symbol = get_str_trait(h, mathdx.CufftdxTraitType.SYMBOL_NAME)
 
-    @property
-    def extensions(self):
-        raise NotImplementedError("Extensions not supported yet")
+    return Code(code_type, isa_version, lto_fn), symbol
 
 
 @docstring_decorator(FFTDX_DOCSTRING, skip_missing=False)
-def fft(*, compiler=None, **kwargs):
+def fft(*, compiler=None, code_type=None, execute_api=None, **kwargs):
     """
-    Create an :class:`FFTOptions` object that encapsulates a compiled and ready-to-use FFT
+    Create an :class:`FFT` object that encapsulates a compiled and ready-to-use FFT
     device function.
 
+    .. deprecated:: 0.7.0
+
     Args:
         size (int): {size}
 
@@ -518,9 +454,7 @@ def fft(*, compiler=None, **kwargs):
 
         fft_type (str): {fft_type}
 
-        compiler (str): {compiler}
-
-        code_type (CodeType): {code_type}. Optional if compiler is specified as ``'numba'``.
+        sm (ComputeCapability): {sm}
 
         execution (str): {execution}
 
@@ -537,10 +471,29 @@ def fft(*, compiler=None, **kwargs):
               ``'full'``.
             - ``'real_mode'``, currently supports ``'normal'`` and ``'folded'``.
 
+        compiler: {compiler}
+
+            .. versionchanged:: 0.7.0
+                compiler is no longer needed and does not take effect. Use
+                :py:func:`nvmath.device.compile_fft_execute` to get device
+                function code.
+
+        code_type (CodeType): {code_type}
+
+            .. versionchanged:: 0.7.0
+                code_type should be used by
+                :py:func:`nvmath.device.compile_fft_execute` and no longer
+                needed for numba-cuda usage.
+
         execute_api (str): {execute_api}
 
-    See Also:
-        The attributes of :class:`FFTOptions` provide a 1:1 mapping with the CUDA C++
+            .. versionchanged:: 0.7.0
+                execute_api should be used by
+                :py:func:`nvmath.device.compile_fft_execute` and no longer
+                needed for numba-cuda usage.
+
+    .. seealso::
+        The attributes of :class:`FFT` provide a 1:1 mapping with the CUDA C++
         cuFFTDx APIs. For further details, please refer to `cuFFTDx documentation
         <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
 
@@ -548,8 +501,11 @@ def fft(*, compiler=None, **kwargs):
         Examples can be found in the `nvmath/examples/device
         <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
     """
-    check_in("compiler", compiler, [None, "numba"])
-    if compiler is None:
-        return FFTCompiled(**kwargs)
-    elif compiler == "numba":
-        return FFTNumba(**kwargs)
+    DeprecationWarning("fft is deprecated and will be removed in future releases. Please use FFT class directly.")
+    if code_type is not None:
+        DeprecationWarning("code_type is deprecated and will be removed in future releases. It is no longer needed.")
+    if compiler is not None:
+        DeprecationWarning("compiler is deprecated and will be removed in future releases. It is no longer needed.")
+    if execute_api is not None:
+        DeprecationWarning("execute_api is deprecated and will be removed in future releases. It is no longer needed.")
+    return FFT(**kwargs)
diff --git a/nvmath/device/cufftdx_backend.py b/nvmath/device/cufftdx_backend.py
index 7964746..0dffff8 100644
--- a/nvmath/device/cufftdx_backend.py
+++ b/nvmath/device/cufftdx_backend.py
@@ -10,7 +10,7 @@
 
 from nvmath.device.common_backend import DescriptorWrapper
 from nvmath.device.common_cuda import ComputeCapability
-from .common import check_contains, check_in, check_not_in, check_code_type
+from .common import check_contains, check_in, check_not_in, check_sm
 from .common_backend import (
     NP_TYPES_TO_MATHDX_PRECISION,
     EXECUTION_STR_TO_MATHDX,
@@ -69,7 +69,7 @@ def validate(
     ffts_per_block,
     elements_per_thread,
     real_fft_options,
-    code_type,
+    sm,
 ):
     if size <= 0:
         raise ValueError(f"size must be > 0. Got {size}")
@@ -100,7 +100,7 @@ def validate(
         check_contains(real_fft_options, "real_mode")
         check_in("real_fft_options['complex_layout']", real_fft_options["complex_layout"], ["natural", "packed", "full"])
         check_in("real_fft_options['real_mode']", real_fft_options["real_mode"], ["normal", "folded"])
-    check_code_type(code_type)
+    check_sm(sm, "sm")
 
 
 def validate_execute_api(execution: str, execute_api: str | None):
@@ -120,7 +120,7 @@ def generate_FFT(
     precision,
     fft_type,
     direction,
-    code_type,
+    sm,
     execution,
     ffts_per_block,
     elements_per_thread,
@@ -132,9 +132,14 @@ def generate_FFT(
 
     h = mathdx.cufftdx_create_descriptor()
 
+    # TODO: remove after migrating to libmathdx 0.2.4+
+    if execute_api is None:
+        execute_api = "register_memory"
+
     if execute_api is not None:
         mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.API, _FFT_API_STR_TO_MATHDX[execute_api])
 
+    mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.SM, sm.major * 100 + sm.minor * 10)
     mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.SIZE, size)
     mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.PRECISION, NP_TYPES_TO_MATHDX_PRECISION[precision])
     mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.TYPE, _FFT_TYPE_TO_MATHDX[fft_type])
@@ -150,9 +155,6 @@ def generate_FFT(
 
     mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.EXECUTION, EXECUTION_STR_TO_MATHDX[execution])
 
-    if code_type:
-        mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.SM, code_type.cc.major * 100 + code_type.cc.minor * 10)
-
     if ffts_per_block:
         mathdx.cufftdx_set_operator_int64(h, mathdx.CufftdxOperatorType.FFTS_PER_BLOCK, ffts_per_block)
 
diff --git a/nvmath/device/cufftdx_numba.py b/nvmath/device/cufftdx_numba.py
index f6255f2..75b1acc 100644
--- a/nvmath/device/cufftdx_numba.py
+++ b/nvmath/device/cufftdx_numba.py
@@ -2,122 +2,194 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from numba import types
-from numba.core.typing import signature
-from numba.extending import intrinsic, overload
-from .common_numba import NUMBA_FE_TYPES_TO_NUMBA_IR, make_function_call
-from .common import check_in
+from numba import cuda
+from numba.core import typing, cgutils
+from numba.extending import typeof_impl, overload_method, types, utils, overload
+from numba.cuda.cudaimpl import lower_constant, registry as cuda_registry
+from numba.cuda.models import register_model
 
 
-#
-# Lowering down to function call to cuFFTDx
-# io = 'thread' or 'smem'
-# execution = 'Thread' or 'Block'
-# value_type = real or complex numpy type of the input/output thread-private and shared
-# memory
-# symbols = name of the function, as a string
-#
-def make_codegen(io, execution, value_type, symbol):
-    check_in("io", io, ["thread", "smem"])  # input in thread-private vs shared memory
-    check_in("execution", execution, ["Thread", "Block"])  # Thread() or Block() APIs
-
-    array_type = types.Array(value_type, 1, "C")
-    return_type = types.void
-
-    # Thread() APIs only work on a single thread-private array
-    # no shared memory, no workspace
-    # (void) ( (value_type*)thread array )
+from nvmath.device.common_cuda import get_default_code_type
+from nvmath.device.cufftdx import FFT, compile_fft_execute
+from .common_numba import (
+    NUMBA_FE_TYPES_TO_NUMBA_IR,
+    declare_cabi_device,
+    get_array_ptr,
+    overload_type_attribute,
+    EmptyStructModel,
+)
 
-    if execution == "Thread" and io == "thread":
-        return signature(return_type, array_type), make_function_call(symbol)
 
-    # Block() APIs have two variants
-    # (void) ( (value_type*)thread array,         (value_type*)shared memory array               )  # noqa: W505
-    # (void) ( (value_type*)shared memory array                                                  )  # noqa: W505
+_FFT_DEFINITION_ARGS = [
+    "size",
+    "precision",
+    "fft_type",
+    "execution",
+    "sm",
+    "direction",
+    "ffts_per_block",
+    "elements_per_thread",
+    "real_fft_options",
+]
 
-    elif execution == "Block" and io == "thread":
-        codegen = make_function_call(symbol)
+_FFT_COMPILED_ARGS = [
+    "value_type",
+    "input_type",
+    "output_type",
+    "storage_size",
+    "shared_memory_size",
+    "stride",
+    "block_dim",
+    "implicit_type_batching",
+]
 
-        def wrap_codegen(context, builder, sig, args):
-            assert len(args) == 2
-            assert len(sig.args) == 2
-            codegen(context, builder, sig, [args[0], args[1]])
 
-        return signature(return_type, array_type, array_type), wrap_codegen
+class FFTType(types.Type):
+    """
+    Type class associated with the `cufftdx.FFT`.
+    """
 
-    elif execution == "Block" and io == "smem":
-        codegen = make_function_call(symbol)
+    def __init__(self, fft: FFT):
+        assert isinstance(fft, FFT)
+        self._fft = fft
+        attributes = [f"{attr}={getattr(fft, attr)}" for attr in _FFT_DEFINITION_ARGS if getattr(fft, attr)]
+        attributes.sort()
 
-        def wrap_codegen(context, builder, sig, args):
-            assert len(args) == 1
-            assert len(sig.args) == 1
-            codegen(context, builder, sig, args)
+        self.name = "FFT(" + ",".join(attributes) + ")"
 
-        return signature(return_type, array_type), wrap_codegen
+    @property
+    def fft(self) -> FFT:
+        return self._fft
 
 
-def codegen(description, func_to_overload):
-    execution = description["execution"]
-    execute_api = description["execute_api"]
-
-    check_in("execution", execution, ["Block", "Thread"])
-
-    if execution == "Thread":
-        codegen_thread(description, func_to_overload)
-    else:
-        assert execution == "Block"
+register_model(FFTType)(EmptyStructModel)
 
-        if execute_api == "register_memory":
-            codegen_block_lmem(description, func_to_overload)
-        else:
-            assert execute_api == "shared_memory"
 
-            codegen_block_smem(description, func_to_overload)
+@lower_constant(FFTType)
+def constant_dummy(context, builder, typ, pyval):
+    struct_ptr = cgutils.create_struct_proxy(typ)(context, builder)
+    return struct_ptr._getvalue()
 
 
-def codegen_thread(description, func_to_overload):
-    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[description["value_type"]]
-    symbol = description["symbol"]
+@typeof_impl.register(FFT)
+def typeof_fft_numba(val: FFT, c: typing.Context) -> FFTType:
+    return FFTType(val)
 
-    @intrinsic
-    def intrinsic_1(typingctx, thread):
-        return make_codegen("thread", "Thread", value_type, symbol)
 
-    @overload(func_to_overload, target="cuda")
-    def fft(thread):
-        def impl(thread):
-            return intrinsic_1(thread)
+for attribute in _FFT_DEFINITION_ARGS + _FFT_COMPILED_ARGS:
+    overload_type_attribute(FFTType, "fft", attribute)
 
-        return impl
 
+# Numba does not support method overload or variadic arguments, so we using
+# default values as a workaround
+# https://github.com/numba/numba/issues/9980
+# https://github.com/numba/numba/issues/9979
+# https://github.com/numba/numba/issues/10143
+@overload_method(FFTType, "execute", target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_fft_numba_execute(fft_numba: FFTType, _arg1, _arg2=None):
+    return ol_fft_numba(fft_numba, _arg1, _arg2)
 
-def codegen_block_lmem(description, func_to_overload):
-    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[description["value_type"]]
-    symbol = description["symbol"]
 
-    @intrinsic
-    def intrinsic_2(typingctx, thread, smem):
-        return make_codegen("thread", "Block", value_type, symbol)
+@overload_method(FFTType, "__call__", target="cuda", strict=False)
+def ol_fft_numba_call(fft_numba: FFTType, _arg1, _arg2=None):
+    return ol_fft_numba(fft_numba, _arg1, _arg2)
 
-    @overload(func_to_overload, target="cuda")
-    def fft(thread, smem):
-        def impl(thread, smem):
-            return intrinsic_2(thread, smem)
 
-        return impl
-
-
-def codegen_block_smem(description, func_to_overload):
-    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[description["value_type"]]
-    symbol = description["symbol"]
-
-    @intrinsic
-    def intrinsic_1(typingctx, smem):
-        return make_codegen("smem", "Block", value_type, symbol)
-
-    @overload(func_to_overload, target="cuda")
-    def fft(smem):
-        def impl(smem):
-            return intrinsic_1(smem)
-
-        return impl
+def ol_fft_numba(fft_numba: FFTType, _arg1, _arg2=None):
+    if _arg2 in {None, types.Omitted(None)}:
+        return lambda _, smem, _arg2=None: _fft_type___call__(_, smem)
+    else:
+        return lambda _, thread_data, smem: _fft_type___call__(_, thread_data, smem)
+
+
+# TODO: use overload_method when supported
+def _fft_type___call__(*args):
+    raise Exception("Stub for overloads")
+
+
+@overload(_fft_type___call__, jit_options={"forceinline": True}, strict=False)
+def ol_fft_type___call___rmem(
+    fft_numba: FFTType,
+    thread_data: types.Array,
+):
+    if not isinstance(fft_numba, FFTType):
+        return
+    if not isinstance(thread_data, types.Array):
+        return
+    FFT = fft_numba.fft
+    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[FFT.value_type]
+    if thread_data.dtype != value_type:
+        return
+
+    code, symbol = compile_fft_execute(
+        FFT,
+        code_type=get_default_code_type(),
+        execute_api="shared_memory" if FFT.execution == "Block" else None,
+    )
+
+    lto = cuda.LTOIR(code.data)
+
+    sig = types.void(types.CPointer(value_type))
+    fft_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, thread_data):
+        tptr = get_array_ptr(thread_data)
+        fft_device_func(tptr)
+
+    return impl
+
+
+@overload(_fft_type___call__, jit_options={"forceinline": True}, strict=False)
+def ol_fft_type___call___smem(
+    fft_numba: FFTType,
+    thread_data: types.Array,
+    smem: types.Array,
+):
+    if not isinstance(fft_numba, FFTType):
+        return
+    if not isinstance(thread_data, types.Array):
+        return
+    if not isinstance(smem, types.Array):
+        return
+    FFT = fft_numba.fft
+    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[FFT.value_type]
+    if smem.dtype != value_type:
+        return
+    if thread_data.dtype != value_type:
+        return
+
+    code, symbol = compile_fft_execute(
+        FFT,
+        code_type=get_default_code_type(),
+        execute_api="register_memory" if FFT.execution == "Block" else None,
+    )
+
+    lto = cuda.LTOIR(code.data)
+
+    value_type = NUMBA_FE_TYPES_TO_NUMBA_IR[FFT.value_type]
+    sig = types.void(types.CPointer(value_type), types.CPointer(value_type))
+    fft_device_func = declare_cabi_device(symbol, sig, link=lto)
+
+    def impl(_, thread_data, smem):
+        tptr = get_array_ptr(thread_data)
+        sptr = get_array_ptr(smem)
+        fft_device_func(tptr, sptr)
+
+    return impl
+
+
+# __call__ overload is not supported by numba, however adding this overload
+# kind of activates proper behaviour and works like magic.
+# Issue reference: https://github.com/numba/numba/issues/5885
+# TODO: remove once supported
+@cuda_registry.lower(FFTType, FFTType, types.VarArg(types.Any))
+def method_impl(context, builder, sig, args):
+    typing_context = context.typing_context
+    fnty = typing_context.resolve_value_type(ol_fft_numba_call)
+    sig = fnty.get_call_type(typing_context, sig.args, {})
+    sig = sig.replace(pysig=utils.pysignature(ol_fft_numba_call))
+
+    call = context.get_function(fnty, sig)
+    # Link dependent library
+    context.add_linking_libs(getattr(call, "libs", ()))
+    return call(builder, args)
diff --git a/nvmath/device/curand_kernel.py b/nvmath/device/curand_kernel.py
index d446ed8..72bf5f4 100644
--- a/nvmath/device/curand_kernel.py
+++ b/nvmath/device/curand_kernel.py
@@ -7,7 +7,7 @@
 # Automatically generated by Numbast Static Binding Generator
 import os
 
-from nvmath.device.common_mathdx import CURAND_HOME
+from cuda import pathfinder
 
 # Imports:
 from numba.types import uint32
@@ -7791,7 +7791,7 @@ class _typing___get_mrg32k3a_matrix(ConcreteTemplate):
 
 # Shim functions:
 
-curand_kernel_header = os.path.join(CURAND_HOME, 'curand_kernel.h')
+curand_kernel_header = os.path.join(pathfinder.find_nvidia_header_directory("curand"), 'curand_kernel.h')
 c_ext_shim_source = CUSource("""#include <%s>
 
 extern "C" __device__ int
diff --git a/nvmath/device/random.py b/nvmath/device/random.py
index efa40fe..016054b 100644
--- a/nvmath/device/random.py
+++ b/nvmath/device/random.py
@@ -17,7 +17,7 @@
 from nvmath.device import curand_kernel, random_helpers
 from nvmath.device import random_states as states
 from cuda.core.experimental import ObjectCode, Program, ProgramOptions
-from .common_mathdx import CUDA_HOME
+from cuda import pathfinder
 
 # Common APIs (initialization, bit generation).
 _COMMON_APIS = ["init", "rand", "rand4"]
@@ -406,7 +406,10 @@ def __init__(self, cc: nvmath.device.ComputeCapability | None = None):
                 link_time_optimization=True,
                 gen_opt_lto=True,
                 relocatable_device_code=True,
-                include_path=[h + "/include" for h in CUDA_HOME] + list(CUDA_HOME) if CUDA_HOME is not None else [],
+                include_path=[
+                    pathfinder.find_nvidia_header_directory("cudart"),
+                    pathfinder.find_nvidia_header_directory("cccl"),
+                ],
             ),
         )
         obj = prog.compile("ltoir")
diff --git a/nvmath/device/random_states.py b/nvmath/device/random_states.py
index edcad17..cf5091b 100644
--- a/nvmath/device/random_states.py
+++ b/nvmath/device/random_states.py
@@ -5,7 +5,6 @@
 import operator
 
 from nvmath.device import curand_kernel
-from nvmath.device.common_mathdx import CURAND_HOME  # noqa: F401
 
 from numba import cuda, types
 from numba.extending import models, register_model, typeof_impl
diff --git a/nvmath/device/types.py b/nvmath/device/types.py
index 250951d..ea841b5 100644
--- a/nvmath/device/types.py
+++ b/nvmath/device/types.py
@@ -2,10 +2,145 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+__all__ = [
+    "Complex",
+    "Vector",
+    "complex32",
+    "complex64",
+    "complex128",
+    "half2",
+    "half4",
+    "np_float16x2",
+    "np_float16x4",
+    "REAL_NP_TYPES",
+    "INT_NP_TYPES",
+]
+
 import numpy as np
+import warnings
+
+from ._deprecated import deprecated
 
 np_float16x2 = np.dtype([("x", np.float16), ("y", np.float16)], align=True)
 np_float16x4 = np.dtype([("x", np.float16), ("y", np.float16), ("z", np.float16), ("w", np.float16)], align=True)
 
 REAL_NP_TYPES: list = [np.float16, np.float32, np.float64]
 INT_NP_TYPES: list = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64]
+
+_alignment_warning_msg = (
+    "You are using the host counterpart of a dtype that is under-aligned "
+    "compared to what the device function expects. In most cases this will "
+    "work, since device memory is typically allocated with at least 256-byte "
+    "alignment. For details, please review the alignment guidelines at "
+    "https://nvidia.com/docs/nvmath/alignment"
+)
+
+
+class Complex:
+    """
+    Complex type that can be used to represent complex numbers both
+    on host and device side. Numpy does not provide a built-in complex type with
+    16-bit real and imaginary parts, so we define our own dtype for that case.
+    For 32-bit and 64-bit complex numbers, we can use the built-in numpy dtypes.
+    However on device side we expect those types to be aligned to the full size
+    of the complex type, so the array defined on host and device side will have
+    different type and alignment. :py:const:`np_float16x2`,
+    :py:const:`numpy.dtype(numpy.complex64)` and
+    :py:const:`numpy.dtype(numpy.complex128)` are the host side dtypes and
+    :py:const:`float16x2_type`, :py:const:`float32x2_type` and
+    :py:const:`float64x2_type` are the device side types.
+    """
+
+    def __init__(self, real_dtype):
+        self._real_dtype = real_dtype
+
+    @property
+    def real_dtype(self):
+        return self._real_dtype
+
+    @property
+    def dtype(self):
+        warnings.warn(_alignment_warning_msg, UserWarning, stacklevel=2)
+        if self._real_dtype == np.float16:
+            return np_float16x2
+        elif self._real_dtype == np.float32:
+            return np.dtype(np.complex64)
+        assert self._real_dtype == np.float64
+        return np.dtype(np.complex128)
+
+    @property
+    def _numba_type(self):
+        from .vector_types_numba import float16x2_type, float32x2_type, float64x2_type
+
+        if self.real_dtype == np.float16:
+            return float16x2_type
+        if self.real_dtype == np.float32:
+            return float32x2_type
+        assert self.real_dtype == np.float64
+        return float64x2_type
+
+    @property
+    @deprecated("This is a numba fallback behavior and will be removed in future releases, please use numba types directly")
+    def make(self):
+        return self._numba_type.make
+
+
+complex32 = Complex(np.float16)
+complex64 = Complex(np.float32)
+complex128 = Complex(np.float64)
+
+
+class Vector:
+    """
+    Vector type that can be used to represent vector numbers both
+    on host and device side. Host side representation uses numpy structured
+    dtypes to represent the vector components, while device side representation
+    uses custom numba types. This difference is necessary because device
+    functions expect alignment of the vector types to be the same as the size of
+    the vector, which is not the case for numpy structured dtypes.
+    :py:const:`np_float16x2` and :py:const:`np_float16x4` are the host side
+    dtypes and :py:const:`float16x2_type` and :py:const:`float16x4_type` are the
+    device side types.
+    """
+
+    def __init__(self, real_dtype, size):
+        if size not in (2, 4):
+            raise ValueError(f"Unsupported vector size {size}, only 2 and 4 are supported")
+        if real_dtype != np.float16:
+            raise ValueError(f"Unsupported vector real dtype {real_dtype}, only float16 is supported")
+        self._real_dtype = real_dtype
+        self._size = size
+
+    @property
+    def real_dtype(self):
+        return self._real_dtype
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def dtype(self):
+        warnings.warn(_alignment_warning_msg, UserWarning, stacklevel=2)
+        if self._size == 2:
+            return np_float16x2
+        assert self._size == 4
+        return np_float16x4
+
+    @property
+    def _numba_type(self):
+        from .vector_types_numba import float16x2_type, float16x4_type
+
+        if self._size == 2:
+            return float16x2_type
+        assert self._size == 4
+        return float16x4_type
+
+    @property
+    @deprecated("This is a numba fallback behavior and will be removed in future releases, please use numba types directly")
+    def make(self):
+        return self._numba_type.make
+
+
+half2 = Vector(np.float16, 2)
+half4 = Vector(np.float16, 4)
diff --git a/nvmath/distributed/__init__.py b/nvmath/distributed/__init__.py
index 6349514..27e2933 100644
--- a/nvmath/distributed/__init__.py
+++ b/nvmath/distributed/__init__.py
@@ -11,15 +11,22 @@
     raise ImportError("nvmath.distributed requires mpi4py for bootstrapping.") from e
 
 import atexit
+import numpy as np
 import re
+from collections.abc import Sequence
 from dataclasses import dataclass
 from threading import Lock
+from typing import Literal
 
 from ._internal import nvshmem
 from ._utils import allocate_symmetric_memory, free_symmetric_memory
 
-from nvmath.distributed import fft
-from nvmath.distributed import reshape
+from . import distribution
+
+from nvmath.bindings import nccl  # type: ignore
+from nvmath.internal.utils import device_ctx
+
+from nvmath.distributed import fft, linalg, reshape  # noqa: E402
 
 __all__ = [
     "initialize",
@@ -27,7 +34,9 @@
     "get_context",
     "allocate_symmetric_memory",
     "free_symmetric_memory",
+    "distribution",
     "fft",
+    "linalg",
     "reshape",
 ]
 
@@ -42,23 +51,49 @@ class DistributedContext:
     Context of initialized nvmath.distributed runtime.
 
     Attributes:
-        device_id: CUDA device ID associated with the distributed runtime.
+        device_id: CUDA device ID associated with the distributed runtime
+            on this process.
+
         communicator: MPI communicator of participating processes.
+
+        nvshmem_available: True if NVSHMEM backend was selected at initialization.
+
+        nccl_comm: pointer to NCCL communicator if NCCL backend was selected at
+            initialization, None otherwise.
     """
 
     device_id: int
     communicator: mpi4py.MPI.Comm
+    nvshmem_available: bool
+    nccl_comm: int | None
+
 
+def initialize(
+    device_id: int,
+    communicator: mpi4py.MPI.Comm,
+    backends: Sequence[Literal["nvshmem", "nccl"]],
+) -> None:
+    """Initialize nvmath.distributed runtime. This is required before any distributed
+    operations can be performed. **Note that this is a collective operation and must be
+    called by all processes.**
 
-def initialize(device_id: int, communicator: mpi4py.MPI.Comm | None = None) -> None:
-    """Initialize nvmath.distributed. This is required before any distributed operations can
-    be performed. **Note that this is a collective operation and must be called by all
-    processes.**
+    If the runtime is already initialized this function will raise an error. If you need
+    to reinitialize the runtime (for example with different backends) you have to finalize
+    it first.
+
+    NCCL doesn't allow assigning more than one process to the same GPU.
 
     Args:
-        device_id: CUDA device ID to associate with the nvmath.distributed runtime.
+        device_id: CUDA device ID to associate with the nvmath.distributed runtime on this
+            process.
+
         communicator: MPI communicator specifying the participating processes. If None, will
-                      use MPI.COMM_WORLD.
+                      use MPI.COMM_WORLD. MPI is used for setup and not for communication
+                      during compute.
+
+        backends: Communication backends to use in distributed computations. Valid values
+            are "nvshmem" and "nccl". Note that specific libraries (cuFFTMp, cuBLASMp, ...)
+            have specific required backends.
     """
     if not isinstance(device_id, int):
         raise TypeError(
@@ -66,6 +101,14 @@ def initialize(device_id: int, communicator: mpi4py.MPI.Comm | None = None) -> N
             f"must be an integer. The provided device ID is {device_id}."
         )
 
+    valid_backends = ("nvshmem", "nccl")
+    for backend in backends:
+        if backend not in valid_backends:
+            raise ValueError(f"backend must be one of {valid_backends}, got {backend}")
+
+    if len(backends) == 0:
+        raise ValueError("Need to specify at least one backend")
+
     with _initialize_mutex:
         global _atexit_registered, _ctx
 
@@ -95,19 +138,47 @@ def initialize(device_id: int, communicator: mpi4py.MPI.Comm | None = None) -> N
                 f"got object of type {type(communicator)}."
             )
 
-        nvshmem.initialize(device_id, communicator)
-
-        _ctx = DistributedContext(device_id=device_id, communicator=communicator)
+        rank = communicator.Get_rank()
+        nranks = communicator.Get_size()
+
+        nvshmem_available = False
+        if "nvshmem" in backends:
+            nvshmem.initialize(device_id, communicator)
+            nvshmem_available = True
+
+        nccl_comm = None
+        if "nccl" in backends:
+            # Create NCCL communicator.
+            unique_id = nccl.UniqueId()
+            if rank == 0:
+                nccl.get_unique_id(unique_id.ptr)
+            # PE 0 broadcasts the unique ID.
+            communicator.Bcast(unique_id._data.view(np.int8), root=0)
+            with device_ctx(device_id):
+                nccl_comm = nccl.comm_init_rank(nranks, unique_id.ptr, rank)
+
+        _ctx = DistributedContext(
+            device_id=device_id, communicator=communicator, nvshmem_available=nvshmem_available, nccl_comm=nccl_comm
+        )
 
 
 def finalize() -> None:
-    """Finalize nvmath.distributed runtime. **Note that this is a collective operation and
+    """Finalize nvmath.distributed runtime (this is called automatically at exit
+    if the runtime is initialized). **Note that this is a collective operation and
     must be called by all processes.**"""
     global _ctx
     with _initialize_mutex:
         if _ctx is None:
             return
-        nvshmem.finalize(_ctx.device_id)
+
+        linalg.advanced.matmulmod._grid_cache.clear()
+
+        if _ctx.nccl_comm is not None:
+            with device_ctx(_ctx.device_id):
+                nccl.comm_destroy(_ctx.nccl_comm)
+
+        if _ctx.nvshmem_available:
+            nvshmem.finalize(_ctx.device_id)
 
         _ctx = None
 
diff --git a/nvmath/distributed/distribution.py b/nvmath/distributed/distribution.py
new file mode 100644
index 0000000..600d505
--- /dev/null
+++ b/nvmath/distributed/distribution.py
@@ -0,0 +1,768 @@
+from __future__ import annotations  # allows typehint of class methods to return the self class
+
+import copy
+import math
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from typing import cast, TypeAlias
+
+
+from nvmath.bindings import cufftMp  # type: ignore
+from nvmath.bindings import cublasMp  # type: ignore
+
+import nvmath.distributed as dist
+
+__all__ = ["ProcessGrid", "Distribution", "Slab", "Box", "BlockCyclic", "BlockNonCyclic"]
+
+
+class ProcessGrid:
+    """
+    N-dimensional grid of processes used by some distributions like the PBLAS block-cyclic
+    distribution.
+
+    Example 2D process grid for 4 processes, with processes arranged in column-major order::
+
+        ---------
+        | 0 | 2 |
+        ---------
+        | 1 | 3 |
+        ---------
+    """
+
+    Layout: TypeAlias = cublasMp.GridLayout
+
+    def __init__(
+        self,
+        *,
+        shape: Sequence[int] | None = None,
+        layout: ProcessGrid.Layout | None = None,
+        process_array=None,
+    ):
+        """
+        Create a new ProcessGrid object.
+
+        Args:
+            shape: Shape of the process grid.
+
+            layout: Layout of the process grid (column-major or row-major). This is optional
+                for 1D grid or when a custom grid is provided.
+
+            process_array: optional ndarray specifying custom arrangement of processes.
+        """
+        self._nranks = _get_communicator().Get_size()
+
+        if process_array is None:
+            if shape is None:
+                raise ValueError("shape must be provided when process_array=None")
+            self._shape = tuple(shape)
+            if layout is None:
+                if self._is_1d_distribution():
+                    layout = ProcessGrid.Layout.ROW_MAJOR  # layout doesn't matter in this case.
+                else:
+                    raise ValueError("layout must be provided when process_array=None and partitioning on multiple dimensions")
+            if not isinstance(layout, ProcessGrid.Layout):
+                raise TypeError(f"layout must be of type ProcessGrid.Layout, got {layout}")
+            self._layout = layout
+            self._process_array = None
+        else:
+            self._shape = tuple(process_array.shape)
+            if shape is not None and self._shape != tuple(shape):
+                raise ValueError(f"shape {shape} and process_array.shape ({process_array.shape}) don't match")
+            # TODO: Can set layout to COL_MAJOR or ROW_MAJOR automatically if the
+            # process_array matches.
+            if layout is not None:
+                raise NotImplementedError
+            self._layout = None
+            self._process_array = process_array
+
+        if math.prod(self._shape) != self._nranks:
+            raise ValueError(
+                f"Number of grid elements ({math.prod(self._shape)}) must equal the number of processes ({self._nranks})"
+            )
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        """Shape of process grid."""
+        return self._shape
+
+    @property
+    def layout(self) -> ProcessGrid.Layout | None:
+        """Layout of process grid if row-major or column-major, otherwise None."""
+        return self._layout
+
+    @property
+    def process_array(self):
+        return self._process_array
+
+    def __str__(self):
+        return f"ProcessGrid(shape={self._shape}, layout={self._layout.name}, process_array={self._process_array})"
+
+    def __hash__(self):
+        # NOTE: The layout isn't considered for the hash, to allow process grids
+        # partitioned on a single dimension with the same shape but different layout
+        # to be the same dictionary key.
+        return hash(self._shape)
+
+    def __eq__(self, other):
+        if self._process_array is None and other._process_array is None:
+            if self._shape == other._shape:
+                return self._is_1d_distribution() or self._layout == other._layout
+            return False
+        raise NotImplementedError
+
+    def _is_1d_distribution(self) -> bool:
+        """True if process grid partitions on a single dimension."""
+        return self._nranks in self._shape
+
+    def _is_row_wise(self) -> bool:
+        """True if 2D process grid partitioned only on rows."""
+        return self._shape == (self._nranks, 1)
+
+    def _is_col_wise(self) -> bool:
+        """True if 2D process grid partitioned only on columns."""
+        return self._shape == (1, self._nranks)
+
+
+class BindDistributionError(Exception):
+    pass
+
+
+class ConvertDistributionError(Exception):
+    """Errors converting a distribution instance to another distribution type"""
+
+    pass
+
+
+class Distribution(ABC):
+    """Specifies how a tensor is distributed across processes."""
+
+    def __init__(self):
+        self._bound = False
+
+    @property
+    @abstractmethod
+    def ndim(self) -> int | None:
+        """The number of dimensions of a distributed tensor for which this distribution
+        applies; None if it doesn't apply to any specific number of dimensions."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def shape(self, rank: int, global_shape: Sequence[int] | None = None) -> tuple[int, ...]:
+        """Get the local shape of data on the given rank according to this distribution.
+
+        Args:
+            rank: the process rank for which to calculate the local shape.
+
+            global_shape: Global shape of data. Required if the distribution is
+                not bound to a global shape, otherwise not required.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def to(
+        self,
+        cls: type[Distribution],
+        /,
+        *,
+        ndim: int | None = None,
+        copy: bool = False,
+    ) -> Distribution:
+        """Convert this distribution object to an equivalent distribution of the given type.
+
+        Args:
+            cls: the target distribution type.
+
+            ndim: dimensionality of the target distribution. Must be compatible with the
+                dimensionality of the source distribution. This may be required if the
+                source distribution doesn't have associated dimensionality.
+
+            copy: Returns a copy if the source and target type are the same.
+
+        Raises:
+            ConvertDistributionError: if the conversion is not possible.
+        """
+        raise NotImplementedError
+
+    def _to_checks(self, cls: type[Distribution], ndim: int | None) -> None:
+        if cls is Distribution or not issubclass(cls, Distribution):
+            raise ValueError(f"{cls} is not a valid distribution")
+        if ndim is not None and self.ndim is not None and ndim != self.ndim:
+            raise ValueError(f"ndim argument ({ndim}) doesn't match this distribution's dimensionality ({self.ndim})")
+
+    @abstractmethod
+    def _bind(
+        self,
+        global_shape: Sequence[int],
+        *,
+        shape: Sequence[int] | None = None,
+    ) -> Distribution:
+        """Binds this distribution object to a global shape, which determines how a
+        distributed tensor with that shape must be partitioned among processes (the local
+        shape on each process). You can also provide the local shape on this process to
+        check if it fits the distribution (the function will raise an exception if not).
+        **The exception may be raised on some ranks but not others (it's up to the caller
+        to handle this)**.
+
+        Args:
+            global_shape: global shape of the data.
+
+            shape: shape of the data on this process.
+
+        Returns:
+            self
+
+        Raises:
+            BindDistributionError: if distribution is already bound or local shape doesn't
+            fit the distribution.
+        """
+        raise NotImplementedError
+
+    def _binding_str(self):
+        return f"[bound: global_shape={self._data_global_shape}, shape={self._data_shape}]" if self._bound else ""
+
+    def copy(self) -> Distribution:
+        """This is a common implementation for those distributions that only require
+        a shallow copy."""
+        return copy.copy(self)
+
+    def _local_shape_checks(self, rank: int, nranks: int, global_shape: Sequence[int] | None = None):
+        if not isinstance(rank, int):
+            raise ValueError(f"rank must be an integer, got {rank}")
+
+        if rank < 0 or rank > nranks - 1:
+            raise ValueError(f"This is not a valid process rank: got rank={rank} with nranks={nranks}")
+
+        if global_shape is None and not self._bound:
+            raise RuntimeError("This distribution is unbound: please specify a global shape")
+
+        if global_shape is not None and self._bound and tuple(global_shape) != self._data_global_shape:  # type: ignore
+            raise ValueError(
+                "This distribution is already bound to a different global shape: provided "
+                f"{global_shape}, bound to {self._data_global_shape}"  # type: ignore
+            )
+
+
+class Slab(Distribution):
+    """
+    Slab distribution
+
+    Data is partitioned across processes on a single axis, such that:
+
+    - The shape of the slab on the first s_p % P processes is
+      (s_0, ..., s_p // P + 1, ..., s_{n-1})
+    - The shape of the slab on the remaining processes is (s_0, ..., s_p // P, ..., s_{n-1})
+    - Process 0 owns the first slab according to the global index order, process 1 owns
+      the second slab and so on.
+
+    where:
+
+    - s_i is the size of dimension i of the global array
+    - p is the partition dimension
+    - n is the number of dimensions of the array
+    - P is the number of processes
+    """
+
+    X: Slab
+    """Slab distribution on axis 0."""
+
+    Y: Slab
+    """Slab distribution on axis 1."""
+
+    Z: Slab
+    """Slab distribution on axis 2."""
+
+    def __init__(
+        self,
+        partition_dim: int,
+        ndim: int | None = None,
+    ):
+        super().__init__()
+        if not isinstance(partition_dim, int) or partition_dim < 0:
+            raise ValueError(f"partition_dim must be integer >= 0, got {partition_dim}")
+
+        if ndim is not None:
+            if not isinstance(ndim, int) or ndim < 1:
+                raise ValueError(f"ndim must be integer >= 1, got {ndim}")
+            if partition_dim >= ndim:
+                raise ValueError("partition_dim must be < ndim")
+
+        self._partition_dim = partition_dim
+        self._ndim = ndim
+
+    def __eq__(self, other):
+        if not isinstance(other, Slab):
+            return False
+        if self._partition_dim != other._partition_dim:
+            return False
+        if self._ndim is None or other._ndim is None:
+            return True
+        return self._ndim == other._ndim
+
+    def __hash__(self):
+        return self._partition_dim
+
+    def __str__(self):
+        return f"Slab(partition_dim={self._partition_dim}, ndim={self._ndim})" + self._binding_str()
+
+    @property
+    def name(self) -> str:
+        match self._partition_dim:
+            case 0:
+                return "Slab.X"
+            case 1:
+                return "Slab.Y"
+            case 2:
+                return "Slab.Z"
+            case _:
+                return "Slab"
+
+    @property
+    def partition_dim(self) -> int:
+        """Slab partition dimension"""
+        return self._partition_dim
+
+    @property
+    def ndim(self) -> int | None:
+        return self._ndim
+
+    def shape(self, rank: int, global_shape: Sequence[int] | None = None) -> tuple[int, ...]:
+        comm = _get_communicator()
+        n = comm.Get_size()
+        self._local_shape_checks(rank, n, global_shape)
+        if global_shape is None:
+            global_shape = self._data_global_shape
+
+        S = global_shape[self._partition_dim]
+        partition_dim_local_size = S // n + bool(rank < S % n)
+        slab_shape = list(global_shape)
+        slab_shape[self._partition_dim] = partition_dim_local_size
+        return tuple(slab_shape)
+
+    @property
+    def _cufftmp_value(self):
+        if self._partition_dim not in (0, 1):
+            raise TypeError(f"Unsupported distribution {self} for cuFFTMp: partition dimension must be X or Y")
+        return cufftMp.XtSubFormat.FORMAT_INPLACE if self._partition_dim == 0 else cufftMp.XtSubFormat.FORMAT_INPLACE_SHUFFLED
+
+    def _bind(self, global_shape, *, shape=None) -> Slab:
+        if self._bound:
+            raise BindDistributionError(f"{self} is already bound")
+
+        if self._ndim is not None and len(global_shape) != self._ndim:
+            raise ValueError(f"The given shape doesn't have the same dimensionality as this {self} distribution")
+
+        if self._partition_dim >= len(global_shape):
+            raise ValueError("partition_dim must be < ndim")
+
+        comm = _get_communicator()
+        rank = comm.Get_rank()
+        slab_shape = self.shape(rank, global_shape)
+
+        if shape is not None and tuple(shape) != slab_shape:
+            raise BindDistributionError(
+                f"The given shapes (global_shape={global_shape}, shape={shape}) don't fit distribution {str(self)}"
+            )
+
+        self._ndim = len(global_shape)
+        self._data_global_shape = tuple(global_shape)
+        self._data_shape = slab_shape
+        self._bound = True
+        return self
+
+    def to(self, cls, /, *, ndim=None, copy=False):
+        super()._to_checks(cls, ndim)
+        nranks = _get_communicator().Get_size()
+
+        if ndim is None:
+            ndim = self._ndim
+        elif self._partition_dim >= ndim:
+            raise ValueError(f"ndim ({ndim}) must be greater than the partition dimension ({self._partition_dim})")
+
+        if cls is Slab:
+            if copy or (ndim is not None and self._ndim is None):
+                d = cast(Slab, self.copy())
+                d._ndim = ndim
+                return d
+            return self
+        elif issubclass(cls, BlockCyclic):
+            if ndim is None:
+                raise ConvertDistributionError("Can't convert Slab distribution to BlockCyclic: unknown dimensionality")
+
+            process_grid_shape = tuple(1 if x != self._partition_dim else nranks for x in range(ndim))
+            # layout doesn't matter when partitioning on a single axis.
+            process_grid = ProcessGrid(shape=process_grid_shape, layout=ProcessGrid.Layout.ROW_MAJOR)
+            if not self._bound:
+                return BlockNonCyclic(process_grid)
+            else:
+                b = BlockNonCyclic(process_grid)
+                return b._bind(self._data_global_shape, shape=self._data_shape)
+        elif cls is Box:
+            raise NotImplementedError
+
+
+# Define alternate forms for the user to specify Slab on X, Y or Z.
+# NOTE: dimensionality is left unspecified when using these (but will be set when the
+# distribution is bound to data).
+Slab.X = Slab(0)
+Slab.Y = Slab(1)
+Slab.Z = Slab(2)
+
+
+class Box(Distribution):
+    """Box distribution"""
+
+    def __init__(
+        self,
+        lower: Sequence[int],
+        upper: Sequence[int],
+    ):
+        super().__init__()
+        if len(lower) != len(upper):
+            raise ValueError("lower and upper coordinates must have the same dimensionality")
+        for coords in (lower, upper):
+            if not all(isinstance(x, int) for x in coords):
+                raise ValueError("lower and upper coordinates must be integer")
+        if not all(upper[i] > lower[i] for i in range(len(upper))):
+            raise ValueError(
+                f"The upper coordinates must be larger than the lower coordinates, but got lower={lower} upper={upper}"
+            )
+        self._lower = tuple(lower)
+        self._upper = tuple(upper)
+
+    @property
+    def lower(self) -> tuple[int, ...]:
+        """Box lower coordinates"""
+        return self._lower
+
+    @property
+    def upper(self) -> tuple[int, ...]:
+        """Box upper coordinates"""
+        return self._upper
+
+    @property
+    def ndim(self) -> int:
+        return len(self._lower)
+
+    def shape(self, rank: int, global_shape: Sequence[int] | None = None) -> tuple[int, ...]:
+        comm = _get_communicator()
+        if rank != comm.Get_rank():
+            raise RuntimeError("Can't calculate local shape of peer process with Box distribution")
+        nranks = comm.Get_size()
+        self._local_shape_checks(rank, nranks, global_shape)
+        return tuple(self._upper[i] - self._lower[i] for i in range(self.ndim))
+
+    def __str__(self):
+        return f"Box(lower={self._lower}, upper={self._upper})" + self._binding_str()
+
+    def __eq__(self, other):
+        if not isinstance(other, Box):
+            return False
+        return self._lower == other._lower and self._upper == other._upper
+
+    def __hash__(self):
+        return hash((self._lower, self._upper))
+
+    def __iter__(self):
+        # To allow unpacking
+        yield self._lower
+        yield self._upper
+
+    def __getitem__(self, index):
+        if index not in (0, 1):
+            return IndexError(f"Index must be 0 or 1, got {index}")
+        return self._lower if index == 0 else self._upper
+
+    def _bind(self, global_shape, *, shape=None) -> Distribution:
+        if self._bound:
+            raise BindDistributionError(f"{self} is already bound")
+        my_shape = tuple(self._upper[i] - self._lower[i] for i in range(self.ndim))
+        if shape is not None and tuple(shape) != my_shape:
+            raise BindDistributionError(
+                f"The given shapes don't fit this Box distribution: {global_shape} "
+                f"and {shape}, lower={self._lower} upper={self._upper}"
+            )
+        self._data_global_shape = tuple(global_shape)
+        self._data_shape = my_shape
+        self._bound = True
+        return self
+
+    def to(self, cls, /, *, ndim=None, copy=False):
+        super()._to_checks(cls, ndim)
+        if cls is Box:
+            return self.copy() if copy else self
+        raise NotImplementedError
+
+
+class BlockCyclic(Distribution):
+    """Block-cyclic distribution"""
+
+    def __init__(
+        self,
+        process_grid: ProcessGrid,
+        block_sizes: Sequence[int],
+        *,
+        first_process: Sequence[int] | None = None,
+    ):
+        super().__init__()
+        if block_sizes is None or not all(isinstance(x, int) for x in block_sizes):
+            raise ValueError(f"Must provide a sequence of integer block sizes, got {block_sizes}")
+        if len(block_sizes) != len(process_grid.shape):
+            raise ValueError(
+                f"Number of block sizes ({len(block_sizes)}) doesn't match dimensionality ({len(process_grid.shape)})"
+            )
+        self._process_grid = process_grid
+        self._block_sizes = tuple(block_sizes)
+        if first_process is None:
+            self._first_process = (0,) * self.ndim
+        else:
+            if not all(isinstance(x, int) for x in first_process):
+                raise ValueError(f"first_process must be a sequence of integer coordinates, got {first_process}")
+            for i, x in enumerate(first_process):
+                if x < 0 or x >= process_grid.shape[i]:
+                    raise ValueError(
+                        f"first_process {first_process} is not a valid index into the process grid of "
+                        f"shape {process_grid.shape}"
+                    )
+            self._first_process = tuple(first_process)
+
+    @property
+    def process_grid(self) -> ProcessGrid:
+        """The process grid of this BlockCyclic distribution"""
+        return self._process_grid
+
+    @property
+    def ndim(self) -> int:
+        return len(self._process_grid.shape)
+
+    @property
+    def block_sizes(self) -> tuple[int, ...]:
+        """The block sizes of this BlockCyclic distribution"""
+        return self._block_sizes
+
+    @property
+    def first_process(self) -> tuple[int, ...]:
+        """Index in the process grid of the process who owns the first block of the
+        distributed tensor."""
+        return self._first_process
+
+    def __str__(self):
+        return (
+            f"{self.__class__.__name__}(process_grid={self._process_grid}, block_sizes={self._block_sizes})"
+            + self._binding_str()
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, BlockCyclic):
+            return False
+        return (
+            self._process_grid == other._process_grid
+            and self._block_sizes == other._block_sizes
+            and self._first_process == other._first_process
+        )
+
+    def _is_1d_distribution(self) -> bool:
+        """True if process grid partitions on a single dimension."""
+        return self._process_grid._is_1d_distribution()
+
+    def _is_row_wise(self) -> bool:
+        """True if 2D process grid partitioned only on rows."""
+        return self._process_grid._is_row_wise()
+
+    def _is_col_wise(self) -> bool:
+        """True if 2D process grid partitioned only on columns."""
+        return self._process_grid._is_col_wise()
+
+    def shape(self, rank: int, global_shape: Sequence[int] | None = None) -> tuple[int, ...]:
+        self._local_shape_checks(rank, self._process_grid._nranks, global_shape)
+        if global_shape is None:
+            global_shape = self._data_global_shape
+        return self._calc_local_shape(rank, self._block_sizes, global_shape)
+
+    def _calc_local_shape(self, rank, block_sizes, global_shape):
+        nprow, npcol = self._process_grid._shape
+        layout = self._process_grid._layout
+        if layout is not None:
+            myprow = rank % nprow if layout == ProcessGrid.Layout.COL_MAJOR else rank // npcol
+            mypcol = rank // nprow if layout == ProcessGrid.Layout.COL_MAJOR else rank % npcol
+            index = (myprow, mypcol)
+        else:
+            raise NotImplementedError
+        nrows = cublasMp.numroc(global_shape[0], block_sizes[0], index[0], self._first_process[0], nprow)
+        ncols = cublasMp.numroc(global_shape[1], block_sizes[1], index[1], self._first_process[1], npcol)
+        return (nrows, ncols)
+
+    def _bind(self, global_shape, *, shape=None) -> Distribution:
+        if self._bound:
+            raise BindDistributionError(f"{self} is already bound")
+
+        if self.ndim != 2:
+            raise NotImplementedError
+
+        if len(global_shape) != self.ndim:
+            raise ValueError(
+                f"Dimensionality of shapes ({len(global_shape)}) doesn't match dimensionality "
+                f"of this distribution ({self.ndim})"
+            )
+
+        rank = _get_communicator().Get_rank()
+        nrows, ncols = self._calc_local_shape(rank, self._block_sizes, global_shape)
+        if shape is not None and tuple(shape) != (nrows, ncols):
+            raise BindDistributionError(
+                f"The local shape {shape} on process {rank} is not the expected one based "
+                f"on the global shape {global_shape}, process grid {self._process_grid} and "
+                f"block sizes {self._block_sizes}: expected shape is {(nrows, ncols)}"
+            )
+
+        self._data_global_shape = global_shape
+        self._data_shape = (nrows, ncols)
+        self._bound = True
+        return self
+
+    def to(self, cls, /, *, ndim=None, copy=False):
+        super()._to_checks(cls, ndim)
+        nranks = _get_communicator().Get_size()
+        if issubclass(cls, BlockCyclic):
+            return self.copy() if copy else self
+        elif cls is Slab:
+            if not self._bound:
+                # Without binding, it's a stretch to assume that this is compatible with
+                # Slab (the "cyclic" nature means that it's much more likely that it isn't).
+                raise ConvertDistributionError(
+                    "Unbound BlockCyclic distribution can't be converted to Slab. "
+                    "Consider using BlockNonCyclic if there is no cyclic distribution of blocks."
+                )
+
+            if not self._is_1d_distribution():
+                raise ConvertDistributionError(
+                    "Can't convert this block distribution to Slab: partitioning must be on a single dimension"
+                )
+
+            # Data must be divisible on partition_dim and the block size must correspond
+            # to the slab size.
+            partition_dim = self._process_grid.shape.index(nranks)
+
+            if self._data_global_shape[partition_dim] % nranks != 0:
+                raise ConvertDistributionError(
+                    "Can't convert this distribution to Slab: data doesn't divide evenly on partition dimension."
+                    f" Global shape is {self._data_global_shape}, partition dimension is "
+                    f"{partition_dim} and number of processes is {nranks}."
+                )
+
+            if self._block_sizes[partition_dim] != self._data_global_shape[partition_dim] // nranks:
+                raise ConvertDistributionError(
+                    "This distribution can't be converted to Slab, because the block size in the "
+                    f"partition dimension {partition_dim} is not a factor of the global extent in "
+                    f"that dimension {self._data_global_shape[partition_dim]}"
+                )
+            d = Slab(partition_dim, ndim=self.ndim)
+            return d._bind(self._data_global_shape, self._data_shape)
+        elif cls is Box:
+            raise NotImplementedError
+
+
+class BlockNonCyclic(BlockCyclic):
+    """Block distribution without cycles"""
+
+    def __init__(
+        self,
+        process_grid: ProcessGrid,
+        *,
+        first_process: Sequence[int] | None = None,
+    ):
+        super().__init__(process_grid, (-1,) * len(process_grid.shape), first_process=first_process)
+
+    def __eq__(self, other):
+        if isinstance(other, BlockNonCyclic) and (not other._bound or not self._bound):
+            # Don't compare block sizes since one of them is not bound.
+            return self._process_grid == other._process_grid and self._first_process == other._first_process
+        return super().__eq__(other)
+
+    def shape(self, rank: int, global_shape: Sequence[int] | None = None) -> tuple[int, ...]:
+        self._local_shape_checks(rank, self._process_grid._nranks, global_shape)
+        if global_shape is None:
+            global_shape = self._data_global_shape
+        if self._bound:
+            block_sizes = self._block_sizes
+        else:
+            block_sizes = self._infer_block_sizes(global_shape)
+        return self._calc_local_shape(rank, block_sizes, global_shape)
+
+    def _infer_block_sizes(self, global_shape):
+        if all(x % self._process_grid._shape[i] == 0 for i, x in enumerate(global_shape)):
+            block_sizes = tuple(x // self._process_grid._shape[i] for i, x in enumerate(global_shape))
+        else:
+            # The logic to bind this global shape to this distribution isn't implemented yet
+            # (which doesn't necessarily mean that it isn't possible to fit the data to this
+            # distribution).
+            raise NotImplementedError(
+                "BlockNonCyclic is currently only implemented for uniform partition sizes. "
+                "Use BlockCyclic with explicit block sizes instead."
+            )
+        return block_sizes
+
+    def _bind(self, global_shape, *, shape=None) -> Distribution:
+        if self._bound:
+            raise BindDistributionError(f"{self} is already bound")
+
+        assert all(x == -1 for x in self._block_sizes)
+
+        if len(global_shape) != self.ndim:
+            raise ValueError(
+                f"Dimensionality of shapes ({len(global_shape)}) doesn't match dimensionality "
+                f"of this distribution ({self.ndim})"
+            )
+
+        # Now we assign block sizes to this distribution based on the shape of the data.
+        # NOTE: For dimensions that aren't partitioned, there are multiple blocks sizes that
+        # are valid and still fit the Block[Non]Cyclic model. This is because, in addition
+        # to block size being the full length L of the dimension, any L//N is also a valid
+        # block size (so a single block in that dimension is equivalent to N contiguous
+        # blocks in that dimension). The importance of this is that cuBLASMp actually
+        # requires configurations with these block sizes, since block sizes have to match
+        # across matrices A, B and C but they might be distributed differently.
+        # In other words, for distributed matmul with BlockNonCyclic, block sizes have to be
+        # inferred jointly with matrices A, B and C, but that is outside the scope of this
+        # method.
+
+        block_sizes = self._infer_block_sizes(global_shape)
+        if shape is not None and block_sizes != tuple(shape):
+            raise BindDistributionError("Data doesn't fit BlockNonCyclic distribution")
+
+        self._block_sizes = block_sizes
+        return super()._bind(global_shape, shape=shape)
+
+    def to(self, cls, /, *, ndim=None, copy=False):
+        super()._to_checks(cls, ndim)
+        nranks = _get_communicator().Get_size()
+        if issubclass(cls, BlockCyclic):
+            return self.copy() if copy else self
+        elif cls is Slab:
+            if not self._is_1d_distribution():
+                raise ConvertDistributionError(
+                    "Can't convert this block distribution to Slab: partitioning must be on a single dimension"
+                )
+
+            partition_dim = self._process_grid.shape.index(nranks)
+            d = Slab(partition_dim, ndim=self.ndim)
+            # For bound=False, we can allow the conversion and let Slab._bind() catch
+            # any potential errors later.
+            if self._bound:
+                if self._data_global_shape[partition_dim] % nranks != 0:
+                    raise ConvertDistributionError(
+                        "Can't convert this distribution to Slab: data doesn't divide evenly on partition dimension"
+                    )
+                d._bind(self._data_global_shape, shape=self._data_shape)
+            return d
+        elif cls is Box:
+            raise NotImplementedError
+
+
+def _get_communicator():
+    distributed_ctx = dist.get_context()
+    if distributed_ctx is None:
+        raise RuntimeError(
+            "nvmath.distributed has not been initialized. Refer to "
+            "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+            " for more information."
+        )
+    return distributed_ctx.communicator
diff --git a/nvmath/distributed/fft/__init__.py b/nvmath/distributed/fft/__init__.py
index 0a44b71..4a2f7a5 100644
--- a/nvmath/distributed/fft/__init__.py
+++ b/nvmath/distributed/fft/__init__.py
@@ -4,3 +4,4 @@
 
 from ._configuration import *  # noqa: F403
 from .fft import *  # noqa: F403
+from ..distribution import Slab as Slab
diff --git a/nvmath/distributed/fft/_configuration.py b/nvmath/distributed/fft/_configuration.py
index a329573..bdfc0be 100644
--- a/nvmath/distributed/fft/_configuration.py
+++ b/nvmath/distributed/fft/_configuration.py
@@ -2,13 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["FFTDirection", "FFTOptions", "Slab"]
+__all__ = ["FFTDirection", "FFTOptions"]
 
 from dataclasses import dataclass
 from enum import IntEnum
 from logging import Logger
 from typing import Literal
-from nvmath.bindings import cufftMp  # type: ignore
 
 
 @dataclass
@@ -43,7 +42,7 @@ class FFTOptions:
             the CPU, to ensure that the user doesn't inadvertently use the result before it
             becomes available. The default is ``"auto"``.
 
-    See Also:
+    .. seealso::
         :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
     """
 
@@ -72,29 +71,9 @@ def __post_init__(self):
 class FFTDirection(IntEnum):
     """An IntEnum class specifying the direction of the transform.
 
-    See Also:
+    .. seealso::
         :meth:`FFT.execute`, :func:`fft`
     """
 
     FORWARD = -1
     INVERSE = 1
-
-
-class Slab(IntEnum):
-    """An IntEnum class to specify a cuFFTMp Slab distribution.
-
-    Given an array of size X * Y * Z distributed over n GPUs, there are two possible slab
-    distributions depending on whether the data is partitioned on the X or Y axis:
-
-    * X axis partitioning: the first X % n GPUs each own (X/n+1) * Y * Z elements and
-      the remaining GPUs each own (X/n) * Y * Z elements.
-
-    * Y axis partitioning: the first Y % n GPUs each own X * (Y/n+1) * Z elements and
-      the remaining GPUs each own X * (Y/n) * Z elements.
-
-    See Also:
-        :class:`FFT`, :func:`fft`
-    """
-
-    X = cufftMp.XtSubFormat.FORMAT_INPLACE
-    Y = cufftMp.XtSubFormat.FORMAT_INPLACE_SHUFFLED
diff --git a/nvmath/distributed/fft/fft.py b/nvmath/distributed/fft/fft.py
index 44bddb3..9f734ea 100644
--- a/nvmath/distributed/fft/fft.py
+++ b/nvmath/distributed/fft/fft.py
@@ -13,9 +13,10 @@
 import math
 import numpy as np
 
-from ._configuration import FFTOptions, Slab, FFTDirection
+from ._configuration import FFTOptions, FFTDirection
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Distribution, Slab, Box
 from nvmath.bindings import cufftMp as cufft  # type: ignore
 from nvmath.bindings import nvshmem  # type: ignore
 from nvmath import memory
@@ -40,10 +41,6 @@ class TensorLayout:
     strides: Sequence[int]
 
 
-# A box contains lower and upper coordinates, so it must be of length 2 in practice.
-Box = Sequence[Sequence[int]]
-
-
 @dataclass
 class _ProblemSpec:
     """This is used in a custom MPI reduction to check that the FFT problem
@@ -108,9 +105,10 @@ def __init__(self, options: FFTOptions):
         #
         "distribution": """\
 Specifies the distribution of input and output operands across processes, which can be: (i) according to
-a Slab distribution (see :class:`Slab`), or (ii) a custom box distribution. With Slab distribution,
-this indicates the distribution of the input operand (the output operand will use the complementary
-Slab distribution). With box distribution, this indicates the input and output boxes.""".replace("\n", " "),
+a Slab distribution (see :class:`nvmath.distributed.distribution.Slab`), or (ii) a custom box distribution
+(see :class:`nvmath.distributed.distribution.Box`). With Slab distribution, this indicates the distribution
+of the input operand (the output operand will use the complementary Slab distribution).
+With box distribution, this indicates the input and output boxes.""".replace("\n", " "),
         #
         "direction": """\
 Specify whether forward or inverse FFT is performed (:class:`FFTDirection` object, or as a string from ['forward',
@@ -126,7 +124,7 @@ def __init__(self, options: FFTOptions):
         #
         "function_signature": """\
 operand,
-distribution: Slab | Sequence[Box],
+distribution: Distribution | Sequence[Box],
 sync_symmetric_memory: bool = True,
 options: FFTOptions | None = None,
 stream: AnyStream | None = None
@@ -187,7 +185,7 @@ def _get_fft_concrete_type(dtype, fft_abstract_type):
         if dtype == "complex64":
             return FFTType["C2R"]
         elif dtype == "complex128":
-            return FFTType["Z2R"]
+            return FFTType["Z2D"]
         else:
             raise ValueError(f"Incompatible dtype '{dtype}' for complex-to-real transform.")
     else:
@@ -347,7 +345,7 @@ def _calculate_capacity(
     global_shape: Sequence[int],
     fft_type: Literal["C2C", "C2R", "R2C"],
     nranks: int,
-):
+) -> int:
     """Calculate the max number of elements that the input buffer on every rank must be able
     to hold in order to perform the specified distributed FFT. Since the memory allocation
     is on the symmetric heap, we need to use the same (max) capacity on every rank. Also
@@ -408,7 +406,7 @@ def _allocate_for_fft(
     capacity must be provided on every rank, and must be large enough for the specified
     transform."""
     if fft_type == "R2C" and isinstance(distribution, Slab):
-        partition_dim = 0 if distribution == Slab.X else 1
+        partition_dim = distribution.partition_dim
 
         # For input, the strides depend on the padding.
         global_output_shape = list(global_input_shape)
@@ -441,7 +439,7 @@ def allocate_operand(
     package: ModuleType,
     *,
     input_dtype=None,
-    distribution: Slab | Sequence[Box],
+    distribution: Distribution | Sequence[Box],
     memory_space: Literal["cpu", "cuda"] | None = None,
     fft_type: Literal["C2C", "C2R", "R2C"] | None = None,
     logger: logging.Logger | None = None,
@@ -497,7 +495,13 @@ def allocate_operand(
 
     distributed_ctx = nvmath.distributed.get_context()
     if distributed_ctx is None:
-        raise RuntimeError("nvmath.distributed has not been initialized")
+        raise RuntimeError(
+            "nvmath.distributed has not been initialized. Refer to "
+            "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+            " for more information."
+        )
+    if not distributed_ctx.nvshmem_available:
+        raise RuntimeError("nvmath.distributed wasn't initialized with NVSHMEM backend")
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
     nranks = comm.Get_size()
@@ -518,6 +522,13 @@ def allocate_operand(
 
     package_name = cast(Literal["numpy", "cupy", "torch"], package_name)
 
+    if isinstance(distribution, Distribution):
+        distribution = distribution.to(Slab, ndim=len(shape), copy=True)
+        distribution = cast(Slab, distribution)
+    else:
+        # Must be a Box pair (this is checked in the ProblemSpec reducer).
+        distribution = tuple(cast(Box, box.copy()) for box in distribution)
+
     options = FFTOptions(fft_type=fft_type)
     problem_spec = _ProblemSpec(
         distribution=distribution,
@@ -546,12 +557,11 @@ def allocate_operand(
     ):
         raise ValueError(f"input dtype {input_dtype_name} is not compatible with FFT type {fft_type}")
 
-    distribution_name = f"Slab.{distribution.name}" if isinstance(distribution, Slab) else str(distribution)
     logger = logger if logger is not None else logging.getLogger()
     logger.info(
         f"Allocating {package.__name__} operand with shape {shape} and dtype "
         f"{input_dtype_name} for FFT type {fft_type} on {memory_space}, with "
-        f"distribution {distribution_name}."
+        f"distribution {distribution}."
     )
 
     # Infer global shape.
@@ -767,8 +777,19 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
             if p1.distribution != p2.distribution:
                 raise ValueError("The slab distribution is inconsistent across processes")
 
+            slab = cast(Slab, p1.distribution)
+
+            if slab.ndim != len(p1.shape):
+                raise ValueError(
+                    f"The dimensionality of {p1.distribution} doesn't match the dimensionality "
+                    "of the FFT operand ({len(p1.shape)})"
+                )
+
             # Using cuFFTMp slab distribution.
-            partitioned_dim = 0 if p1.distribution == Slab.X else 1
+            partitioned_dim = slab.partition_dim
+
+            if partitioned_dim not in (0, 1):
+                raise ValueError("The Slab partition dimension must be X or Y")
 
             if any(p1.shape[i] != p2.shape[i] for i in range(len(p1.shape)) if i != partitioned_dim):
                 return ValueError("The problem size is inconsistent across processes")
@@ -779,25 +800,17 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
         else:
             # Custom distribution given by input and output boxes on each process.
             for distribution in (p1.distribution, p2.distribution):
-                if not isinstance(distribution, Sequence):
-                    return ValueError("distribution must be a Slab or boxes")
+                if not isinstance(distribution, Sequence) or not all(isinstance(d, Box) for d in distribution):
+                    return ValueError("distribution must be a Slab or a Box pair")
 
             if len(p1.distribution) != 2 or len(p2.distribution) != 2:  # type: ignore
-                return ValueError("Must provide input and output boxes on all processes")
-            input_box1, output_box1 = p1.distribution  # type: ignore
-            input_box2, output_box2 = p2.distribution  # type: ignore
+                return ValueError("Must provide a Box pair on every process")
+            input_box1, output_box1 = cast(Sequence[Box], p1.distribution)
+            input_box2, output_box2 = cast(Sequence[Box], p2.distribution)
             for box in (input_box1, output_box1, input_box2, output_box2):
-                if len(box) != 2:
-                    return ValueError(f"Box {box} must have lower and upper coordinates")
-                lower, upper = box
-                if len(lower) != len(p1.shape) or len(upper) != len(p1.shape):
+                if box.ndim != len(p1.shape):
                     return ValueError(
-                        f"The number of coordinates in each coordinate pair of box {box} must "
-                        f"match the number of operand dimensions {len(p1.shape)}."
-                    )
-                if not all(upper[i] > lower[i] for i in range(len(p1.shape))):
-                    return ValueError(
-                        f"The upper coordinates must be larger than the lower coordinates, but got lower={lower} upper={upper}"
+                        f"The dimensionality of {box} doesn't match the dimensionality of the FFT operand ({len(p1.shape)})"
                     )
 
             for p_spec in (p1, p2):
@@ -825,11 +838,9 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
             def reduce_boxes(box1, box2):
                 """This function returns the smallest box that encompasses `box1`
                 and `box2`"""
-                lower1, upper1 = box1
-                lower2, upper2 = box2
-                lower = np.minimum(np.array(lower1), np.array(lower2)).tolist()
-                upper = np.maximum(np.array(upper1), np.array(upper2)).tolist()
-                return lower, upper
+                lower = np.minimum(np.array(box1.lower), np.array(box2.lower)).tolist()
+                upper = np.maximum(np.array(box1.upper), np.array(box2.upper)).tolist()
+                return Box(lower, upper)
 
             # Merge the boxes to get the global operand shape. Note that this is applied
             # progressively throughout the MPI reduction, starting with the local boxes.
@@ -896,7 +907,7 @@ class FFT:
 
         stream: {stream}
 
-    See Also:
+    .. seealso::
         :meth:`plan`, :meth:`reset_operand`, :meth:`execute`
 
     Examples:
@@ -918,6 +929,7 @@ class FFT:
         Create a 3-D complex128 ndarray on GPU symmetric memory, distributed according to
         the Slab distribution on the X axis (the global shape is (128, 128, 128)):
 
+        >>> from nvmath.distributed.distribution import Slab
         >>> shape = 128 // nranks, 128, 128
 
         cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
@@ -935,7 +947,7 @@ class FFT:
         ndarray for local operations) and specifies how the operand is distributed across
         processes:
 
-        >>> f = nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X)
+        >>> f = nvmath.distributed.fft.FFT(a, distribution=Slab.X)
 
         More information on distribution of operands can be found in the documentation:
         https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/fft/index.html
@@ -1015,7 +1027,7 @@ def __init__(
         self,
         operand,
         *,
-        distribution: Slab | Sequence[Box],
+        distribution: Distribution | Sequence[Box],
         options: FFTOptions | None = None,
         stream: AnyStream | None = None,
     ):
@@ -1026,6 +1038,8 @@ def __init__(
                 "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
                 " for more information."
             )
+        if not distributed_ctx.nvshmem_available:
+            raise RuntimeError("nvmath.distributed wasn't initialized with NVSHMEM backend")
         self.communicator = communicator = distributed_ctx.communicator
         self.rank = rank = communicator.Get_rank()
         self.nranks = nranks = communicator.Get_size()
@@ -1034,6 +1048,13 @@ def __init__(
         self.options = options = cast(FFTOptions, utils.check_or_create_options(FFTOptions, options, "Distributed FFT options"))
         self.package = operand.name
 
+        if isinstance(distribution, Distribution):
+            distribution = distribution.to(Slab, ndim=len(operand.shape), copy=True)
+            distribution = cast(Slab, distribution)
+        else:
+            # Must be a Box pair (this is checked in the ProblemSpec reducer).
+            distribution = tuple(cast(Box, box.copy()) for box in distribution)
+
         is_C = sorted(operand.strides, reverse=True) == list(operand.strides)
 
         # Merge the problem specification across processes to make sure that there are no
@@ -1106,14 +1127,11 @@ def __init__(
         if isinstance(distribution, Slab):
             self.global_extents = tuple(problem_spec.shape)
             # Check that this process has the correct slab shape.
-            partitioned_dim = 0 if distribution == Slab.X else 1
-            shape, _ = _calculate_slab_shape_strides(self.global_extents, partitioned_dim, rank, nranks)
             error = None
-            if self.operand.shape != shape:
-                error = ValueError(
-                    f"[{rank}] The operand shape is {self.operand.shape}, but the expected slab "
-                    f"shape is {shape} ({distribution})"
-                )
+            try:
+                distribution._bind(self.global_extents, shape=self.operand.shape)
+            except Exception as e:
+                error = e
             error = communicator.allreduce(error, _reduce_exception)
             if error:
                 raise error
@@ -1123,6 +1141,10 @@ def __init__(
             lower, upper = problem_spec.distribution[0]  # type: ignore
             self.global_extents = tuple(int(upper[i] - lower[i]) for i in range(self.operand_dim))
 
+            # This can't throw error since the local operand shape was already checked
+            # against the box shape in the ProblemSpec reducer.
+            distribution[0]._bind(self.global_extents, shape=self.operand.shape)
+
             # The global number of elements must be compatible with the global shape.
             if problem_spec.global_size != math.prod(self.global_extents):
                 raise ValueError(
@@ -1139,7 +1161,7 @@ def __init__(
         self.logger.info(f"The global FFT extents are {self.global_extents}.")
 
         # Calculate the required buffer capacity (in number of elements) for this transform.
-        self.capacity = _calculate_capacity(problem_spec, self.global_extents, self.fft_abstract_type, nranks)
+        self.capacity: int = _calculate_capacity(problem_spec, self.global_extents, self.fft_abstract_type, nranks)
 
         # Copy the operand to execution_space's device if needed.
         self.operand, self.operand_backup = _copy_operand_perhaps(
@@ -1207,10 +1229,7 @@ def __init__(
             )
         else:
             input_box, output_box = distribution
-            input_box = (tuple(input_box[0]), tuple(input_box[1]))
-            output_box = (tuple(output_box[0]), tuple(output_box[1]))
             self.distribution_layout[input_box] = self.operand_layout
-            self.distribution = distribution = (input_box, output_box)
 
             self.logger.info(f"The operand distribution is based on custom input box {input_box} and output box {output_box}.")
 
@@ -1238,7 +1257,7 @@ def __init__(
                 )
 
         if self.options.reshape:
-            partition_dim = 0 if distribution == Slab.X else 1
+            partition_dim = distribution.partition_dim  # type: ignore
             if self.fft_abstract_type == "C2R":
                 self.result_shape_padded, _ = _calculate_slab_shape_strides(
                     global_result_extents_padded, partition_dim, rank, nranks
@@ -1254,12 +1273,13 @@ def __init__(
                 self.global_result_extents, 1 - partition_dim, rank, nranks, global_result_extents_padded
             )
         elif not isinstance(self.distribution, Slab):
-            output_lower, output_upper = distribution[1]  # type: ignore
+            output_lower, output_upper = output_box
             self.result_shape = tuple(output_upper[i] - output_lower[i] for i in range(self.operand_dim))
             self.result_strides = calculate_strides(self.result_shape, reversed(range(self.operand_dim)))
             self.distribution_layout[output_box] = TensorLayout(shape=self.result_shape, strides=self.result_strides)
+            output_box._bind(self.global_result_extents, shape=self.result_shape)
         else:
-            result_partition_dim = 1 if distribution == Slab.X else 0
+            result_partition_dim = 1 - distribution.partition_dim  # type: ignore
             if self.fft_abstract_type == "C2R":
                 self.result_shape_padded, _ = _calculate_slab_shape_strides(
                     global_result_extents_padded, result_partition_dim, rank, nranks
@@ -1353,41 +1373,29 @@ def _allocate_reshape_operand(self, exec_stream_holder: StreamHolder | None, log
                 f"{self.result_strides} and data type '{self.result_data_type}'."
             )
 
-        result_shape = self.result_shape
-        if self.fft_abstract_type == "C2R":
-            # For C2R we need to preserve the last axis strides of the real output
-            # when we reshape.
-            result_shape = self.result_shape_padded
-
-        result = utils.create_empty_tensor(
-            self.result_class,  # type: ignore
-            result_shape,
-            self.result_data_type,
-            self.device_id,
-            exec_stream_holder,
-            verify_strides=False,  # the strides are computed so that they are contiguous
-            strides=self.result_strides,
-            symmetric_memory=True,
-            make_symmetric=True,
-            logger=self.logger,
+        capacity_out_dtype = (
+            self.capacity * 2
+            if self.fft_abstract_type == "C2R"
+            else self.capacity // 2
+            if self.fft_abstract_type == "R2C"
+            else self.capacity
+        )
+        # For C2R we preserve the last axis strides of the real output
+        # when we reshape.
+        result = _allocate_for_fft(
+            self.global_result_extents,
+            self.result_shape,
+            self.distribution,
+            self.result_operand.name_to_dtype[self.result_data_type],
+            "cuda",
+            self.result_operand.module,
+            self.fft_abstract_type[::-1],  # type: ignore
+            capacity_out_dtype,
+            self.rank,
+            self.nranks,
         )
         if log_debug:
             self.logger.debug("The reshape output (empty) tensor has been created.")
-
-        if self.fft_abstract_type == "C2R":
-            if result.name == "cuda":
-                view = ndbuffer.wrap_external(
-                    result.tensor,
-                    result.data_ptr,
-                    self.result_data_type,
-                    self.result_shape,
-                    self.result_strides,
-                    self.device_id,
-                    result.itemsize,
-                )
-                return CudaDistributedTensor(view)
-            else:
-                return tensor_wrapper.wrap_operand(result.tensor[..., : self.result_shape[-1]])
         return result
 
     def _get_result_operand(self, collective_error_checking):
@@ -1481,7 +1489,7 @@ def plan(self, *, stream: AnyStream | None = None):
             elapsed,
         ):
             if isinstance(self.distribution, Slab):
-                self.subformat = self.distribution
+                self.subformat = self.distribution._cufftmp_value
             else:
                 if self.fft_abstract_type == "C2R":
                     # C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT,
@@ -1509,8 +1517,8 @@ def plan(self, *, stream: AnyStream | None = None):
                     strides_output,
                 )
                 self.box_to_subformat = {}
-                self.box_to_subformat[(tuple(lower_input), tuple(upper_input))] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
-                self.box_to_subformat[(tuple(lower_output), tuple(upper_output))] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_OUTPUT
+                self.box_to_subformat[Box(lower_input, upper_input)] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
+                self.box_to_subformat[Box(lower_output, upper_output)] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_OUTPUT
                 self.subformat = (
                     cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
                     if self.fft_abstract_type != "C2R"
@@ -1562,7 +1570,9 @@ def plan(self, *, stream: AnyStream | None = None):
             self.logger.info(f"The FFT planning phase took {elapsed.data:.3f} ms to complete.")
 
     @utils.precondition(_check_valid_fft)
-    def reset_operand(self, operand=None, *, distribution: Slab | Sequence[Box] | None = None, stream: AnyStream | None = None):
+    def reset_operand(
+        self, operand=None, *, distribution: Distribution | Sequence[Box] | None = None, stream: AnyStream | None = None
+    ):
         """
         Reset the operand held by this :class:`FFT` instance. This method has two use cases:
 
@@ -1610,6 +1620,7 @@ def reset_operand(self, operand=None, *, distribution: Slab | Sequence[Box] | No
             Create a 3-D complex128 ndarray on GPU symmetric memory, distributed according
             to the Slab distribution on the X axis (the global shape is (128, 128, 128)):
 
+            >>> from nvmath.distributed.distribution import Slab
             >>> shape = 128 // nranks, 128, 128
             >>> dtype = cp.complex128
             >>> a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=dtype)
@@ -1617,7 +1628,7 @@ def reset_operand(self, operand=None, *, distribution: Slab | Sequence[Box] | No
 
             Create an FFT object as a context manager
 
-            >>> with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X) as f:
+            >>> with nvmath.distributed.fft.FFT(a, distribution=Slab.X) as f:
             ...     # Plan the FFT
             ...     f.plan()
             ...
@@ -1706,6 +1717,12 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
         if distribution is None:
             raise ValueError("Please specify the distribution of the operand for reset_operand")
 
+        if isinstance(distribution, Distribution):
+            distribution = distribution.to(Slab, ndim=self.operand_dim, copy=True)
+        else:
+            # Must be a Box pair.
+            distribution = tuple(cast(Box, box.copy()) for box in distribution)
+
         distribution_type_old = "slab" if isinstance(self.distribution, Slab) else "box"
         distribution_type_new = "slab" if isinstance(distribution, Slab) else "box"
         if distribution_type_old != distribution_type_new:
@@ -1714,13 +1731,6 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
                 f"{distribution_type_new} distribution in reset_operand."
             )
 
-        if distribution_type_old == "box":
-            distribution = cast(Sequence[Box], distribution)  # for type checker
-            input_box, output_box = distribution
-            input_box = (tuple(input_box[0]), tuple(input_box[1]))
-            output_box = (tuple(output_box[0]), tuple(output_box[1]))
-            distribution = (input_box, output_box)
-
         if self.fft_abstract_type in ("R2C", "C2R") and self.distribution != distribution:
             raise ValueError(f"Can't change distribution with FFT type {self.fft_abstract_type}")
 
@@ -1729,14 +1739,9 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
                 raise ValueError("Can't change distribution when using reshape=True")
 
             distribution = cast(Slab, distribution)  # for type checker
-            operand_layout = self.distribution_layout[distribution]
-            if operand.shape != operand_layout.shape:
-                raise ValueError(
-                    f"The operand shape is {operand.shape}, but the expected slab shape "
-                    f"is {operand_layout.shape} ({distribution})"
-                )
-
-            self.subformat = self.distribution = distribution
+            distribution._bind(self.global_extents, shape=operand.shape)
+            self.distribution = distribution
+            self.subformat = distribution._cufftmp_value
 
             # Log distribution.
             if log_info:
@@ -1751,17 +1756,9 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
         else:
             distribution = cast(Sequence[Box], distribution)  # for type checker
             input_box, output_box = distribution
-            input_box = (tuple(input_box[0]), tuple(input_box[1]))
-            output_box = (tuple(output_box[0]), tuple(output_box[1]))
             if input_box not in self.box_to_subformat or output_box not in self.box_to_subformat:
                 raise ValueError("The reset operand distribution must use the original boxes (in any order)")
-
-            operand_layout = self.distribution_layout[input_box]
-            if operand.shape != operand_layout.shape:
-                return ValueError(
-                    f"The operand shape {operand.shape} does not match the input box shape {operand_layout.shape}"
-                )
-
+            distribution[0]._bind(self.global_extents, shape=operand.shape)
             self.subformat = self.box_to_subformat[input_box]
             self.distribution = distribution
 
@@ -1802,6 +1799,7 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
 
         if distribution_type_old == "box":
             result_layout = self.distribution_layout[output_box]
+            output_box._bind(self.global_result_extents, shape=result_layout.shape)
         elif not self.options.reshape:
             result_layout = self.distribution_layout[Slab.X if distribution == Slab.Y else Slab.Y]
         else:
@@ -2127,7 +2125,7 @@ def _fft(
     x,
     /,
     *,
-    distribution: Slab | Sequence[Box],
+    distribution: Distribution | Sequence[Box],
     direction: FFTDirection | None = None,
     sync_symmetric_memory: bool = True,
     options: FFTOptions | None = None,
@@ -2157,7 +2155,7 @@ def _fft(
         shape will depend on the choice of distribution and reshape option. The operand
         remains on the same device and uses the same package as the input operand.
 
-    See Also:
+    .. seealso::
         :func:`ifft`, :func:`irfft`, :func:`rfft`, :class:`FFT`
 
     Examples:
@@ -2176,6 +2174,7 @@ def _fft(
         Create a 3-D complex128 ndarray on GPU symmetric memory, distributed according to
         the Slab distribution on the Y axis (the global shape is (256, 256, 256)):
 
+        >>> from nvmath.distributed.distribution import Slab
         >>> shape = 256, 256 // nranks, 256
         >>> dtype = cp.complex128
         >>> a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=dtype)
@@ -2186,7 +2185,7 @@ def _fft(
         Perform a 3-D C2C FFT using :func:`fft`. The result `r` is also a CuPy complex128
         ndarray:
 
-        >>> r = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.Y)
+        >>> r = nvmath.distributed.fft.fft(a, distribution=Slab.Y)
 
         See :class:`FFTOptions` for the complete list of available options.
 
@@ -2211,7 +2210,7 @@ def _fft(
         Provide the NumPy ndarray to :func:`fft`, with the result also being a NumPy
         ndarray:
 
-        >>> r = nvmath.distributed.fft.fft(b, nvmath.distributed.fft.Slab.Y)
+        >>> r = nvmath.distributed.fft.fft(b, distribution=Slab.Y)
 
     Notes:
         - This function only takes complex operand for C2C transformation. If the user
@@ -2253,7 +2252,7 @@ def rfft(
     operand,
     /,
     *,
-    distribution: Slab | Sequence[Box],
+    distribution: Distribution | Sequence[Box],
     sync_symmetric_memory: bool = True,
     options: FFTOptions | None = None,
     stream: AnyStream | None = None,
@@ -2281,7 +2280,7 @@ def rfft(
         the input operand. The global extent of the last transformed axis in the result will
         be ``global_extent[-1] // 2 + 1``.
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`irfft`, :class:`FFT`.
     """
     wrapped_operand = tensor_wrapper.wrap_operand(operand)
@@ -2324,7 +2323,7 @@ def rfft(
         shape will depend on the choice of distribution and reshape option. The operand
         remains on the same device and uses the same package as the input operand.
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`irfft`, :class:`FFT`.
 
     Notes:
@@ -2345,7 +2344,7 @@ def irfft(
     operand,
     /,
     *,
-    distribution: Slab | Sequence[Box],
+    distribution: Distribution | Sequence[Box],
     sync_symmetric_memory: bool = True,
     options: FFTOptions | None = None,
     stream: AnyStream | None = None,
@@ -2376,7 +2375,7 @@ def irfft(
         ``even``, or ``global_extent[-1] * 2 - 1`` if :attr:`FFTOptions.last_axis_parity`
         is ``odd``.
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`ifft`, :class:`FFT`.
 
     Example:
diff --git a/nvmath/distributed/linalg/__init__.py b/nvmath/distributed/linalg/__init__.py
new file mode 100644
index 0000000..fcf0397
--- /dev/null
+++ b/nvmath/distributed/linalg/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from . import advanced
+from nvmath.bindings.cublas import ComputeType  # type: ignore
+
+__all__ = [
+    "advanced",
+    "ComputeType",
+]
diff --git a/nvmath/distributed/linalg/_internal/__init__.py b/nvmath/distributed/linalg/_internal/__init__.py
new file mode 100644
index 0000000..831c565
--- /dev/null
+++ b/nvmath/distributed/linalg/_internal/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/nvmath/distributed/linalg/_internal/matmul_desc_ifc.py b/nvmath/distributed/linalg/_internal/matmul_desc_ifc.py
new file mode 100644
index 0000000..e1e7bd8
--- /dev/null
+++ b/nvmath/distributed/linalg/_internal/matmul_desc_ifc.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Interface class to encapsulate low-level calls to get and set matmul descriptor attributes.
+"""
+
+__all__ = ["MatmulDescInterface"]
+
+import ctypes
+import logging
+
+import numpy as np
+
+from nvmath.bindings import cublasMp  # type: ignore
+
+logger = logging.getLogger()
+
+DescEnum = cublasMp.MatmulDescriptorAttribute
+
+
+def scalar_attributes():
+    return [e.name for e in DescEnum]
+
+
+DESC_ENUM_SCALAR_ATTR = scalar_attributes()
+
+
+def _get_attribute_ctype(name):
+    return np.ctypeslib.as_ctypes_type(cublasMp.get_matmul_descriptor_attribute_dtype(DescEnum[name]))
+
+
+DESC_ENUM_SCALAR_ATTR_INFO = {name: (DescEnum[name].value, _get_attribute_ctype(name)) for name in DESC_ENUM_SCALAR_ATTR}  # type: ignore[valid-type]
+
+
+class MatmulDescInterface:
+    def __init__(self, matmul_desc):
+        self.matmul_desc = matmul_desc
+
+    def __getattr__(self, name):
+        _name = name.upper()
+        logging.debug("Getting Matmul Description attribute %s.", _name)
+        info = DESC_ENUM_SCALAR_ATTR_INFO.get(_name)
+        if info is None:
+            raise AttributeError(f"No attribute named {name} in matmul descriptor")
+        enum_value, ctype = info
+        name = _name
+        attribute_buffer = ctype()
+        size_written = ctypes.c_uint64()
+        cublasMp.matmul_descriptor_attribute_get(
+            self.matmul_desc,
+            enum_value,
+            ctypes.addressof(attribute_buffer),
+            ctypes.sizeof(attribute_buffer),
+            ctypes.addressof(size_written),
+        )
+        return attribute_buffer.value
+
+    def __setattr__(self, name, value):
+        if name in ("matmul_desc"):
+            # For attributes of this Python class, redirect to the original __setattr__
+            return super().__setattr__(name, value)
+        _name = name.upper()
+        logging.debug("Setting Matmul Description attribute %s to %s.", _name, value)
+        info = DESC_ENUM_SCALAR_ATTR_INFO.get(_name)
+        if info is None:
+            raise AttributeError(f"No attribute named {name} in matmul descriptor")
+        enum_value, ctype = info
+        name = _name
+        ctypes_value = ctype(value)
+        cublasMp.matmul_descriptor_attribute_set(
+            self.matmul_desc, enum_value, ctypes.addressof(ctypes_value), ctypes.sizeof(ctypes_value)
+        )
diff --git a/nvmath/distributed/linalg/advanced/__init__.py b/nvmath/distributed/linalg/advanced/__init__.py
new file mode 100644
index 0000000..d31ede4
--- /dev/null
+++ b/nvmath/distributed/linalg/advanced/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._configuration import *  # noqa: F403
+from .matmulmod import *  # noqa: F403
diff --git a/nvmath/distributed/linalg/advanced/_configuration.py b/nvmath/distributed/linalg/advanced/_configuration.py
new file mode 100644
index 0000000..981e734
--- /dev/null
+++ b/nvmath/distributed/linalg/advanced/_configuration.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+__all__ = [
+    "MatmulEpilog",
+    "MatmulAlgoType",
+    "MatmulOptions",
+    "MatmulEpilogPreferences",
+    "MatmulPlanPreferences",
+    "matrix_qualifiers_dtype",
+]
+
+import dataclasses
+from logging import Logger
+from typing import Literal
+
+import numpy as _np
+
+from nvmath.bindings import cublas  # type: ignore
+from nvmath.bindings import cublasMp  # type: ignore
+from nvmath.internal.utils import check_or_create_options
+from nvmath._utils import CudaDataType
+
+MatmulEpilog = cublasMp.MatmulEpilogue
+MatmulAlgoType = cublasMp.MatmulAlgoType
+
+
+@dataclasses.dataclass
+class MatmulOptions:
+    """A data class for providing options to the :class:`Matmul` object and the wrapper
+    function :func:`matmul`.
+
+    Attributes:
+        compute_type (nvmath.distributed.linalg.ComputeType): CUDA compute type. A suitable
+            compute type will be selected if not specified.
+
+        scale_type (nvmath.CudaDataType): CUDA data type. A suitable data type consistent
+            with the compute type will be selected if not specified.
+
+        result_type (nvmath.CudaDataType): CUDA data type. A requested datatype of the
+            result. If not specified, this type will be determined based on the input types.
+
+        algo_type (nvmath.distributed.linalg.advanced.MatmulAlgoType): Hints the algorithm
+            type to be used. If not supported, cuBLASMp will fallback to the default
+            algorithm.
+
+        sm_count_communication (int) : The number of SMs to use for communication. This is
+            only relevant for some algorithms (please consult cuBLASMp documentation).
+
+        logger (logging.Logger): Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+        blocking: A flag specifying the behavior of the execution functions and methods,
+            such as :func:`matmul` and :meth:`Matmul.execute`. When ``blocking`` is `True`,
+            the execution methods do not return until the operation is complete. When
+            ``blocking`` is ``"auto"``, the methods return immediately when the inputs are
+            on the GPU. The execution methods always block when the operands are on the CPU
+            to ensure that the user doesn't inadvertently use the result before it becomes
+            available. The default is ``"auto"``.
+
+    .. seealso::
+       :class:`Matmul`, :func:`matmul`
+    """
+
+    compute_type: int | None = None
+    scale_type: int | None = None
+    result_type: int | None = None
+    algo_type: int | None = None
+    sm_count_communication: int | None = None
+    logger: Logger | None = None
+    blocking: Literal[True, "auto"] = "auto"
+
+    def __post_init__(self):
+        if self.compute_type is not None:
+            self.compute_type = cublas.ComputeType(self.compute_type)
+
+        if self.scale_type is not None:
+            self.scale_type = CudaDataType(self.scale_type)
+
+        if self.algo_type is not None:
+            self.algo_type = MatmulAlgoType(self.algo_type)
+
+        if self.sm_count_communication is not None and not (
+            isinstance(self.sm_count_communication, int) and self.sm_count_communication > 0
+        ):
+            raise ValueError("sm_count_communication must be a positive integer")
+
+        if self.blocking not in (True, "auto"):
+            raise ValueError("The value specified for blocking must be either True or 'auto'.")
+
+
+matrix_qualifiers_dtype = _np.dtype([("structure", object), ("is_transpose", "<i1"), ("is_conjugate", "<i1")])
+
+
+@dataclasses.dataclass
+class MatmulEpilogPreferences:
+    """A data class for providing epilog options as part of ``preferences`` to the
+    :meth:`Matmul.plan` method and the wrapper function :func:`matmul`.
+
+    Attributes:
+        aux_type (nvmath.CudaDataType): The requested datatype of the
+            epilog auxiliary output. If not specified, this type will be determined based on
+            the input types. Non-default auxiliary output types are only supported for
+            certain epilogs. For more details on the supported combinations, see
+            ``CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_EPILOGUE_AUX_DATA_TYPE`` in cuBLASMp
+            documentation.
+
+    .. seealso::
+       :meth:`Matmul.plan`, :func:`matmul`, :class:`MatmulPlanPreferences`
+    """
+
+    aux_type: int | None = None
+
+
+@dataclasses.dataclass
+class MatmulPlanPreferences:
+    """A data class for providing options to the :meth:`Matmul.plan` method and the
+    wrapper function :func:`matmul`.
+
+    Attributes:
+        epilog (:class:`nvmath.distributed.linalg.advanced.MatmulEpilogPreferences`):
+            Epilog preferences (as an object of class
+            :class:`~nvmath.distributed.linalg.advanced.MatmulEpilogPreferences`
+            or a `dict`).
+
+    .. seealso::
+       :meth:`Matmul.plan`, :func:`matmul`
+    """
+
+    epilog: MatmulEpilogPreferences | None = None
+
+    def __post_init__(self):
+        self.epilog = check_or_create_options(MatmulEpilogPreferences, self.epilog, "epilog preferences")
diff --git a/nvmath/distributed/linalg/advanced/matmulmod.py b/nvmath/distributed/linalg/advanced/matmulmod.py
new file mode 100644
index 0000000..e2377d1
--- /dev/null
+++ b/nvmath/distributed/linalg/advanced/matmulmod.py
@@ -0,0 +1,2265 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+__all__ = ["MatmulComputeType", "Matmul", "matmul"]
+
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+import logging
+import typing
+
+from typing import cast, Literal
+import cuda.core.experimental as ccx
+import numpy as np
+
+import nvmath.distributed
+from nvmath.distributed.distribution import ProcessGrid, Distribution, BlockCyclic, BlockNonCyclic
+
+from nvmath import memory
+
+from nvmath.distributed.linalg.advanced import _configuration, MatmulEpilog
+from ._configuration import MatmulOptions, matrix_qualifiers_dtype
+from nvmath.bindings import cublas
+from nvmath.bindings import cublasMp  # type: ignore
+from nvmath.bindings import nvshmem  # type: ignore
+
+from nvmath.distributed._internal.nvshmem import NvshmemMemoryManager
+
+from nvmath.internal import formatters
+from nvmath.distributed._internal import tensor_wrapper
+from nvmath.distributed._internal.tensor_ifc import DistributedTensor
+from nvmath.internal import typemaps
+from nvmath.internal import utils
+from nvmath._internal.layout import is_contiguous_and_dense
+
+from nvmath.distributed.linalg._internal import matmul_desc_ifc
+from nvmath.linalg._internal.typemaps import (
+    NAMES_TO_DEFAULT_SCALE_TYPE,
+    NAMES_TO_DEFAULT_COMPUTE_TYPE,
+    COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE,
+    SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE,
+    SUPPORTED_TYPES,
+)
+from nvmath.linalg._internal.utils import (
+    calculate_strides,
+)
+from nvmath._utils import CudaDataType
+
+MatmulComputeType = cublasMp.ComputeType
+
+
+@dataclass
+class MatrixLayout:
+    """An internal data class for capturing the local tensor layout."""
+
+    shape: Sequence[int]
+    strides: Sequence[int]
+    is_transpose: bool = False
+    is_conjugate: bool = False  # Used to support is_conjugate via conjugate_transpose.
+
+
+@dataclass
+class MMTraits:
+    """An internal data class for capturing the matrix multiplication traits. The
+    result traits are captured separately, because we need to wait for the
+    epilog to be provided.
+    """
+
+    M: int  # global matrix size M
+    N: int  # global matrix size N
+    K: int  # global matrix size K
+    a_layout: MatrixLayout  # local
+    b_layout: MatrixLayout  # local
+    c_layout: MatrixLayout | None  # local
+    # NOTE: cuBLASMp doesn't support batched matmul so batch size is currently always 0.
+    batch_count: int = 0
+    batch_shape: Sequence[int] = field(default_factory=list)
+    batch_axis_order: Sequence[int] = field(default_factory=list)
+
+
+@dataclass
+class _ProblemSpec:
+    """This is used in a custom MPI reduction to check that the Matmul problem
+    specification is consistent across processes, and to infer global information
+    (e.g. global shape of distributed matrices)."""
+
+    @dataclass
+    class Options:
+        """
+        This is used for _ProblemSpec instead of MatmulOptions because it's going
+        to be serialized as part of the custom reduction of the _ProblemSpec, and
+        we want to control which fields are included (for example we don't need
+        the logger).
+        """
+
+        def __init__(self, options: MatmulOptions):
+            self.compute_type = options.compute_type
+            self.scale_type = options.scale_type
+            self.result_type = options.result_type
+            self.blocking = options.blocking
+
+        compute_type: int | None = None
+        scale_type: int | None = None
+        result_type: int | None = None
+        blocking: Literal[True, "auto"] = "auto"
+
+    shapes: list[list[int]]  # shape of each operand
+    is_F: list[bool]  # Is F memory layout (of each operand)
+    operand_dtypes: list[str]  # dtype of each operand
+    packages: list[Literal["numpy", "cupy", "torch"]]  # package of each operand
+    memory_spaces: list[Literal["cuda", "cpu"]]  # memory space of each operand
+    distributions: list[Distribution]  # distribution of A, B and C/D
+    options: Options  # Matmul options
+    device_ids: list[int | Literal["cpu"]]  # device_id of each operand
+    compute_capability: tuple[int, ...]  # compute capability of the execution space device
+    alpha: float
+    beta: float
+    qualifiers: np.ndarray
+    nranks: int
+    rank: int  # only valid if is_leaf=True
+    lib_version: int
+
+    # is_leaf=True means that this is the _ProblemSpec of a process before reducing
+    # with that of another process.
+    is_leaf: bool = True
+
+
+def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
+    try:
+        if isinstance(p1, Exception):
+            return p1  # propagate exception
+
+        if isinstance(p2, Exception):
+            return p2  # propagate exception
+
+        if not (p1.lib_version == p2.lib_version >= 600):
+            return ValueError("cublasMp >= 0.6.0 required")
+
+        num_operands = len(p1.operand_dtypes)
+        if num_operands != len(p2.operand_dtypes):
+            return ValueError("The number of operands doesn't match across processes")
+
+        if num_operands not in (2, 3):
+            return ValueError("The number of operands must be 2 or 3")
+
+        def check_dtype(dtype, operand_name: str):
+            if dtype not in SUPPORTED_TYPES:
+                raise ValueError(f"The dtype of operand {operand_name} ({dtype}) is not supported.")
+
+        operand_name = "ABC"
+
+        for i in range(num_operands):
+            if p1.operand_dtypes[i] != p2.operand_dtypes[i]:
+                return ValueError(
+                    f"Operand {operand_name[i]} dtype does not match across processes: "
+                    f"{p1.operand_dtypes[i]} != {p2.operand_dtypes[i]}"
+                )
+            check_dtype(p1.operand_dtypes[i], operand_name[i])
+
+        def _check_extents(shape: list[int], name: str):
+            if len(shape) > 2:
+                raise ValueError("Batched matmul is not supported")
+            if name == "C" and len(shape) != 2:
+                raise ValueError(
+                    "In order to avoid broadcasting behavior ambiguity, `c` must be 2-D. "
+                    "Use a singleton dimension to convert your input array to 2-D."
+                )
+            # TODO: allow broadcasting A and B if 1D.
+            if len(shape) != 2:
+                raise ValueError("Operands must be two-dimensional")
+            if any(e <= 0 for e in shape):
+                message = (
+                    f"The specified extents {shape} for operand {name} are not valid. The extents must be strictly positive. "
+                )
+                raise ValueError(message)
+
+        for p in (p1, p2):
+            if p.is_leaf:
+                for i in range(num_operands):
+                    _check_extents(p.shapes[i], operand_name[i])
+
+                if len(set(p.packages)) != 1:
+                    return ValueError(
+                        f"The operands on process {p.rank} don't belong to the same package: got operand packages {p.packages}"
+                    )
+
+                if len(set(p.memory_spaces)) != 1:
+                    return ValueError(
+                        f"The operands on process {p.rank} are not in the same memory space: got "
+                        f"operand memory spaces {p.memory_spaces}"
+                    )
+
+                if len(set(p.device_ids)) != 1:
+                    return ValueError(
+                        f"The operands on process {p.rank} are not on the same device: got operand device IDs {p.device_ids}"
+                    )
+
+                input_type_width = typemaps.NAME_TO_DATA_WIDTH[p.operand_dtypes[0]]
+                if input_type_width <= 8:
+                    return TypeError("Narrow-precision data types (FP8 and lower) are not currently supported.")
+
+                p.qualifiers = p.qualifiers if p.qualifiers is not None else np.zeros((3,), dtype=matrix_qualifiers_dtype)
+                if p.qualifiers.dtype != matrix_qualifiers_dtype:
+                    return ValueError(
+                        "The qualifiers must be specified as a NumPy array of length 3 "
+                        "corresponding to the operands A, B, and C of type "
+                        "'matrix_qualifiers_dtype'."
+                    )
+
+        for i in range(num_operands):
+            if len(p1.shapes[i]) != len(p2.shapes[i]):
+                return ValueError(f"The number of dimensions of the operand {operand_name[i]} is inconsistent across processes")
+
+        if p1.packages[0] != p2.packages[0]:
+            return ValueError("operands don't belong to the same package on all processes")
+
+        if p1.memory_spaces[0] != p2.memory_spaces[0]:
+            return ValueError('operands are not in the same memory space ("cpu", "cuda") on all processes')
+
+        if p1.options != p2.options:
+            return ValueError(f"options are inconsistent across processes: {p1.options} != {p2.options}")
+
+        if p1.alpha != p2.alpha:
+            return ValueError(f"alpha does not match across processes: {p1.alpha} != {p2.alpha}")
+
+        if p1.beta != p2.beta:
+            return ValueError(f"beta does not match across processes: {p1.beta} != {p2.beta}")
+
+        if not np.array_equal(p1.qualifiers, p2.qualifiers):
+            return ValueError("The qualifiers don't match across processes")
+
+        if len(p1.distributions) != 3 or len(p2.distributions) != 3:
+            return ValueError("Must provide distributions for A, B and C/D")
+
+        # Check that distribution of operands is the same on every process.
+        for i, (d1, d2) in enumerate(zip(p1.distributions, p2.distributions, strict=False)):
+            if d1 != d2:
+                return ValueError(f"Distribution for {operand_name[i]} doesn't match across processes: {d1} != {d2}")
+
+        for p in (p1, p2):
+            if p.is_leaf:
+                p.distributions = [d.to(BlockCyclic, ndim=2, copy=True) for d in p.distributions]
+                for i, d in enumerate(p.distributions):
+                    assert isinstance(d, BlockCyclic)  # only for type checker
+                    if i == num_operands:
+                        break
+                    # To calculate the global shape when using 2D block distribution, we
+                    # ignore the rows of processes that aren't in column 0 of the process
+                    # grid, and the columns of processes that aren't in row 0 of the process
+                    # grid (by setting the rows/columns to 0). We could do the same for 1D,
+                    # but by preserving the shape info for 1D we can do some extra checks
+                    # below.
+                    if not d.process_grid._is_1d_distribution():
+                        nprow, npcol = d.process_grid.shape
+                        myprow = p.rank % nprow if d.process_grid.layout == ProcessGrid.Layout.COL_MAJOR else p.rank // npcol
+                        mypcol = p.rank // nprow if d.process_grid.layout == ProcessGrid.Layout.COL_MAJOR else p.rank % npcol
+                        if myprow != 0:
+                            p.shapes[i][1] = 0
+                        if mypcol != 0:
+                            p.shapes[i][0] = 0
+
+        # Determine the memory layout shared by all processes.
+        for i in range(num_operands):
+            p1.is_F[i] &= p2.is_F[i]
+            if not p1.is_F[i]:
+                return ValueError(f"Operand {operand_name[i]} doesn't have column-major (Fortran) memory layout")
+
+        # Calculate global shape based on process grid.
+        for i in range(num_operands):
+            p_grid = cast(BlockCyclic, p1.distributions[i]).process_grid
+            partitioned_dims = (0,) if p_grid._is_row_wise() else (1,) if p_grid._is_col_wise() else (0, 1)
+
+            if len(partitioned_dims) == 1 and any(
+                p1.shapes[i][j] != p2.shapes[i][j] for j in (0, 1) if j != partitioned_dims[0]
+            ):
+                return ValueError(
+                    "The problem size is inconsistent across processes:" + str(p1.shapes) + " vs " + str(p2.shapes)
+                )
+
+            if p1 is not p2:  # with nranks==1 p1 is p2
+                # Reduce the partitioned dimensions to get the global size.
+                for dim in partitioned_dims:
+                    p1.shapes[i][dim] += p2.shapes[i][dim]
+
+    except Exception as e:
+        return e
+    p1.is_leaf = False
+    return p1
+
+
+SHARED_MM_DOCUMENTATION = utils.COMMON_SHARED_DOC_MAP.copy()
+SHARED_MM_DOCUMENTATION.update(
+    {
+        "a": """\
+A distributed tensor representing the first operand to the matrix multiplication (see `Semantics`_).
+The currently supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and
+:class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "b": """\
+A distributed tensor representing the second operand to the matrix multiplication (see `Semantics`_).
+The currently supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and
+:class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "c": """\
+(Optional) A distributed tensor representing the operand to add to the matrix multiplication
+result (see `Semantics`_). The currently supported types are :class:`numpy.ndarray`,
+:class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "distributions": """\
+Sequence specifying the distribution across processes of matrices A, B and C/D. The distribution needs to
+be BlockCyclic or compatible.""".replace("\n", " "),
+        #
+        "alpha": """\
+The scale factor for the matrix multiplication term as a real or complex number. The default is
+:math:`1.0`.""".replace("\n", " "),
+        #
+        "beta": """\
+The scale factor for the matrix addition term as a real or complex number. A value for `beta` must be provided if
+operand `c` is specified.""".replace("\n", " "),
+        #
+        "epilog": """\
+Specify an epilog :math:`F` as an object of type :class:`MatmulEpilog` to apply to the result of the matrix
+multiplication: :math:`F(\\alpha A @ B + \\beta C`). The default is no epilog. See `cuBLASMp documentation
+<https://docs.nvidia.com/cuda/cublasmp/usage/types.html#cublasmpmatmulepilogue-t>`_ for the list of
+available epilogs.""".replace("\n", " "),
+        #
+        "epilog_inputs": """\
+Specify the additional inputs needed for the selected epilog as a dictionary, where the key is the epilog input name and
+the value is the epilog input. The epilog input must be a tensor with the same package and in the same memory space as
+the operands (see the constructor for more information on the operands). If the required epilog inputs are not provided,
+an exception is raised that lists the required epilog inputs. Some epilog inputs are generated by other epilogs. For
+example, the epilog input for :class:`MatmulEpilog.DRELU` is generated by matrix multiplication with the same operands
+using :class:`MatmulEpilog.RELU_AUX`. """.replace("\n", " "),
+        #
+        "qualifiers": """\
+Specify the matrix qualifiers as a :class:`numpy.ndarray` of
+:class:`~nvmath.distributed.linalg.advanced.matrix_qualifiers_dtype` objects of length 3
+corresponding to the operands `a`, `b`, and `c`. See
+:ref:`matrix-tensor-qualifiers` for the motivation behind qualifiers.""".replace("\n", " "),
+        #
+        "options": """\
+Specify options for the matrix multiplication as a
+:class:`~nvmath.distributed.linalg.advanced.MatmulOptions` object. Alternatively, a `dict` containing
+the parameters for the ``MatmulOptions`` constructor can also be provided. If not specified, the
+value will be set to the default-constructed ``MatmulOptions`` object.""".replace("\n", " "),
+        #
+        "preferences": """\
+This parameter specifies the preferences for planning as a :class:`MatmulPlanPreferences` object. Alternatively, a
+dictionary containing the parameters for the :class:`MatmulPlanPreferences` constructor can also be provided. If not
+specified, the value will be set to the default-constructed :class:`MatmulPlanPreferences` object.
+""".replace("\n", " "),
+        #
+        "result": """\
+The result of the specified matrix multiplication (epilog applied), which remains on the same device and belongs to the
+same package as the input operands.""".replace("\n", " "),
+        #
+        "semantics": """\
+        .. _semantics:
+
+        The semantics of the matrix multiplication follows :func:`numpy.matmul` semantics, with some restrictions on
+        broadcasting.
+""".strip(),
+    }
+)
+
+
+class InvalidMatmulState(Exception):
+    pass
+
+
+@utils.docstring_decorator(SHARED_MM_DOCUMENTATION, skip_missing=False)
+class Matmul:
+    """
+    Create a stateful object encapsulating the specified distributed matrix multiplication
+    computation :math:`\\alpha a @ b + \\beta c` and the required resources to perform the
+    operation.  A stateful object can be used to amortize the cost of preparation (planning
+    in the case of matrix multiplication) across multiple executions (also see the
+    :ref:`Stateful APIs <host api types>` section).
+
+    The function-form API :func:`matmul` is a convenient alternative to using stateful
+    objects for *single* use (the user needs to perform just one matrix multiplication, for
+    example), in which case there is no possibility of amortizing preparatory costs. The
+    function-form APIs are just convenience wrappers around the stateful object APIs.
+
+    Using the stateful object typically involves the following steps:
+
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific matrix multiplication operation.
+    3. **Execution**: Perform the matrix multiplication computation with :meth:`execute`.
+    4. **Resource Management**: Ensure all resources are released either by explicitly
+       calling :meth:`free` or by managing the stateful object within a context manager.
+
+    Detailed information on what's happening in the various phases described above can be
+    obtained by passing in a :class:`logging.Logger` object to :class:`MatmulOptions` or by
+    setting the appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        distributions: {distributions}
+
+        alpha: {alpha}
+
+        beta: {beta}
+
+        qualifiers: {qualifiers}
+
+        options: {options}
+
+        stream: {stream}
+
+    Semantics:
+        {semantics}
+
+    .. seealso::
+        :meth:`plan`, :meth:`reset_operands`, :meth:`execute`
+
+    Examples:
+
+        >>> import numpy as np
+        >>> import nvmath.distributed
+        >>> from nvmath.distributed.distribution import Slab
+        >>> from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype
+
+        Get MPI communicator used to initialize nvmath.distributed (for information on
+        initializing ``nvmath.distributed``, you can refer to the documentation or to the
+        Matmul examples in `nvmath/examples/distributed/linalg/advanced
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_):
+
+        >>> comm = nvmath.distributed.get_context().communicator
+
+        Get my process rank:
+
+        >>> rank = comm.Get_rank()
+
+        Create two 2-D float64 ndarrays on the CPU (using Slab distributions to distribute
+        the matrices across processes):
+
+        >>> M, N, K = 1024, 1024, 1024
+        >>> a_shape = Slab.X.shape(rank, (K, M))
+        >>> b_shape = Slab.X.shape(rank, (K, N))
+        >>> a = np.asfortranarray(np.random.rand(*a_shape))
+        >>> b = np.asfortranarray(np.random.rand(*b_shape))
+
+        We will define a matrix multiplication operation followed by an AllReduce epilog
+        using the specialized matrix multiplication interface.
+
+        Create a Matmul object encapsulating the problem specification above:
+
+        >>> qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+        >>> qualifiers[0]["is_transpose"] = True  # a is transposed
+        >>> distributions = [Slab.X, Slab.X, Slab.Y]
+        >>> mm = nvmath.distributed.linalg.advanced.Matmul(
+        ...     a, b, distributions=distributions, qualifiers=qualifiers
+        ... )
+
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`MatmulOptions`).
+
+        Next, plan the operation. The epilog is specified, and optionally, preferences can
+        be specified for planning:
+
+        >>> epilog = nvmath.distributed.linalg.advanced.MatmulEpilog.ALLREDUCE
+        >>> mm.plan(epilog=epilog)
+
+        Now execute the matrix multiplication, and obtain the result `r1` as a NumPy
+        ndarray.
+
+        >>> r1 = mm.execute()
+
+        Finally, free the object's resources. To avoid having to explicitly making this
+        call, it's recommended to use the Matmul object as a context manager as shown below,
+        if possible.
+
+        >>> mm.free()
+
+        Note that all :class:`Matmul` methods execute on the current stream by default.
+        Alternatively, the `stream` argument can be used to run a method on a specified
+        stream.
+
+        Let's now look at the same problem with CuPy ndarrays on the GPU.
+
+        >>> device_id = nvmath.distributed.get_context().device_id
+        >>> import cupy as cp
+        >>> with cp.cuda.Device(device_id):
+        ...     a = cp.asfortranarray(cp.random.rand(*a_shape))
+        ...     b = cp.asfortranarray(cp.random.rand(*b_shape))
+
+        Create a Matmul object encapsulating the problem specification described earlier
+        and use it as a context manager.
+
+        >>> with nvmath.distributed.linalg.advanced.Matmul(
+        ...     a, b, distributions=distributions, qualifiers=qualifiers
+        ... ) as mm:
+        ...     mm.plan(epilog=epilog)
+        ...
+        ...     # Execute the operation to get the first result.
+        ...     r1 = mm.execute()
+        ...
+        ...     # Update operands A and B in-place (see reset_operands() for an
+        ...     # alternative).
+        ...     with cp.cuda.Device(device_id):
+        ...         a[:] = cp.random.rand(*a_shape)
+        ...         b[:] = cp.random.rand(*b_shape)
+        ...
+        ...     # Execute the operation to get the new result.
+        ...     r2 = mm.execute()
+
+
+        All the resources used by the object are released at the end of the block.
+
+        Further examples can be found in the
+        `nvmath/examples/distributed/linalg/advanced/matmul
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_
+        directory.
+    """
+
+    def __init__(
+        self,
+        a,
+        b,
+        /,
+        c=None,
+        *,
+        distributions: Sequence[Distribution],
+        alpha=None,
+        beta=None,
+        qualifiers=None,
+        options=None,
+        stream: utils.AnyStream | int | None = None,
+    ):
+        distributed_ctx = nvmath.distributed.get_context()
+        if distributed_ctx is None:
+            raise RuntimeError(
+                "nvmath.distributed has not been initialized. Refer to "
+                "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+                " for more information."
+            )
+        if not distributed_ctx.nvshmem_available:
+            raise RuntimeError("nvmath.distributed wasn't initialized with NVSHMEM backend")
+        if distributed_ctx.nccl_comm is None:
+            raise RuntimeError("nvmath.distributed wasn't initialized with NCCL backend")
+        self.communicator = communicator = distributed_ctx.communicator
+        self.rank = rank = communicator.Get_rank()
+        self.nranks = nranks = communicator.Get_size()
+
+        self.options = options = cast(
+            MatmulOptions, utils.check_or_create_options(MatmulOptions, options, "Distributed matrix multiplication options")
+        )
+        self.logger = options.logger if options.logger is not None else logging.getLogger()
+
+        # The matrix multiplication has two required operands 'a' and 'b', and one optional
+        # operand 'c'.
+        a = tensor_wrapper.wrap_operand(a)
+        b = tensor_wrapper.wrap_operand(b)
+        if c is not None:
+            c = tensor_wrapper.wrap_operand(c)
+
+        operands = [a, b, c] if c is not None else [a, b]
+
+        problem_spec = _ProblemSpec(
+            distributions=list(distributions),
+            shapes=[list(o.shape) for o in operands],  # local shapes
+            operand_dtypes=[o.dtype for o in operands],
+            options=_ProblemSpec.Options(options),
+            packages=[o.name for o in operands],
+            memory_spaces=[o.device for o in operands],
+            device_ids=[o.device_id for o in operands],
+            compute_capability=tuple(ccx.Device(distributed_ctx.device_id).compute_capability),
+            alpha=alpha,
+            beta=beta,
+            qualifiers=qualifiers,
+            is_F=[sorted(o.strides) == list(o.strides) and is_contiguous_and_dense(o.shape, o.strides) for o in operands],
+            lib_version=cublasMp.get_version(),
+            nranks=nranks,
+            rank=rank,
+        )
+
+        if nranks > 1:
+            problem_spec = communicator.allreduce(problem_spec, op=_problem_spec_reducer)
+        else:
+            # Ensure we error-check with one rank.
+            problem_spec = _problem_spec_reducer(problem_spec, problem_spec)
+        if isinstance(problem_spec, Exception):
+            # There is an error or inconsistency in the problem spec across processes.
+            # Note that since this comes from an allreduce, all processes will have
+            # received the same exception.
+            raise problem_spec
+
+        self.distributions = distributions = cast(Sequence[BlockCyclic], problem_spec.distributions)
+
+        self.logger.info("= SPECIFICATION PHASE =")
+        self.logger.info("For performance and debugging hints, use CUBLASMP_LOG_LEVEL=5 and CUBLASLT_LOG_LEVEL=5")
+        self.logger.info(f"The data type of operand A is '{a.dtype}', and that of operand B is '{b.dtype}'.")
+
+        self.num_operands = len(operands)
+        if c is not None:
+            self.logger.info(f"The data type of operand C is {c.dtype}.")
+            if beta is None:
+                raise ValueError("A value for beta must be provided if operand C is provided.")
+
+        if (a.dtype, b.dtype) not in NAMES_TO_DEFAULT_SCALE_TYPE:
+            raise ValueError(f"Unsupported combination of dtypes for operands A {a.dtype} and B {b.dtype}.")
+
+        operand_name = "ABC"
+        for i in range(self.num_operands):
+            global_shape = tuple(problem_spec.shapes[i])
+            self.logger.info(f"The global shape of operand {operand_name[i]} is {global_shape}.")
+
+        self.logger.info(f"The distribution of operand A is {self.distributions[0]}")
+        self.logger.info(f"The distribution of operand B is {self.distributions[1]}")
+        self.logger.info(f"The distribution of operand C/D is {self.distributions[2]}")
+
+        # Currently, a.dtype != b.dtype is only supported for FP8 (different FP8 kinds are
+        # allowed), so we assume that A and B have equal width.
+        self.input_type_width = typemaps.NAME_TO_DATA_WIDTH[a.dtype]
+
+        assert self.num_operands == 2 or self.num_operands == 3, "Internal Error."
+
+        # Infer the library package & device ID the operands belong to.
+        self.operands: None | list[DistributedTensor] = operands
+
+        self.package = utils.get_operands_package(operands)
+        self.memory_space = "cuda"
+        self.device_id = utils.get_operands_device_id(operands)
+        if self.device_id == "cpu":
+            if self.package == "numpy":
+                self.package = "cuda"
+            self.memory_space = "cpu"
+            self.device_id = distributed_ctx.device_id
+        elif self.device_id != distributed_ctx.device_id:
+            raise RuntimeError(
+                "The operands are not on the same device as the one assigned to the distributed "
+                f"runtime on this process: operands' device ID is {self.device_id} and the runtime "
+                f"device ID is {distributed_ctx.device_id}"
+            )
+        self.logger.info(
+            f"The input operands' memory space is {self.memory_space}, and the execution space is on device {self.device_id}."
+        )
+
+        self.nccl_comm = distributed_ctx.nccl_comm
+
+        # Allocate device memory (in stream context) if needed.
+        stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package)
+        self.logger.info(f"The specified stream for the Matmul ctor is {stream_holder.obj}.")
+
+        # Copy operands to device if needed.
+        if self.memory_space == "cpu":
+            # Some of the comm overlap algorithms in cuBLASMp will perform better
+            # when some of the operands are already on symmetric memory (e.g. AG+GEMM
+            # when B is on symmetric memory).
+            self.operands = [o.to(self.device_id, stream_holder, symmetric_memory=True) for o in self.operands]
+
+        self._set_result_sheap_flag()
+
+        # Set qualifiers.
+        self.qualifiers = problem_spec.qualifiers
+        if self.qualifiers[2]["is_transpose"]:
+            raise ValueError("The transpose flag is currently not supported for operand C.")
+        if self.qualifiers[2]["is_conjugate"]:
+            raise ValueError("The conjugate flag is currently not supported for operand C.")
+        # Set qualifiers based on torch lazy conjugation flag if not provided.
+        if self.package == "torch" and qualifiers is None:
+            self.qualifiers[0]["is_conjugate"] = self.operands[0].tensor.is_conj()
+            self.qualifiers[1]["is_conjugate"] = self.operands[1].tensor.is_conj()
+            if len(self.operands) > 2 and self.operands[2].tensor.is_conj():
+                raise ValueError("The conjugate flag is currently not supported for operand C.")
+            self.lazy_conjugation = True
+        else:
+            self.lazy_conjugation = False
+        for i in range(2):
+            if self.qualifiers[i]["is_conjugate"] and not self.qualifiers[i]["is_transpose"]:
+                raise ValueError("Conjugate is not supported without transpose")
+
+        # Set blocking or non-blocking behavior.
+        self.blocking = self.options.blocking is True or self.memory_space == "cpu"
+        if self.blocking:
+            self.call_prologue = "This call is blocking and will return only after the operation is complete."
+        else:
+            self.call_prologue = (
+                "This call is non-blocking and will return immediately after the operation is launched on the device."
+            )
+
+        # The result class is that of the first wrapped device operand.
+        self.result_class = self.operands[0].__class__
+
+        # Set memory allocator.
+        # Workspace in symmetric heap gives better performance for GEMM+comm overlap
+        # algorithms that rely on NVSHMEM.
+        # NOTE: AG+GEMM and GEMM+RS currently *require* workspace on symmetric heap, else
+        # the library fails with internal error.
+        self.allocator = NvshmemMemoryManager(self.device_id, self.logger)
+
+        # Create cuBLASMp handle.
+        with utils.device_ctx(self.device_id):
+            self.handle: int = cublasMp.create(stream_holder.ptr)
+
+        # Determine the data types for a and b.
+        self.a_dtype = typemaps.NAME_TO_DATA_TYPE[a.dtype]
+        self.b_dtype = typemaps.NAME_TO_DATA_TYPE[b.dtype]
+        self.a_dtype_name = a.dtype
+        self.b_dtype_name = b.dtype
+
+        self.is_complex = "complex" in self.a_dtype_name or "complex" in self.b_dtype_name
+
+        for i, dtype_name in enumerate((a.dtype, b.dtype)):
+            if self.qualifiers[i]["is_conjugate"] and "complex" not in dtype_name:
+                raise ValueError("The conjugate flag only applies to complex operands")
+
+        # Determine the data types for c and d.
+        self.d_dtype = options.result_type
+        if self.num_operands == 3:
+            self.c_dtype = typemaps.NAME_TO_DATA_TYPE[c.dtype]
+            if self.d_dtype is None:
+                self.d_dtype = self.c_dtype
+        elif self.num_operands == 2:
+            if self.d_dtype is None:
+                self.d_dtype = self.a_dtype
+            if self.d_dtype in (CudaDataType.CUDA_R_8F_E5M2, CudaDataType.CUDA_R_8F_E4M3):
+                self.c_dtype = CudaDataType.CUDA_R_16F
+            else:
+                self.c_dtype = self.d_dtype
+        self.c_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.c_dtype]
+        self.d_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.d_dtype]
+        self.c_dtype_width = typemaps.NAME_TO_DATA_WIDTH[self.c_dtype_name]
+        self.d_dtype_width = typemaps.NAME_TO_DATA_WIDTH[self.d_dtype_name]
+
+        self.logger.info(f"The data type for the result D is '{self.d_dtype_name}'.")
+
+        def assert_valid_compute_type(compute_type):
+            if compute_type not in COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"]:
+                message = f"Unsupported compute type. The compute type '{repr(compute_type)}' is currently not supported."
+                raise ValueError(message)
+
+        # Determine the scale type.
+        if options.scale_type is None:
+            if options.compute_type is not None:
+                assert_valid_compute_type(options.compute_type)
+                if self.is_complex:
+                    scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["complex"]
+                else:
+                    scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"]
+                self.scale_type = scale_type_map[options.compute_type]
+            else:
+                self.scale_type = NAMES_TO_DEFAULT_SCALE_TYPE[(self.a_dtype_name, self.b_dtype_name)]
+            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
+        else:
+            self.scale_type = options.scale_type
+            if self.scale_type not in SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE:
+                message = f"Unsupported scale type. The data type '{repr(self.scale_type)}' is currently not supported."
+                raise ValueError(message)
+            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
+        self.logger.info(f"The scale type is '{self.scale_type_name}'.")
+
+        # Determine the compute type.
+        if options.compute_type is None:
+            if options.scale_type is not None:
+                self.compute_type = SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE[options.scale_type]
+            else:
+                self.compute_type = NAMES_TO_DEFAULT_COMPUTE_TYPE[(self.a_dtype_name, self.b_dtype_name)]
+        else:
+            self.compute_type = options.compute_type
+        assert_valid_compute_type(self.compute_type)
+        self.logger.info(f"The compute type is {self.compute_type.name}.")
+
+        def is_supported(atype, btype, compute_type, scale_type):
+            ct = cublas.ComputeType
+            st = CudaDataType
+            abtype = atype if atype == btype else (atype, btype)
+            if compute_type in (ct.COMPUTE_16F, ct.COMPUTE_16F_PEDANTIC):
+                return scale_type == st.CUDA_R_16F and abtype == "float16"
+            elif compute_type == ct.COMPUTE_32F_PEDANTIC:
+                if scale_type == st.CUDA_R_32F:
+                    return abtype in ("float32", "bfloat16", "float16", "float8_e4m3fn", "float8_e5m2")
+                elif scale_type == st.CUDA_C_32F:
+                    return abtype == "complex64"
+            elif compute_type == ct.COMPUTE_32F:
+                if scale_type == st.CUDA_R_32F:
+                    return abtype in (
+                        "float32",
+                        "bfloat16",
+                        "float16",
+                        "float8_e4m3fn",
+                        "float8_e5m2",
+                        ("float8_e4m3fn", "float8_e5m2"),
+                        ("float8_e5m2", "float8_e4m3fn"),
+                    )
+                elif scale_type == st.CUDA_C_32F:
+                    return abtype == "complex64"
+            elif compute_type in (ct.COMPUTE_32F_FAST_16F, ct.COMPUTE_32F_FAST_16BF, ct.COMPUTE_32F_FAST_TF32):
+                if scale_type == st.CUDA_R_32F:
+                    return abtype == "float32"
+                if scale_type == st.CUDA_C_32F:
+                    return abtype == "complex64"
+            elif compute_type in (ct.COMPUTE_64F, ct.COMPUTE_64F_PEDANTIC):
+                if scale_type == st.CUDA_R_64F:
+                    return abtype == "float64"
+                if scale_type == st.CUDA_C_64F:
+                    return abtype == "complex128"
+            return False
+
+        if not is_supported(self.a_dtype_name, self.b_dtype_name, self.compute_type, self.scale_type):
+            raise ValueError(
+                f"Selected scale_type={repr(self.scale_type)} compute_type={repr(self.compute_type)} "
+                + f"are not supported for data types {self.a_dtype_name} (A) and {self.b_dtype_name} (B)."
+            )
+
+        # Set alpha and beta.
+        self.alpha = np.zeros((1,), dtype=self.scale_type_name)
+        try:
+            self.alpha[0] = alpha if alpha is not None else 1
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'.") from e
+
+        self.beta = np.zeros((1,), dtype=self.scale_type_name)
+        if beta is not None and self.num_operands == 2:
+            self.logger.warning(f"Matmul: The provided beta value {beta} is ignored since operand C is not specified.")
+        try:
+            self.beta[0] = beta if beta is not None and self.num_operands == 3 else 0
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'.") from e
+
+        # Check operands alignment if needed
+        if self.input_type_width <= 8:
+            for operand, operand_name in zip(self.operands, "ABC", strict=False):
+                if operand.data_ptr % 16 != 0:
+                    raise ValueError(
+                        f"For narrow-precision (FP8 and lower) multiplication, operand {operand_name} should be aligned to 16 "
+                        "bytes."
+                    )
+
+        # Capture operand extents and strides for consistency check when resetting operands.
+        self.operand_extents = tuple(o.shape for o in self.operands)
+        self.operand_strides = tuple(o.strides for o in self.operands)
+
+        # Create operand layouts.
+        a_layout = MatrixLayout(
+            shape=self.operands[0].shape,
+            strides=self.operands[0].strides,
+            is_transpose=bool(self.qualifiers[0]["is_transpose"]),
+            is_conjugate=bool(self.qualifiers[0]["is_conjugate"]),
+        )
+        b_layout = MatrixLayout(
+            shape=self.operands[1].shape,
+            strides=self.operands[1].strides,
+            is_transpose=bool(self.qualifiers[1]["is_transpose"]),
+            is_conjugate=bool(self.qualifiers[1]["is_conjugate"]),
+        )
+        c_layout = (
+            MatrixLayout(shape=self.operands[2].shape, strides=self.operands[2].strides) if self.num_operands == 3 else None
+        )
+
+        input_layout = ("T" if a_layout.is_transpose else "N") + ("T" if b_layout.is_transpose else "N")
+        if self.input_type_width <= 8 and input_layout != "TN":
+            raise ValueError(f"FP8 matrix multiplications support only TN input layout. Got {input_layout}")
+
+        # Get the operation traits.
+        A_shape = problem_spec.shapes[0]  # this is global
+        B_shape = problem_spec.shapes[1]  # this is global
+        M0, K0 = (A_shape[0], A_shape[1]) if not a_layout.is_transpose else (A_shape[1], A_shape[0])
+        K1, N0 = (B_shape[0], B_shape[1]) if not b_layout.is_transpose else (B_shape[1], B_shape[0])
+        if K0 != K1:
+            raise ValueError(
+                f"The 'K' extent must match for the operands: K={K0} in operand A is not equal to K={K1} in operand B."
+            )
+
+        self.mm_traits = MMTraits(
+            M=M0,
+            N=N0,
+            K=K0,
+            a_layout=a_layout,
+            b_layout=b_layout,
+            c_layout=c_layout,
+        )
+        self.result_layout: None | MatrixLayout = None  # Wait till planning to determine this based on the epilog.
+        self.logger.info(
+            f"The matrix multiplication attributes are M={self.mm_traits.M}, N={self.mm_traits.N}, "
+            f"K={self.mm_traits.K}, transA={a_layout.is_transpose} and transB={b_layout.is_transpose}."
+        )
+
+        def use_alt_cache():
+            if distributions[0].process_grid == distributions[1].process_grid == distributions[2].process_grid:
+                # cuBLASMp uses SUMMA if all the process grids are equal, but there are
+                # cases that are problematic with SUMMA, so avoid SUMMA for now for those
+                # cases by using alternate cache for one of the process grids.
+                for d in distributions:
+                    if isinstance(d, BlockNonCyclic) or d._is_1d_distribution():
+                        return True
+                nprow, npcol = distributions[0].process_grid.shape
+                if (distributions[2].block_sizes[0] == M0 // nprow) or (distributions[2].block_sizes[1] == N0 // npcol):
+                    return True
+            return False
+
+        # Create process grids.
+        alt_cache = use_alt_cache()
+        self.lib_process_grids = []
+        with utils.device_ctx(self.device_id):
+            for i, d in enumerate(distributions):
+                grid = d.process_grid
+                assert grid.layout is not None
+                from_alt_cache = alt_cache and i == 2
+                lib_grid = _grid_cache.get_library_process_grid(grid, self.device_id, self.nccl_comm, from_alt=from_alt_cache)
+                self.lib_process_grids.append(lib_grid)
+        self.logger.info("Created cuBLASMp process grids")
+
+        # Create and set the operation descriptor.
+        self.mm_desc = cublasMp.matmul_descriptor_create(self.compute_type)
+        self.mm_desc_ifc = matmul_desc_ifc.MatmulDescInterface(self.mm_desc)
+        self.mm_desc_ifc.transA = (
+            cublas.Operation.C
+            if (a_layout.is_conjugate and a_layout.is_transpose)
+            else cublas.Operation.T
+            if a_layout.is_transpose
+            else cublas.Operation.N
+        )
+        self.mm_desc_ifc.transB = (
+            cublas.Operation.C
+            if (b_layout.is_conjugate and b_layout.is_transpose)
+            else cublas.Operation.T
+            if b_layout.is_transpose
+            else cublas.Operation.N
+        )
+        if self.options.sm_count_communication:
+            self.mm_desc_ifc.communication_sm_count = self.options.sm_count_communication
+        if self.options.algo_type:
+            self.mm_desc_ifc.algo_type = self.options.algo_type
+
+        self.problem_spec = problem_spec
+
+        # Planning preferences
+        self.preferences = None
+
+        # Epilog attributes.
+        self.epilog = None
+
+        # Epilog attributes: name-to-operand.
+        self.epilog_operands: dict[str, typing.Any] = {}
+
+        # Epilog attributes: epilog input name-to-handler.
+        self.epilog_input_name_to_handler: dict[str, typing.Any] = {}
+
+        # Epilog attributes: name-to-output tensor.
+        self.epilog_outputs: dict[str, typing.Any] = {}
+
+        # Keep track of epilog input traits for resetting operands.
+        self.epilog_inputs_traits: dict[str, typing.Any] = {}
+
+        # Keep track of epilog output handlers to allocate output in execute().
+        self.epilog_output_handlers: list[typing.Any] = []
+
+        # Non-epilog aux outputs. Currently, only used for quantization outputs (amax etc.)
+        self.aux_outputs: dict[str, typing.Any] = {}
+
+        # Plan attributes.
+        self.matrix_descriptors: list[int] = []
+        self.mm_planned = False
+
+        # Workspace attributes.
+        self.workspace_device: None | memory.MemoryPointer = None
+        self.workspace_size_device = 0
+        self.workspace_host: None | np.ndarray = None
+        self.workspace_size_host = 0
+        self.workspace_allocated_size = 0
+        self.workspace_allocated_here = False
+
+        # Attributes to establish stream ordering.
+        self.workspace_stream = None
+        self.last_compute_event = None
+
+        self.valid_state = True
+        self.logger.info("The distributed Matmul operation has been created.")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.free()
+
+    def _check_valid_matmul(self, *args, **kwargs):
+        """
+        Check if the Matmul object is alive and well.
+        """
+        if not self.valid_state:
+            raise InvalidMatmulState("The Matmul object cannot be used after resources are free'd")
+
+    def _check_valid_operands(self, *args, **kwargs):
+        """
+        Check if the operands are available for the operation.
+        """
+        what = kwargs["what"]
+        if self.operands is None:
+            raise RuntimeError(
+                f"{what} cannot be performed if the operands have been set to None. Use reset_operands() to set the "
+                f"desired input before using performing the {what.lower()}."
+            )
+
+    def _free_plan_resources(self, exception: Exception | None = None) -> bool:
+        """
+        Free resources allocated in planning.
+        """
+
+        # Destroy matrix descriptors.
+        for descriptor in self.matrix_descriptors:
+            if descriptor is not None:
+                cublasMp.matrix_descriptor_destroy(descriptor)
+        self.matrix_descriptors = []
+
+        self.mm_planned = False
+        return True
+
+    def _check_planned(self, *args, **kwargs):
+        what = kwargs["what"]
+        if not self.mm_planned:
+            raise RuntimeError(f"{what} cannot be performed before plan() has been called.")
+
+    def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
+        """
+        Free workspace by releasing the MemoryPointer object.
+        """
+        if self.workspace_device is None:
+            assert self.workspace_host is None, "Internal error."
+            return True
+
+        with utils.device_ctx(self.device_id):
+            workspace_memory_ptr = utils.get_ptr_from_memory_pointer(self.workspace_device)
+            is_symmetric_memory = nvshmem.ptr(workspace_memory_ptr, nvshmem.my_pe()) != 0
+            if is_symmetric_memory:
+                # Calling nvshmem_free on memory that's still in use is not safe
+                # (nvshmem_free is not stream-ordered), so we need to wait for the
+                # computation to finish.
+                if self.workspace_stream is not None:
+                    self.workspace_stream.sync()
+                self.workspace_device.free()
+        self.workspace_device = self.workspace_host = None
+        self.workspace_allocated_size = 0
+        self.logger.debug("[_free_workspace_memory] The workspace has been released.")
+
+        return True
+
+    def _reset_workspace_allocation_tracking(self):
+        """
+        Reset workspace allocation tracking attributes to False at the end of the methods
+        where workspace memory is potentially allocated. This is necessary to prevent any
+        exceptions raised before method entry from using stale tracking values.
+        """
+        self.workspace_allocated_here = False
+
+    @utils.precondition(_check_valid_matmul)
+    def _release_workspace_memory_perhaps(self, release_workspace):
+        """
+        Free workspace memory if it's larger than the specified limit.
+        """
+        if not release_workspace:
+            return True
+
+        # Establish ordering wrt the computation and free workspace if requested.
+        if self.last_compute_event is not None:
+            self.workspace_stream.wait(self.last_compute_event)
+            self.logger.debug("Established ordering with respect to the computation before releasing the workspace.")
+            self.last_compute_event = None
+
+        self.logger.debug("[_release_workspace_memory_perhaps] The workspace memory will be released.")
+        return self._free_workspace_memory()
+
+    def _release_workspace_memory_perhaps_wrapper(self, exception: Exception | None = None) -> bool:
+        """
+        This is used in @atomic.
+        """
+        if isinstance(exception, cublasMp.cuBLASMpError) and (
+            "NOT_SUPPORTED" in str(exception) or "INVALID_VALUE" in str(exception)
+        ):
+            addendum = (
+                " It is also recommended to check the dtype support table at "
+                "https://docs.nvidia.com/cuda/cublasmp/usage/functions.html#cublasmpmatmul"
+            )
+            # For cuBLASMpError we know that args attribute is (str,)
+            exception.args = (exception.args[0] + addendum,)
+        self._release_workspace_memory_perhaps(release_workspace=self.workspace_allocated_here)
+        self._reset_workspace_allocation_tracking()
+        return True
+
+    @utils.precondition(_check_valid_matmul)
+    @utils.precondition(_check_planned, "Workspace memory allocation")
+    @utils.atomic(_free_workspace_memory, method=True)
+    def _allocate_workspace_memory(self, stream_holder: utils.StreamHolder):
+        """
+        Allocate workspace memory using the specified allocator.
+        """
+
+        assert self.workspace_size_device is not None, "Internal Error."
+        assert self.workspace_allocated_here is False, "Internal Error."
+
+        if self.workspace_size_device == 0:  # For performance, bypass allocator for workspace size == 0.
+            self.workspace_device = memory.MemoryPointer(0, 0, finalizer=None)
+        else:
+            self.logger.debug("Allocating device workspace for performing the matrix multiplication...")
+            with utils.device_ctx(self.device_id), stream_holder.ctx:
+                try:
+                    if isinstance(self.allocator, memory.BaseCUDAMemoryManagerAsync):
+                        self.workspace_device = self.allocator.memalloc_async(self.workspace_size_device, stream_holder.obj)
+                    else:
+                        self.workspace_device = self.allocator.memalloc(self.workspace_size_device)
+                    self.workspace_allocated_here = True
+                except TypeError as e:
+                    message = (
+                        "The method 'memalloc' in the allocator object must conform to the interface in the "
+                        "'BaseCUDAMemoryManager' protocol."
+                    )
+                    raise TypeError(message) from e
+
+        if self.workspace_size_host > 0:
+            self.logger.debug("Allocating host workspace for performing the matrix multiplication...")
+            self.workspace_host = np.array(self.workspace_size_host, dtype=np.int8)
+
+        self.workspace_allocated_size = self.workspace_size_device
+        self.workspace_stream = stream_holder.obj
+        self.logger.debug(
+            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size_device)} in the context "
+            f"of stream {self.workspace_stream}."
+        )
+        self.logger.debug(f"Finished allocating host workspace of size {formatters.MemoryStr(self.workspace_size_host)}.")
+
+    def _allocate_workspace_memory_perhaps(self, stream_holder: utils.StreamHolder):
+        """
+        Allocate workspace memory using the specified allocator, if it hasn't already been
+        done.
+        """
+
+        if self.workspace_device is not None and self.workspace_allocated_size >= self.workspace_size_device:
+            return
+
+        return self._allocate_workspace_memory(stream_holder)
+
+    @utils.precondition(_check_valid_matmul)
+    def _infer_blocking_sizes(self, problem_spec, m, k, n, epilog_AR):
+        # Infer block sizes for the case of BlockNonCyclic 1D distributions with uniform
+        # partition sizes. Even though the block sizes were set individually for each
+        # distribution when BlockNonCyclic._bind() was called, the block sizes might need
+        # to be tweaked because they have to match across matrices A, B and C/D for m, n, k
+        # and so must be inferred jointly.
+        if not all(isinstance(d, BlockNonCyclic) and d._is_1d_distribution() for d in self.distributions):
+            return
+
+        assert all(d._bound for d in self.distributions), "Internal error"
+
+        nranks = self.nranks
+
+        transA = self.mm_traits.a_layout.is_transpose
+        transB = self.mm_traits.b_layout.is_transpose
+
+        # This function only infers for uniform partition sizes.
+        if self.distributions[0]._is_row_wise():
+            if transA and k % nranks != 0:
+                return
+            if not transA and m % nranks != 0:
+                return
+        else:
+            if transA and m % nranks != 0:
+                return
+            if not transA and k % nranks != 0:
+                return
+
+        if self.distributions[1]._is_row_wise():
+            if transB and n % nranks != 0:
+                return
+            if not transB and k % nranks != 0:
+                return
+        else:
+            if transB and k % nranks != 0:
+                return
+            if not transB and n % nranks != 0:
+                return
+
+        A = self.operands[0]
+        B = self.operands[1]
+        mbA, nbA = A.shape  # local
+        mbB, nbB = B.shape  # local
+
+        if epilog_AR:
+            mbD, nbD = m, n
+        else:
+            mbD, nbD = (m // nranks, n) if self.distributions[2]._is_row_wise() else (m, n // nranks)
+
+        # Note that for a dimension of length L that isn't partitioned, L//N is also
+        # a valid block size (a single block in that dimension is equivalent to N
+        # contiguous blocks in that dimension).
+        if not transA:
+            # A is (m, k)
+            mbA = mbD = min(mbA, mbD)
+            if not transB:
+                # B is (k, n)
+                nbA = mbB = min(nbA, mbB)
+            else:
+                # B is (n, k)
+                nbA = nbB = min(nbA, nbB)
+        else:
+            # A is (k, m)
+            nbA = mbD = min(nbA, mbD)
+            if not transB:
+                # B is (k, n)
+                mbA = mbB = min(mbA, mbB)
+            else:
+                # B is (n, k)
+                mbA = nbB = min(mbA, nbB)
+
+        if not transB:
+            # B is (k, n)
+            nbB = nbD = min(nbB, nbD)
+        else:
+            # B is (n, k)
+            mbB = nbD = min(mbB, nbD)
+
+        self.distributions[0]._block_sizes = (mbA, nbA)
+        self.distributions[1]._block_sizes = (mbB, nbB)
+        self.distributions[2]._block_sizes = (mbD, nbD)
+
+    @utils.precondition(_check_valid_matmul)
+    def _infer_algo(self, m, k, n, epilog_AR: bool) -> int:
+        """Return distributed matrix multiplication algorithm that is expected to run.
+        Currently only tries to infer the tensor parallelism comm-overlap algorithms.
+
+        0 if naive
+        2 if GEMM+RS
+        3 if AG+GEMM
+        4 if GEMM+AR
+        5 if local GEMM
+        -1 otherwise (unknown, probably naive or SUMMA)
+        """
+        nranks = self.nranks
+        if nranks == 1:
+            return 5
+
+        if any(not d._is_1d_distribution() for d in self.distributions):
+            return -1  # unknown
+
+        is_transpose = [
+            self.mm_traits.a_layout.is_transpose,
+            self.mm_traits.b_layout.is_transpose,
+            False,
+        ]
+
+        global_shape = [
+            (m, k) if not is_transpose[0] else (k, m),
+            (k, n) if not is_transpose[1] else (n, k),
+            (m, n) if not epilog_AR else (m * nranks, n) if self.distributions[2]._is_row_wise() else (m, n * nranks),
+        ]
+
+        # Check that data divides evenly and distribution is non-cyclic
+        for i in range(3):
+            partitioned_dim = 0 if self.distributions[i]._is_row_wise() else 1
+
+            if global_shape[i][partitioned_dim] % nranks != 0:
+                # Data has to divide evenly
+                return -1
+
+            block_sizes = self.distributions[i].block_sizes
+            if global_shape[i][partitioned_dim] // nranks != block_sizes[partitioned_dim]:
+                # has to be non-cyclic
+                return -1
+
+        transA = is_transpose[0]
+        A_distribution = "R" if self.distributions[0]._is_row_wise() else "C"
+
+        transB = is_transpose[1]
+        B_distribution = "R" if self.distributions[1]._is_row_wise() else "C"
+
+        C_distribution = "R" if self.distributions[2]._is_row_wise() else "C"
+
+        expected_algo = 0  # naive
+        if epilog_AR:
+            expected_algo = 4  # GEMM+AR
+        elif (
+            C_distribution == "R" and A_distribution == ("C" if transA else "R") and B_distribution == ("R" if transB else "C")
+        ):
+            expected_algo = 3  # AG+GEMM
+        elif (
+            C_distribution == "C" and A_distribution == ("R" if transA else "C") and B_distribution == ("C" if transB else "R")
+        ):
+            expected_algo = 2  # GEMM+RS
+
+        return expected_algo
+
+    def _validate_epilog_aux_scale(self, aux_quantization_scale, *, required):
+        is_fp8_aux = (
+            self.preferences.epilog.aux_type is not None
+            and typemaps.NAME_TO_DATA_WIDTH[typemaps.DATA_TYPE_TO_NAME[self.preferences.epilog.aux_type]] <= 8
+        )
+        if aux_quantization_scale is not None and not is_fp8_aux:
+            raise ValueError(
+                "Scales for epilog auxiliary output are not supported when `preferences.epilog.aux_type` is not set to a "
+                "narrow-precision type."
+            )
+        elif aux_quantization_scale is None and is_fp8_aux and required:
+            raise ValueError(
+                '"aux_quantization_scale" epilog input is required when `preferences.epilog.aux_type` is not set to a '
+                "narrow-precision type."
+            )
+
+    @utils.precondition(_check_valid_matmul)
+    @utils.atomic(_free_plan_resources, method=True)
+    def plan(
+        self, *, preferences=None, epilog=None, epilog_inputs=None, stream: utils.AnyStream | int | None = None
+    ):  # Epilog inputs require as many inputs (with specific shapes etc) as required by the epilogue. It's a dict.
+        """
+        Plan the matrix multiplication operation, considering the epilog (if provided).
+
+        Args:
+            preferences: {preferences}
+
+            epilog: {epilog}
+
+            epilog_inputs: {epilog_inputs}
+
+            stream: {stream}
+
+        See :class:`Matmul` for an example, and further examples can be found in the
+        `nvmath/examples/distributed/linalg/advanced/matmul
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_
+        directory.
+        """
+        self.logger.info("= PLANNING PHASE =")
+
+        # Clear epilog operands, since different epilogs can be provided in different calls.
+        # We don't need to worry about ordering, since it's the user's responsibility to
+        # order calls that accept a stream argument. This applies to CPU operands as well,
+        # even though we move them to the GPU, since the execution is blocking.
+        self.epilog_operands = {}  # Clear operands in case of repeated planning.
+        self.epilog_input_name_to_handler = {}  # Clear input name to handler map as well,
+        self.epilog_inputs_traits = {}  # ... and the input traits as well.
+
+        preferences = utils.check_or_create_options(
+            _configuration.MatmulPlanPreferences, preferences, "Distributed matrix multiplication plan preferences"
+        )
+        self.preferences = preferences
+
+        if self.operands is None:
+            raise RuntimeError("The Matmul has no operands. Please call reset_operands")
+
+        mm_traits = self.mm_traits
+
+        stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package)
+        self.logger.info(f"The specified stream for the matrix multiplication plan is {stream_holder.obj}.")
+
+        if epilog is None and epilog_inputs is not None:
+            self.logger.warning(
+                f"Matmul: The provided epilog inputs {epilog_inputs.keys()} are ignored since an epilog is not specified."
+            )
+
+        self.epilog = epilog
+        if epilog is not None:
+            if epilog != MatmulEpilog.ALLREDUCE:
+                raise ValueError(f"{epilog.name} epilogue is not supported")
+
+            self.mm_desc_ifc.epilogue = epilog
+
+        m, n, k = mm_traits.M, mm_traits.N, mm_traits.K
+
+        if self.num_operands == 3:
+            if epilog == MatmulEpilog.ALLREDUCE:
+                expected_global_shape = tuple(
+                    x * y for x, y in zip(self.distributions[2].process_grid.shape, (m, n), strict=False)
+                )
+                if tuple(self.problem_spec.shapes[2]) != expected_global_shape:
+                    raise ValueError(
+                        f"The global shape of C according to its distribution ({self.problem_spec.shapes[2]}) is "
+                        f"not the expected one when using AllReduce epilogue ({expected_global_shape})"
+                    )
+                if self.operands[2].shape != (m, n):
+                    raise ValueError(f"The shape of C on every process when using AllReduce epilogue must be (m, n)={(m, n)}")
+            elif tuple(self.problem_spec.shapes[2]) != (m, n):
+                raise ValueError(
+                    f"The global shape of C according to its distribution ({self.problem_spec.shapes[2]}) is "
+                    f"not the expected shape ({(m, n)})"
+                )
+
+        if not self.distributions[0]._bound:
+            for i, d in enumerate(self.distributions):
+                assert not d._bound, "Internal error"
+                if i < self.num_operands:
+                    global_shape = tuple(self.problem_spec.shapes[i])
+                    shape = self.operands[i].shape
+                else:
+                    if epilog == MatmulEpilog.ALLREDUCE:
+                        global_shape = tuple(
+                            x * y for x, y in zip(self.distributions[2].process_grid.shape, (m, n), strict=False)
+                        )
+                    else:
+                        global_shape = (m, n)
+                    shape = None
+                d._bind(global_shape, shape=shape)
+            self._infer_blocking_sizes(self.problem_spec, m, k, n, epilog == MatmulEpilog.ALLREDUCE)
+
+        transA = self.mm_traits.a_layout.is_transpose
+        transB = self.mm_traits.b_layout.is_transpose
+
+        # Check block size on m dimension.
+        m_block_size_A = self.distributions[0].block_sizes[1] if transA else self.distributions[0].block_sizes[0]
+        m_block_size_D = self.distributions[2].block_sizes[0]
+        if m_block_size_A != m_block_size_D:
+            raise ValueError("Block size of m dimension must be the same for A and C/D")
+
+        # Check block size on n dimension.
+        n_block_size_B = self.distributions[1].block_sizes[0] if transB else self.distributions[1].block_sizes[1]
+        n_block_size_D = self.distributions[2].block_sizes[1]
+        if n_block_size_B != n_block_size_D:
+            raise ValueError("Block size of n dimension must be the same for B and C/D")
+
+        # Check block size on k dimension.
+        k_block_size_A = self.distributions[0].block_sizes[0] if transA else self.distributions[0].block_sizes[1]
+        k_block_size_B = self.distributions[1].block_sizes[1] if transB else self.distributions[1].block_sizes[0]
+        if k_block_size_A != k_block_size_B:
+            raise ValueError("Block size of k dimension must be the same for A and B")
+
+        self._expected_algo = self._infer_algo(m, k, n, epilog == MatmulEpilog.ALLREDUCE)
+
+        if any(d.first_process != (0, 0) for d in self.distributions) and self._expected_algo not in (0, 1, 5):
+            raise NotImplementedError(
+                "Use of first_process != (0, 0) is not supported for this distributed matmul configuration"
+            )
+
+        if epilog == MatmulEpilog.RELU and self._expected_algo == 0:
+            raise NotImplementedError("RELU epilogue is currently not supported for this distributed matmul configuration")
+
+        # Fill the result traits, now that we know the epilog.
+        result_shape = self.distributions[2]._data_shape
+        self.result_layout = MatrixLayout(
+            shape=result_shape,
+            strides=calculate_strides(result_shape, (0, 1)),
+        )
+
+        # Create descriptors for matrices A, B, C and D.
+        matrix_dtypes = (self.a_dtype, self.b_dtype, self.c_dtype, self.d_dtype)
+        for i in range(4):
+            distribution = self.distributions[min(i, 2)]  # distribution for C/D is the same
+            lld = self.operands[i].strides[1] if i < self.num_operands else distribution._data_shape[0]
+            descriptor = cublasMp.matrix_descriptor_create(
+                distribution._data_global_shape[0],
+                distribution._data_global_shape[1],
+                distribution.block_sizes[0],
+                distribution.block_sizes[1],
+                distribution.first_process[0],
+                distribution.first_process[1],
+                lld,
+                matrix_dtypes[i],
+                self.lib_process_grids[min(i, 2)],
+            )
+            self.matrix_descriptors.append(descriptor)
+
+        if self.input_type_width == 8 and (mm_traits.M % 16 != 0 or mm_traits.N % 16 != 0 or mm_traits.K % 16 != 0):
+            raise ValueError(f"M={mm_traits.M} N={mm_traits.N} K={mm_traits.K} must be divisible by 16 for FP8 operations")
+
+        alpha_ptr, beta_ptr = self.alpha.ctypes.data, self.beta.ctypes.data
+        self.workspace_size_device, self.workspace_size_host = cublasMp.matmul_buffer_size(
+            self.handle,
+            self.mm_desc,
+            mm_traits.M,
+            mm_traits.N,
+            mm_traits.K,
+            alpha_ptr,
+            self.operands[0].data_ptr,
+            1,
+            1,
+            self.matrix_descriptors[0],
+            self.operands[1].data_ptr,
+            1,
+            1,
+            self.matrix_descriptors[1],
+            beta_ptr,
+            0 if self.num_operands == 2 else self.operands[2].data_ptr,
+            1,
+            1,
+            self.matrix_descriptors[2],
+            0,  # d pointer
+            1,
+            1,
+            self.matrix_descriptors[3],
+        )
+
+        self.mm_planned = True
+
+    def _set_result_sheap_flag(self):
+        self.result_on_symmetric_memory = False
+        on_symmetric_memory = {o.is_symmetric_memory for o in self.operands}
+        if len(on_symmetric_memory) == 2:
+            self.logger.warning(
+                "Some operands are on symmetric memory and others are not. Result won't be allocated on symmetric memory"
+            )
+        elif on_symmetric_memory == {True}:
+            if self.memory_space == "cuda":
+                self.logger.info("Input operands are on symmetric memory. Result will be allocated on symmetric memory.")
+            self.result_on_symmetric_memory = True
+
+    def _check_and_set_operand(
+        self,
+        operand,
+        operand_name,
+        mm_desc_ifc,
+        stream_holder,
+        *,
+        operand_index=None,
+        epilog_name=None,
+        package=None,
+        dtype=None,
+        extents=None,
+        strides=None,
+    ):
+        """
+        Check to make sure that the provided operand is consistent with the one it's
+        updating, and update it.
+        """
+        assert (operand_index is None) ^ (epilog_name is None), "Internal Error."
+        assert self.operands is not None, "Internal Error."
+
+        # Make sure that the data type and extents match.
+        utils.check_attribute_match(dtype, operand.dtype, "data type")
+        utils.check_attribute_match(extents, operand.shape, "extents")
+
+        package = utils.infer_object_package(operand.tensor)
+
+        # Conjugate flag of the provided operands must match the original qualifiers
+        if (
+            operand_index is not None
+            and package == "torch"
+            and self.lazy_conjugation
+            and self.qualifiers[operand_index]["is_conjugate"] != operand.tensor.is_conj()
+        ):
+            raise ValueError(f"The provided operand {operand_name} has different conjugate flag than the original operand")
+
+        device_id = operand.device_id
+        if device_id == "cpu":
+            package = "cuda" if package == "numpy" else package  # Handle the NumPy <=> CuPy asymmetry.
+            if self.package != package:
+                message = f"Library package mismatch: '{self.package}' => '{package}'"
+                raise TypeError(message)
+
+            # Check if we have a GPU buffer to update into.
+            if operand_index is not None:
+                o = self.operands[operand_index]
+            else:
+                o = self.epilog_operands[epilog_name]
+            if o is None:  # No buffer, create one.
+                # Copy operand across memory spaces (CPU to GPU).
+                # Some of the comm overlap algorithms in cuBLASMp will perform better when
+                # some of the operands are already on symmetric memory (e.g. AG+GEMM when
+                # B is on symmetric memory).
+                o = operand.to(self.device_id, stream_holder, symmetric_memory=True)
+                if operand_index is not None:
+                    self.operands[operand_index] = o
+                else:
+                    self.epilog_operands[epilog_name] = o
+                    # Update the epilog pointer, since we're starting afresh.
+                    self.epilog_input_name_to_handler[epilog_name].update(mm_desc_ifc, o)
+            else:
+                # In-place copy to existing device pointer because the new operand is on the
+                # CPU.
+                tensor_wrapper.copy_([operand], [o], stream_holder)
+        else:
+            if self.package != package:
+                message = f"Library package mismatch: '{self.package}' => '{package}'"
+                raise TypeError(message)
+
+            utils.check_attribute_match(strides, operand.strides, "strides")
+
+            if self.device_id != device_id:
+                raise ValueError(
+                    f"The operand {operand_name} must be on the same device ({device_id}) as the original operand "
+                    f"({self.device_id})."
+                )
+
+            # Finally, replace the original operand by the new one.
+            if operand_index is not None:
+                self.operands[operand_index] = operand
+            else:
+                self.epilog_operands[epilog_name] = operand
+                # Update the epilog pointer, since we're starting afresh.
+                self.epilog_input_name_to_handler[epilog_name].update(mm_desc_ifc, operand)
+
+        self.logger.info(f"Operand '{operand_name}' has been reset to the new value.")
+
+        return
+
+    @utils.precondition(_check_valid_matmul)
+    def reset_operands(
+        self,
+        a=None,
+        b=None,
+        c=None,
+        *,
+        alpha=None,
+        beta=None,
+        epilog_inputs=None,
+        stream: utils.AnyStream | int | None = None,
+    ):
+        """
+        Reset the operands held by this :class:`Matmul` instance.
+
+        This method has two use cases:
+            (1) it can be used to provide new operands for execution when the original
+                operands are on the CPU
+            (2) it can be used to release the internal reference to the previous operands
+                and make their memory available for other use by passing ``None`` for *all*
+                arguments. In this case, this method must be called again to provide the
+                desired operands before another call to :meth:`execute`.
+
+        This method is not needed when the operands reside on the GPU and in-place
+        operations are used to update the operand values.
+
+        This method will perform various checks on the new operands to make sure:
+
+            - The distributions, shapes, strides, datatypes match those of the old ones.
+            - The packages that the operands belong to match those of the old ones.
+            - If input tensors are on GPU, the device must match.
+
+        Args:
+            a: {a}
+
+            b: {b}
+
+            c: {c}
+
+            alpha: {alpha}
+
+            beta: {beta}
+
+            epilog_inputs: {epilog_inputs}
+
+            stream: {stream}
+
+        Examples:
+
+            >>> import cupy as cp
+            >>> import nvmath.distributed
+            >>> from nvmath.distributed.distribution import Slab
+
+            Get MPI communicator used to initialize nvmath.distributed (for information on
+            initializing ``nvmath.distributed``, you can refer to the documentation or to
+            the Matmul examples in `nvmath/examples/distributed/linalg/advanced
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_):
+
+            >>> comm = nvmath.distributed.get_context().communicator
+
+            Get my process rank:
+
+            >>> rank = comm.Get_rank()
+
+            Create two 3-D float64 ndarrays on the GPU (using Slab distributions to
+            distribute the matrices across processes):
+
+            >>> M, N, K = 128, 128, 256
+            >>> a_shape = Slab.X.shape(rank, (M, K))
+            >>> b_shape = Slab.Y.shape(rank, (K, N))
+            >>> device_id = nvmath.distributed.get_context().device_id
+            >>> with cp.cuda.Device(device_id):
+            ...     a = cp.asfortranarray(cp.random.rand(*a_shape))
+            ...     b = cp.asfortranarray(cp.random.rand(*b_shape))
+
+            Create an matrix multiplication object as a context manager
+
+            >>> d = [Slab.X, Slab.Y, Slab.X]
+            >>> with nvmath.distributed.linalg.advanced.Matmul(a, b, distributions=d) as mm:
+            ...     # Plan the operation.
+            ...     mm.plan()
+            ...
+            ...     # Execute the MM to get the first result.
+            ...     r1 = mm.execute()
+            ...
+            ...     # Reset the operands to new CuPy ndarrays.
+            ...     with cp.cuda.Device(device_id):
+            ...         c = cp.asfortranarray(cp.random.rand(*a_shape))
+            ...         d = cp.asfortranarray(cp.random.rand(*b_shape))
+            ...     mm.reset_operands(c, d)
+            ...
+            ...     # Execute to get the new result corresponding to the updated operands.
+            ...     r2 = mm.execute()
+
+            Note that if only a subset of operands are reset, the operands that are not
+            reset hold their original values.
+
+            With :meth:`reset_operands`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
+
+            For the particular example above, explicitly calling :meth:`reset_operands` is
+            equivalent to updating the operands in-place, i.e, replacing
+            ``mm.reset_operand(c, d)`` with ``a[:]=c`` and ``b[:]=d``. Note that updating
+            the operand in-place should be adopted with caution as it can only yield the
+            expected result under the additional constraint below:
+
+                - The operand is on the GPU (more precisely, the operand memory space should
+                  be accessible from the execution space).
+
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul/example06_stateful_inplace.py>`_.
+        """
+
+        if c is not None and self.num_operands == 2:
+            raise ValueError(
+                "The matrix multiplication problem specification does not include operand C, so it cannot be reset."
+            )
+
+        if a is None and b is None and c is None and epilog_inputs is None and alpha is None and beta is None:
+            if self.memory_space == "cpu" and self.operands is not None:
+                with utils.device_ctx(self.device_id):
+                    for o in self.operands:
+                        if o.is_symmetric_memory:
+                            o.free_symmetric()
+            self.operands = None
+            self.epilog_operands = {}
+            self.logger.info("The operands have been reset to None.")
+            return
+
+        # If the operands have been reset to None, then all required operands (a, b, c, and
+        # epilog_inputs need to be provided).
+        if self.operands is None:
+            if a is None or b is None or (c is None and self.num_operands == 3):
+                op_names = "A, B"
+                if c is None and self.num_operands == 3:
+                    op_names += ", C"
+                raise ValueError(f"Operands {op_names} must be provided.")
+            epilog_names = self.epilog_inputs_traits.keys()
+            if epilog_inputs is None:
+                if epilog_names:
+                    raise ValueError(f"The epilog inputs {epilog_names} must be provided.")
+            else:
+                # Check that all required epilog inputs names are provided.
+                if epilog_names != epilog_inputs.keys():
+                    raise ValueError(
+                        f"The epilog inputs {epilog_names} are required. The provided epilog input names are "
+                        f"{epilog_inputs.keys()}."
+                    )
+            self.operands = [None] * self.num_operands  # type: ignore
+            self.epilog_operands = dict.fromkeys(epilog_names)
+
+        # Future operations on the workspace stream should be ordered after the computation.
+        if self.last_compute_event is not None:
+            self.workspace_stream.wait(self.last_compute_event)
+            self.last_compute_event = None
+
+        # Update alpha.
+        if alpha is not None:
+            try:
+                self.alpha[0] = alpha
+            except (ValueError, TypeError) as e:
+                raise ValueError(
+                    f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'."
+                ) from e
+
+        # Update beta.
+        if beta is not None:
+            if self.num_operands == 2:
+                self.logger.warning(f"Matmul: The provided beta value {beta} is ignored since operand C is not specified.")
+            else:
+                try:
+                    self.beta[0] = beta
+                except (ValueError, TypeError) as e:
+                    raise ValueError(
+                        f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'."
+                    ) from e
+
+        stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package)
+
+        # Reset the provided operands.
+        if a is not None:
+            a = tensor_wrapper.wrap_operand(a)
+            index = 0
+            self._check_and_set_operand(
+                a,
+                "A",
+                self.mm_desc_ifc,
+                stream_holder,
+                operand_index=index,
+                dtype=self.a_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+        if b is not None:
+            b = tensor_wrapper.wrap_operand(b)
+            index = 1
+            self._check_and_set_operand(
+                b,
+                "B",
+                self.mm_desc_ifc,
+                stream_holder,
+                operand_index=index,
+                dtype=self.b_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+        if c is not None:  # If we get here, we know that C is one of the operands in the problem specification.
+            c = tensor_wrapper.wrap_operand(c)
+            index = 2
+            self._check_and_set_operand(
+                c,
+                "C",
+                self.mm_desc_ifc,
+                stream_holder,
+                operand_index=index,
+                dtype=self.c_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+        # Reset the provided epilog inputs.
+        if epilog_inputs is not None:
+            for name in epilog_inputs:
+                epilog_input = tensor_wrapper.wrap_operand(epilog_inputs[name])
+                self._check_and_set_operand(
+                    epilog_input,
+                    name,
+                    self.mm_desc_ifc,
+                    stream_holder,
+                    epilog_name=name,
+                    dtype=self.epilog_inputs_traits[name].dtype,
+                    extents=self.epilog_inputs_traits[name].extents,
+                    strides=self.epilog_inputs_traits[name].strides,
+                )
+
+        self._set_result_sheap_flag()
+
+    @utils.precondition(_check_valid_matmul)
+    @utils.precondition(_check_planned, "Execution")
+    @utils.precondition(_check_valid_operands, "Execution")
+    @utils.atomic(_release_workspace_memory_perhaps_wrapper, method=True)
+    def execute(self, *, release_workspace=False, stream: utils.AnyStream | int | None = None):
+        """
+        Execute a planned distributed matrix multiplication.
+
+        Args:
+            release_workspace: {release_workspace}
+
+            stream: {stream}
+
+        Returns:
+           {result}
+        """
+        log_info = self.logger.isEnabledFor(logging.INFO)
+        log_debug = self.logger.isEnabledFor(logging.DEBUG)
+
+        assert self.operands is not None, "Internal error."
+
+        if log_info:
+            self.logger.info("= EXECUTION PHASE =")
+        stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package)
+        if log_info:
+            self.logger.info(f"The specified stream for execute() is {stream_holder.obj}.")
+
+        # Allocate workspace if needed.
+        self._allocate_workspace_memory_perhaps(stream_holder)
+
+        # Create empty tensors for auxiliary output.
+        for handler in self.epilog_output_handlers:
+            name = handler.name
+            shape, strides, dtype_name = handler.attributes()
+            if log_debug:
+                self.logger.debug(f"Beginning auxiliary output tensor '{name}' creation...")
+                self.logger.debug(f"The '{name}' tensor shape = {shape} with strides = {strides} and data type '{dtype_name}'.")
+            self.epilog_outputs[name] = aux_tensor = utils.create_empty_tensor(
+                self.result_class,
+                shape,
+                dtype_name,
+                self.device_id,
+                stream_holder,
+                verify_strides=False,
+                strides=strides,
+            )
+            if log_debug:
+                self.logger.debug(f"The auxiliary output tensor '{name}' has been created.")
+
+            # Update the data pointer in the MM descriptor.
+            handler.update_ptr(self.mm_desc_ifc, aux_tensor.data_ptr)
+
+        # Create empty tensor for the result.
+        # result_layout is based on local properties.
+        assert self.result_layout is not None, "Internal Error. self.result_layout should have been set by self.plan()"
+        if log_debug:
+            self.logger.debug("Beginning output (empty) tensor creation...")
+            self.logger.debug(
+                f"The local output tensor shape = {self.result_layout.shape} with strides = "
+                f"{self.result_layout.strides} and data type '{self.d_dtype_name}'."
+            )
+        result = cast(
+            DistributedTensor,
+            utils.create_empty_tensor(
+                self.result_class,
+                self.result_layout.shape,
+                self.d_dtype_name,
+                self.device_id,
+                stream_holder,
+                verify_strides=False,
+                strides=self.result_layout.strides,
+                symmetric_memory=self.result_on_symmetric_memory,
+                make_symmetric=self.result_on_symmetric_memory,
+            ),
+        )
+        if log_debug:
+            self.logger.debug("The output (empty) tensor has been created.")
+
+        self.aux_outputs = {}
+
+        a, b = self.operands[0], self.operands[1]
+        raw_workspace_ptr_device = utils.get_ptr_from_memory_pointer(self.workspace_device)
+        if log_info:
+            self.logger.info("Starting distributed matrix multiplication...")
+            self.logger.info(f"{self.call_prologue}")
+        with utils.cuda_call_ctx(stream_holder, self.blocking, timing=log_info) as (
+            self.last_compute_event,
+            elapsed,
+        ):
+            cublasMp.stream_set(self.handle, stream_holder.ptr)
+
+            nullptr = 0
+            cublasMp.matmul(
+                self.handle,
+                self.mm_desc,
+                self.mm_traits.M,
+                self.mm_traits.N,
+                self.mm_traits.K,
+                self.alpha.ctypes.data,
+                a.data_ptr,
+                1,
+                1,
+                self.matrix_descriptors[0],
+                b.data_ptr,
+                1,
+                1,
+                self.matrix_descriptors[1],
+                self.beta.ctypes.data,
+                nullptr if self.num_operands == 2 else self.operands[2].data_ptr,
+                1,
+                1,
+                self.matrix_descriptors[2],
+                result.data_ptr,
+                1,
+                1,
+                self.matrix_descriptors[3],
+                raw_workspace_ptr_device,
+                self.workspace_size_device,
+                self.workspace_host.ctypes.data if self.workspace_size_host > 0 else nullptr,  # type: ignore
+                self.workspace_size_host,
+            )
+
+        if log_info and elapsed.data is not None:
+            self.logger.info(f"The distributed matrix multiplication calculation took {elapsed.data:.3f} ms to complete.")
+
+        # Establish ordering wrt the computation and free workspace if requested.
+        if release_workspace:
+            self._release_workspace_memory_perhaps(True)
+
+        # Return the result and auxiliary outputs, if present.
+        all_outputs = self.epilog_outputs | self.aux_outputs
+        if self.memory_space == "cpu":
+            out = result.to("cpu", stream_holder=stream_holder).tensor
+            # Copy auxiliary output to CPU.
+            aux = {name: all_outputs[name].to("cpu", stream_holder=stream_holder).tensor for name in all_outputs}
+        else:
+            out = result.tensor
+            # Return the unwrapped epilog output tensor(s).
+            aux = {name: all_outputs[name].tensor for name in all_outputs}
+
+        # Release internal reference to the result to permit recycling of memory.
+        if self.memory_space == "cpu" and result.is_symmetric_memory:
+            with utils.device_ctx(self.device_id):
+                result.free_symmetric()
+        self.aux_outputs = {}
+        self.epilog_outputs = {}
+        self._reset_workspace_allocation_tracking()
+
+        if aux:
+            return out, aux
+
+        return out
+
+    def free(self):
+        """Free Matmul resources.
+
+        It is recommended that the :class:`Matmul` object be used within a context, but if
+        it is not possible then this method must be called explicitly to ensure that the
+        matrix multiplication resources (especially internal library objects) are properly
+        cleaned up.
+        """
+
+        if not self.valid_state:
+            return
+
+        try:
+            # Future operations on the workspace stream should be ordered after the
+            # computation.
+            if self.last_compute_event is not None:
+                self.workspace_stream.wait(self.last_compute_event)
+                self.last_compute_event = None
+
+            self._free_workspace_memory()
+
+            self._free_plan_resources()
+
+            with utils.device_ctx(self.device_id):
+                # Destroy matmul descriptor.
+                if self.mm_desc is not None:
+                    cublasMp.matmul_descriptor_destroy(self.mm_desc)
+                    self.mm_desc = None
+
+                # NOTE: cuBLASMp grids are stored in the global cache and destroyed
+                # when the cache is cleared (this just clears the references from
+                # this object).
+                self.lib_process_grids = []
+
+                # Destroy cuBLASMp library handle.
+                if self.handle is not None:
+                    cublasMp.destroy(self.handle)
+                    self.handle = None
+
+                if self.memory_space == "cpu":
+                    # In this case, the operands are internal GPU operands owned by Matmul
+                    for operand in self.operands:
+                        if operand.is_symmetric_memory:
+                            operand.free_symmetric()
+                self.operands = None
+
+        except Exception as e:
+            self.logger.critical("Internal error: only part of the Matmul object's resources have been released.")
+            self.logger.critical(str(e))
+            raise e
+        finally:
+            self.valid_state = False
+
+        self.logger.info("The Matmul object's resources have been released.")
+
+
+@utils.docstring_decorator(SHARED_MM_DOCUMENTATION, skip_missing=False)
+def matmul(
+    a,
+    b,
+    /,
+    c=None,
+    *,
+    distributions: Sequence[Distribution],
+    alpha=None,
+    beta=None,
+    epilog=None,
+    epilog_inputs=None,
+    qualifiers=None,
+    options=None,
+    preferences=None,
+    stream: utils.AnyStream | int | None = None,
+):
+    """
+    Perform the specified distributed matrix multiplication computation
+    :math:`F(\\alpha a @ b + \\beta c)`, where :math:`F` is the epilog. This function-form
+    is a wrapper around the stateful :class:`Matmul` object APIs and is meant for *single*
+    use (the user needs to perform just one matrix multiplication, for example), in which
+    case there is no possibility of amortizing preparatory costs.
+
+    Detailed information on what's happening within this function can be obtained by passing
+    in a :class:`logging.Logger` object to :class:`MatmulOptions` or by setting the
+    appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        distributions: {distributions}
+
+        alpha: {alpha}
+
+        beta: {beta}
+
+        epilog: {epilog}
+
+        epilog_inputs: {epilog_inputs}
+
+        qualifiers: {qualifiers}
+
+        options: {options}
+
+        preferences: {preferences}
+
+        stream: {stream}
+
+    Returns:
+        {result}
+
+    Semantics:
+        {semantics}
+
+    .. seealso::
+        :class:`Matmul`, :class:`MatmulOptions`, :class:`MatmulEpilog`,
+        :class:`MatmulPlanPreferences`
+
+    Examples:
+
+        >>> import cupy as cp
+        >>> import nvmath.distributed
+        >>> from nvmath.distributed.distribution import Slab
+
+        Get MPI communicator used to initialize nvmath.distributed (for information on
+        initializing ``nvmath.distributed``, you can refer to the documentation or to the
+        Matmul examples in `nvmath/examples/distributed/linalg/advanced
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_):
+
+        >>> comm = nvmath.distributed.get_context().communicator
+
+        Get my process rank:
+
+        >>> rank = comm.Get_rank()
+
+        Create three float32 ndarrays on the GPU:
+
+        >>> M, N, K = 128, 64, 256
+        >>> a_shape = Slab.X.shape(rank, (M, K))
+        >>> b_shape = Slab.Y.shape(rank, (K, N))
+        >>> c_shape = Slab.X.shape(rank, (M, N))
+        >>> device_id = nvmath.distributed.get_context().device_id
+        >>> with cp.cuda.Device(device_id):
+        ...     a = cp.asfortranarray(cp.random.rand(*a_shape, dtype=cp.float32))
+        ...     b = cp.asfortranarray(cp.random.rand(*b_shape, dtype=cp.float32))
+        ...     c = cp.asfortranarray(cp.random.rand(*c_shape, dtype=cp.float32))
+
+        Perform the operation :math:`\\alpha A @ B + \\beta C` using :func:`matmul`. The
+        result `r` is also a CuPy float32 ndarray:
+
+        >>> distributions = [Slab.X, Slab.Y, Slab.X]
+        >>> r = nvmath.distributed.linalg.advanced.matmul(
+        ...     a, b, c, alpha=1.23, beta=0.74, distributions=distributions
+        ... )
+
+        Options can be provided to customize the operation:
+
+        >>> compute_type = (
+        ...     nvmath.distributed.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_TF32
+        ... )
+        >>> o = nvmath.distributed.linalg.advanced.MatmulOptions(compute_type=compute_type)
+        >>> r = nvmath.distributed.linalg.advanced.matmul(
+        ...     a, b, distributions=distributions, options=o
+        ... )
+
+        See `MatmulOptions` for the complete list of available options.
+
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the Matmul operation. This can be done if the operands are computed on a
+        different stream, for example:
+
+        >>> with cp.cuda.Device(device_id):
+        ...     s = cp.cuda.Stream()
+        ...     with s:
+        ...         a = cp.asfortranarray(cp.random.rand(*a_shape))
+        ...         b = cp.asfortranarray(cp.random.rand(*b_shape))
+        >>> r = nvmath.distributed.linalg.advanced.matmul(
+        ...     a, b, distributions=distributions, stream=s
+        ... )
+
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
+
+        Create NumPy ndarrays on the CPU.
+
+        >>> import numpy as np
+        >>> a = np.asfortranarray(np.random.rand(*a_shape))
+        >>> b = np.asfortranarray(np.random.rand(*b_shape))
+
+        Provide the NumPy ndarrays to :func:`matmul`, with the result also being a NumPy
+        ndarray:
+
+        >>> r = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions)
+
+    Notes:
+        - This function is a convenience wrapper around :class:`Matmul` and is
+          specifically meant for *single* use.
+
+    Further examples can be found in the `nvmath/distributed/examples/linalg/advanced/matmul
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/linalg/advanced/matmul>`_
+    directory.
+    """
+    preferences = utils.check_or_create_options(
+        _configuration.MatmulPlanPreferences, preferences, "Matrix multiplication plan preferences"
+    )
+
+    with Matmul(
+        a,
+        b,
+        c=c,
+        distributions=distributions,
+        alpha=alpha,
+        beta=beta,
+        qualifiers=qualifiers,
+        options=options,
+        stream=stream,
+    ) as mm:
+        mm.plan(preferences=preferences, epilog=epilog, epilog_inputs=epilog_inputs, stream=stream)
+
+        r = mm.execute(stream=stream)
+
+    return r
+
+
+class cuBLASMpProcessGridCache:
+    def __init__(self):
+        self.cache = {}
+        self.cache_alt = {}
+        self.device_id = None
+        import threading
+
+        self.lock = threading.Lock()
+
+    def get_library_process_grid(self, process_grid, device_id, nccl_comm, from_alt=False):
+        """**This is a collective call**. Caller must make sure to set device context."""
+        with self.lock:
+            if self.device_id is None:
+                self.device_id = device_id
+            else:
+                assert self.device_id == device_id
+            cache = self.cache if not from_alt else self.cache_alt
+            if process_grid not in cache:
+                process_grid_cpp = cublasMp.grid_create(
+                    process_grid.shape[0],
+                    process_grid.shape[1],
+                    process_grid.layout,
+                    nccl_comm,
+                )
+                cache[process_grid] = process_grid_cpp
+                return process_grid_cpp
+            return cache[process_grid]
+
+    def clear(self):
+        """This is a collective call."""
+        with self.lock:
+            if len(self.cache) == 0 and len(self.cache_alt) == 0:
+                return
+            with utils.device_ctx(self.device_id):
+                for cache in (self.cache, self.cache_alt):
+                    for grid in cache.values():
+                        cublasMp.grid_destroy(grid)
+                    cache.clear()
+
+
+_grid_cache = cuBLASMpProcessGridCache()
diff --git a/nvmath/distributed/reshape/_configuration.py b/nvmath/distributed/reshape/_configuration.py
index c2302c0..c7ed33a 100644
--- a/nvmath/distributed/reshape/_configuration.py
+++ b/nvmath/distributed/reshape/_configuration.py
@@ -27,7 +27,7 @@ class ReshapeOptions:
             is on the CPU to ensure that the user doesn't inadvertently use the result
             before it becomes available. The default is ``"auto"``.
 
-    See Also:
+    .. seealso::
         :class:`Reshape` and :func:`reshape`.
     """
 
diff --git a/nvmath/distributed/reshape/reshape.py b/nvmath/distributed/reshape/reshape.py
index 4058f0c..d15cd32 100644
--- a/nvmath/distributed/reshape/reshape.py
+++ b/nvmath/distributed/reshape/reshape.py
@@ -12,6 +12,7 @@
 import numpy as np
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 from nvmath.internal import formatters, utils
 from nvmath.internal.package_wrapper import StreamHolder, AnyStream
 from nvmath.bindings import cufftMp  # type: ignore
@@ -31,10 +32,6 @@ class TensorLayout:
     strides: Sequence[int]
 
 
-# Box contains lower and upper coordinates, so it must be of length 2 in practice.
-Box = Sequence[Sequence[int]]
-
-
 @dataclass
 class _ProblemSpec:
     """This is used in a custom MPI reduction to check that the Reshape problem
@@ -55,13 +52,13 @@ def __init__(self, options: ReshapeOptions):
 
         blocking: Literal[True, "auto"]
 
-    shape: tuple[int]  # operand shape
+    shape: tuple[int, ...]  # operand shape
     is_F: bool  # Is Fortran memory layout
     is_C: bool  # Is C memory layout
     operand_dtype: str  # str because TensorHolder.dtype returns str
     package: Literal["numpy", "cupy", "torch"]  # operand package
     memory_space: Literal["cuda", "cpu"]  # operand memory space
-    boxes: Sequence[Box]  # Reshape input and output box
+    boxes: list[Box]  # Reshape input and output box
     options: Options  # Reshape options
 
     # Global number of elements in the operand (calculated as part of the reduction).
@@ -185,47 +182,29 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
         if not p1.is_F and not p1.is_C:
             return ValueError("The input memory layout is not C or Fortran, or is inconsistent across processes")
 
-        if len(p1.boxes) != 2 or len(p2.boxes) != 2:
-            return ValueError("Must provide input and output boxes on all processes")
-        input_box1, output_box1 = p1.boxes
-        input_box2, output_box2 = p2.boxes
-        for box in (input_box1, output_box1, input_box2, output_box2):
-            if len(box) != 2:
-                return ValueError(f"Box {box} must have lower and upper coordinates")
-            lower, upper = box
-            if len(lower) != len(p1.shape) or len(upper) != len(p1.shape):
-                return ValueError(
-                    f"The number of coordinates in each coordinate pair of box {box} must "
-                    f"match the number of operand dimensions {len(p1.shape)}."
-                )
-            if not all(upper[i] > lower[i] for i in range(len(p1.shape))):
-                return ValueError(
-                    f"The upper coordinates must be larger than the lower coordinates, but got lower={lower} upper={upper}"
-                )
-
         for p_spec in (p1, p2):
             if p_spec.is_leaf:
-                # Check that the input box shape of this process matches the shape of the
-                # input operand.
-                input_lower, input_upper = p_spec.boxes[0]
-                input_box_shape = tuple(input_upper[i] - input_lower[i] for i in range(len(p_spec.shape)))
-                if input_box_shape != tuple(p_spec.shape):
-                    return ValueError(f"The operand shape {p_spec.shape} does not match the input box shape {input_box_shape}")
+                for box in p_spec.boxes:
+                    if not isinstance(box, Box):
+                        return ValueError(f"{box} is not a Box distribution")
+                    if box.ndim != len(p_spec.shape):
+                        return ValueError(
+                            f"The dimensionality of {box} doesn't match the dimensionality of "
+                            f"the reshape operand ({len(p_spec.shape)})"
+                        )
 
         if p1 is not p2:  # with nranks=1 p1 is p2
             p1.global_size += p2.global_size
 
         def reduce_boxes(box1, box2):
             """This function returns the smallest box that encompasses `box1` and `box2`"""
-            lower1, upper1 = box1
-            lower2, upper2 = box2
-            lower = np.minimum(np.array(lower1), np.array(lower2)).tolist()
-            upper = np.maximum(np.array(upper1), np.array(upper2)).tolist()
-            return lower, upper
+            lower = np.minimum(np.array(box1.lower), np.array(box2.lower)).tolist()
+            upper = np.maximum(np.array(box1.upper), np.array(box2.upper)).tolist()
+            return Box(lower, upper)
 
         # Merge the boxes to get the global operand shape. Note that this is applied
         # progressively throughout the MPI reduction, starting with the local boxes.
-        p1.boxes = (reduce_boxes(input_box1, input_box2), reduce_boxes(output_box1, output_box2))
+        p1.boxes = [reduce_boxes(p1.boxes[0], p2.boxes[0]), reduce_boxes(p1.boxes[1], p2.boxes[1])]
 
     except Exception as e:
         return e
@@ -283,7 +262,7 @@ class Reshape:
 
         stream: {stream}
 
-    See Also:
+    .. seealso::
         :meth:`plan`, :meth:`reset_operand`, :meth:`execute`
 
     Examples:
@@ -321,10 +300,11 @@ class Reshape:
         NOTE: each process has its own input and output boxes which are different to those
         of other processes, as each holds a different section of the global array.
 
+        >>> from nvmath.distributed.distribution import Box
         >>> if comm.Get_rank() == 0:
         ...     input_lower = (0, 0, 0)
         ...     input_upper = (4, 4, 4)
-        ...     input_box = [input_lower, input_upper]
+        ...     input_box = Box(input_lower, input_upper)
         ...     output_box = ...
         ... else:
         ...     input_box = ...  # the input box depends on the process.
@@ -420,6 +400,8 @@ def __init__(
                 "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
                 " for more information."
             )
+        if not distributed_ctx.nvshmem_available:
+            raise RuntimeError("nvmath.distributed wasn't initialized with NVSHMEM backend")
         self.communicator = distributed_ctx.communicator
         nranks = self.communicator.Get_size()
 
@@ -432,6 +414,9 @@ def __init__(
         is_C = sorted(operand.strides, reverse=True) == list(operand.strides)
         is_F = sorted(operand.strides) == list(operand.strides)
 
+        input_box = cast(Box, input_box.copy())
+        output_box = cast(Box, output_box.copy())
+
         # Merge the problem specification across processes to make sure that there are no
         # inconsistencies and to calculate the global shape. Importantly, this also does
         # collective error checking of the Reshape input parameters, to ensure that all
@@ -481,13 +466,17 @@ def __init__(
         global_shape = tuple(int(upper[i] - lower[i]) for i in range(self.operand_dim))
         self.logger.info(f"The global shape of the operand is {global_shape}.")
 
+        # This can't throw error since the local operand shape was already checked
+        # against the box shape in the ProblemSpec reducer.
+        input_box._bind(global_shape, shape=operand.shape)
+
         # The global number of elements must be compatible with the global shape.
         if problem_spec.global_size != math.prod(global_shape):
             raise ValueError(f"The global number of elements is incompatible with the inferred global shape {global_shape}")
 
         # Store the local input and output box.
-        self.input_box = input_box
-        self.output_box = output_box
+        self.input_box: Box = input_box
+        self.output_box: Box = output_box
 
         self.operand_data_type = operand.dtype
         # TODO: change to `operand.dtype.itemsize` once operand is StridedMemoryView.
@@ -1163,7 +1152,7 @@ def reshape(
         A tensor that remains on the same device and belongs to the same package as
         the input operand, with shape according to output_box.
 
-    See Also:
+    .. seealso::
         :class:`Reshape`.
 
     Examples:
@@ -1201,10 +1190,11 @@ def reshape(
         NOTE: each process has its own input and output boxes which are different to those
         of other processes, as each holds a different section of the global array.
 
+        >>> from nvmath.distributed.distribution import Box
         >>> if comm.Get_rank() == 0:
         ...     input_lower = (0, 0, 0)
         ...     input_upper = (4, 4, 4)
-        ...     input_box = [input_lower, input_upper]
+        ...     input_box = Box(input_lower, input_upper)
         ...     output_box = ...
         ... else:
         ...     input_box = ...  # the input box depends on the process.
diff --git a/nvmath/fft/_configuration.py b/nvmath/fft/_configuration.py
index 3d3a6bc..5c1660c 100644
--- a/nvmath/fft/_configuration.py
+++ b/nvmath/fft/_configuration.py
@@ -23,7 +23,7 @@ class ExecutionCUDA:
         device_id: CUDA device ordinal (only used if the operand resides on the CPU). The
             default value is 0.
 
-    See Also:
+    .. seealso::
        :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and
        :func:`irfft`.
     """
@@ -45,7 +45,7 @@ class ExecutionCPU:
                      If not specified, defaults to the number of CPU cores available to the
                      process.
 
-    See Also:
+    .. seealso::
        :class:`ExecutionCUDA`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and
        :func:`irfft`.
     """
@@ -110,7 +110,7 @@ class FFTOptions:
             PyTorch operands, :func:`cupy.cuda.alloc` otherwise). Currently, CPU execution
             does not allow specifying a memory pool.
 
-    See Also:
+    .. seealso::
         :class:`ExecutionCUDA`, :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`,
         :func:`ifft`, :func:`rfft`, and :func:`irfft`.
     """
@@ -164,7 +164,7 @@ class DeviceCallable:
         data:  A device pointer to user data used in the callback. The default is None,
             which means a null pointer will be used in the callback.
 
-    See Also:
+    .. seealso::
         :meth:`FFT.plan`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
     """
 
@@ -200,7 +200,7 @@ def __post_init__(self):
 class FFTDirection(IntEnum):
     """An IntEnum class specifying the direction of the transform.
 
-    See Also:
+    .. seealso::
         :meth:`FFT.execute`, :func:`fft`
     """
 
diff --git a/nvmath/fft/_helpers.py b/nvmath/fft/_helpers.py
index 134d9c4..c7f2501 100644
--- a/nvmath/fft/_helpers.py
+++ b/nvmath/fft/_helpers.py
@@ -175,7 +175,7 @@ def _compile(
     Returns:
         The function compiled to LTO-IR as `bytes` object.
 
-    See Also:
+    .. seealso::
         :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`,
         :meth:`~nvmath.fft.compile_epilog`.
 
@@ -205,7 +205,7 @@ def _compile(
     Returns:
         The function compiled to LTO-IR as `bytes` object.
 
-    See Also:
+    .. seealso::
         :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`,
         :meth:`~nvmath.fft.compile_prolog`.
 
diff --git a/nvmath/fft/fft.py b/nvmath/fft/fft.py
index 359e98a..af3a983 100644
--- a/nvmath/fft/fft.py
+++ b/nvmath/fft/fft.py
@@ -10,7 +10,6 @@
 import enum
 import functools
 import logging
-import math
 import operator
 
 from ._configuration import ExecutionCPU, ExecutionCUDA, FFTOptions, FFTDirection, DeviceCallable
@@ -869,57 +868,6 @@ def get_data(device_callable):
     return plan_args, callable_data, data_cls_astuple(execution)  # type: ignore[arg-type]
 
 
-def _has_only_small_factors_extent(extent):
-    # fast track for powers of 2 (and zero)
-    if extent & (extent - 1) == 0:
-        return True
-    # Divide the `extent` by the product of all the prime factors up to
-    # 127 present in the `extent` until there are none left.
-    # Considering all the prime factors at once is faster, even though the
-    # first call (and only the first call) to gcd operates on ints with precision
-    # exceeding 64 bits. For common 2, 3, 5, 7 factors, the higher powers are included,
-    # to reduce number of iterations.
-    # math.prod(p for p in range(2, 128) if is_prime(p)) * 2**10 * 3**7 * 5**5 * 7**4
-    magic_prod = 67455891904760197438286248026720562156610454525830430432000000
-    d = math.gcd(extent, magic_prod)
-    while d > 1:
-        if extent == d:
-            return True
-        extent //= d
-        d = math.gcd(extent, d)
-    return False
-
-
-def _has_only_small_factors_shape(shape):
-    return all(extent <= 2048 or _has_only_small_factors_extent(extent) for extent in shape)
-
-
-def check_is_shape_supported_lto_ea(operand, plan_traits, fft_abstract_type):
-    if fft_abstract_type != "C2R":
-        shape = plan_traits.ordered_fft_in_shape
-    else:
-        shape = plan_traits.ordered_fft_out_shape
-    if not _has_only_small_factors_shape(shape):
-        raise ValueError(
-            f"cuFFT LTO EA does not support callbacks with inputs of certain shapes. "
-            f"Tensor with extents comprasing prime factors larger than 127 are not supported. "
-            f"Got a tensor of shape {operand.shape}."
-        )
-    if len(shape) == 3 and sum(e == 1 for e in shape) == 1 and shape[-1] == 1:
-        raise ValueError(
-            "cuFFT LTO EA does not support callbacks with inputs of certain shapes. "
-            "3D FFT with the last extent equal 1 are not supported"
-        )
-
-
-def _check_prolog_epilog_traits(prolog, epilog, plan_traits, operand, fft_abstract_type):
-    # Since the version 11300, cufft does the validation itself.
-    # In earlier versions, it could ignore the callback silently
-    # for unsupported shapes
-    if (prolog or epilog) and cufft.get_version() < 11300:
-        check_is_shape_supported_lto_ea(operand, plan_traits, fft_abstract_type)
-
-
 def set_prolog_and_epilog(handle, prolog, epilog, operand_dtype, result_dtype, logger):
     def set_callback(cbkind, cbobj, dtype):
         if cbobj is None:
@@ -929,18 +877,13 @@ def set_callback(cbkind, cbobj, dtype):
         CBType = CBLoadType if cbkind == "prolog" else CBStoreType
 
         try:
-            cufft.xt_set_jit_callback(handle, cbobj.ltoir, cbobj.size, CBType[dtype.upper()], [cbobj.data])
+            cufft.xt_set_jit_callback(handle, 0, cbobj.ltoir, cbobj.size, CBType[dtype.upper()], [cbobj.data])
         except _bindings_utils.FunctionNotFoundError as e:
             version = cufft.get_version()
             raise RuntimeError(
                 f"The currently running cuFFT version {version} does not support LTO callbacks. \n"
-                f"The following cuFFT releases support LTO callbacks: \n"
-                f"1. cuFFT shipped with CUDA 12.6U2 (11.3.0) or newer \n"
-                f"2. the older, experimental cuFFT LTO EA (early access) preview build "
-                f"(https://developer.nvidia.com/cufftea).\n"
-                f"To use version different from the one shipped with CUDA Toolkit, please make "
-                f"sure the right 'libcufft.so' takes precedence for nvmath. "
-                f"For example, by adjusting the 'LD_LIBRARY_PATH' or 'LD_PRELOAD'."
+                f"cuFFT LTO callbacks are supported starting with cuFFT 11.3, "
+                f"shipped with CUDA Toolkit 12.6U2 (11.3.0) or newer. \n"
             ) from e
 
         logger.info(f"The specified LTO-IR {cbkind} has been set.")
@@ -1004,7 +947,7 @@ class FFT:
 
         stream: {stream}
 
-    See Also:
+    .. seealso::
         :meth:`plan`, :meth:`reset_operand`, :meth:`execute`, :meth:`create_key`
 
     Examples:
@@ -1309,7 +1252,7 @@ def get_key(self, *, prolog: DeviceCallable | None = None, epilog: DeviceCallabl
         Returns:
             {fft_key}
 
-        See Also:
+        .. seealso::
             :meth:`create_key`
         """
         return create_fft_key(
@@ -1490,7 +1433,6 @@ def plan(
             # Set LTO-IR callbacks, if present.
             prolog = utils.check_or_create_options(DeviceCallable, prolog, "prolog", keep_none=True)
             epilog = utils.check_or_create_options(DeviceCallable, epilog, "epilog", keep_none=True)
-            _check_prolog_epilog_traits(prolog, epilog, self.plan_traits, self.operand, self.fft_abstract_type)
             set_prolog_and_epilog(self.handle, prolog, epilog, self.operand_data_type, self.result_data_type, self.logger)
 
         # Get all the arguments to xt_make_plan_many except for the first (the handle).
@@ -2053,7 +1995,7 @@ def _fft(
         A transformed operand that retains the same data type and shape as the input. It
         remains on the same device and uses the same package as the input operand.
 
-    See Also:
+    .. seealso::
         :func:`ifft`, :func:`irfft`, :func:`rfft`, :class:`FFT`
 
     Examples:
@@ -2182,7 +2124,7 @@ def rfft(
         ``operand.shape[axes[-1]] // 2 + 1``.
 
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`irfft`, :class:`FFT`.
     """
     wrapped_operand = tensor_wrapper.wrap_operand(operand)
@@ -2229,7 +2171,7 @@ def rfft(
         A transformed operand that retains the same data type and shape as the input. It
         remains on the same device and uses the same package as the input operand.
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`irfft`, :class:`FFT`.
 
     Notes:
@@ -2284,7 +2226,7 @@ def irfft(
         ``even``, or ``operand.shape[axes[-1]] * 2 - 1`` if
         :attr:`FFTOptions.last_axis_parity` is ``odd``.
 
-    See Also:
+    .. seealso::
         :func:`fft`, :func:`ifft`, :class:`FFT`.
 
     Examples:
diff --git a/nvmath/internal/tensor_ifc.py b/nvmath/internal/tensor_ifc.py
index 676b229..7d5563f 100644
--- a/nvmath/internal/tensor_ifc.py
+++ b/nvmath/internal/tensor_ifc.py
@@ -91,16 +91,19 @@ def empty(cls, shape: Sequence[int], device_id: int | Literal["cpu"], **context:
     @property
     @abstractmethod
     def shape(self) -> Sequence[int]:
+        """The extent of each dimension in number of elements."""
         raise NotImplementedError
 
     @property
     @abstractmethod
     def size(self) -> int:
+        """The total number of elements in the tensor."""
         raise NotImplementedError
 
     @property
     @abstractmethod
     def strides(self) -> Sequence[int]:
+        """The stride of each dimension in number of elements."""
         raise NotImplementedError
 
     @abstractmethod
@@ -153,3 +156,11 @@ def reshape(self, shape: Sequence[int], *, copy: bool | None = None) -> TensorHo
                 the function must avoid copying, if possible, and may copy otherwise.
         """
         raise NotImplementedError
+
+    @property
+    def is_conjugate(self) -> bool:
+        """Return True when self.tensor has a no-op conjugation flag enabled.
+
+        The default implementation always returns False.
+        """
+        return False
diff --git a/nvmath/internal/tensor_ifc_torch.py b/nvmath/internal/tensor_ifc_torch.py
index dacea4b..9ff3ffb 100644
--- a/nvmath/internal/tensor_ifc_torch.py
+++ b/nvmath/internal/tensor_ifc_torch.py
@@ -120,3 +120,7 @@ def reshape(self, shape: Sequence[int], *, copy: bool | None = None):
         if copy is False:
             return self.__class__(self.tensor.view(shape))
         return self.__class__(self.tensor.reshape(shape))
+
+    @property
+    def is_conjugate(self) -> bool:
+        return self.tensor.is_conj()
diff --git a/nvmath/internal/utils.py b/nvmath/internal/utils.py
index 3625549..109f67f 100644
--- a/nvmath/internal/utils.py
+++ b/nvmath/internal/utils.py
@@ -638,7 +638,7 @@ def decorator(func_or_class):
             # update the docstring of all public methods with docstrings
             static_methods = []  # staticmethods appear to require special handling
             for name, method in vars(func_or_class).items():
-                if isinstance(method, staticmethod):
+                if isinstance(method, (staticmethod, classmethod)):
                     static_methods.append(name)
                     continue
                 if callable(method) and (not name.startswith("_")) and method.__doc__:
@@ -646,8 +646,8 @@ def decorator(func_or_class):
             # update the docstring of the constructor
             func_or_class.__doc__ = _format_doc(func_or_class.__doc__)
             for name in static_methods:
-                method = getattr(func_or_class, name)
-                method.__doc__ = _format_doc(method.__doc__)
+                method = func_or_class.__dict__[name]
+                method.__func__.__doc__ = _format_doc(method.__func__.__doc__)
             return func_or_class
         else:  # function decorator
             func_or_class.__doc__ = _format_doc(func_or_class.__doc__)
diff --git a/nvmath/linalg/__init__.py b/nvmath/linalg/__init__.py
index ebe6de8..36f2d1a 100644
--- a/nvmath/linalg/__init__.py
+++ b/nvmath/linalg/__init__.py
@@ -4,8 +4,42 @@
 
 from . import advanced
 from nvmath.bindings.cublas import ComputeType  # type: ignore
+from .generic import (
+    DiagonalMatrixQualifier,
+    ExecutionCPU,
+    ExecutionCUDA,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    InvalidMatmulState,
+    matmul,
+    Matmul,
+    MatmulOptions,
+    MatrixQualifier,
+    matrix_qualifiers_dtype,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+    SideMode,
+    FillMode,
+    DiagType,
+)
 
 __all__ = [
     "advanced",
     "ComputeType",
+    "DiagonalMatrixQualifier",
+    "ExecutionCPU",
+    "ExecutionCUDA",
+    "GeneralMatrixQualifier",
+    "HermitianMatrixQualifier",
+    "InvalidMatmulState",
+    "matmul",
+    "Matmul",
+    "MatmulOptions",
+    "MatrixQualifier",
+    "matrix_qualifiers_dtype",
+    "SymmetricMatrixQualifier",
+    "TriangularMatrixQualifier",
+    "SideMode",
+    "FillMode",
+    "DiagType",
 ]
diff --git a/nvmath/linalg/_internal/batch.py b/nvmath/linalg/_internal/batch.py
new file mode 100644
index 0000000..84ea468
--- /dev/null
+++ b/nvmath/linalg/_internal/batch.py
@@ -0,0 +1,120 @@
+from collections.abc import Sequence
+import math
+
+from dataclasses import dataclass
+
+from nvmath.linalg._internal.utils import check_batch_tileable
+from nvmath._internal.layout import is_overlapping_layout
+
+
+@dataclass(slots=True, frozen=True)
+class BatchTraits:
+    """Represents the traits of a batched data.
+
+    A BatchTraits is valid if the non-batch dimensions are contiguous and dense (the data
+    span the entire extent of the tensor and there are no gaps between adjacent elements).
+    The batch dimensions may be optionally overlapping.
+
+    Attributes:
+        shape: The dimensions of the batch. An empty tuple `()` indicates no batching.
+        strides: The memory strides for each dimension of the batch.
+        overlap_allowed: Whether the batch dimensions are allowed to overlap in memory.
+
+    """
+
+    shape: Sequence[int]
+    strides: Sequence[int]
+    overlap_allowed: bool = False
+
+    @classmethod
+    def from_full_shape_and_strides(
+        cls,
+        shape: Sequence[int],
+        strides: Sequence[int],
+        num_trailing_dims: int,
+        overlap_allowed: bool = False,
+    ):
+        leading_shape, _ = shape[:-num_trailing_dims], shape[-num_trailing_dims:]
+        leading_strides, _ = strides[:-num_trailing_dims], strides[-num_trailing_dims:]
+        # Check for valid batching
+        # The samples in the batch must be tileable.
+        if leading_shape and not check_batch_tileable(leading_shape, leading_strides):
+            message = (
+                f"The batch layout corresponding to shape = {leading_shape} and strides = "
+                f"{leading_strides} is currently not supported because it is not tileable."
+            )
+            raise ValueError(message)
+
+        if not overlap_allowed and is_overlapping_layout(leading_shape, leading_strides):
+            raise ValueError(
+                "Only non-overlapping tensors are valid for batching. "
+                f"Shape {leading_shape} with strides {leading_strides} is not valid for batching."
+            )
+        # It's OK if we prune off the information about the trailing dimensions because the
+        # strides should already contain information about them.
+        return BatchTraits(
+            shape=leading_shape,
+            strides=leading_strides,
+            overlap_allowed=overlap_allowed,
+        )
+
+    @classmethod
+    def from_full_shape_only(
+        cls,
+        shape: Sequence[int],
+        num_trailing_dims: int,
+    ):
+        """Create a BatchTraits from a full shape assuming that the strides are dense
+        contiguous and row-ordered."""
+        leading_shape, _ = shape[:-num_trailing_dims], shape[-num_trailing_dims:]
+        strides = () if not shape else tuple(math.prod(shape[i + 1 :]) for i in range(len(shape)))
+        leading_strides, _ = strides[:-num_trailing_dims], strides[-num_trailing_dims:]
+        return BatchTraits(
+            shape=leading_shape,
+            strides=leading_strides,
+            overlap_allowed=False,
+        )
+
+    @property
+    def count(self):
+        """The total number of elements in the batch. Returns 0 if non-batched."""
+        return 0 if not self.shape else math.prod(self.shape)
+
+    @property
+    def stride(self):
+        """The stride between elements the batch. Returns 0 if non-batched."""
+        return 0 if not self.shape else min(self.strides)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, BatchTraits):
+            raise TypeError("Unsupported operand type(s) for ==.")
+        return self.shape == other.shape and self.strides == other.strides
+
+    def __mul__(self, other) -> Sequence[int]:
+        """Return the combined shape of two BatchTraits.
+
+        Currently two BatchTraits are only combinable if they are equal or if one is an
+        empty batch.
+
+        Raises
+        ------
+        ValueError: if the dimensions and strides are incompatible.
+        """
+        if not isinstance(other, BatchTraits):
+            raise TypeError("Unsupported operand type(s) for *.")
+
+        if self.shape == other.shape:
+            return self.shape
+        if not self.shape:
+            return other.shape
+        if not other.shape:
+            return self.shape
+
+        msg = (
+            f"Batch dimensions {self.shape} and {other.shape} are not compatible."
+            "Dimensions must be the same OR one input must be non-batched."
+        )
+        raise ValueError(msg)
+
+    def __str__(self) -> str:
+        return f"batch dimensions with shape {self.shape} and strides {self.strides}"
diff --git a/nvmath/linalg/_internal/layout.py b/nvmath/linalg/_internal/layout.py
new file mode 100644
index 0000000..0d99892
--- /dev/null
+++ b/nvmath/linalg/_internal/layout.py
@@ -0,0 +1,481 @@
+"""
+Defines dataclasses for tracking matrix layout and traits specifically for wranging inputs
+into a form accepted by matrix multiplication operations in BLAS APIs.
+
+The module defines two classes: BLASMatrixTraits which tracks a single matrix, and
+BLASMMTraits which tracks a collection of BLASMatrixTraits to be used together in a matrix
+multiplication.
+"""
+
+import logging
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+import nvmath.bindings.cublasLt as cublaslt
+import nvmath.bindings.cublas as cublas
+from nvmath.internal import typemaps
+
+
+def check_extents(shape: Sequence[int], name: str):
+    """Raises an error if any element in shape is non-positive.
+
+    0D tensors are expected to have shape () and stride ().
+    """
+    if any(e <= 0 for e in shape):
+        message = f"The specified extents {shape} for operand {name} are not valid. The extents must be strictly positive."
+        raise ValueError(message)
+
+
+def check_strides(strides: Sequence[int], name: str):
+    """Raises an error if any element in strides is negative.
+
+    0D tensors are expected to have shape () and stride().
+    Strides may be 0 or positive.
+    BLAS disallows negative strides, but the cuBLAS dgmm extension allows negative strides.
+    """
+    if any(s < 0 for s in strides):
+        message = f"The specified strides {strides} for operand {name} are not valid. The strides must be non-negative."
+        raise ValueError(message)
+
+
+@dataclass(slots=True)
+class BLASMatrixTraits:
+    """Manages a tensor's layout and its compatibility with BLAS API mamtuls.
+
+    :class:`BLASMatrixTraits` encapsulates attributes and methods to handle single matrix
+    layouts, including memory order, operations (e.g., transpose, conjugation), and
+    dimensions. It provides functionality to transform matrices into compatible formats for
+    BLAS matrix multiplication.
+
+    Diagonal matrices should be described as the equivalent square matrix, with equal
+    strides in both dimensions.
+
+    Attributes:
+        dtype : The CUDA data type representing the matrix element.
+
+        shape: The number of elements along each dimension.
+
+        strides: The number of elements in memory to move between elements along the
+            corresponding dimension. We expect a stride of 0 for singleton dimensions.
+
+        is_conjugate : A flag indicating if the matrix elements should be conjugated.
+
+        is_transpose : A flag indicating if the matrix should be transposed.
+
+        is_lower : A flag indicating if the matrix is a lower (true) or upper (false)
+            triangular matrix. We track this by default even if the matrix is full because
+            we can always ignore this parameter.
+    """
+
+    dtype: typemaps.cudaDataType
+    shape: Sequence[int]
+    strides: Sequence[int]
+    is_conjugate: bool
+    is_transpose: bool
+    is_lower: bool
+
+    def __post_init__(self):
+        assert len(self.shape) < 3, "Internal Error: BLASMatrixTraits supports only 0..2D matrices."
+        assert len(self.strides) == len(self.shape), "Internal Error: BLASMatrixTraits strides and shape must have same length."
+        assert all(extent > 0 for extent in self.shape), "Internal Error: BLASMatrixTraits supports only positive shapes."
+        assert all(stride >= 0 for stride in self.strides), (
+            "Internal Error: BLASMatrixTraits supports only non-negative strides."
+        )
+        # Set strides for singleton dimensions to zero. The stride length of singleton
+        # dimensions is unused because we don't travel along it, but non-zero strides will
+        # confuse us when we try to guess the ordering of the matrix.
+        self.strides = tuple(0 if extent == 1 else stride for extent, stride in zip(self.shape, self.strides, strict=True))
+
+    @property
+    def order(self) -> cublaslt.Order:
+        """The indexing order of the matrix in memory."""
+        if len(self.shape) < 2:
+            # For vectors and scalars we return COL-order because COL is the default.
+            return cublaslt.Order.COL
+        msg = f"Unsupported layout for shape: {self.shape} strides: {self.strides}. At least one dimension must be contiguous."
+        if 1 not in self.strides and 0 not in self.strides:
+            raise ValueError(msg)
+        if self.shape[0] * self.strides[0] <= self.strides[1]:
+            return cublaslt.Order.COL
+        if self.shape[1] * self.strides[1] <= self.strides[0]:
+            return cublaslt.Order.ROW
+        raise ValueError(msg)
+
+    @property
+    def ld(self) -> int:
+        """The leading dimension of the matrix without operations applied."""
+        assert self.order == cublaslt.Order.COL, (
+            f"Internal Error: {self.__class__.__name__}.ld should only be accessed if the matrix is COL-order."
+        )
+        assert len(self.shape) == 2, (
+            f"Internal Error: {self.__class__.__name__}.ld should only be accessed if the matrix is 2D."
+        )
+        if any(stride == 0 and extent != 1 for extent, stride in zip(self.shape, self.strides, strict=True)):
+            raise ValueError(
+                f"Unsupported layout for shape: {self.shape} strides: {self.strides}. "
+                "Only singleton dimensions may have zero stride."
+            )
+        if self.shape[1] == 1:
+            return self.shape[0]  # extent = 1 strides cannot be trusted
+        return self.strides[1]
+
+    @property
+    def operation(self) -> cublas.Operation:
+        """The operation to be applied to the matrix before multiplication."""
+        match (self.is_conjugate, self.is_transpose):
+            case (False, False):
+                return cublas.Operation.N
+            case (False, True):
+                return cublas.Operation.T
+            case (True, True):
+                return cublas.Operation.C
+            case (True, False):
+                raise NotImplementedError("Conjugate non-transpose operation is not supported.")
+            case _:
+                raise NotImplementedError("Conjugate and transpose flags must be python booleans.")
+
+    def mm_shape(self) -> Sequence[int]:
+        """The shape of the matrix after applying operations."""
+        if len(self.shape) == 2:
+            return self.shape[::-1] if self.is_transpose else self.shape
+        raise NotImplementedError("mm_shape only implemented for 2D matrices.")
+
+    def transpose_and_reorder(self, logger: logging.Logger):
+        """Return a new :class:`BLASMatrixTraits` that has been transposed and reordered.
+
+        Simultaneous transpose and reorder is a non-operation because the data does not need
+        to move.
+        """
+        new = BLASMatrixTraits(
+            dtype=self.dtype,
+            shape=self.shape[::-1],
+            strides=self.strides[::-1],
+            is_conjugate=self.is_conjugate,
+            is_transpose=not self.is_transpose,
+            is_lower=not self.is_lower,
+        )
+        logger.debug("The matrix was transposed and reordered from %s to %s.", self.order.name, new.order.name)
+        return new
+
+    def promote_left(self, logger: logging.Logger, ndim: int = 2):
+        """Return a new :class:`BLASMatrixTraits` with new singleton dimensions added to
+        the left side of `shape` until the matrix has at least `ndim` dimensions."""
+        add = max(ndim - len(self.shape), 0)
+        promoted = BLASMatrixTraits(
+            dtype=self.dtype,
+            shape=(*[1] * add, *self.shape),
+            strides=(*[0] * add, *self.strides),
+            is_conjugate=self.is_conjugate,
+            is_transpose=self.is_transpose,
+            is_lower=self.is_lower,
+        )
+        if add > 0:
+            logger.debug("The matrix was promoted from shape %s to shape %s", self.shape, promoted.shape)
+        return promoted
+
+    def blas_A_compatible(self, logger: logging.Logger):
+        """Return ``self`` or a new :class:`BLASMatrixTraits` that is compatible with the
+        BLAS API's A matrices."""
+        if len(self.shape) < 2:
+            return self.promote_left(logger).blas_A_compatible(logger)
+        if self.order == cublaslt.Order.ROW:
+            return self.transpose_and_reorder(logger).blas_A_compatible(logger)
+        # else self.order is COL
+        if self.is_conjugate and not self.is_transpose:
+            # We can only perform conjugate transpose; conjugate non-tranpose is not an
+            # option.
+            msg = f"BLAS APIs only accept COL-order matrix for A. {self} was not convertible to a valid A."
+            raise ValueError(msg)
+        return self
+
+    def promote_right(self, logger: logging.Logger, ndim: int = 2):
+        """Return a new :class:`BLASMatrixTraits` with new singleton dimensions added to
+        the right side of `shape` until the matrix has at least `ndim` dimensions."""
+        add = max(ndim - len(self.shape), 0)
+        promoted = BLASMatrixTraits(
+            dtype=self.dtype,
+            shape=(*self.shape, *[1] * add),
+            strides=(*self.strides, *[0] * add),
+            is_conjugate=self.is_conjugate,
+            is_transpose=self.is_transpose,
+            is_lower=self.is_lower,
+        )
+        if add > 0:
+            logger.debug("The matrix was promoted from shape %s to shape %s", self.shape, promoted.shape)
+        return promoted
+
+    def blas_B_compatible(self, logger: logging.Logger):
+        """Return ``self`` or a new :class:`BLASMatrixTraits` that is compatible with the
+        BLAS API's B matrices."""
+        if len(self.shape) < 2:
+            return self.promote_right(logger).blas_B_compatible(logger)
+        if self.order == cublaslt.Order.ROW:
+            return self.transpose_and_reorder(logger).blas_B_compatible(logger)
+        # else self.order is COL
+        if self.is_conjugate and not self.is_transpose:
+            # We can only perform conjugate transpose; conjugate non-tranpose is not an
+            # option.
+            msg = f"BLAS APIs only accept COL-order matrix for B. {self} was not convertible to a valid B."
+            raise ValueError(msg)
+        return self
+
+    def blas_C_compatible(self, logger: logging.Logger):
+        """Return ``self`` or a new :class:`BLASMatrixTraits` that is compatible with the
+        BLAS API's C matrix."""
+        if len(self.shape) < 2:
+            return self.promote_right(logger).blas_C_compatible(logger)
+        match (self.order, self.is_conjugate, self.is_transpose):
+            case (cublaslt.Order.COL, False, False):
+                return self
+            case (cublaslt.Order.ROW, False, True):
+                return self.transpose_and_reorder(logger)
+            case _:
+                msg = (
+                    "BLAS APIs only accept COL-order, non-tranpose, non-conjugate matrices for C. "
+                    f"{self} was not convertible to a valid C."
+                )
+                raise ValueError(msg)
+
+    def trim_strides(self):
+        """Return ``self`` or a new :class:`BLASMatrixTraits` with strides adjusted so
+        the matrix is contiguous and dense along all dimensions."""
+        match len(self.shape):
+            case 0:
+                new_strides = ()
+            case 1:
+                new_strides = (1,)
+            case 2:
+                if self.order == cublaslt.Order.COL:
+                    new_strides = (1, self.shape[0])
+                else:  # self.order == cublaslt.Order.ROW
+                    new_strides = (self.shape[1], 1)
+        if self.strides == new_strides:
+            return self
+        return BLASMatrixTraits(
+            dtype=self.dtype,
+            shape=self.shape,
+            strides=new_strides,
+            is_conjugate=self.is_conjugate,
+            is_transpose=self.is_transpose,
+            is_lower=self.is_lower,
+        )
+
+
+@dataclass(slots=True)
+class BLASMMTraits:
+    """
+    BLASMMTraits represents the traits required for describing BLAS-compatible matrix
+    multiplications, including operand A, operand B, and an optional operand C. It ensures
+    that the operands comply with specific BLAS API compatibility rules, such as
+    broadcasting, promotion of dimensions, and swapping of A and B when required.
+
+    Attributes:
+        M : The number of rows in the resulting matrix multiplication.
+
+        N : The number of columns in the resulting matrix multiplication.
+
+        K : The shared dimension between operands A and B for matrix multiplication.
+
+        a_layout_traits : Layout traits for operand A.
+
+        b_layout_traits : Layout traits for operand B.
+
+        c_layout_traits : Layout traits for operand C.
+
+        is_swapped_AB : Indicates if operands A and B were swapped in order to ensure
+            compatibility.
+    """
+
+    M: int | None
+    N: int | None
+    K: int | None
+    a_layout_traits: BLASMatrixTraits
+    b_layout_traits: BLASMatrixTraits
+    c_layout_traits: BLASMatrixTraits
+    is_swapped_AB: bool
+
+    @staticmethod
+    def from_layouts(
+        a_layout: BLASMatrixTraits,
+        b_layout: BLASMatrixTraits,
+        c_layout: BLASMatrixTraits | None,
+        logger: logging.Logger,
+    ):
+        """Create a `BLASMMTraits` from 2 or 3 `BLASMatrixTraits`.
+
+        See nvmath.linalg.advanced.matmulmod semantics docstring for matrix promotion and
+        broadcasting rules.
+        """
+        logger.debug("Constructing a BLASMMTraits.")
+        logger.debug(f"Operand A is shape {a_layout.shape} with strides {a_layout.strides} and order {a_layout.order.name}.")
+        logger.debug(f"Operand B is shape {b_layout.shape} with strides {b_layout.strides} and order {b_layout.order.name}.")
+        match len(a_layout.shape):
+            case 0:
+                M0, K0 = None, None
+            case 1:
+                M0, K0 = None, a_layout.shape[0]
+            case _:
+                M0, K0 = a_layout.mm_shape()
+        match len(b_layout.shape):
+            case 0:
+                K1, N0 = None, None
+            case 1:
+                K1, N0 = b_layout.shape[0], None
+            case _:
+                K1, N0 = b_layout.mm_shape()
+        if c_layout is None:
+            logger.debug("Operand C was not provided.")
+            shape: Sequence[int]
+            strides: Sequence[int]
+            if M0 is None and N0 is not None:
+                shape = (N0,)
+                strides = (1,)
+            elif M0 is not None and N0 is None:
+                shape = (M0,)
+                strides = (1,)
+            elif M0 is not None and N0 is not None:
+                shape = (M0, N0)
+                strides = (N0, 1) if a_layout.order == cublaslt.Order.ROW and b_layout.order == cublaslt.Order.ROW else (1, M0)
+            else:  # both are None
+                shape = ()
+                strides = ()
+            c_layout_ = BLASMatrixTraits(
+                dtype=a_layout.dtype,
+                shape=shape,
+                # Create COL-order tensor by default, but match orders if all ROW
+                strides=strides,
+                is_transpose=False,
+                is_conjugate=False,
+                is_lower=True,
+            )
+        else:
+            c_layout_ = c_layout
+        logger.debug(f"Operand C is shape {c_layout_.shape} with strides {c_layout_.strides} and order {c_layout_.order.name}.")
+
+        match len(c_layout_.shape):
+            case 0:
+                M1, N1 = None, None
+            case 1:
+                if len(a_layout.shape) <= 1:
+                    M1, N1 = c_layout_.shape[0], None
+                else:
+                    M1, N1 = None, c_layout_.shape[0]
+            case _:
+                M1, N1 = c_layout_.mm_shape()
+        if K0 != K1:
+            raise ValueError(
+                f"The 'K' extent must match for the operands: K={K0} in operand A is not equal to K={K1} in operand B."
+            )
+        if M0 is None and N0 is None:
+            # Both a,b are vectors; c must be a scalar
+            # NOTE: Since BLAS does not support broadcasting c, the shape of c should match
+            # a@b exactly not shape (1, 1) or (1,)
+            if (M1, N1) != (None, None):
+                raise ValueError(f"When both operands A,B are vectors, operand C must be scalar-like, not shape {(M1, N1)}.")
+        elif (M0 is not None and N0 is None) or (M0 is None and N0 is not None):
+            # One of a,b is a vector; c must be a vector.
+            if (M1 or N1) not in [M0, N0]:
+                raise ValueError(
+                    f"When one of operands A,B is a vector, operand C must be a vector with shape ({M0 or N0},), "
+                    f"not shape ({M1}, {N1})."
+                )
+        else:
+            # Both a,b are matrices; c must have shape (M0, N0)
+            if M0 != M1:
+                raise ValueError(
+                    f"The 'M' extent must match for the operands: M={M0} in operand A is not equal to M={M1} in operand C."
+                )
+            if N0 != N1:
+                raise ValueError(
+                    f"The 'N' extent must match for the operands: N={N0} in operand B is not equal to N={N1} in operand C."
+                )
+
+        return BLASMMTraits(
+            M=M0,
+            N=N0,
+            K=K0,
+            a_layout_traits=a_layout,
+            b_layout_traits=b_layout,
+            c_layout_traits=c_layout_,
+            is_swapped_AB=False,
+        )
+
+    def blas_compatible(self, logger: logging.Logger, inplace: bool):
+        """Return ``self`` or a new :class:`BLASMMTraits` that is compatible with the
+        BLAS API.
+
+        Args
+        ----
+        inplace: Whether C will be inplace or copied into a memory-compact array of the same
+            shape, but potentially different strides.
+        """
+        logger.debug("Making a BLAS compatible BLASMMTraits.")
+        a_layout = self.a_layout_traits
+        b_layout = self.b_layout_traits
+        c_layout = self.c_layout_traits
+        is_swapped_AB = False
+
+        logger.debug("Making a BLAS compatible view of operand C.")
+        if len(a_layout.shape) < 2 and len(c_layout.shape) < 2:
+            c_layout = c_layout.promote_left(logger)
+        if c_layout.order == cublaslt.Order.ROW:
+            c_layout = c_layout.transpose_and_reorder(logger)
+        if not inplace:
+            c_layout = c_layout.trim_strides()
+        if c_layout.is_transpose:
+            # We can use property of transpose that (A @ B).T = B.T @ A.T to remove
+            # transpose operation from C.
+            a_layout, b_layout = b_layout, a_layout
+            is_swapped_AB = True
+            a_layout.is_transpose = not a_layout.is_transpose
+            b_layout.is_transpose = not b_layout.is_transpose
+            c_layout.is_transpose = not c_layout.is_transpose
+            logger.debug("Operands A, B will be swapped and transposed in order to transpose C.")
+        c_layout = c_layout.blas_C_compatible(logger)
+        logger.debug(
+            f"The BLAS operand C is shape {c_layout.shape} with strides {c_layout.strides} and order {c_layout.order.name}."
+        )
+        logger.debug("The matrix multiplication will be performed with %s for operand C.", c_layout.operation.name)
+
+        logger.debug("Making a BLAS compatible view of operand A.")
+        a_layout = a_layout.blas_A_compatible(logger)
+        logger.debug(
+            f"The BLAS operand A is shape {a_layout.shape} with strides {a_layout.strides} and order {a_layout.order.name}."
+        )
+        logger.debug("The matrix multiplication will be performed with %s for operand A.", a_layout.operation.name)
+
+        logger.debug("Making a BLAS compatible view of operand B.")
+        b_layout = b_layout.blas_B_compatible(logger)
+        logger.debug(
+            f"The BLAS operand B is shape {b_layout.shape} with strides {b_layout.strides} and order {b_layout.order.name}."
+        )
+        logger.debug("The matrix multiplication will be performed with %s for operand B.", b_layout.operation.name)
+
+        M0, K0 = a_layout.mm_shape()
+        K1, N0 = b_layout.mm_shape()
+        M1, N1 = c_layout.mm_shape()
+        assert M0 is not None
+        assert K0 is not None
+        assert N0 is not None
+        if K0 != K1:
+            raise ValueError(
+                f"The 'K' extent must match for the operands: K={K0} in operand A is not equal to K={K1} in operand B."
+            )
+        if M0 != M1:
+            raise ValueError(
+                f"The 'M' extent must match for the operands: M={M0} in operand A is not equal to M={M1} in operand C."
+            )
+        if N0 != N1:
+            raise ValueError(
+                f"The 'N' extent must match for the operands: N={N0} in operand B is not equal to N={N1} in operand C."
+            )
+
+        return BLASMMTraits(
+            M=M0,
+            N=N0,
+            K=K0,
+            a_layout_traits=a_layout,
+            b_layout_traits=b_layout,
+            c_layout_traits=c_layout,
+            is_swapped_AB=is_swapped_AB,
+        )
diff --git a/nvmath/linalg/_internal/utils.py b/nvmath/linalg/_internal/utils.py
index 306dede..744ea8a 100644
--- a/nvmath/linalg/_internal/utils.py
+++ b/nvmath/linalg/_internal/utils.py
@@ -18,35 +18,57 @@
 
 import typing
 
+from nvmath.bindings import cublas
 from nvmath.bindings import cublasLt as cublaslt
 from nvmath.internal import utils
 
-HANDLES: dict[int, int] = {}
+HANDLES: dict[str, dict[int, int]] = {
+    "cublas": {},
+    "cublaslt": {},
+}
 
 
-def create_handle(device_id: int) -> int:
+def create_handle(device_id: int, binding="cublaslt") -> int:
     """
     Currently for internal use only.
     """
     with utils.device_ctx(device_id):
-        handle = cublaslt.create()
-
+        match binding:
+            case "cublas":
+                handle = cublas.create()
+            case "cublaslt" | _:
+                handle = cublaslt.create()
     return handle
 
 
-def destroy_handle(handle: int):
+def destroy_handle(handle: int, binding="cublaslt"):
     """
     Currently for internal use only.
     """
-    cublaslt.destroy(handle)
+    match binding:
+        case "cublas":
+            cublas.destroy(handle)
+        case "cublaslt" | _:
+            cublaslt.destroy(handle)
 
 
-def get_handle(device_id: int) -> int:
+def get_handle(device_id: int, binding="cublaslt") -> int:
     """
-    Retrieve the BLAS library handle for the specified device. If one doesn't exist, create,
-    cache, and return the handle.
+    Retrieve the cuBLAS[lt] library handle for the specified device. If one doesn't exist,
+    create, cache, and return the handle.
+
+    According to the docs for cublasLtHandle_t, any valid cublasHandle_t can be used in
+    place of cublasLtHandle_t with a simple cast, so we use the same handle for both APIs.
+
+    We never cleanup these handles (allow them to leak) since we expect to have exactly one
+    handle per device / thread.
     """
-    return HANDLES.setdefault(device_id, create_handle(device_id))
+    if device_id in HANDLES[binding]:
+        handle = HANDLES[binding][device_id]
+    else:
+        handle = create_handle(device_id, binding=binding)
+        HANDLES[binding][device_id] = handle
+    return handle
 
 
 def pointer_aligned_to(address):
diff --git a/nvmath/linalg/advanced/_configuration.py b/nvmath/linalg/advanced/_configuration.py
index a0966d3..4bf1106 100644
--- a/nvmath/linalg/advanced/_configuration.py
+++ b/nvmath/linalg/advanced/_configuration.py
@@ -95,7 +95,7 @@ class MatmulOptions:
             the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
             PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
 
-    See Also:
+    .. seealso::
        :class:`Matmul`, :func:`matmul`
     """
 
@@ -140,7 +140,7 @@ def __post_init__(self):
             raise ValueError("The value specified for blocking must be either True or 'auto'.")
 
         if self.allocator is not None and not isinstance(self.allocator, BaseCUDAMemoryManager | BaseCUDAMemoryManagerAsync):
-            raise TypeError("The allocator must be an object of type that fulfils the BaseCUDAMemoryManager protocol.")
+            raise TypeError("The allocator must be an object of type that fulfills the BaseCUDAMemoryManager protocol.")
 
 
 matrix_qualifiers_dtype = _np.dtype([("structure", object), ("is_conjugate", "<i4")])
@@ -198,7 +198,7 @@ class MatmulEpilogPreferences:
             Only supported when ``aux_type`` option is set to a narrow-precision
             data type.
 
-    See Also:
+    .. seealso::
        :meth:`Matmul.plan`, :func:`matmul`, :class:`MatmulPlanPreferences`
     """
 
@@ -234,7 +234,7 @@ class MatmulPlanPreferences:
             Epilog preferences (as an object of class
             :class:`~nvmath.linalg.advanced.MatmulEpilogPreferences` or a `dict`).
 
-    See Also:
+    .. seealso::
        :meth:`Matmul.plan`, :func:`matmul`
     """
 
@@ -286,7 +286,7 @@ class MatmulQuantizationScales:
 
         d (float or Tensor) : Scale for matrix D.
 
-    See Also:
+    .. seealso::
        :class:`Matmul`, :func:`matmul`
     """
 
diff --git a/nvmath/linalg/advanced/matmulmod.py b/nvmath/linalg/advanced/matmulmod.py
index ba5f60d..70a42dc 100644
--- a/nvmath/linalg/advanced/matmulmod.py
+++ b/nvmath/linalg/advanced/matmulmod.py
@@ -507,7 +507,8 @@ def get_result_traits(mm_traits: MMTraits, epilog_ordering: cublaslt.Order | Non
         "qualifiers": """\
 If desired, specify the matrix qualifiers as a :class:`numpy.ndarray` of
 :class:`~nvmath.linalg.advanced.matrix_qualifiers_dtype` objects of length 3 corresponding to the operands `a`, `b`, and
-`c`.""".replace("\n", " "),
+`c`. See :ref:`matrix-tensor-qualifiers` for the motivation behind
+qualifiers.""".replace("\n", " "),
         #
         "options": """\
 Specify options for the matrix multiplication as a :class:`~nvmath.linalg.advanced.MatmulOptions` object. Alternatively,
@@ -521,7 +522,7 @@ def get_result_traits(mm_traits: MMTraits, epilog_ordering: cublaslt.Order | Non
 """.replace("\n", " "),
         #
         "result": """\
-The result of the specified matrix multiplication (epilog applied), which remains on the same device and belong to the
+The result of the specified matrix multiplication (epilog applied), which remains on the same device and belongs to the
 same package as the input operands. If an epilog (like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX`) that
 results in extra output is used, or an extra output is requested (for example by setting
 :attr:`~nvmath.linalg.advanced.MatmulOptions.result_amax` option in ``options`` argument),
@@ -661,7 +662,7 @@ class Matmul:
     Narrow-precision support:
         {narrow_precision}
 
-    See Also:
+    .. seealso::
         :meth:`autotune`, :meth:`plan`, :meth:`reset_operands`, :meth:`execute`
 
     Examples:
@@ -700,7 +701,7 @@ class Matmul:
 
         >>> r1 = mm.execute()
 
-        Finally, free the object's resources. To avoid having to explicitly making this
+        Finally, free the object's resources. To avoid having to explicitly make this
         call, it's recommended to use the Matmul object as a context manager as shown below,
         if possible.
 
@@ -712,8 +713,6 @@ class Matmul:
 
         Let's now look at the same problem with CuPy ndarrays on the GPU.
 
-        Create a 3-D complex128 CuPy ndarray on the GPU:
-
         >>> import cupy as cp
         >>> a = cp.random.rand(M, K)
         >>> b = cp.random.rand(K, N)
@@ -838,17 +837,14 @@ def check_dtype(dtype, operand_name):
                 "The qualifiers must be specified as a NumPy array of length 3 corresponding to the operands A, B, and "
                 "C of type 'matrix_qualifiers_dtype'."
             )
+        # Set qualifiers based on torch lazy conjugation flag if not provided.
+        self.qualifiers[0]["is_conjugate"] = self.qualifiers[0]["is_conjugate"] ^ self.operands[0].is_conjugate
+        self.qualifiers[1]["is_conjugate"] = self.qualifiers[1]["is_conjugate"] ^ self.operands[1].is_conjugate
+        self.lazy_conjugation = (self.operands[0].is_conjugate, self.operands[1].is_conjugate, False)
+        if self.num_operands == 3:
+            self.qualifiers[2]["is_conjugate"] = self.qualifiers[2]["is_conjugate"] ^ self.operands[2].is_conjugate
         if self.qualifiers[2]["is_conjugate"]:
             raise ValueError("The conjugate flag is currently not supported for operand C.")
-        # Set qualifiers based on torch lazy conjugation flag if not provided.
-        if self.package == "torch" and qualifiers is None:
-            self.qualifiers[0]["is_conjugate"] = self.operands[0].tensor.is_conj()
-            self.qualifiers[1]["is_conjugate"] = self.operands[1].tensor.is_conj()
-            if len(self.operands) > 2 and self.operands[2].tensor.is_conj():
-                raise ValueError("The conjugate flag is currently not supported for operand C.")
-            self.lazy_conjugation = True
-        else:
-            self.lazy_conjugation = False
 
         # Set blocking or non-blocking behavior.
         self.blocking = self.options.blocking is True or self.memory_space == "cpu"
@@ -1968,6 +1964,7 @@ def _check_and_set_operand(
         updating, and update it.
         """
         assert (operand_index is None) ^ (epilog_name is None), "Internal Error."
+        assert self.operands is not None, "Internal Error."
 
         # Make sure that the data type and extents match.
         utils.check_attribute_match(dtype, operand.dtype, "data type")
@@ -1976,12 +1973,7 @@ def _check_and_set_operand(
         package = utils.infer_object_package(operand.tensor)
 
         # Conjugate flag of the provided operands must match the original qualifiers
-        if (
-            operand_index is not None
-            and package == "torch"
-            and self.lazy_conjugation
-            and self.qualifiers[operand_index]["is_conjugate"] != operand.tensor.is_conj()
-        ):
+        if operand_index is not None and self.lazy_conjugation[operand_index] != operand.is_conjugate:
             raise ValueError(f"The provided operand {operand_name} has different conjugate flag than the original operand")
 
         device_id = operand.device_id
@@ -2764,7 +2756,7 @@ def matmul(
     Narrow-precision support:
         {narrow_precision}
 
-    See Also:
+    .. seealso::
         :class:`Matmul`, :class:`MatmulOptions`, :class:`MatmulEpilog`,
         :class:`MatmulPlanPreferences`
 
@@ -2781,7 +2773,7 @@ def matmul(
         >>> c = cp.random.rand(M, N, dtype=cp.float32)
 
         Perform the operation :math:`\\alpha A @ B + \\beta C` using :func:`matmul`. The
-        result `r` is also a CuPy float64 ndarray:
+        result `r` is also a CuPy float32 ndarray:
 
         >>> r = nvmath.linalg.advanced.matmul(a, b, c, alpha=1.23, beta=0.74)
 
diff --git a/nvmath/linalg/generic/__init__.py b/nvmath/linalg/generic/__init__.py
new file mode 100644
index 0000000..28a0e86
--- /dev/null
+++ b/nvmath/linalg/generic/__init__.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._configuration import (
+    DiagonalMatrixQualifier,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    MatmulOptions,
+    MatrixQualifier,
+    matrix_qualifiers_dtype,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+)
+from .matmulmod import (
+    ExecutionCPU,
+    ExecutionCUDA,
+    InvalidMatmulState,
+    matmul,
+    Matmul,
+    SideMode,
+    FillMode,
+    DiagType,
+)
+
+__all__ = (
+    "DiagonalMatrixQualifier",
+    "ExecutionCPU",
+    "ExecutionCUDA",
+    "GeneralMatrixQualifier",
+    "HermitianMatrixQualifier",
+    "InvalidMatmulState",
+    "matmul",
+    "Matmul",
+    "MatmulOptions",
+    "MatrixQualifier",
+    "matrix_qualifiers_dtype",
+    "SymmetricMatrixQualifier",
+    "TriangularMatrixQualifier",
+    "SideMode",
+    "FillMode",
+    "DiagType",
+)
diff --git a/nvmath/linalg/generic/_configuration/__init__.py b/nvmath/linalg/generic/_configuration/__init__.py
new file mode 100644
index 0000000..3740b78
--- /dev/null
+++ b/nvmath/linalg/generic/_configuration/__init__.py
@@ -0,0 +1,27 @@
+from .qualifiers import (
+    DiagonalMatrixQualifier,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    MatmulOptions,
+    MatrixQualifier,
+    matrix_qualifiers_dtype,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+    vector_to_square,
+)
+from .match import (
+    select_blas_mm_function,
+)
+
+__all__ = [
+    "DiagonalMatrixQualifier",
+    "GeneralMatrixQualifier",
+    "HermitianMatrixQualifier",
+    "MatmulOptions",
+    "MatrixQualifier",
+    "matrix_qualifiers_dtype",
+    "SymmetricMatrixQualifier",
+    "TriangularMatrixQualifier",
+    "vector_to_square",
+    "select_blas_mm_function",
+]
diff --git a/nvmath/linalg/generic/_configuration/match.py b/nvmath/linalg/generic/_configuration/match.py
new file mode 100644
index 0000000..e2cf181
--- /dev/null
+++ b/nvmath/linalg/generic/_configuration/match.py
@@ -0,0 +1,605 @@
+"""
+Matrix multiplication function selection and matching for BLAS operations.
+
+This module provides functionality to select and configure the appropriate BLAS matrix
+multiplication functions based on matrix qualifiers, batch traits, and execution context.
+It supports both CPU (NVPL) and GPU (cuBLAS) backends and handles various matrix types
+including general, symmetric, hermitian, triangular, and diagonal matrices.
+"""
+
+import logging
+import typing
+
+import numpy as np
+
+from nvmath._internal.templates import ExecutionCPU, ExecutionCUDA
+from nvmath.internal import tensor_wrapper, typemaps, utils
+from nvmath.linalg._internal.batch import BatchTraits
+from nvmath.linalg._internal.layout import BLASMMTraits
+import nvmath.bindings.cublas as cublas
+
+from .qualifiers import (
+    DiagonalMatrixQualifier,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    MatrixQualifier,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+)
+from .wrap import (
+    cublas_enum_mapper,
+    cublas_mm_function,
+    get_address_zeroth_element,
+    get_value_zeroth_element,
+    nvpl_enum_mapper,
+    nvpl_mm_function,
+)
+
+"""A matmul function returned by this module."""
+WrappedMMFunction: typing.TypeAlias = typing.Callable[
+    [
+        tensor_wrapper.TensorHolder,
+        tensor_wrapper.TensorHolder,
+        tensor_wrapper.TensorHolder,
+        np.ndarray,
+        np.ndarray,
+        utils.StreamHolder | None,
+    ],
+    None,
+]
+
+"""A function which returns a BLAS implementation's matching matmul function."""
+MMFunctionGetter: typing.TypeAlias = typing.Callable[
+    [
+        ExecutionCUDA | ExecutionCPU,
+        typemaps.cudaDataType,
+        str,
+        logging.Logger,
+        typing.Literal["", "stride", "group"],
+    ],
+    typing.Callable,
+]
+
+
+def select_blas_mm_function(
+    batch_traits: tuple[BatchTraits, BatchTraits, BatchTraits],
+    mm_traits: BLASMMTraits,
+    qualifiers: MatrixQualifier,
+    logger: logging.Logger,
+    execution: ExecutionCUDA | ExecutionCPU,
+) -> WrappedMMFunction:
+    """Return a matrix multiplication function which matches the provided arguments."""
+
+    # At this level, we only select the appropriate library
+    match execution:
+        case ExecutionCPU():
+            mm_function_getter = nvpl_mm_function
+            mm_enum_mapper = nvpl_enum_mapper
+            # NOTE: the BLAS APIs take values for real alpha/beta, pointers for complex
+            if mm_traits.a_layout_traits.dtype.name.startswith("CUDA_C"):
+                mm_alpha_beta_picker = get_address_zeroth_element
+            else:
+                mm_alpha_beta_picker = get_value_zeroth_element
+        case ExecutionCUDA():
+            mm_function_getter = cublas_mm_function
+            mm_enum_mapper = cublas_enum_mapper
+            mm_alpha_beta_picker = get_address_zeroth_element
+        case _:
+            raise ValueError("Only ExectionCUDA and ExecutionCPU are supported.")
+
+    return _select_blas_mm_function_from_qualifiers(
+        batch_traits,
+        mm_traits,
+        qualifiers,
+        logger,
+        execution,
+        mm_function_getter,
+        mm_enum_mapper,
+        mm_alpha_beta_picker,
+    )
+
+
+def _select_blas_mm_function_from_qualifiers(
+    batch_traits: tuple[BatchTraits, BatchTraits, BatchTraits],
+    mm_traits: BLASMMTraits,
+    qualifiers: MatrixQualifier,
+    logger: logging.Logger,
+    execution: ExecutionCUDA | ExecutionCPU,
+    mm_function_getter: MMFunctionGetter,
+    mm_enum_mapper: typing.Callable,
+    mm_alpha_beta_picker: typing.Callable,
+) -> WrappedMMFunction:
+    """Match and wrap a matrix multiplication function based on the provided arguments."""
+    # NOTE: The parameters of this function are only the operands that will be resettable by
+    # reset_operands(); the rest of the parameters should be unchanged by reset_operands.
+    # Therefore we can amortize the cost of those bits.
+    batchCount = batch_traits[2].count
+    if batchCount >= 0 and GeneralMatrixQualifier.is_valid(qualifiers):
+        operationA = mm_traits.a_layout_traits.operation
+        operationB = mm_traits.b_layout_traits.operation
+        m, n, k = mm_traits.M, mm_traits.N, mm_traits.K
+        lda, ldb, ldc = mm_traits.a_layout_traits.ld, mm_traits.b_layout_traits.ld, mm_traits.c_layout_traits.ld
+        assert mm_traits.c_layout_traits.operation == cublas.Operation.N
+        strideA, strideB, strideC = (t.stride for t in batch_traits)
+
+        if mm_traits.is_swapped_AB:
+            strideA, strideB = strideB, strideA
+
+        operationA = mm_enum_mapper(operationA)
+        operationB = mm_enum_mapper(operationB)
+
+        func = mm_function_getter(
+            execution,
+            mm_traits.a_layout_traits.dtype,
+            GeneralMatrixQualifier.abbreviation,
+            logger,
+            "stride" if batchCount > 1 else "",
+        )
+
+        def wrapped(
+            A: tensor_wrapper.TensorHolder,
+            B: tensor_wrapper.TensorHolder,
+            C: tensor_wrapper.TensorHolder,
+            alpha: np.ndarray,
+            beta: np.ndarray,
+            stream_holder: utils.StreamHolder | None,
+        ) -> None:
+            if mm_traits.is_swapped_AB:
+                A, B = B, A
+            logger.debug(
+                "Calling %s(operationA=%s, operationB=%s, m=%d, n=%d, k=%d, alpha=%s, lda=%d, strideA=%d, "
+                "ldb=%d, strideB=%d, beta=%s, ldc=%d, strideC=%d, batchCount=%d)",
+                func.__name__,
+                operationA,
+                operationB,
+                m,
+                n,
+                k,
+                alpha[0],
+                lda,
+                strideA,
+                ldb,
+                strideB,
+                beta[0],
+                ldc,
+                strideC,
+                batchCount,
+            )
+            if batchCount > 1:
+                func(
+                    operationA,
+                    operationB,
+                    m,
+                    n,
+                    k,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    strideA,
+                    B.data_ptr,
+                    ldb,
+                    strideB,
+                    mm_alpha_beta_picker(beta),
+                    C.data_ptr,
+                    ldc,
+                    strideC,
+                    batchCount,
+                    stream_holder=stream_holder,
+                )
+            else:
+                func(
+                    operationA,
+                    operationB,
+                    m,
+                    n,
+                    k,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    B.data_ptr,
+                    ldb,
+                    mm_alpha_beta_picker(beta),
+                    C.data_ptr,
+                    ldc,
+                    stream_holder=stream_holder,
+                )
+    elif (
+        batchCount >= 0
+        and GeneralMatrixQualifier.is_valid(qualifiers[2])
+        and (
+            (
+                (HermitianMatrixQualifier.is_valid(qualifiers[0]) or SymmetricMatrixQualifier.is_valid(qualifiers[0]))
+                and GeneralMatrixQualifier.is_valid(qualifiers[1])
+            )
+            or (
+                (HermitianMatrixQualifier.is_valid(qualifiers[1]) or SymmetricMatrixQualifier.is_valid(qualifiers[1]))
+                and GeneralMatrixQualifier.is_valid(qualifiers[0])
+            )
+        )
+    ):
+        if HermitianMatrixQualifier.is_valid(qualifiers[0]) or SymmetricMatrixQualifier.is_valid(qualifiers[0]):
+            qualifierS = qualifiers[0]
+            # qualifierG = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsS = mm_traits.a_layout_traits
+            traitsG = mm_traits.b_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideS = batch_traits[0].stride
+            strideG = batch_traits[1].stride
+            strideC = batch_traits[2].stride
+            is_left_side_symmetric = True
+            # Parameter A must always be the symmetric matrix, so if the user provided a
+            # SymmetricMatrixQualifier as qualifiers[1], we must move the corresponding
+            # matrix to the A input in the BLAS API.
+            swapABinputs = False
+        elif HermitianMatrixQualifier.is_valid(qualifiers[1]) or SymmetricMatrixQualifier.is_valid(qualifiers[1]):
+            # qualifierG = qualifiers[0]
+            qualifierS = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsS = mm_traits.b_layout_traits
+            traitsG = mm_traits.a_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideS = batch_traits[1].stride
+            strideG = batch_traits[0].stride
+            strideC = batch_traits[2].stride
+            is_left_side_symmetric = False
+            swapABinputs = True
+        else:
+            raise ValueError("Internal Error: At least one must be a HermitianMatrixQualifier | SymmetricMatrixQualifier!")
+
+        if mm_traits.is_swapped_AB:
+            traitsS, traitsG = traitsG, traitsS
+            is_left_side_symmetric = not is_left_side_symmetric
+
+        func = mm_function_getter(
+            execution,
+            traitsS.dtype,
+            qualifierS["abbreviation"],
+            logger,
+            "stride" if batchCount > 1 else "",
+        )
+
+        if traitsG.operation != cublas.Operation.N or traitsC.operation != cublas.Operation.N:
+            raise ValueError(
+                f"Operations on the non-hermitian/non-symmetric operands B,C are not supported for {func.__name__}"
+            )
+
+        if traitsS.operation == cublas.Operation.C and SymmetricMatrixQualifier.is_valid(qualifierS):
+            raise ValueError(f"Conjugate-Transpose on operand A is not supported for {func.__name__}")
+
+        if traitsS.operation == cublas.Operation.T and HermitianMatrixQualifier.is_valid(qualifierS):
+            raise ValueError(f"Transpose on operand A is not supported for {func.__name__}")
+
+        side = cublas.SideMode.LEFT if is_left_side_symmetric else cublas.SideMode.RIGHT
+        uplo = cublas.FillMode.LOWER if traitsS.is_lower else cublas.FillMode.UPPER
+        m, n = traitsG.shape
+        lda, ldb, ldc = traitsS.ld, traitsG.ld, traitsC.ld
+
+        side = mm_enum_mapper(side)
+        uplo = mm_enum_mapper(uplo)
+
+        def wrapped(
+            A: tensor_wrapper.TensorHolder,
+            B: tensor_wrapper.TensorHolder,
+            C: tensor_wrapper.TensorHolder,
+            alpha: np.ndarray,
+            beta: np.ndarray,
+            stream_holder: utils.StreamHolder | None,
+        ) -> None:
+            if swapABinputs:
+                A, B = B, A
+            logger.debug(
+                "Calling %s(side=%s, uplo=%s, m=%d, n=%d, alpha=%s, lda=%d, strideA=%d, "
+                "ldb=%d, strideB=%d, beta=%s, ldc=%d, strideC=%d, batchCount=%d)",
+                func.__name__,
+                side,
+                uplo,
+                m,
+                n,
+                alpha[0],
+                lda,
+                strideS,
+                ldb,
+                strideG,
+                beta[0],
+                ldc,
+                strideC,
+                batchCount,
+            )
+            if batchCount > 1:
+                func(
+                    side,
+                    uplo,
+                    m,
+                    n,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    strideS,
+                    B.data_ptr,
+                    ldb,
+                    strideG,
+                    mm_alpha_beta_picker(beta),
+                    C.data_ptr,
+                    ldc,
+                    strideC,
+                    batchCount,
+                    stream_holder=stream_holder,
+                )
+            else:
+                func(
+                    side,
+                    uplo,
+                    m,
+                    n,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    B.data_ptr,
+                    ldb,
+                    mm_alpha_beta_picker(beta),
+                    C.data_ptr,
+                    ldc,
+                    stream_holder=stream_holder,
+                )
+    elif (
+        batchCount >= 0
+        and GeneralMatrixQualifier.is_valid(qualifiers[2])
+        and (
+            (TriangularMatrixQualifier.is_valid(qualifiers[0]) and GeneralMatrixQualifier.is_valid(qualifiers[1]))
+            or (TriangularMatrixQualifier.is_valid(qualifiers[1]) and GeneralMatrixQualifier.is_valid(qualifiers[0]))
+        )
+    ):
+        func = mm_function_getter(
+            execution,
+            mm_traits.a_layout_traits.dtype,
+            TriangularMatrixQualifier.abbreviation,
+            logger,
+            "stride" if batchCount > 1 else "",
+        )
+        if TriangularMatrixQualifier.is_valid(qualifiers[0]):
+            qualifierT = qualifiers[0]
+            # qualifierG = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsT = mm_traits.a_layout_traits
+            traitsG = mm_traits.b_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideT = batch_traits[0].stride
+            strideG = batch_traits[1].stride
+            strideC = batch_traits[2].stride
+            is_left_side_triangle = True
+            # Parameter A must always be the triangular matrix, so if the user provided a
+            # TriangularMatrixQualifier as qualifiers[1], we must move the corresponding
+            # matrix to the A input in the BLAS API.
+            swapABinputs = False
+        elif TriangularMatrixQualifier.is_valid(qualifiers[1]):
+            # qualifierG = qualifiers[0]
+            qualifierT = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsT = mm_traits.b_layout_traits
+            traitsG = mm_traits.a_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideT = batch_traits[1].stride
+            strideG = batch_traits[0].stride
+            strideC = batch_traits[2].stride
+            is_left_side_triangle = False
+            swapABinputs = True
+        else:
+            raise ValueError("Internal Error: At least one must be a TriangularMatrixQualifier!")
+
+        if mm_traits.is_swapped_AB:
+            traitsT, traitsG = traitsG, traitsT
+            is_left_side_triangle = not is_left_side_triangle
+
+        if traitsG.operation != cublas.Operation.N or traitsC.operation != cublas.Operation.N:
+            raise ValueError(f"Operations on the non-triangular operands B,C are not supported for {func.__name__}")
+
+        side = cublas.SideMode.LEFT if is_left_side_triangle else cublas.SideMode.RIGHT
+        uplo = cublas.FillMode.LOWER if traitsT.is_lower else cublas.FillMode.UPPER
+        operation = traitsT.operation
+        diag = qualifierT["diag"]
+        m, n = traitsG.shape
+        lda, ldb, ldc = traitsT.ld, traitsG.ld, traitsC.ld
+
+        if (
+            mm_traits.a_layout_traits.dtype == typemaps.cudaDataType.CUDA_C_64F
+            and n >= 256
+            and m == 1
+            and side == cublas.SideMode.RIGHT
+        ):
+            raise ValueError("This configuration is unsupported for CTK <13.")
+
+        side = mm_enum_mapper(side)
+        uplo = mm_enum_mapper(uplo)
+        operation = mm_enum_mapper(operation)
+        diag = mm_enum_mapper(diag)
+
+        def wrapped(
+            A: tensor_wrapper.TensorHolder,
+            B: tensor_wrapper.TensorHolder,
+            C: tensor_wrapper.TensorHolder,
+            alpha: np.ndarray,
+            beta: np.ndarray,
+            stream_holder: utils.StreamHolder | None,
+        ) -> None:
+            if swapABinputs:
+                A, B = B, A
+            logger.debug(
+                "Calling %s(side=%s, uplo=%s, operation=%s, diag=%s, m=%d, n=%d, alpha=%s, lda=%d, strideA=%d, "
+                "ldb=%d, strideB=%d, ldc=%d, strideC=%d, batchCount=%d)",
+                func.__name__,
+                side,
+                uplo,
+                operation,
+                diag,
+                m,
+                n,
+                alpha[0],
+                lda,
+                strideT,
+                ldb,
+                strideG,
+                ldc,
+                strideC,
+                batchCount,
+            )
+            if batchCount > 1:
+                func(
+                    side,
+                    uplo,
+                    operation,
+                    diag,
+                    m,
+                    n,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    strideT,
+                    B.data_ptr,
+                    ldb,
+                    strideG,
+                    C.data_ptr,
+                    ldc,
+                    strideC,
+                    batchCount,
+                    stream_holder=stream_holder,
+                )
+            else:
+                func(
+                    side,
+                    uplo,
+                    operation,
+                    diag,
+                    m,
+                    n,
+                    mm_alpha_beta_picker(alpha),
+                    A.data_ptr,
+                    lda,
+                    B.data_ptr,
+                    ldb,
+                    C.data_ptr,
+                    ldc,
+                    stream_holder=stream_holder,
+                )
+    elif (
+        batchCount >= 0
+        and GeneralMatrixQualifier.is_valid(qualifiers[2])
+        and (
+            (DiagonalMatrixQualifier.is_valid(qualifiers[0]) and GeneralMatrixQualifier.is_valid(qualifiers[1]))
+            or (DiagonalMatrixQualifier.is_valid(qualifiers[1]) and GeneralMatrixQualifier.is_valid(qualifiers[0]))
+        )
+    ):
+        func = mm_function_getter(
+            execution,
+            mm_traits.a_layout_traits.dtype,
+            DiagonalMatrixQualifier.abbreviation,
+            logger,
+            "stride" if batchCount > 1 else "",
+        )
+        if DiagonalMatrixQualifier.is_valid(qualifiers[0]):
+            qualifierX = qualifiers[0]
+            # qualifierG = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsX = mm_traits.a_layout_traits
+            traitsG = mm_traits.b_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideX = batch_traits[0].stride
+            strideG = batch_traits[1].stride
+            strideC = batch_traits[2].stride
+            is_left_side_diagonal = True
+            # Parameter X must always be the diagonal matrix, so if the user provided a
+            # DiagonalMatrixQualifier as qualifiers[0], we must move the corresponding
+            # matrix to the X input in the BLAS API.
+            swapABinputs = True
+        elif DiagonalMatrixQualifier.is_valid(qualifiers[1]):
+            # qualifierG = qualifiers[0]
+            qualifierX = qualifiers[1]
+            # qualifierC = qualifiers[2]
+            traitsX = mm_traits.b_layout_traits
+            traitsG = mm_traits.a_layout_traits
+            traitsC = mm_traits.c_layout_traits
+            strideX = batch_traits[1].stride
+            strideG = batch_traits[0].stride
+            strideC = batch_traits[2].stride
+            is_left_side_diagonal = False
+            swapABinputs = False
+        else:
+            raise ValueError("Internal Error: At least one must be a DiagonalMatrixQualifier!")
+
+        if mm_traits.is_swapped_AB:
+            traitsX, traitsG = traitsG, traitsX
+            is_left_side_diagonal = not is_left_side_diagonal
+
+        if traitsG.operation != cublas.Operation.N or traitsC.operation != cublas.Operation.N:
+            raise ValueError(f"Operations on the non-diagonal operands A,C are not supported for {func.__name__}")
+
+        # Operation.T is allowed for diagonal matrix because the transpose is a no-op
+        if traitsX.operation == cublas.Operation.C:
+            raise ValueError(f"Conjugate-Transpose on operand X is not supported for {func.__name__}")
+
+        side = cublas.SideMode.LEFT if is_left_side_diagonal else cublas.SideMode.RIGHT
+        m, n = traitsG.shape
+        lda, ldc = traitsG.ld, traitsC.ld
+        incx = qualifierX["incx"] * max(traitsX.strides)
+
+        side = mm_enum_mapper(side)
+
+        def wrapped(
+            A: tensor_wrapper.TensorHolder,
+            B: tensor_wrapper.TensorHolder,
+            C: tensor_wrapper.TensorHolder,
+            alpha: np.ndarray,
+            beta: np.ndarray,
+            stream_holder: utils.StreamHolder | None,
+        ) -> None:
+            if swapABinputs:
+                A, B = B, A
+            logger.debug(
+                "Calling %s(side=%s, m=%d, n=%d, lda=%d, strideA=%d, incx=%d, strideX=%d, ldc=%d, strideC=%d, batchCount=%d)",
+                func.__name__,
+                side,
+                m,
+                n,
+                lda,
+                strideG,
+                incx,
+                strideX,
+                ldc,
+                strideC,
+                batchCount,
+            )
+            if batchCount > 1:
+                func(
+                    side,
+                    m,
+                    n,
+                    A.data_ptr,
+                    lda,
+                    strideG,
+                    B.data_ptr,
+                    incx,
+                    strideX,
+                    C.data_ptr,
+                    ldc,
+                    strideC,
+                    batchCount,
+                    stream_holder=stream_holder,
+                )
+            else:
+                func(
+                    side,
+                    m,
+                    n,
+                    A.data_ptr,
+                    lda,
+                    B.data_ptr,
+                    incx,
+                    C.data_ptr,
+                    ldc,
+                    stream_holder=stream_holder,
+                )
+    else:
+        msg = f"No available generic matrix multiplication matches the provided matrices: {qualifiers}."
+        raise ValueError(msg)
+
+    return wrapped
diff --git a/nvmath/linalg/generic/_configuration/qualifiers.py b/nvmath/linalg/generic/_configuration/qualifiers.py
new file mode 100644
index 0000000..5a30ca9
--- /dev/null
+++ b/nvmath/linalg/generic/_configuration/qualifiers.py
@@ -0,0 +1,512 @@
+"""
+Matrix qualifier dataclasses for describing structured matrix types and their properties in
+linear algebra operations. Provides qualifiers for general, symmetric, hermitian,
+triangular, and diagonal matrices with associated metadata like fill modes, transpose flags,
+and BLAS function abbreviations.
+"""
+
+import abc
+import dataclasses
+import typing
+
+import numpy as np
+import numpy.typing as npt
+
+import nvmath.bindings.cublas as cublas
+
+from nvmath._internal.templates import StatefulAPIOptions
+from nvmath.internal import utils
+
+
+FillMode: typing.TypeAlias = cublas.FillMode
+DiagType: typing.TypeAlias = cublas.DiagType
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class MatmulOptions(StatefulAPIOptions):
+    """A dataclass for providing options to a :class:`Matmul` object.
+
+    Attributes:
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used
+            to draw device memory. If an allocator is not provided, a memory allocator from
+            the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
+            PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
+
+        blocking: A flag specifying the behavior of the stream-ordered functions and
+            methods. When ``blocking`` is `True`, the stream-ordered methods do not return
+            until the operation is complete. When ``blocking`` is ``"auto"``, the methods
+            return immediately when the inputs are on the GPU. The stream-ordered methods
+            always block when the operands are on the CPU to ensure that the user doesn't
+            inadvertently use the result before it becomes available. The default is
+            ``"auto"``.
+
+        inplace: Whether the matrix multiplication is performed in-place (operand C is
+            overwritten).
+
+        logger: Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+    .. seealso::
+       :class:`StatefulAPI`
+    """
+
+    inplace: bool = False
+
+
+MM_QUALIFIERS_DOCUMENTATION = {
+    #
+    "abbreviation": """\
+The two character abbreviation of the matrix qualifier.""".replace("\n", " "),
+    #
+    "conjugate": """\
+Whether the matrix is conjugate.""".replace("\n", " "),
+    #
+    "transpose": """\
+Whether the matrix is transpose.""".replace("\n", " "),
+    #
+    "uplo": """\
+The :py:class:`~nvmath.bindings.cublas.FillMode` of the matrix. e.g. upper, lower...""".replace("\n", " "),
+    #
+    "diag": """\
+The :py:class:`~nvmath.bindings.cublas.DiagType` of the matrix. e.g. unit, non-unit...""".replace("\n", " "),
+    #
+    "incx": """\
+The direction to read the diagonal. +1 for forward; -1 for reverse.""".replace("\n", " "),
+    #
+}
+
+
+# We name this variable as foo_bar_dtype, so that the docstrings format correctly
+# Docstrings for custom dtypes are defined in docs/sphinx/conf.py
+matrix_qualifiers_dtype = np.dtype(
+    [
+        ("abbreviation", "U2"),
+        ("conjugate", np.bool_),
+        ("transpose", np.bool_),
+        ("uplo", np.int_),
+        ("diag", np.int_),
+        ("incx", np.int_),
+    ]
+)
+
+"""A NumPy array of type :class:`matrix_qualifiers_dtype`.
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+"""
+MatrixQualifier: typing.TypeAlias = npt.NDArray
+
+
+class MatrixQualifierConstructor(abc.ABC):
+    abbreviation: typing.ClassVar[str]
+
+    def __init__(self):
+        msg = f"The {self.__class__.__name__} constructor should not be called. Use {self.__class__.__name__}.create() instead."
+        raise RuntimeError(msg)
+
+    @classmethod
+    def create(
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        uplo: FillMode = FillMode.FULL,
+        diag: DiagType = DiagType.NON_UNIT,
+        incx: typing.Literal[-1, 1, 0] = 0,
+    ):
+        return np.array((cls.abbreviation, conjugate, transpose, uplo, diag, incx), dtype=matrix_qualifiers_dtype)
+
+    @classmethod
+    def is_valid(cls, other: MatrixQualifier) -> np.bool_:
+        """Return ``True`` if all elements of `other` are valid examples of the
+        :class:`matrix_qualifiers_dtype` constructed by this class."""
+        return np.all(other["abbreviation"] == cls.abbreviation) and np.bool_(
+            all(
+                n in other.dtype.names  # type: ignore[operator]
+                for n in ("conjugate", "transpose")
+            )
+        )
+
+    @classmethod
+    def to_string(cls, other: MatrixQualifier) -> str:
+        """Return a pretty string representation of `other`."""
+        return (
+            f"({other['abbreviation']}, conjugate={other['conjugate']}, "
+            f"transpose={other['transpose']}, uplo={FillMode(other['uplo']).name}, "
+            f"diag={DiagType(other['diag']).name}, incx={other['incx']})"
+        )
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class GeneralMatrixQualifier(MatrixQualifierConstructor):
+    """A class which constructs and validates :class:`matrix_qualifiers_dtype` for a general
+    rectangular matrix.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from nvmath.linalg import GeneralMatrixQualifier, matrix_qualifiers_dtype
+
+        Create a general matrix qualifier:
+
+        >>> GeneralMatrixQualifier.create()  # doctest: +ELLIPSIS
+        array(('ge', False, False, 2, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create a conjugate general matrix qualifier:
+
+        >>> GeneralMatrixQualifier.create(conjugate=True)  # doctest: +ELLIPSIS
+        array(('ge', True, False, 2, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create an array of general matrix qualifiers:
+
+        >>> np.full(
+        ...     2, GeneralMatrixQualifier.create(), dtype=matrix_qualifiers_dtype
+        ... )  # doctest: +ELLIPSIS
+        array([('ge', False, False, 2, 0, 0), ('ge', False, False, 2, 0, 0)],
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+    """
+
+    abbreviation: typing.ClassVar[str] = "ge"
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+    ):
+        """Return a :class:`np.ndarray` of type :class:`matrix_qualifiers_dtype` whose
+        element describes a general matrix.
+
+        Args:
+            conjugate: {conjugate}
+
+            transpose: {transpose}
+        """
+        return super().create(conjugate=conjugate, transpose=transpose)
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class DiagonalMatrixQualifier(MatrixQualifierConstructor):
+    """A class which constructs and validates :class:`matrix_qualifiers_dtype` for a
+    diagonal matrix.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from nvmath.linalg import (
+        ...     DiagonalMatrixQualifier,
+        ...     GeneralMatrixQualifier,
+        ...     matrix_qualifiers_dtype,
+        ... )
+
+        Create a diagonal matrix qualifier:
+
+        >>> DiagonalMatrixQualifier.create()  # doctest: +ELLIPSIS
+        array(('dg', False, False, 2, 0, 1),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create a conjugate diagonal matrix qualifier:
+
+        >>> DiagonalMatrixQualifier.create(conjugate=True)  # doctest: +ELLIPSIS
+        array(('dg', True, False, 2, 0, 1),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create an array of matrix qualifiers with one general and one diagonal matrix:
+
+        >>> qualifiers = np.full(
+        ...     2,
+        ...     GeneralMatrixQualifier.create(),
+        ...     dtype=matrix_qualifiers_dtype,
+        ... )
+        >>> qualifiers[1] = DiagonalMatrixQualifier.create()
+        >>> qualifiers  # doctest: +ELLIPSIS
+        array([('ge', False, False, 2, 0, 0), ('dg', False, False, 2, 0, 1)],
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+    """
+
+    abbreviation: typing.ClassVar[str] = "dg"
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        incx: typing.Literal[-1, 1] = 1,
+    ):
+        """Return a :class:`np.ndarray` of type :class:`matrix_qualifiers_dtype` whose
+        element describes a diagonal matrix.
+
+        Args:
+            conjugate: {conjugate}
+
+            transpose: {transpose}
+
+            incx: {incx}
+        """
+        if incx not in (-1, 1):
+            raise ValueError(f"The 'incx' parameter must be '-1' or '1' not {incx}")
+        return super().create(conjugate, transpose, incx=incx)
+
+    @classmethod
+    def is_valid(cls, other):
+        return super().is_valid(other) and np.all(np.logical_or(other["incx"] == -1, other["incx"] == 1))
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class SquareMatrixQualifier(MatrixQualifierConstructor, abc.ABC):
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        uplo: FillMode = FillMode.LOWER,
+        **kwargs,
+    ):
+        if uplo not in (FillMode.UPPER, FillMode.LOWER):
+            raise ValueError(f"The 'uplo' parameter must be 'UPPER' or 'LOWER', not {uplo}.")
+        return super().create(conjugate, transpose, uplo=uplo, **kwargs)
+
+    @classmethod
+    def is_valid(cls, other):
+        return super().is_valid(other) and np.all(
+            np.logical_or(other["uplo"] == FillMode.UPPER, other["uplo"] == FillMode.LOWER)
+        )
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class HermitianMatrixQualifier(SquareMatrixQualifier):
+    """A class which constructs and validates :class:`matrix_qualifiers_dtype` for a
+    hermitian matrix.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from nvmath.linalg import (
+        ...     HermitianMatrixQualifier,
+        ...     GeneralMatrixQualifier,
+        ...     matrix_qualifiers_dtype,
+        ... )
+
+        Create a hermitian matrix qualifier:
+
+        >>> HermitianMatrixQualifier.create()  # doctest: +ELLIPSIS
+        array(('he', False, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create a conjugate hermitian matrix qualifier:
+
+        >>> HermitianMatrixQualifier.create(conjugate=True)  # doctest: +ELLIPSIS
+        array(('he', True, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create an array of matrix qualifiers with one general and one hermitian matrix:
+
+        >>> qualifiers = np.full(
+        ...     2,
+        ...     GeneralMatrixQualifier.create(),
+        ...     dtype=matrix_qualifiers_dtype,
+        ... )
+        >>> qualifiers[1] = HermitianMatrixQualifier.create()
+        >>> qualifiers  # doctest: +ELLIPSIS
+        array([('ge', False, False, 2, 0, 0), ('he', False, False, 0, 0, 0)],
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+    """
+
+    abbreviation: typing.ClassVar[str] = "he"
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        uplo: FillMode = FillMode.LOWER,
+    ):
+        """Return a :class:`np.ndarray` of type :class:`matrix_qualifiers_dtype` whose
+        element describes a hermitian matrix.
+
+        Args:
+            conjugate: {conjugate}
+
+            transpose: {transpose}
+
+            uplo: {uplo}
+        """
+        return super().create(conjugate, transpose, uplo)
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class SymmetricMatrixQualifier(SquareMatrixQualifier):
+    """A class which constructs and validates :class:`matrix_qualifiers_dtype` for a
+    symmetric matrix.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from nvmath.linalg import (
+        ...     SymmetricMatrixQualifier,
+        ...     GeneralMatrixQualifier,
+        ...     matrix_qualifiers_dtype,
+        ... )
+
+        Create a symmetric matrix qualifier:
+
+        >>> SymmetricMatrixQualifier.create()  # doctest: +ELLIPSIS
+        array(('sy', False, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create a conjugate symmetric matrix qualifier:
+
+        >>> SymmetricMatrixQualifier.create(conjugate=True)  # doctest: +ELLIPSIS
+        array(('sy', True, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create an array of matrix qualifiers with one general and one symmetric matrix:
+
+        >>> qualifiers = np.full(
+        ...     2,
+        ...     GeneralMatrixQualifier.create(),
+        ...     dtype=matrix_qualifiers_dtype,
+        ... )
+        >>> qualifiers[1] = SymmetricMatrixQualifier.create()
+        >>> qualifiers  # doctest: +ELLIPSIS
+        array([('ge', False, False, 2, 0, 0), ('sy', False, False, 0, 0, 0)],
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+    """
+
+    abbreviation: typing.ClassVar[str] = "sy"
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        uplo: FillMode = FillMode.LOWER,
+    ):
+        """Return a :class:`np.ndarray` of type :class:`matrix_qualifiers_dtype` whose
+        element describes a symmetric matrix.
+
+        Args:
+            conjugate: {conjugate}
+
+            transpose: {transpose}
+
+            uplo: {uplo}
+        """
+        return super().create(conjugate, transpose, uplo)
+
+
+@utils.docstring_decorator(MM_QUALIFIERS_DOCUMENTATION, skip_missing=False)
+class TriangularMatrixQualifier(SquareMatrixQualifier):
+    """A class which constructs and validates :class:`matrix_qualifiers_dtype` for a
+    triangular matrix.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from nvmath.linalg import (
+        ...     TriangularMatrixQualifier,
+        ...     GeneralMatrixQualifier,
+        ...     matrix_qualifiers_dtype,
+        ... )
+
+        Create a triangular matrix qualifier:
+
+        >>> TriangularMatrixQualifier.create()  # doctest: +ELLIPSIS
+        array(('tr', False, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create a conjugate triangular matrix qualifier:
+
+        >>> TriangularMatrixQualifier.create(conjugate=True)  # doctest: +ELLIPSIS
+        array(('tr', True, False, 0, 0, 0),
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+        Create an array of matrix qualifiers with one general and one triangular matrix:
+
+        >>> qualifiers = np.full(
+        ...     2,
+        ...     GeneralMatrixQualifier.create(),
+        ...     dtype=matrix_qualifiers_dtype,
+        ... )
+        >>> qualifiers[1] = TriangularMatrixQualifier.create()
+        >>> qualifiers  # doctest: +ELLIPSIS
+        array([('ge', False, False, 2, 0, 0), ('tr', False, False, 0, 0, 0)],
+              dtype=[('abbreviation', '<U2'), ('conjugate', '?'), ...
+
+    .. seealso::
+        :class:`GeneralMatrixQualifier`, :class:`HermitianMatrixQualifier`,
+        :class:`SymmetricMatrixQualifier`, :class:`TriangularMatrixQualifier`
+        :class:`DiagonalMatrixQualifier`, :class:`matrix_qualifiers_dtype`
+    """
+
+    abbreviation: typing.ClassVar[str] = "tr"
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        conjugate: bool = False,
+        transpose: bool = False,
+        uplo: FillMode = FillMode.LOWER,
+        diag: DiagType = DiagType.NON_UNIT,
+    ):
+        """Return a :class:`np.ndarray` of type :class:`matrix_qualifiers_dtype` whose
+        element describes a triangular matrix.
+
+        Args:
+            conjugate: {conjugate}
+
+            transpose: {transpose}
+
+            uplo: {uplo}
+
+            diag: {diag}
+        """
+        if diag not in (DiagType.UNIT, DiagType.NON_UNIT):
+            raise ValueError(f"The 'diag' parameter must be 'UNIT' or 'NON_UNIT', not {diag}.")
+        return super().create(conjugate, transpose, diag=diag, uplo=uplo)
+
+    @classmethod
+    def is_valid(cls, other):
+        return super().is_valid(other) and np.all(
+            np.logical_or(other["diag"] == DiagType.UNIT, other["diag"] == DiagType.NON_UNIT)
+        )
+
+
+def vector_to_square(
+    shape: typing.Sequence[int], strides: typing.Sequence[int], qualifier: MatrixQualifier
+) -> tuple[typing.Sequence[int], typing.Sequence[int]]:
+    """If `qualifier` is a DiagonalMatrixQualifier, convert `shape` and `stride` from a
+    vector to the equivalent square matrix."""
+    if DiagonalMatrixQualifier.is_valid(qualifier):
+        if len(shape) != 1:
+            msg = f"The shape of a diagonal matrix must be 1D; not {shape}."
+            raise ValueError(msg)
+        shape = tuple(shape) * 2
+        if len(strides) != 1:
+            msg = f"The strides of a diagonal matrix must be 1D; not {strides}."
+            raise ValueError(msg)
+        strides = (0, *strides)
+    return shape, strides
diff --git a/nvmath/linalg/generic/_configuration/wrap.py b/nvmath/linalg/generic/_configuration/wrap.py
new file mode 100644
index 0000000..e665fe2
--- /dev/null
+++ b/nvmath/linalg/generic/_configuration/wrap.py
@@ -0,0 +1,213 @@
+"""
+BLAS function wrapper utilities for dynamically loading and wrapping Level-3 matrix
+multiplication functions from cuBLAS and NVPL BLAS backends. Handles data type to BLAS
+abbreviation mapping, function name generation, and provides unified access with proper
+handle and stream management.
+"""
+
+import logging
+import typing
+
+from nvmath._internal.templates import ExecutionCPU, ExecutionCUDA
+from nvmath.bindings._internal.utils import FunctionNotFoundError
+from nvmath.internal import typemaps, utils
+from nvmath.linalg._internal.utils import get_handle
+import nvmath.bindings.cublas as cublas
+import nvmath.bindings.nvpl.blas as blas
+
+
+def get_value_zeroth_element(array):
+    """Returns the value of zeroth element."""
+    return array[0]
+
+
+def get_address_zeroth_element(array):
+    """Returns the memory address of the zeroth element."""
+    return array.ctypes.data
+
+
+def _blas_dtype_abbreviation(dtype: typemaps.cudaDataType) -> str:
+    """Return the BLAS Level-3 API abbreviation of a dtype."""
+    match dtype:
+        case typemaps.cudaDataType.CUDA_R_32F:
+            return "s"
+        case typemaps.cudaDataType.CUDA_R_64F:
+            return "d"
+        case typemaps.cudaDataType.CUDA_C_32F:
+            return "c"
+        case typemaps.cudaDataType.CUDA_C_64F:
+            return "z"
+        case typemaps.cudaDataType.CUDA_R_16F:
+            return "h"
+        case _:
+            msg = f"'{dtype.name}' has no known BLAS abbreviation."
+            raise ValueError(msg)
+
+
+def _netlib_mm_function_name(dtype: typemaps.cudaDataType, matrix_descr_abbreviation: str) -> str:
+    """Return a netlib API Level-3 function name based on the parameters."""
+    return f"{_blas_dtype_abbreviation(dtype)}{matrix_descr_abbreviation}mm"
+
+
+def _cublas_mm_function_name(
+    dtype: typemaps.cudaDataType,
+    matrix_descr_abbreviation: str,
+    batch_type: typing.Literal["", "stride", "group"] = "",
+) -> str:
+    """Return a cuBLAS API Level-3 function name based on the parameters."""
+    match batch_type:
+        case "stride":
+            suffix = "_strided_batched"
+        case "group":
+            suffix = "_grouped_batched"
+        case "":
+            suffix = ""
+        case _:
+            raise ValueError("batch_type is invalid.")
+    return _netlib_mm_function_name(dtype, matrix_descr_abbreviation) + suffix + "_64"
+
+
+def cublas_mm_function(
+    execution: ExecutionCUDA | ExecutionCPU,
+    dtype: typemaps.cudaDataType,
+    matrix_descr_abbreviation: str,
+    logger: logging.Logger,
+    batch_type: typing.Literal["", "stride", "group"] = "",
+) -> typing.Callable:
+    """Return a cublas API Level-3 function from nvmath.bindings.cublas."""
+    # We get the cublas handle and set the stream here because other BLAS implementations do
+    # not have these constructs
+    assert isinstance(execution, ExecutionCUDA)
+    handle = get_handle(device_id=execution.device_id, binding="cublas")
+    function_name = _cublas_mm_function_name(dtype, matrix_descr_abbreviation, batch_type)
+    try:
+        function = getattr(cublas, function_name)
+        try:
+            function()
+        except TypeError:
+            pass
+
+        def wrapped_with_handle_and_stream(*args, stream_holder: utils.StreamHolder):
+            cublas.set_stream(handle, stream_holder.ptr)
+            function(handle, *args)
+
+        wrapped_with_handle_and_stream.__name__ = function.__name__
+
+        logger.info("Loaded a cuBLAS API function named %s", function.__name__)
+        return wrapped_with_handle_and_stream
+    except (AttributeError, FunctionNotFoundError) as e:
+        # The user may try to call a newer function with older cuBLAS.
+        cublas_version = cublas.get_version(handle)
+        msg = (
+            f"{function_name}() is an unknown cuBLAS function "
+            f"for cuBLAS version {cublas_version}. Please check the cuBLAS Level-3 Function Reference "
+            f"to see whether this function should exists for cuBLAS version {cublas_version}."
+        )
+        raise NotImplementedError(msg) from e
+
+
+def cublas_enum_mapper(enum):
+    """Maps cuBLAS enums to cuBLAS enum."""
+    return enum
+
+
+def _nvpl_mm_function_name(
+    dtype: typemaps.cudaDataType,
+    matrix_descr_abbreviation: str,
+    batch_type: typing.Literal["", "stride", "group"] = "",
+) -> str:
+    """Return an NVPL API Level-3 function name based on the parameters."""
+    match batch_type:
+        case "stride":
+            suffix = "_batch_strided"
+        case "group":
+            suffix = "_batch_grouped"
+        case "":
+            suffix = ""
+        case _:
+            raise ValueError("batch_type is invalid.")
+    return _netlib_mm_function_name(dtype, matrix_descr_abbreviation) + suffix
+
+
+def nvpl_mm_function(
+    execution: ExecutionCUDA | ExecutionCPU,
+    dtype: typemaps.cudaDataType,
+    matrix_descr_abbreviation: str,
+    logger: logging.Logger,
+    batch_type: typing.Literal["", "stride", "group"] = "",
+) -> typing.Callable:
+    """Return an NVPL API Level-3 function from nvmath.bindings.nvpl.blas."""
+    if matrix_descr_abbreviation == "tr":
+        # FIXME: Reconcile API differences
+        raise NotImplementedError("trmm on CPU is unsupported at this time because the cuBLAS API differs.")
+    assert isinstance(execution, ExecutionCPU)
+    function_name = _nvpl_mm_function_name(dtype, matrix_descr_abbreviation, batch_type)
+    try:
+        function = getattr(blas, function_name)
+        try:
+            function()
+        except TypeError:
+            pass
+
+        new_num_threads = 0 if execution.num_threads is None else execution.num_threads
+
+        for set_num_threads_local_name in [
+            "set_num_threads_local",
+            "mkl_set_num_threads_local",
+            "openblas_set_num_threads_local",
+        ]:
+            try:
+                set_num_threads_local = getattr(blas, set_num_threads_local_name)
+                old_num_threads = set_num_threads_local(new_num_threads)
+                set_num_threads_local(old_num_threads)
+            except FunctionNotFoundError as e:
+                logger.debug(e)
+                pass
+            else:
+                logger.debug(f"function {set_num_threads_local_name} is valid.")
+                break
+        else:
+            # If none of the local setting functions are valid, implement a dummy function
+            def set_num_threads_local(x):
+                pass
+
+        def wrapped_with_threads_and_stream(*args, stream_holder: None):
+            old_num_threads = set_num_threads_local(new_num_threads)
+            function(blas.ORDER.ColMajor, *args)
+            set_num_threads_local(old_num_threads)
+
+        wrapped_with_threads_and_stream.__name__ = function.__name__
+
+        logger.info("Loaded a NVPL BLAS API function named %s", function.__name__)
+        return wrapped_with_threads_and_stream
+    except (AttributeError, FunctionNotFoundError) as e:
+        # The user may try to call a newer function with older cuBLAS.
+        try:
+            blas_version = blas.get_version()
+        except FunctionNotFoundError:
+            blas_version = "unknown"
+        msg = (
+            f"{function_name}() is an unknown NVPL BLAS function "
+            f"for NVPL BLAS version {blas_version}. Please check the BLAS Level-3 Function Reference "
+            f"to see whether this function should exists for NVPL BLAS version {blas_version}."
+        )
+        raise NotImplementedError(msg) from e
+
+
+# NOTE: We have to map from name (str) to enum because of value collisions in the enums
+_CUBLAS_ENUM_TO_NVPL_ENUM: dict[str, int] = {
+    cublas.DiagType.NON_UNIT.name: blas.DIAG.NonUnit,
+    cublas.DiagType.UNIT.name: blas.DIAG.Unit,
+    cublas.FillMode.LOWER.name: blas.UPLO.Lower,
+    cublas.FillMode.UPPER.name: blas.UPLO.Upper,
+    cublas.Operation.C.name: blas.TRANSPOSE.ConjTrans,
+    cublas.Operation.N.name: blas.TRANSPOSE.NoTrans,
+    cublas.Operation.T.name: blas.TRANSPOSE.Trans,
+    cublas.SideMode.LEFT.name: blas.SIDE.Left,
+    cublas.SideMode.RIGHT.name: blas.SIDE.Right,
+}
+
+
+def nvpl_enum_mapper(enum):
+    """Maps cuBLAS enums to BLAS enums."""
+    return _CUBLAS_ENUM_TO_NVPL_ENUM[enum.name]
diff --git a/nvmath/linalg/generic/_dtype.py b/nvmath/linalg/generic/_dtype.py
new file mode 100644
index 0000000..8b9eecc
--- /dev/null
+++ b/nvmath/linalg/generic/_dtype.py
@@ -0,0 +1,11 @@
+SUPPORTED_TYPES = [
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
+
+
+def check_dtype(dtype, operand_name):
+    if dtype not in SUPPORTED_TYPES:
+        raise ValueError(f"The dtype of operand {operand_name} ({dtype}) is not supported.")
diff --git a/nvmath/linalg/generic/matmulmod.py b/nvmath/linalg/generic/matmulmod.py
new file mode 100644
index 0000000..c5b7c94
--- /dev/null
+++ b/nvmath/linalg/generic/matmulmod.py
@@ -0,0 +1,1013 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+__all__ = [
+    "Matmul",
+    "matmul",
+]
+
+import dataclasses
+import logging
+from typing import TypeAlias
+from collections.abc import Sequence
+
+import numpy as np
+import cuda.core.experimental as ccx
+
+from nvmath.bindings import cublas
+from nvmath._internal import templates
+from nvmath.internal import utils, tensor_wrapper, typemaps, formatters
+from nvmath.linalg._internal.batch import BatchTraits
+from nvmath.linalg._internal.layout import BLASMMTraits, BLASMatrixTraits, check_extents, check_strides
+from nvmath.linalg.generic._configuration import (
+    GeneralMatrixQualifier,
+    MatrixQualifier,
+    matrix_qualifiers_dtype,
+    MatmulOptions,
+    select_blas_mm_function,
+    vector_to_square,
+)
+from nvmath.linalg.advanced.matmulmod import SHARED_MM_DOCUMENTATION
+from nvmath.linalg.generic._dtype import check_dtype
+
+AnyTensor: TypeAlias = tensor_wrapper.AnyTensor
+SideMode: TypeAlias = cublas.SideMode
+FillMode: TypeAlias = cublas.FillMode
+DiagType: TypeAlias = cublas.DiagType
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class ExecutionCUDA(templates.ExecutionCUDA):
+    """
+    A data class for providing GPU execution options.
+
+    Attributes:
+        device_id: CUDA device ordinal (only used if the operand resides on the CPU). The
+            default value is 0.
+
+    .. seealso::
+        :class:`ExecutionCPU`, :class:`Matmul`, :func:`matmul`
+    """
+
+    pass
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class ExecutionCPU(templates.ExecutionCPU):
+    """
+    A data class for providing CPU execution options.
+
+    Attributes:
+        num_threads: The number of CPU threads used to execute the operation.
+                     If not specified, defaults to the number of CPU cores available to the
+                     process.
+
+    .. seealso::
+       :class:`ExecutionCUDA`, :class:`Matmul`, :func:`matmul`,
+       :func:`~nvmath.bindings.nvpl.blas.set_num_threads_local`
+    """
+
+    pass
+
+
+class InvalidMatmulState(Exception):
+    pass
+
+
+GENERIC_MM_DOCUMENTATION = SHARED_MM_DOCUMENTATION.copy()
+GENERIC_MM_DOCUMENTATION.update(
+    {
+        "qualifiers": """\
+If desired, specify the matrix qualifiers as a :class:`numpy.ndarray` of
+:class:`~nvmath.linalg.generic.matrix_qualifiers_dtype` objects of length <= 3 corresponding to the operands `a`, `b`, and
+`c`. By default, :class:`GeneralMatrixQualifier` is assumed for each tensor. See
+:ref:`matrix-tensor-qualifiers` for the motivation behind qualifiers.""".replace("\n", " "),
+        #
+        "execution": """\
+Specify execution space options for the Matmul as a :class:`ExecutionCUDA` or :class:`ExecutionCPU` object. If not specified,
+the execution space will be selected to match operand's storage (in GPU or host memory), and the corresponding
+:class:`ExecutionCUDA` or :class:`ExecutionCPU` object will be default-constructed.""".replace("\n", " "),
+        #
+        "options": """\
+Specify options for the matrix multiplication as a :class:`MatmulOptions` object. If not specified, the
+value will be set to the default-constructed ``MatmulOptions`` object.""".replace("\n", " "),
+        #
+        "result": """\
+The result of the specified matrix multiplication, which remains on the same device and belong to the
+same package as the input operands.""".replace("\n", " "),
+        #
+        "semantics": """\
+        .. _semantics:
+
+        The semantics of the matrix multiplication follows :func:`numpy.matmul` semantics, with some restrictions.
+
+        * Batching is not supported in this API, but is planned for a future release. See the advanced API
+          (:func:`nvmath.linalg.advanced.matmul`) for an API that supports batching.
+        * Broadcasting `c` is not supported in this API, but may be supported in the future. See the advanced API
+          (:func:`nvmath.linalg.advanced.matmul`) for an API that supports broadcasting `c`.
+
+        In addition, the semantics for the fused matrix addition are described below:
+
+        * If arguments `a` and `b` are matrices, they are multiplied according to the rules of matrix multiplication.
+        * If argument `a` is 1-D, it is promoted to a matrix by prefixing ``1`` to its dimensions. After matrix
+          multiplication, the prefixed ``1`` is removed from the result's dimensions.
+        * If argument `b` is 1-D, it is promoted to a matrix by appending ``1`` to its dimensions. After matrix
+          multiplication, the appended ``1`` is removed from the result's dimensions.
+        * The operand for the matrix addition `c` must be the expected shape of the result of the matrix multiplication.
+
+""".strip(),
+    }
+)
+
+
+@utils.docstring_decorator(GENERIC_MM_DOCUMENTATION, skip_missing=False)
+class Matmul(templates.StatefulAPI[MatmulOptions]):
+    """
+    Create a stateful object encapsulating the specified matrix multiplication computation
+    :math:`\\alpha a @ b + \\beta c` and the required resources to perform the operation. A
+    stateful object can be used to amortize the cost of preparation (planning in the case of
+    matrix multiplication) across multiple executions (also see the :ref:`Stateful APIs
+    <host api types>` section).
+
+    The function-form API :func:`matmul` is a convenient alternative to using stateful
+    objects for *single* use (the user needs to perform just one matrix multiplication, for
+    example), in which case there is no possibility of amortizing preparatory costs. The
+    function-form APIs are just convenience wrappers around the stateful object APIs.
+
+    Using the stateful object typically involves the following steps:
+
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific matrix multiplication operation.
+    3. **Execution**: Perform the matrix multiplication computation with :meth:`execute`.
+
+    Detailed information on what's happening in the various phases described above can be
+    obtained by passing in a :class:`logging.Logger` object to :class:`MatmulOptions` or by
+    setting the appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        alpha: {alpha}
+
+        beta: {beta}
+
+        qualifiers: {qualifiers}
+
+        options: {options}
+
+        execution: {execution}
+
+        stream: {stream}
+
+    Semantics:
+        {semantics}
+
+    .. seealso::
+        :meth:`reset_operands`, :meth:`execute`
+
+    Examples:
+
+        >>> import numpy as np
+        >>> import nvmath
+
+        Create two 2-D float64 ndarrays on the CPU:
+
+        >>> M, N, K = 1024, 1024, 1024
+        >>> a = np.random.rand(M, K)
+        >>> b = np.random.rand(K, N)
+
+        We will define a matrix multiplication operation using the generic matrix
+        multiplication interface.
+
+        Create a Matmul object encapsulating the problem specification above:
+
+        >>> mm = nvmath.linalg.Matmul(a, b)
+
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`MatmulOptions`).
+
+        Next, plan the operation. The operands' layouts, qualifiers, and dtypes will be
+        considered to select an appropriate matrix multiplication:
+
+        >>> mm.plan()
+
+        Now execute the matrix multiplication, and obtain the result `r1` as a NumPy
+        ndarray.
+
+        >>> r1 = mm.execute()
+
+        Note that all :class:`Matmul` methods execute on the current stream by default.
+        Alternatively, the `stream` argument can be used to run a method on a specified
+        stream.
+
+        Let's now look at the same problem with CuPy ndarrays on the GPU.
+
+        Create a 3-D complex128 CuPy ndarray on the GPU:
+
+        >>> import cupy as cp
+        >>> a = cp.random.rand(M, K)
+        >>> b = cp.random.rand(K, N)
+
+        Create an Matmul object encapsulating the problem specification described earlier
+        and use it as a context manager.
+
+        >>> with nvmath.linalg.Matmul(a, b) as mm:
+        ...     # Plan the operation.
+        ...     mm.plan()
+        ...
+        ...     # Execute the operation to get the first result.
+        ...     r1 = mm.execute()
+        ...
+        ...     # Update operands A and B in-place (see reset_operands() for an
+        ...     # alternative).
+        ...     a[:] = cp.random.rand(M, K)
+        ...     b[:] = cp.random.rand(K, N)
+        ...
+        ...     # Execute the operation to get the new result.
+        ...     r2 = mm.execute()
+
+
+        All the resources used by the object are released at the end of the block.
+
+        Further examples can be found in the `nvmath/examples/linalg/generic/matmul
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/generic/matmul>`_
+        directory.
+    """
+
+    _input_traits: BLASMMTraits
+    _batch_traits: tuple[BatchTraits, BatchTraits, BatchTraits]
+    _qualifiers: MatrixQualifier
+
+    def __init__(
+        self,
+        a: AnyTensor,
+        b: AnyTensor,
+        /,
+        c: AnyTensor | None = None,
+        *,
+        alpha: float | complex | None = None,
+        beta: float | complex | None = None,
+        qualifiers: MatrixQualifier | None = None,
+        options: MatmulOptions | None = None,
+        execution: ExecutionCPU | ExecutionCUDA | None = None,
+        stream: utils.AnyStream | int | None = None,
+    ):
+        options = utils.check_or_create_options(MatmulOptions, options, "Matrix multiplication options")
+        assert options is not None
+
+        if c is None and options.inplace:
+            raise ValueError("Operation cannot be inplace if operand C is not provided.")
+
+        a = tensor_wrapper.wrap_operand(a)
+        check_dtype(a.dtype, "A")
+        check_extents(a.shape, "A")
+        check_strides(a.strides, "A")
+
+        b = tensor_wrapper.wrap_operand(b)
+        check_dtype(b.dtype, "B")
+        check_extents(b.shape, "B")
+        check_strides(b.strides, "B")
+        operands = [a, b]
+
+        self.num_operands = 2
+        if c is not None:
+            self.num_operands = 3
+            c = tensor_wrapper.wrap_operand(c)
+            check_dtype(c.dtype, "C")
+            check_extents(c.shape, "C")
+            check_strides(c.strides, "C")
+            operands.append(c)
+
+        super().__init__(operands, options=options, execution=execution, stream=stream)
+
+        self._logger.info(f"The data type of operand A is '{a.dtype}', and that of operand B is '{b.dtype}'.")
+        if c is not None:
+            self._logger.info(f"The data type of operand C is '{c.dtype}'.")
+
+        if self.options.inplace:
+            self._logger.info("The operation will be performed inplace with operand C.")
+
+        if c is not None and beta is None:
+            raise ValueError("A value for beta must be provided if operand C is provided.")
+
+        assert self.num_operands == 2 or self.num_operands == 3, "Internal Error."
+
+        if a.dtype != b.dtype or (c is not None and a.dtype != c.dtype):
+            raise ValueError(
+                "Unsupported combination of dtypes. "
+                f"A ({a.dtype}), B ({b.dtype}), and C ({getattr(c, 'dtype', None)}) must all have the same dtype."
+            )
+        # Determine the data types for a and b.
+        self.a_dtype = typemaps.NAME_TO_DATA_TYPE[a.dtype]
+        self.b_dtype = typemaps.NAME_TO_DATA_TYPE[b.dtype]
+        self.a_dtype_name = a.dtype
+        self.b_dtype_name = b.dtype
+
+        self.is_complex = "complex" in self.a_dtype_name or "complex" in self.b_dtype_name
+
+        # Determine the data types for c.
+        if c is None:
+            self.c_dtype = self.a_dtype
+        else:
+            self.c_dtype = typemaps.NAME_TO_DATA_TYPE[c.dtype]
+        self.c_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.c_dtype]
+
+        self._logger.info(f"The data type for the result C is '{self.c_dtype_name}'.")
+
+        self.scale_type_name = self.a_dtype_name
+
+        # Set alpha and beta.
+        self.alpha = np.zeros((1,), dtype=self.scale_type_name)
+        try:
+            self.alpha[0] = alpha if alpha is not None else 1
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'.") from e
+
+        self.beta = np.zeros((1,), dtype=self.scale_type_name)
+        if beta is not None and c is None:
+            self._logger.warning(f"Matmul: The provided beta value {beta} is ignored since operand C is not specified.")
+        try:
+            self.beta[0] = beta if beta is not None and c is not None else 0
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'.") from e
+
+        if qualifiers is None:
+            self._qualifiers = np.empty(3, dtype=matrix_qualifiers_dtype)
+            self._qualifiers[:] = GeneralMatrixQualifier.create()
+        else:
+            if not ((len(qualifiers) == 3) or (len(qualifiers) == 2 and c is None)):
+                raise ValueError("The number of MatrixQualifiers must match the number of operands.")
+            new_qualifiers = np.empty(3, dtype=matrix_qualifiers_dtype)
+            new_qualifiers[:2] = qualifiers[:2]
+            new_qualifiers[2] = GeneralMatrixQualifier.create() if len(qualifiers) < 3 else qualifiers[2]
+            self._qualifiers = new_qualifiers
+        self._logger.info(
+            f"The matrix multiplication qualifiers are "
+            f"A = {GeneralMatrixQualifier.to_string(self._qualifiers[0])}, "
+            f"B = {GeneralMatrixQualifier.to_string(self._qualifiers[1])}, and "
+            f"C = {GeneralMatrixQualifier.to_string(self._qualifiers[2])}."
+        )
+
+        # Set qualifiers based on torch lazy conjugation flag if not provided.
+        self._qualifiers[0]["conjugate"] = self._qualifiers[0]["conjugate"] ^ self._operands[0].is_conjugate
+        self._qualifiers[1]["conjugate"] = self._qualifiers[1]["conjugate"] ^ self._operands[1].is_conjugate
+        self.lazy_conjugation = (self._operands[0].is_conjugate, self._operands[1].is_conjugate, False)
+        if c is not None:
+            self._qualifiers[2]["conjugate"] = self._qualifiers[2]["conjugate"] ^ self._operands[2].is_conjugate
+        if self._qualifiers[2]["conjugate"]:
+            raise ValueError("The conjugate flag is currently not supported for operand C.")
+
+        # Capture operand extents and strides for consistency check when resetting operands.
+        self.operand_extents = tuple(o.shape for o in self._operands)
+        self.operand_strides = tuple(o.strides for o in self._operands)
+
+        # Create operand layouts.
+        a_layout = BLASMatrixTraits(
+            self.a_dtype,
+            *vector_to_square(
+                self._operands[0].shape[-2:],
+                self._operands[0].strides[-2:],
+                self._qualifiers[0],
+            ),
+            is_conjugate=bool(self._qualifiers[0]["conjugate"]),
+            is_transpose=bool(self._qualifiers[0]["transpose"]),
+            is_lower=self._qualifiers[0]["uplo"] == FillMode.LOWER,
+        )
+        b_layout = BLASMatrixTraits(
+            self.b_dtype,
+            *vector_to_square(
+                self._operands[1].shape[-2:],
+                self._operands[1].strides[-2:],
+                self._qualifiers[1],
+            ),
+            is_conjugate=bool(self._qualifiers[1]["conjugate"]),
+            is_transpose=bool(self._qualifiers[1]["transpose"]),
+            is_lower=self._qualifiers[1]["uplo"] == FillMode.LOWER,
+        )
+        c_layout = (
+            None
+            if c is None
+            else BLASMatrixTraits(
+                self.c_dtype,
+                *vector_to_square(
+                    self._operands[2].shape[-2:],
+                    self._operands[2].strides[-2:],
+                    self._qualifiers[2],
+                ),
+                is_conjugate=bool(self._qualifiers[2]["conjugate"]),
+                is_transpose=bool(self._qualifiers[2]["transpose"]),
+                is_lower=self._qualifiers[2]["uplo"] == FillMode.LOWER,
+            )
+        )
+
+        # Get the operation traits.
+        self._input_traits = BLASMMTraits.from_layouts(a_layout, b_layout, c_layout, self._logger)
+        self._logger.info(
+            f"The matrix multiplication attributes are M = {self._input_traits.M}, N = {self._input_traits.N}, and "
+            f"K = {self._input_traits.K}."
+        )
+
+        a_batch = BatchTraits.from_full_shape_and_strides(
+            self._operands[0].shape,
+            self._operands[0].strides,
+            num_trailing_dims=2,
+            overlap_allowed=True,
+        )
+        b_batch = BatchTraits.from_full_shape_and_strides(
+            self._operands[1].shape,
+            self._operands[1].strides,
+            num_trailing_dims=2,
+            overlap_allowed=True,
+        )
+        c_batch = (
+            BatchTraits.from_full_shape_only(
+                (*(a_batch * b_batch), *self._input_traits.c_layout_traits.shape),
+                num_trailing_dims=2,
+            )
+            if c is None
+            else BatchTraits.from_full_shape_and_strides(
+                self._operands[2].shape,
+                self._operands[2].strides,
+                num_trailing_dims=2,
+                overlap_allowed=False,
+            )
+        )
+        self._logger.debug("Operand A has %s.", a_batch)
+        self._logger.debug("Operand B has %s.", b_batch)
+        self._logger.debug("Operand C has %s.", c_batch)
+        if a_batch.shape != () or b_batch.shape != () or c_batch.shape != ():
+            raise ValueError("Batched inputs are unsupported by the generic matmul API at this time.")
+        if (a_batch * b_batch) != c_batch.shape:
+            raise ValueError(
+                f"The batch dimensions of operand C are invalid. {a_batch * b_batch} does not match {c_batch.shape}."
+            )
+        self._batch_traits = (a_batch, b_batch, c_batch)
+
+        self._logger.info(
+            f"The batch count is {self._batch_traits[2].count}, and the batch shape is {self._batch_traits[2].shape}."
+        )
+
+        # Attributes to establish stream ordering.
+        self.workspace_stream: ccx.Stream | None = None
+        self.last_compute_event: ccx.Event | None = None
+
+        self.valid_state = True
+        self._logger.info("The Matmul operation has been created.")
+
+    def _check_valid_matmul(self, *args, **kwargs):
+        """
+        Check if the Matmul object is alive and well.
+        """
+        if not self.valid_state:
+            raise InvalidMatmulState("The Matmul object cannot be used after resources are free'd")
+
+    @utils.precondition(_check_valid_matmul)
+    def plan(self) -> None:
+        """
+        Plan the matrix multiplication operation.
+
+        Unlike :py:meth:`nvmath.linalg.advanced.Matmul.plan`, this method takes no
+        tuning parameters. Its primary function is to find the correct matrix multiplication
+        implementation based on the operands and options provided to the constructor.
+
+        Args:
+            stream: {stream}
+
+        Returns:
+            Nothing.
+        """
+        self._logger.info("= PLANNING PHASE =")
+
+        mm_traits = self._input_traits.blas_compatible(self._logger, self.options.inplace)
+        if self.options.inplace:
+            self.result_layout_traits = self._input_traits.c_layout_traits
+            self.result_batch_traits = self._batch_traits[2]
+        else:
+            self.result_layout_traits = self._input_traits.c_layout_traits.trim_strides()
+            self.result_batch_traits = BatchTraits.from_full_shape_only(
+                shape=(*self._batch_traits[2].shape, *self.result_layout_traits.shape),
+                num_trailing_dims=len(self.result_layout_traits.shape),
+            )
+
+        # Base FLOP count.
+        self.flop_count = 2 * mm_traits.M * mm_traits.N * mm_traits.K
+        self._logger.info(f"The base matrix multiplication FLOP count is {formatters.FLOPSStr(self.flop_count, 'FLOP')}.")
+
+        self._function = select_blas_mm_function(
+            (*self._batch_traits[:2], self.result_batch_traits),
+            mm_traits,
+            self._qualifiers,
+            self._logger,
+            self.execution,
+        )
+
+        self._has_plan = True
+
+        return
+
+    def _check_and_set_operand(
+        self,
+        new_operand: utils.TensorHolder,
+        operand_name: str,
+        stream_holder: utils.StreamHolder | None,
+        *,
+        operand_index: int,
+        dtype: str | None = None,
+        extents: Sequence[int] | None = None,
+        strides: Sequence[int] | None = None,
+    ):
+        """
+        Check to make sure that the provided operand is consistent with the one it's
+        updating, and update it.
+        """
+        # Make sure that the data type and extents match.
+        utils.check_attribute_match(dtype, new_operand.dtype, "data type")
+        utils.check_attribute_match(extents, new_operand.shape, "extents")
+
+        # Package must be the same to preserve stream ordering
+        if self._operands_package != new_operand.name:
+            raise TypeError(
+                f"Library package mismatch: The operand {operand_name} must from the same package ({new_operand.name}) "
+                f"as the original operand ({self._operands_package})."
+            )
+
+        if self._operands_device_id != new_operand.device_id:
+            raise ValueError(
+                f"The operand {operand_name} must be on the same device ({new_operand.device_id}) as the original operand "
+                f"({self._operands_device_id})."
+            )
+
+        # Conjugate flag of the provided operands must match the original qualifiers
+        if self.lazy_conjugation[operand_index] != new_operand.is_conjugate:
+            raise ValueError(f"The provided operand {operand_name} has different conjugate flag than the original operand")
+
+        self._operands[operand_index], self._operands_backup[operand_index] = templates.copy_operand_perhaps(
+            internal_operand=self._operands[operand_index],
+            operand=new_operand,
+            stream_holder=stream_holder,
+            execution_device_id=getattr(self.execution, "device_id", "cpu"),
+            operands_device_id=new_operand.device_id,
+        )
+
+        # Check strides after copying because copy could affect data layout?
+        # FIXME: Could end up with a broken operand state if user catches error raised here?
+        # But if we don't use copy_operand_perhaps, we can't do inplace operand reset.
+        utils.check_attribute_match(strides, self._operands[operand_index].strides, "strides")
+
+        self._logger.info(f"Operand '{operand_name}' has been reset to the new value.")
+
+        return
+
+    @utils.precondition(_check_valid_matmul)
+    def reset_operands(
+        self,
+        a=None,
+        b=None,
+        c=None,
+        *,
+        alpha=None,
+        beta=None,
+        stream: utils.AnyStream | int | None = None,
+    ):
+        """
+        Reset the operands held by this :class:`Matmul` instance.
+
+        This method has two use cases:
+            (1) it can be used to provide new operands for execution when the original
+                operands are on the CPU
+            (2) it can be used to release the internal reference to the previous operands
+                and make their memory available for other use by passing ``None`` for *all*
+                arguments. In this case, this method must be called again to provide the
+                desired operands before another call to execution APIs like :meth:`autotune`
+                or :meth:`execute`.
+
+        This method is not needed when the operands reside on the GPU and in-place
+        operations are used to update the operand values.
+
+        This method will perform various checks on the new operands to make sure:
+
+        - The shapes, strides, datatypes match those of the old ones.
+        - The packages that the operands belong to match those of the old ones.
+        - If input tensors are on GPU, the device must match.
+
+        Args:
+            a: {a}
+
+            b: {b}
+
+            c: {c}
+
+            alpha: {alpha}
+
+            beta: {beta}
+
+            stream: {stream}
+
+        Examples:
+
+            >>> import cupy as cp
+            >>> import nvmath
+
+            Create two 3-D float64 ndarrays on the GPU:
+
+            >>> M, N, K = 128, 128, 256
+            >>> a = cp.random.rand(M, K)
+            >>> b = cp.random.rand(K, N)
+
+            Create an matrix multiplication object as a context manager
+
+            >>> with nvmath.linalg.Matmul(a, b) as mm:
+            ...     # Plan the operation.
+            ...     mm.plan()
+            ...
+            ...     # Execute the MM to get the first result.
+            ...     r1 = mm.execute()
+            ...
+            ...     # Reset the operands to new CuPy ndarrays.
+            ...     c = cp.random.rand(M, K)
+            ...     d = cp.random.rand(K, N)
+            ...     mm.reset_operands(c, d)
+            ...
+            ...     # Execute to get the new result corresponding to the updated operands.
+            ...     r2 = mm.execute()
+
+            Note that if only a subset of operands are reset, the operands that are not
+            reset hold their original values.
+
+            With :meth:`reset_operands`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
+
+            For the particular example above, explicitly calling :meth:`reset_operands` is
+            equivalent to updating the operands in-place, i.e, replacing
+            ``mm.reset_operand(c, d)`` with ``a[:]=c`` and ``b[:]=d``. Note that updating
+            the operand in-place should be adopted with caution as it can only yield the
+            expected result under the additional constraint below:
+
+                - The operand is on the GPU (more precisely, the operand memory space should
+                  be accessible from the execution space).
+
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul/example05_stateful_inplace.py>`_.
+        """
+
+        if c is not None and self.num_operands == 2:
+            raise ValueError(
+                "The matrix multiplication problem specification does not include operand C, so it cannot be reset."
+            )
+
+        if a is None and b is None and c is None and alpha is None and beta is None:
+            self._operands = None  # type: ignore[assignment]
+            self._logger.info("The operands have been reset to None.")
+            return
+
+        # If the operands have been reset to None, then all required operands (a, b, c, and
+        # epilog_inputs need to be provided).
+        if not self._operands:
+            if a is None or b is None or (c is None and self.num_operands == 3):
+                op_names = "A, B"
+                if c is None and self.num_operands == 3:
+                    op_names += ", C"
+                raise ValueError(f"Operands {op_names} must be provided.")
+            self._operands = [None] * self.num_operands  # type: ignore[list-item]
+
+        # Future operations on the workspace stream should be ordered after the computation.
+        if self.last_compute_event is not None:
+            # FIMXE: What if result is in-place? Then don't we need to wait for copy
+            # from result to out?
+            assert self.workspace_stream is not None
+            self.workspace_stream.wait(self.last_compute_event)
+            self.last_compute_event = None
+
+        # Update alpha.
+        if alpha is not None:
+            try:
+                self.alpha[0] = alpha
+            except (ValueError, TypeError) as e:
+                raise ValueError(
+                    f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'."
+                ) from e
+
+        # Update beta.
+        if beta is not None:
+            if self.num_operands == 2:
+                self._logger.warning(f"Matmul: The provided beta value {beta} is ignored since operand C is not specified.")
+            else:
+                try:
+                    self.beta[0] = beta
+                except (ValueError, TypeError) as e:
+                    raise ValueError(
+                        f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'."
+                    ) from e
+
+        exec_stream_holder, operand_stream_holder = self._get_or_create_stream_maybe(stream)
+
+        # Reset the provided operands.
+        if a is not None:
+            a = tensor_wrapper.wrap_operand(a)
+            index = 0
+            self._check_and_set_operand(
+                a,
+                "A",
+                operand_stream_holder,
+                operand_index=index,
+                dtype=self.a_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+        if b is not None:
+            b = tensor_wrapper.wrap_operand(b)
+            index = 1
+            self._check_and_set_operand(
+                b,
+                "B",
+                operand_stream_holder,
+                operand_index=index,
+                dtype=self.b_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+        if c is not None:  # If we get here, we know that C is one of the operands in the problem specification.
+            c = tensor_wrapper.wrap_operand(c)
+            index = 2
+            self._check_and_set_operand(
+                c,
+                "C",
+                operand_stream_holder,
+                operand_index=index,
+                dtype=self.c_dtype_name,
+                extents=self.operand_extents[index],
+                strides=self.operand_strides[index],
+            )
+
+    @utils.precondition(_check_valid_matmul)
+    @utils.precondition(templates.StatefulAPI._check_planned, "Execution")
+    @utils.precondition(templates.StatefulAPI._check_valid_operands, "Execution")
+    def execute(self, *, stream: utils.AnyStream | int | None = None) -> utils.AnyTensor:
+        """
+        Execute a prepared (planned) matrix multiplication.
+
+        This method is a wrapper around :py:meth:`_execute`, which takes the same arguments,
+        but skips as many correctness and safety checks as possible.
+
+        Args:
+            stream: {stream}
+
+        Returns:
+           {result}
+        """
+        return self._execute(stream=stream)
+
+    def _execute(self, *, stream: utils.AnyStream | int | None = None) -> utils.AnyTensor:
+        log_info = self._logger.isEnabledFor(logging.INFO)
+        log_debug = self._logger.isEnabledFor(logging.DEBUG)
+        if log_info:
+            self._logger.info("= EXECUTION PHASE =")
+        exec_stream_holder, operand_stream_holder = self._get_or_create_stream_maybe(stream)
+        if log_info:
+            self._logger.info(
+                "The specified stream for execute() is "
+                f"{getattr(exec_stream_holder or operand_stream_holder, 'obj', 'no stream')}."
+            )
+
+        # We must handle all valid combinations of:
+        # - c-provided and c-not-provided
+        # - results in-place and out-of-place
+
+        # Create empty tensor for the result.
+        if self.num_operands == 2 or not self.options.inplace:
+            if log_debug:
+                self._logger.debug("Beginning output (empty) tensor creation...")
+                self._logger.debug(
+                    f"The output tensor shape = {self.result_layout_traits.shape} with strides = "
+                    f"{self.result_layout_traits.strides} and data type '{self.c_dtype_name}'."
+                )
+                self._logger.debug(
+                    f"The output tensor has batch dimensions with shape {self.result_batch_traits.shape} "
+                    f"and strides {self.result_batch_traits.strides}."
+                )
+            result = utils.create_empty_tensor(
+                self._result_class,
+                (*self.result_batch_traits.shape, *self.result_layout_traits.shape),
+                self.c_dtype_name,
+                getattr(self.execution, "device_id", "cpu"),
+                exec_stream_holder,
+                # verify_strides=False because we need strides to be exactly what we
+                # request; not arbitrary if the strides aren't contiguous and dense.
+                # Otherwise, the layout parameters will mismatch what we pass to the matmul
+                # implementation.
+                verify_strides=False,
+                strides=(*self.result_batch_traits.strides, *self.result_layout_traits.strides),
+            )
+            if log_debug:
+                self._logger.debug("The output (empty) tensor has been created.")
+        else:  # num_operands == 3 and self.options.inplace
+            result = self._operands[2]
+            self._logger.debug("The output tensor is C (in-place execution).")
+
+        if self.num_operands == 3 and not self.options.inplace:
+            result.copy_(self._operands[2], exec_stream_holder)
+            self._logger.debug("Operand C copied to result tensor (out-of-place execution).")
+
+        a, b = self._operands[0], self._operands[1]
+        if log_info:
+            self._logger.info("Starting matrix multiplication...")
+
+        if self.execution.name == "cuda":
+            assert exec_stream_holder is not None
+            self.workspace_stream = exec_stream_holder.obj
+            with utils.cuda_call_ctx(exec_stream_holder, self._blocking, timing=log_info) as (
+                self.last_compute_event,
+                elapsed,
+            ):
+                self._function(
+                    a,
+                    b,
+                    result,
+                    self.alpha,
+                    self.beta,
+                    exec_stream_holder,
+                )
+        else:
+            with utils.host_call_ctx(timing=log_info) as elapsed:
+                self._function(
+                    a,
+                    b,
+                    result,
+                    self.alpha,
+                    self.beta,
+                    exec_stream_holder,
+                )
+
+        if log_info and elapsed.data is not None:
+            self._logger.info(f"The matrix multiplication calculation took {elapsed.data:.3f} ms to complete.")
+
+        # Return the result and auxiliary outputs, if present.
+        if self._operands_device_id != getattr(self.execution, "device_id", "cpu"):
+            if self.options.inplace:
+                c = self._operands_backup[2]
+                assert c is not None, (
+                    "Internal Error. "
+                    "Inplace operation was requested, but the execution space was different from the input space, "
+                    "and we didn't keep a reference to the input tensor."
+                )
+                c.copy_(result, stream_holder=operand_stream_holder)
+                out = c.tensor
+            else:
+                out = result.to(self._operands_device_id, stream_holder=operand_stream_holder).tensor
+        else:
+            out = result.tensor
+
+        return out
+
+    def __exit__(self, *args, **kwargs) -> bool | None:
+        pass
+
+
+@utils.docstring_decorator(GENERIC_MM_DOCUMENTATION, skip_missing=False)
+def matmul(
+    a: AnyTensor,
+    b: AnyTensor,
+    /,
+    c: AnyTensor | None = None,
+    *,
+    alpha: float | complex | None = None,
+    beta: float | complex | None = None,
+    qualifiers: MatrixQualifier | None = None,
+    options: MatmulOptions | None = None,
+    execution: ExecutionCPU | ExecutionCUDA | None = None,
+    stream: utils.AnyStream | int | None = None,
+):
+    """
+    Perform the specified matrix multiplication computation :math:`\\alpha a @ b + \\beta
+    c`. This function-form is a wrapper around the stateful :class:`Matmul` object APIs and
+    is meant for *single* use (the user needs to perform just one matrix multiplication, for
+    example), in which case there is no possibility of amortizing preparatory costs.
+
+    Detailed information on what's happening within this function can be obtained by passing
+    in a :class:`logging.Logger` object to :class:`MatmulOptions` or by setting the
+    appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        alpha: {alpha}
+
+        beta: {beta} from a previously planned and autotuned matrix multiplication.
+
+        qualifiers: {qualifiers}
+
+        options: {options}
+
+        execution: {execution}
+
+        stream: {stream}
+
+    Returns:
+        {result}
+
+    Semantics:
+        {semantics}
+
+    .. seealso::
+        :class:`Matmul`, :class:`MatmulOptions`, :class:`matrix_qualifiers_dtype`,
+        :class:`MatrixQualifier`
+
+    Examples:
+
+        >>> import cupy as cp
+        >>> import nvmath
+
+        Create three float32 ndarrays on the GPU:
+
+        >>> M, N, K = 128, 64, 256
+        >>> a = cp.random.rand(M, K, dtype=cp.float32)
+        >>> b = cp.random.rand(K, N, dtype=cp.float32)
+        >>> c = cp.random.rand(M, N, dtype=cp.float32)
+
+        Perform the operation :math:`\\alpha A @ B + \\beta C` using :func:`matmul`. The
+        result `r` is also a CuPy float64 ndarray:
+
+        >>> r = nvmath.linalg.matmul(a, b, c, alpha=1.23, beta=0.74)
+
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the Matmul operation. This can be done if the operands are computed on a
+        different stream, for example:
+
+        >>> s = cp.cuda.Stream()
+        >>> with s:
+        ...     a = cp.random.rand(M, K)
+        ...     b = cp.random.rand(K, N)
+        >>> r = nvmath.linalg.matmul(a, b, stream=s)
+
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
+
+        Create  NumPy ndarrays on the CPU.
+
+        >>> import numpy as np
+        >>> a = np.random.rand(M, K)
+        >>> b = np.random.rand(K, N)
+
+        Provide the NumPy ndarrays to :func:`matmul`, with the result also being a NumPy
+        ndarray:
+
+        >>> r = nvmath.linalg.matmul(a, b)
+
+    Notes:
+        - This function is a convenience wrapper around :class:`Matmul` and and is
+          specifically meant for *single* use.
+
+    Further examples can be found in the `nvmath/examples/linalg/generic/matmul
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/generic/matmul>`_
+    directory.
+    """
+
+    with Matmul(
+        a,
+        b,
+        c=c,
+        alpha=alpha,
+        beta=beta,
+        qualifiers=qualifiers,
+        options=options,
+        execution=execution,
+        stream=stream,
+    ) as mm:
+        mm.plan()
+
+        r = mm.execute(stream=stream)
+
+    return r
diff --git a/nvmath/sparse/_internal/cudss_config_ifc.py b/nvmath/sparse/_internal/cudss_config_ifc.py
index 4fa6211..4c40a24 100644
--- a/nvmath/sparse/_internal/cudss_config_ifc.py
+++ b/nvmath/sparse/_internal/cudss_config_ifc.py
@@ -80,6 +80,10 @@ def __init__(self, solver):
         self._pivot_type = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.PIVOT_TYPE))
         self._pivot_threshold = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.PIVOT_THRESHOLD))
         self._max_lu_nnz = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.MAX_LU_NNZ))
+        self._use_matching = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.USE_MATCHING))
+        self._matching_alg = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.MATCHING_ALG))
+        self._nd_nlevels = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.ND_NLEVELS))
+        self._use_superpanels = np.zeros((1,), dtype=get_dtype(ConfigParamEnum.USE_SUPERPANELS))
 
     def _check_valid_solver_wrapper(self, *args, **kwargs):
         _check_valid_solver(self)
@@ -147,6 +151,36 @@ def reordering_algorithm(self, algorithm):
         algorithm = cudss.AlgType(algorithm)
         _set_scalar_attribute(self._config_ptr, ConfigParamEnum.REORDERING_ALG, self._reordering_alg, algorithm)
 
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def matching_algorithm(self):
+        """
+        Query or set the matching algorithm used. See
+        :class:`nvmath.bindings.cudss.AlgType` and the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_ for more
+        information.
+        """
+        _get_scalar_attribute(self._config_ptr, ConfigParamEnum.MATCHING_ALG, self._matching_alg)
+
+        return cudss.AlgType(self._matching_alg.item())
+
+    @matching_algorithm.setter
+    @utils.precondition(_check_valid_solver_wrapper)
+    def matching_algorithm(self, algorithm):
+        """
+        Set the matching algorithm to use. See :class:`nvmath.bindings.cudss.AlgType` and
+        the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_ for more
+        information.
+
+        Args:
+            algorithm: The matching algorithm of type
+            :class:`nvmath.bindings.cudss.AlgType` or Python `int`.
+
+        """
+        algorithm = cudss.AlgType(algorithm)
+        _set_scalar_attribute(self._config_ptr, ConfigParamEnum.MATCHING_ALG, self._matching_alg, algorithm)
+
     @property
     @utils.precondition(_check_valid_solver_wrapper)
     def pivot_type(self):
@@ -234,6 +268,93 @@ def max_nnz(self, max_nnz):
         """
         _set_scalar_attribute(self._config_ptr, ConfigParamEnum.MAX_LU_NNZ, self._max_lu_nnz, max_nnz)
 
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def use_matching(self):
+        """
+        Query or set the option to enable or disable matching. See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+        """
+        _get_scalar_attribute(self._config_ptr, ConfigParamEnum.USE_MATCHING, self._use_matching)
+
+        return self._use_matching.item()
+
+    @use_matching.setter
+    @utils.precondition(_check_valid_solver_wrapper)
+    def use_matching(self, matching_flag):
+        """
+        Set the option to enable or disable matching. See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+
+        Args:
+            matching_flag: The flag to enable or disable matching (Python `int`
+                or `bool`, 0 to disable).
+
+        """
+        _set_scalar_attribute(self._config_ptr, ConfigParamEnum.USE_MATCHING, self._use_matching, matching_flag)
+
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def nd_min_levels(self):
+        """
+        Query or set the minimum number of levels for the nested dissection reordering. See
+        the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+        """
+        _get_scalar_attribute(self._config_ptr, ConfigParamEnum.ND_NLEVELS, self._nd_nlevels)
+
+        return self._nd_nlevels.item()
+
+    @nd_min_levels.setter
+    @utils.precondition(_check_valid_solver_wrapper)
+    def nd_min_levels(self, min_levels):
+        """
+        Set the minimum number of levels for the nested dissection reordering. See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+
+        Args:
+            min_levels: The minimum number of levels for the nested dissection reordering
+                (Python `int`).
+
+        """
+        _set_scalar_attribute(self._config_ptr, ConfigParamEnum.ND_NLEVELS, self._nd_nlevels, min_levels)
+
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def use_superpanels(self):
+        """
+        Query or set the option to enable or disable superpanel optimization. See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+        """
+        _get_scalar_attribute(self._config_ptr, ConfigParamEnum.USE_SUPERPANELS, self._use_superpanels)
+
+        return self._use_superpanels.item()
+
+    @use_superpanels.setter
+    @utils.precondition(_check_valid_solver_wrapper)
+    def use_superpanels(self, superpanels_flag):
+        """
+        Set the option to enable or disable superpanel optimization. See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssconfigparam-t>`_
+        for more information.
+
+        Args:
+            superpanels_flag: The flag to enable or disable superpanel optimization
+                (Python `int` or `bool`, 0 to disable).
+
+        """
+        _set_scalar_attribute(self._config_ptr, ConfigParamEnum.USE_SUPERPANELS, self._use_superpanels, superpanels_flag)
+
 
 class FactorizationConfig:
     """
diff --git a/nvmath/sparse/_internal/cudss_data_ifc.py b/nvmath/sparse/_internal/cudss_data_ifc.py
index 8141d6a..ff880c3 100644
--- a/nvmath/sparse/_internal/cudss_data_ifc.py
+++ b/nvmath/sparse/_internal/cudss_data_ifc.py
@@ -26,6 +26,18 @@
 _tls.size_written = np.empty((1,), dtype=np.uint64)
 
 
+# TODO: factor out common utilities (taken from fft.py).
+def complex_to_real_equivalent(name):
+    assert "complex" in name, f"Internal Error ({name=})"
+    m = name.split("complex")
+    assert len(m) in (1, 2)
+    size = int(m[-1]) // 2
+    if len(m) == 1:
+        return f"float{size}"
+    else:
+        return f"{m[0]}float{size}"
+
+
 def _get_attribute(handle, data_ptr, name, attribute, length=1):
     """
     name      = cudss enumerator for the attribute.
@@ -80,10 +92,14 @@ def __init__(self, solver):
 
         self._N = self._solver._N
         self._batched = self._solver.batched
+        self._index_type = self._solver.index_type
+        self._value_type = self._solver.value_type
+        self._nsuperpanels = np.zeros((1,), dtype=self._index_type)
 
         # Allocate permutation arrays lazily, and only if not batched.
         self._perm_reorder_col = None
         self._perm_reorder_row = None
+        self._perm_matching = None
 
         self._memory_estimates = np.zeros((), dtype=memory_estimates_dtype).view(np.recarray)
 
@@ -116,8 +132,7 @@ def col_permutation(self):
             raise RuntimeError("Column permutation is not available for batched systems.")
 
         if self._perm_reorder_col is None:
-            get_dtype = cudss.get_data_param_dtype
-            self._perm_reorder_col = np.empty((self._N,), dtype=get_dtype(DataParamEnum.PERM_REORDER_COL))
+            self._perm_reorder_col = np.empty((self._N,), dtype=self._index_type)
 
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.PERM_REORDER_COL, self._perm_reorder_col, length=self._N)
 
@@ -136,13 +151,44 @@ def row_permutation(self):
             raise RuntimeError("Row permutation is not available for batched systems.")
 
         if self._perm_reorder_row is None:
-            get_dtype = cudss.get_data_param_dtype
-            self._perm_reorder_row = np.empty((self._N,), dtype=get_dtype(DataParamEnum.PERM_REORDER_ROW))
+            self._perm_reorder_row = np.empty((self._N,), dtype=self._index_type)
 
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.PERM_REORDER_ROW, self._perm_reorder_row, length=self._N)
 
         return self._perm_reorder_row
 
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def matching_col_permutation(self):
+        """
+        Query the matching (column) permutation after planning (reordering). See the
+        `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
+        for more information.
+        """
+        if self._batched:
+            raise RuntimeError("Matching (column) permutation is not available for batched systems.")
+
+        if self._perm_matching is None:
+            self._perm_matching = np.empty((self._N,), dtype=self._index_type)
+
+        _get_attribute(self._handle, self._data_ptr, DataParamEnum.PERM_MATCHING, self._perm_matching, length=self._N)
+
+        return self._perm_matching
+
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def num_superpanels(self):
+        """
+        Query the number of number of superpanels after planning (symbolic factorization).
+        See the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
+        for more information.
+        """
+        _get_attribute(self._handle, self._data_ptr, DataParamEnum.NSUPERPANELS, self._nsuperpanels)
+
+        return self._nsuperpanels.item()
+
 
 class FactorizationInfo:
     """
@@ -161,20 +207,27 @@ def __init__(self, solver):
 
         self._N = self._solver._N
         self._batched = self._solver.batched
+        self._index_type = self._solver.index_type
         self._value_type = self._solver.value_type
 
         get_dtype = cudss.get_data_param_dtype
 
         self._info = np.zeros((1,), dtype=get_dtype(DataParamEnum.INFO))
         self._lu_nnz = np.zeros((1,), dtype=get_dtype(DataParamEnum.LU_NNZ))
-        self._npivots = np.zeros((1,), dtype=get_dtype(DataParamEnum.NPIVOTS))
-        self._inertia = np.zeros((2,), dtype=get_dtype(DataParamEnum.INERTIA))
+
+        # Allocate arrays that depend on the index or value type lazily.
+        self._npivots = None
+        self._inertia = None
 
         # Allocate permutation and diagonal arrays lazily, and only if not batched.
         self._perm_col = None
         self._perm_row = None
         self._diag = None
 
+        # Allocate matching row and col scale arrays lazily, and only if not batched.
+        self._scale_row = None
+        self._scale_col = None
+
     def _check_valid_solver_wrapper(self, *args, **kwargs):
         _check_valid_solver(self)
 
@@ -213,6 +266,9 @@ def npivots(self):
         <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
         for more information.
         """
+        if self._npivots is None:
+            self._npivots = np.empty((1,), dtype=self._index_type)
+
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.NPIVOTS, self._npivots)
 
         return self._npivots.item()
@@ -227,6 +283,9 @@ def inertia(self):
         <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
         for more information.
         """
+        if self._inertia is None:
+            self._inertia = np.empty((2,), dtype=self._index_type)
+
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.INERTIA, self._inertia, length=2)
 
         return self._inertia
@@ -244,8 +303,7 @@ def col_permutation(self):
             raise RuntimeError("Column permutation is not available for batched systems.")
 
         if self._perm_col is None:
-            get_dtype = cudss.get_data_param_dtype
-            self._perm_col = np.empty((self._N,), dtype=get_dtype(DataParamEnum.PERM_COL))
+            self._perm_col = np.empty((self._N,), dtype=self._index_type)
 
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.PERM_COL, self._perm_col, length=self._N)
 
@@ -264,8 +322,7 @@ def row_permutation(self):
             raise RuntimeError("Row permutation is not available for batched systems.")
 
         if self._perm_row is None:
-            get_dtype = cudss.get_data_param_dtype
-            self._perm_row = np.empty((self._N,), dtype=get_dtype(DataParamEnum.PERM_ROW))
+            self._perm_row = np.empty((self._N,), dtype=self._index_type)
 
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.PERM_ROW, self._perm_row, length=self._N)
 
@@ -289,3 +346,45 @@ def diag(self):
         _get_attribute(self._handle, self._data_ptr, DataParamEnum.DIAG, self._diag, length=self._N)
 
         return self._diag
+
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def row_scale_factors(self):
+        """
+        Query the scale factors for the rows of the factorized system, if matching was
+        used. See the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
+        for more information.
+        """
+        if self._batched:
+            raise RuntimeError("The factorized system's row scale factors is not available for batched systems.")
+
+        if self._scale_row is None:
+            vtype = self._value_type
+            dtype = complex_to_real_equivalent(vtype) if "complex" in vtype else vtype
+            self._scale_row = np.empty((self._N,), dtype=dtype)
+
+        _get_attribute(self._handle, self._data_ptr, DataParamEnum.SCALE_ROW, self._scale_row, length=self._N)
+
+        return self._scale_row
+
+    @property
+    @utils.precondition(_check_valid_solver_wrapper)
+    def col_scale_factors(self):
+        """
+        Query the scale factors for the columns of the factorized system, if matching was
+        used. See the `cuDSS documentation
+        <https://docs.nvidia.com/cuda/cudss/types.html#cudssdataparam-t>`_
+        for more information.
+        """
+        if self._batched:
+            raise RuntimeError("The factorized system's column scale factors is not available for batched systems.")
+
+        if self._scale_col is None:
+            vtype = self._value_type
+            dtype = complex_to_real_equivalent(vtype) if "complex" in vtype else vtype
+            self._scale_col = np.empty((self._N,), dtype=dtype)
+
+        _get_attribute(self._handle, self._data_ptr, DataParamEnum.SCALE_COL, self._scale_col, length=self._N)
+
+        return self._scale_col
diff --git a/nvmath/sparse/advanced/_configuration.py b/nvmath/sparse/advanced/_configuration.py
index e2da9ff..ee85158 100644
--- a/nvmath/sparse/advanced/_configuration.py
+++ b/nvmath/sparse/advanced/_configuration.py
@@ -41,7 +41,7 @@ class HybridMemoryModeOptions:
             ``True``. See
             :attr:`nvmath.bindings.cudss.ConfigParam.USE_CUDA_REGISTER_MEMORY`.
 
-    See Also:
+    .. seealso::
        :class:`ExecutionHybrid`, :class:`DirectSolver`, :func:`direct_solver`.
     """
 
@@ -66,7 +66,7 @@ class ExecutionCUDA:
         hybrid_memory_mode_options: Options controlling the use of hybrid (CPU-GPU) memory
             as an object of type :class:`HybridMemoryModeOptions` or a `dict`.
 
-    See Also:
+    .. seealso::
        :class:`ExecutionHybrid`, :class:`HybridMemoryModeOptions`, :class:`DirectSolver`,
        :func:`direct_solver`.
     """
@@ -88,7 +88,7 @@ class ExecutionHybrid:
         num_threads: The number of CPU threads used to execute the plan.
             If not specified, defaults to the number of CPU cores available to the process.
 
-    See Also:
+    .. seealso::
        :class:`ExecutionCUDA`, :class:`DirectSolver`, :func:`direct_solver`.
     """
 
@@ -131,7 +131,7 @@ class DirectSolverOptions:
 
         handle: cuDSS library handle. A handle will be created if one is not provided.
 
-    See Also:
+    .. seealso::
         :class:`ExecutionCUDA`, :class:`ExecutionHybrid`, :class:`DirectSolver`, and
         :func:`direct_solver`.
     """
diff --git a/nvmath/sparse/advanced/direct_solver.py b/nvmath/sparse/advanced/direct_solver.py
index edd956d..601a641 100644
--- a/nvmath/sparse/advanced/direct_solver.py
+++ b/nvmath/sparse/advanced/direct_solver.py
@@ -35,7 +35,7 @@
 from nvmath.internal.typemaps import NAME_TO_DATA_TYPE
 
 
-VALID_INDEX_TYPES = ("int32",)
+VALID_INDEX_TYPES = ("int32", "int64")
 
 VALID_DTYPES = ("float32", "float64", "complex64", "complex128")
 
@@ -195,6 +195,10 @@ def calculate_strides(shape, axis_order):
 
           * The solution ``x`` always has the same form as the RHS ``b``. It is a sequence of matrices or vectors if
             ``b`` is explicitly-batched, or a higher-dimensional ndarray/tensor if ``b`` is implicitly-batched.
+
+        .. tip::  For a description of the CSR sparse format, see
+           `here <https://docs.nvidia.com/nvpl/latest/sparse/storage_format/sparse_matrix.html#compressed-sparse-row-csr>`_
+           for example.
 """.strip(),
     }
 )
@@ -204,6 +208,16 @@ class InvalidDirectSolverState(Exception):
     pass
 
 
+def _check_cudss_version():
+    required = (0, 7)
+    available = cudss.get_property(0), cudss.get_property(1)
+    if available != required:
+        raise RuntimeError(
+            f"nvmath-python requires cuDSS version {'.'.join(str(i) for i in required)}, while you "
+            f"have cuDSS version {'.'.join(str(i) for i in available)}."
+        )
+
+
 @utils.docstring_decorator(SHARED_DSS_DOCUMENTATION, skip_missing=False)
 class DirectSolver:
     """
@@ -249,7 +263,10 @@ class DirectSolver:
 
         stream: {stream}
 
-    See Also:
+    Semantics:
+        {semantics}
+
+    .. seealso::
         :class:`DirectSolverPlanConfig`, :class:`DirectSolverFactorizationConfig`,
         :class:`DirectSolverSolutionConfig`, :class:`DirectSolverPlanInfo`,
         :class:`DirectSolverFactorizationInfo`, :class:`DirectSolverOptions`,
@@ -325,7 +342,7 @@ class DirectSolver:
 
         >>> x = solver.solve()
 
-        Finally, free the object's resources. To avoid having to explicitly making this
+        Finally, free the object's resources. To avoid having to explicitly make this
         call, it's recommended to use the DirectSolver object as a context manager as
         shown below, if possible.
 
@@ -404,8 +421,13 @@ def __init__(
         execution: ExecutionCUDA | ExecutionHybrid | None = None,
         stream: utils.AnyStream | int | None = None,
     ):
+        # Check if the required cuDSS version is available.
+        _check_cudss_version()
+
         # Process options.
-        self.options: Any = utils.check_or_create_options(DirectSolverOptions, options, "sparse direct solver options")
+        self.options: DirectSolverOptions = utils.check_or_create_options(
+            DirectSolverOptions, options, "sparse direct solver options"
+        )  # type: ignore[assignment]
 
         # Process execution options. The default execution space is CUDA.
         self.execution_options = utils.check_or_create_one_of_options(
@@ -452,7 +474,7 @@ def __init__(
         self.batched = self.explicitly_batched_lhs or self.implicitly_batched_lhs
 
         # The LHS batch shape should be empty for explicit batching.
-        self.lhs_batch_shape: Any = None if self.explicitly_batched_lhs else tuple(self.a.shape[:-2])
+        self.lhs_batch_shape = None if self.explicitly_batched_lhs else tuple(self.a.shape[:-2])
 
         # Set the LHS package.
         if self.explicitly_batched_lhs:
@@ -468,7 +490,7 @@ def __init__(
         if self.explicitly_batched_lhs:
             self.batch_count = len(self.a)
         elif self.implicitly_batched_lhs:
-            self.batch_count = math.prod(self.lhs_batch_shape)
+            self.batch_count = math.prod(self.lhs_batch_shape)  # type: ignore[arg-type]
             # Create the sequence of batch coordinates to use for creating batched CSR
             # matrix type.
             self.batch_indices = tuple(itertools.product(*list(map(range, self.lhs_batch_shape))))  # type: ignore
@@ -566,13 +588,19 @@ def __init__(
             self._N = n
             self.lhs_nnz = self.a.values.size
 
-        # Note that torch by default uses int64 which is not supported. SciPy and CuPy adapt
-        # the index type based on the dimension.
+        # Note that torch by default uses int64 which doesn't seem to give correct results
+        # for batched solves. SciPy and CuPy adapt the index type based on the dimension.
         if self.index_type not in VALID_INDEX_TYPES:
             raise TypeError(
                 f"The index type {self.index_type} is not supported. The supported index types are {VALID_INDEX_TYPES}."
             )
 
+        if self.index_type == "int64" and self.batch_count > 1:
+            raise RuntimeError(
+                "The index type 'int64' is not supported for batched solve. The supported index types are: "
+                f"{', '.join(set(VALID_INDEX_TYPES) - {'int64'})}"
+            )
+
         if self.value_type not in VALID_DTYPES:
             raise TypeError(
                 f"The dtype (value type) {self.value_type} is not supported. The supported dtypes are {VALID_DTYPES}."
@@ -926,7 +954,7 @@ def plan_config(self):
             A :class:`DirectSolverPlanConfig` object, whose attributes can be set (or
             queried) to configure the planning phase.
 
-        See Also:
+        .. seealso::
             :class:`DirectSolverPlanConfig`, :meth:`plan`.
         """
         return self._plan_config
@@ -940,7 +968,7 @@ def factorization_config(self):
             A :class:`DirectSolverFactorizationConfig` object, whose attributes can be set
             (or queried) to configure the factorization phase.
 
-        See Also:
+        .. seealso::
             :class:`DirectSolverFactorizationConfig`, :meth:`factorize`.
         """
         return self._factorization_config
@@ -954,7 +982,7 @@ def solution_config(self):
             A :class:`DirectSolverSolutionConfig` object, whose attributes can be set
             (or queried) to configure the factorization phase.
 
-        See Also:
+        .. seealso::
             :class:`DirectSolverSolutionConfig`, :meth:`solve`.
         """
         return self._solution_config
@@ -968,7 +996,7 @@ def plan_info(self):
             A :class:`DirectSolverPlanInfo` object, whose attributes can be queried for
             information regarding the planning phase.
 
-        See Also:
+        .. seealso::
             :class:`DirectSolverPlanInfo`, :meth:`plan`.
         """
         return self._plan_info
@@ -984,7 +1012,7 @@ def factorization_info(self):
             A :class:`DirectSolverFactorizationInfo` object, whose attributes can be
             queried for information regarding the factorization phase.
 
-        See Also:
+        .. seealso::
             :class:`DirectSolverFactorizationInfo`, :meth:`factorize`.
         """
         return self._factorization_info
@@ -1295,7 +1323,7 @@ def plan(self, *, stream: utils.AnyStream | None = None):
             A :class:`DirectSolverPlanInfo` object, whose attributes can be queried for
             information regarding the plan.
 
-        See Also:
+        .. seealso::
             :attr:`plan_config`, :class:`DirectSolverPlanConfig`,
             :class:`DirectSolverPlanInfo`.
 
@@ -1326,8 +1354,7 @@ def plan(self, *, stream: utils.AnyStream | None = None):
             ...     # returns a DirectSolverPlanInfo object.
             ...     plan_info = solver.plan()
             ...     # Query the column permutation, memory estimates, ...
-            ...     plan_info.col_permutation
-            array([6, 1, 4, 0, 3, 2, 5, 7], dtype=int32)
+            ...     col_perm = plan_info.col_permutation
 
         Further examples can be found in the `nvmath/examples/sparse/advanced/direct_solver
         <https://github.com/NVIDIA/nvmath-python/tree/main/examples/sparse/advanced/direct_solver>`_
@@ -1359,7 +1386,7 @@ def factorize(self, *, stream: utils.AnyStream | None = None):
             A :class:`DirectSolverFactorizationInfo` object, whose attributes can be
             queried for information regarding the numerical factorization.
 
-        See Also:
+        .. seealso::
             :attr:`factorization_config`, :class:`DirectSolverFactorizationConfig`,
             :class:`DirectSolverFactorizationInfo`.
 
@@ -1421,7 +1448,7 @@ def solve(self, *, stream: utils.AnyStream | None = None):
         Returns:
            {result}
 
-        See Also:
+        .. seealso::
             :attr:`solution_config`, :class:`DirectSolverSolutionConfig`.
 
         Examples:
@@ -1547,6 +1574,7 @@ def free(self):
             # Release internal resource references.
             self.resources_a = self.resources_b = self.resources_x = None
             self.resources_ra = self.resources_rb = self.resources_rx = None
+            self.a = self.b = None
 
             # Free matrix pointers.
             cudss.matrix_destroy(self.x_ptr)
@@ -1623,7 +1651,7 @@ def direct_solver(
     Semantics:
         {semantics}
 
-    See Also:
+    .. seealso::
         :class:`DirectSolver`, :class:`DirectSolverOptions`, :class:`ExecutionCUDA`,
         :class:`ExecutionHybrid`.
 
diff --git a/nvmath/tensor/__init__.py b/nvmath/tensor/__init__.py
new file mode 100644
index 0000000..90cf70a
--- /dev/null
+++ b/nvmath/tensor/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._configuration import *  # noqa: F403
+from .contract import *  # noqa: F403
diff --git a/nvmath/tensor/_configuration.py b/nvmath/tensor/_configuration.py
new file mode 100644
index 0000000..62774a3
--- /dev/null
+++ b/nvmath/tensor/_configuration.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+__all__ = [
+    "ContractionAlgo",
+    "ContractionAutotuneMode",
+    "ContractionJitMode",
+    "ContractionCacheMode",
+    "ContractionOptions",
+    "ExecutionCUDA",
+]
+
+
+from dataclasses import dataclass
+from logging import Logger
+from typing import ClassVar, Literal
+
+from nvmath.bindings import cutensor
+from nvmath.memory import BaseCUDAMemoryManager
+
+
+ContractionAlgo = cutensor.Algo
+ContractionAutotuneMode = cutensor.AutotuneMode
+ContractionJitMode = cutensor.JitMode
+ContractionCacheMode = cutensor.CacheMode
+
+
+@dataclass
+class ContractionOptions:
+    """
+    A data class for providing options to the :class:`BinaryContraction` and
+    :class:`TernaryContraction` objects, or the wrapper functions
+    :func:`binary_contraction`and :func:`ternary_contraction`.
+
+    Attributes:
+        compute_type: The compute type to use for the contraction.
+            See :class:`~nvmath.tensor.ComputeDesc` for available compute types.
+        logger (logging.Logger): Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+        blocking: A flag specifying the behavior of the execution functions and methods,
+            such as :func:`binary_contraction` and :meth:`TernaryContraction.execute`.
+            When ``blocking`` is `True`, the execution methods do not return until the
+            operation is complete. When
+            ``blocking`` is ``"auto"``, the methods return immediately when the input tensor
+            is on the GPU. The execution methods always block when the input tensor is
+            on the CPU to ensure that the user doesn't inadvertently use the result
+            before it becomes available. The default is ``"auto"``.
+
+        handle: cuTensor library handle. A handle will be created if one is not provided.
+
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used
+            to draw device memory. If an allocator is not provided, a memory allocator from
+            the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
+            PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
+
+        memory_limit: Maximum memory available to the contraction operation.
+            It can be specified as a value (with optional suffix like K[iB], M[iB],
+            G[iB]) or as a percentage. The default is 80% of the device memory.
+
+    """
+
+    compute_type: int | None = None
+    logger: Logger | None = None
+    blocking: Literal[True, "auto"] = "auto"
+    handle: int | None = None
+    allocator: BaseCUDAMemoryManager | None = None
+    memory_limit: int | str | None = r"80%"
+
+    def __post_init__(self):
+        if self.blocking not in (True, "auto"):
+            raise ValueError("The value specified for 'blocking' must be either True or 'auto'.")
+
+
+@dataclass
+class ExecutionCUDA:
+    """
+    A data class for providing GPU execution options to the :class:`BinaryContraction` and
+    :class:`TernaryContraction` objects, or the wrapper functions
+    :func:`binary_contraction`and :func:`ternary_contraction`.
+
+    Attributes:
+        device_id: CUDA device ordinal (only used if the operand resides on the CPU). The
+            default value is 0.
+
+    """
+
+    name: ClassVar[Literal["cuda"]] = "cuda"
+    device_id: int = 0
diff --git a/nvmath/tensor/_internal/__init__.py b/nvmath/tensor/_internal/__init__.py
new file mode 100644
index 0000000..831c565
--- /dev/null
+++ b/nvmath/tensor/_internal/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/nvmath/tensor/_internal/cutensor_config_ifc.py b/nvmath/tensor/_internal/cutensor_config_ifc.py
new file mode 100644
index 0000000..02b777f
--- /dev/null
+++ b/nvmath/tensor/_internal/cutensor_config_ifc.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Interface classes to encapsulate low-level calls to get or set configuration information.
+"""
+
+__all__ = ["ContractionPlanPreference"]
+
+
+import numpy as np
+
+from nvmath.bindings import cutensor
+from nvmath.internal import utils
+
+
+class ContractionPlanPreference:
+    """
+    An interface to configure :meth:`nvmath.tensor.BinaryContraction.plan` and
+    :meth:`nvmath.tensor.TernaryContraction.plan`. The
+    current configuration can also be queried.
+    """
+
+    def __init__(self, contraction):
+        """
+        ctor for internal use only.
+        """
+        self._contraction = contraction
+        self._handle = self._contraction.handle
+
+        get_dtype = cutensor.get_plan_preference_attribute_dtype
+
+        self._autotune_mode = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.AUTOTUNE_MODE))
+        self._cache_mode = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.CACHE_MODE))
+        self._incremental_count = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.INCREMENTAL_COUNT))
+        self._algo = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.ALGO))
+        self._kernel_rank = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.KERNEL_RANK))
+        self._jit = np.zeros((1,), dtype=get_dtype(cutensor.PlanPreferenceAttribute.JIT))
+
+    def _check_valid_contraction_wrapper(self, *args, **kwargs):
+        if not self._contraction.valid_state:
+            raise RuntimeError("The ContractionPlanPreference object cannot be used after its contraction object is free'd.")
+
+    @staticmethod
+    def _get_scalar_attribute(contraction, name, attribute):
+        """
+        name      = cutensor PlanPreference enum for the attribute
+        attribute = numpy ndarray object into which the value is stored by cutensornet
+        """
+        raise AttributeError("cuTensor does not support a getter for plan preference attributes.")
+
+    @staticmethod
+    def _set_scalar_attribute(contraction, name, attribute, value):
+        """
+        name      = cutensor PlanPreference enum for the attribute
+        attribute = numpy ndarray object into which the value is stored
+        value     = the value to set the the attribute to
+        """
+        assert contraction.plan_preference_ptr is not None, "Internal error"
+        attribute[0] = value
+        cutensor.plan_preference_set_attribute(
+            contraction.handle, contraction.plan_preference_ptr, name, attribute.ctypes.data, attribute.dtype.itemsize
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def autotune_mode(self):
+        """
+        Query the autotune mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorautotunemode-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.AUTOTUNE_MODE, self._autotune_mode
+        )
+        return self._autotune_mode.item()
+
+    @autotune_mode.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def autotune_mode(self, autotune_mode):
+        """
+        Set the autotune mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorautotunemode-t>`__
+        for more information.
+
+        Args:
+            autotune_mode: (nvmath.tensor.ContractionAutotuneMode) The autotune mode.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.AUTOTUNE_MODE, self._autotune_mode, autotune_mode
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def cache_mode(self):
+        """
+        Query the cache mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorcachemode-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.CACHE_MODE, self._cache_mode
+        )
+        return self._cache_mode.item()
+
+    @cache_mode.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def cache_mode(self, cache_mode):
+        """
+        Set the cache mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorcachemode-t>`__
+        for more information.
+
+        Args:
+            cache_mode: (nvmath.tensor.ContractionCacheMode) The cache mode.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.CACHE_MODE, self._cache_mode, cache_mode
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def incremental_count(self):
+        """
+        Query the incremental count. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorincrementalcount-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.INCREMENTAL_COUNT, self._incremental_count
+        )
+        return self._incremental_count.item()
+
+    @incremental_count.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def incremental_count(self, incremental_count):
+        """
+        Set the incremental count. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorincrementalcount-t>`__
+        for more information.
+
+        Args:
+            incremental_count: The incremental count.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.INCREMENTAL_COUNT, self._incremental_count, incremental_count
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def algo(self):
+        """
+        Query the algo. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensoralgo-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(self._contraction, cutensor.PlanPreferenceAttribute.ALGO, self._algo)
+        return self._algo.item()
+
+    @algo.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def algo(self, algo):
+        """
+        Set the algo. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensoralgo-t>`__
+        for more information.
+
+        Args:
+            algo: (nvmath.tensor.ContractionAlgo) The contraction algorithm.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.ALGO, self._algo, algo
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def kernel_rank(self):
+        """
+        Query the kernel rank. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorkernelrank-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.KERNEL_RANK, self._kernel_rank
+        )
+        return self._kernel_rank.item()
+
+    @kernel_rank.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def kernel_rank(self, kernel_rank):
+        """
+        Set the kernel rank. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorkernelrank-t>`__
+        for more information.
+
+        Args:
+            kernel_rank: The kernel rank.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(
+            self._contraction, cutensor.PlanPreferenceAttribute.KERNEL_RANK, self._kernel_rank, kernel_rank
+        )
+
+    @property
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def jit(self):
+        """
+        Query the jit compilation mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorjit-t>`__
+        for more information.
+        """
+        ContractionPlanPreference._get_scalar_attribute(self._contraction, cutensor.PlanPreferenceAttribute.JIT, self._jit)
+        return self._jit.item()
+
+    @jit.setter
+    @utils.precondition(_check_valid_contraction_wrapper)
+    def jit(self, jit):
+        """
+        Set the jit compilation mode. See the
+        `cuTensor documentation
+        <https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#cutensorjit-t>`__
+        for more information.
+
+        Args:
+            jit: (nvmath.tensor.ContractionJitMode) The JIT compilation mode.
+
+        """
+        ContractionPlanPreference._set_scalar_attribute(self._contraction, cutensor.PlanPreferenceAttribute.JIT, self._jit, jit)
diff --git a/nvmath/tensor/_internal/data.py b/nvmath/tensor/_internal/data.py
new file mode 100644
index 0000000..f54eba7
--- /dev/null
+++ b/nvmath/tensor/_internal/data.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Defined in CPython:
+# https://github.com/python/cpython/blob/26bc2cc06128890ac89492eca20e83abe0789c1c/Objects/unicodetype_db.h#L6311-L6349
+
+__all__ = ["WHITESPACE_UNICODE"]
+
+_WHITESPACE_UNICODE_INTS = [
+    0x0009,
+    0x000A,
+    0x000B,
+    0x000C,
+    0x000D,
+    0x001C,
+    0x001D,
+    0x001E,
+    0x001F,
+    0x0020,
+    0x0085,
+    0x00A0,
+    0x1680,
+    0x2000,
+    0x2001,
+    0x2002,
+    0x2003,
+    0x2004,
+    0x2005,
+    0x2006,
+    0x2007,
+    0x2008,
+    0x2009,
+    0x200A,
+    0x2028,
+    0x2029,
+    0x202F,
+    0x205F,
+    0x3000,
+]
+
+
+WHITESPACE_UNICODE = "".join(chr(s) for s in _WHITESPACE_UNICODE_INTS)
diff --git a/nvmath/tensor/_internal/einsum_parser.py b/nvmath/tensor/_internal/einsum_parser.py
new file mode 100644
index 0000000..7b9c542
--- /dev/null
+++ b/nvmath/tensor/_internal/einsum_parser.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+A collection of functions for parsing Einsum expressions.
+"""
+
+from collections import Counter
+from itertools import chain
+from functools import lru_cache
+
+from ...internal import formatters, tensor_wrapper
+from .data import WHITESPACE_UNICODE
+
+
+DISALLOWED_LABELS = {".", "-", ">"}
+native_to_str = lambda native: "'" + "".join(s if s is not Ellipsis else "..." for s in native) + "'"
+
+
+def select_morpher(mapper=None):
+    """
+    Select appropriate function for mode label representation based on string format.
+    """
+    if mapper is None:
+        return native_to_str
+
+    return lambda s: native_to_str(mapper(s))
+
+
+class ModeLabelMapper:
+    """
+    Map mode labels, with special treatment for Ellipsis characters.
+    """
+
+    def __init__(self, _map):
+        """
+        Args:
+            _map = dict-like object to map mode labels.
+        """
+        self._map = _map
+
+    def __call__(self, sequence):
+        return tuple(s if s is Ellipsis else self._map[s] for s in sequence)
+
+
+def parse_single(single):
+    """
+    Parse single operand mode labels considering ellipsis.
+    Leading or trailing whitespace, if present, is removed.
+    """
+    whitespace = WHITESPACE_UNICODE
+    subexpr = single.strip(whitespace).split("...")
+    n = len(subexpr)
+    expr = [[Ellipsis]] * (2 * n - 1)
+    expr[::2] = subexpr
+
+    return tuple(chain(*expr))
+
+
+def check_single(single):
+    """
+    Check for disallowed characters used as mode labels for a single operand.
+    """
+    whitespace = WHITESPACE_UNICODE
+    for s in single:
+        if s is Ellipsis:
+            continue
+        if s in whitespace or s in DISALLOWED_LABELS:
+            return False
+
+    return True
+
+
+@lru_cache(maxsize=128)
+def parse_einsum_str(expr):
+    """
+    Parse einsum expression in string format, retaining ellipses if present.
+
+    Return operand as well as output mode labels if explicit form or None for
+    implicit form.
+    """
+    inputs, output, *rest = expr.split("->") if "->" in expr else (expr, None)
+    if rest:
+        raise ValueError("""Invalid expression.
+It is not permitted to specify more than one '->' in the Einstein summation
+expression.""")
+
+    inputs = list(parse_single(_input) for _input in inputs.split(","))  # noqa: C400
+    if output is not None:
+        output = parse_single(output)
+
+    checks = [check_single(_input) for _input in inputs]
+    if not all(checks):
+        incorrect = [
+            f"{location}: {native_to_str(inputs[location])}" for location, predicate in enumerate(checks) if predicate is False
+        ]
+        incorrect = formatters.array2string(incorrect)
+        message = f"""Incorrect term.
+Whitespace characters and characters from the set {DISALLOWED_LABELS} cannot be
+used as mode labels in a summation expression.
+The incorrectly specified terms as a sequence of "position: term" are:
+\n{incorrect}"""
+        raise ValueError(message)
+
+    return inputs, output
+
+
+def check_ellipses(user_inputs, morpher):
+    """
+    Check ellipsis specification for validity.
+
+    Args:
+        user_inputs: Einsum expression in "neutral format"
+            (sequence of sequences) before mapping.
+        morpher: A callable that transforms a term in neutral format
+            (sequence) to string format.
+    """
+
+    checks = [user_input.count(Ellipsis) <= 1 for user_input in user_inputs]
+    if not all(checks):
+        incorrect = [
+            f"{location}: {morpher(user_inputs[location])}" for location, predicate in enumerate(checks) if predicate is False
+        ]
+        incorrect = formatters.array2string(incorrect)
+        message = f"""Incorrect ellipsis use.
+There must not be more than one ellipsis present in each term.
+The incorrectly specified terms as a sequence of "position: term" are:
+\n{incorrect}"""
+        raise ValueError(message)
+
+
+def check_einsum_with_operands(user_inputs, operands, morpher):
+    """
+    Check that the number of modes in each Einsum term is consistent
+    with the shape of the corresponding operand.
+
+    Args:
+        operands: Wrapped operands.
+        user_inputs: Einsum expression in "neutral format"
+            (sequence of sequences) before mapping.
+        morpher: A callable that transforms a term in neutral format
+            (sequence) to string format.
+    """
+
+    checks = [
+        len(i) - 1 <= len(o.shape) if Ellipsis in i else len(i) == len(o.shape)
+        for i, o in zip(user_inputs, operands, strict=False)
+    ]
+    if not all(checks):
+        mismatch = [
+            f"{location}: {morpher(user_inputs[location])} <=> {operands[location].shape}"
+            for location, predicate in enumerate(checks)
+            if predicate is False
+        ]
+        mismatch = formatters.array2string(mismatch)
+        message = f"""Term-operand shape mismatch.
+The number of mode labels in each term of the expression must match the shape
+of the corresponding operand.
+The mismatch as a sequence of "position: mode labels in term <=> operand shape"
+is: \n{mismatch}"""
+        raise ValueError(message)
+
+
+def map_modes(user_inputs, user_output, num_extra_labels, morpher):
+    """
+    Map modes in user-defined inputs and output to ordinals, leaving ellipsis
+    for later processing. Create extra mode labels in anticipation of ellipsis
+    replacement. Create the forward as well as inverse maps.
+
+    Args:
+        user_inputs: Einsum expression in "neutral format"
+            (sequence of sequences) before mapping.
+        user_output: The output mode labels before mapping as a sequence or None.
+        num_extra_labels: The number of extra mode labels to generate
+            to use in ellipsis expansion later.
+        morpher: A callable that transforms a term in neutral format
+            (sequence) to string format.
+
+    Returns:
+        tuple: A 5-tuple containing (mapped input, mapped output, forward map,
+            reverse map, largest label).
+    """
+
+    ordinal = 0
+    mode_map_user_to_ord = {}
+    for modes in user_inputs:
+        for mode in modes:
+            if mode not in mode_map_user_to_ord:
+                mode_map_user_to_ord[mode] = ordinal
+                ordinal += 1
+
+    mode_map_user_to_ord.update((f"__{i}__", i) for i in range(ordinal, ordinal + num_extra_labels))
+    label_end = ordinal + num_extra_labels
+
+    mode_map_ord_to_user = {v: k for k, v in mode_map_user_to_ord.items()}
+
+    inputs = list(tuple(m if m is Ellipsis else mode_map_user_to_ord[m] for m in modes) for modes in user_inputs)  # noqa: C400
+
+    output = None
+    if user_output is not None:
+        extra = set(user_output) - set(mode_map_user_to_ord.keys()) - {Ellipsis}
+        if extra:
+            output_modes = morpher(user_output)
+            message = f"""Extra modes in output.
+The specified output modes {output_modes} contain the extra modes: {extra}"""
+            raise ValueError(message)
+        output = tuple(m if m is Ellipsis else mode_map_user_to_ord[m] for m in user_output)
+
+    return inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end
+
+
+def create_size_dict(inputs, operands):
+    """
+    Create size dictionary (mode label to extent map) capturing the extent of
+    each mode.
+
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences)
+            after relabelling modes.
+        operands: Wrapped operands.
+
+    Returns:
+        size_dict: size dictionary.
+    """
+
+    size_dict = {}
+    for i, _input in enumerate(inputs):
+        for m, mode in enumerate(_input):
+            shape = operands[i].shape
+            if mode in size_dict:
+                if size_dict[mode] == 1:  # Handle broadcasting
+                    size_dict[mode] = shape[m]
+                elif size_dict[mode] != shape[m] and shape[m] != 1:
+                    message = f"""Extent mismatch.
+The extent ({shape[m]}) of mode {m} for operand {i} does not match the extent
+({size_dict[mode]}) of the same mode found in previous operand(s)."""
+                    raise ValueError(message)
+            else:
+                size_dict[mode] = shape[m]
+
+    return size_dict
+
+
+def infer_output_mode_labels(inputs, mode_map_ord_to_user=None):
+    """
+    Infer output mode labels (those that appear exactly once).
+
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences).
+            If `mode_map_ord_to_user` is provided, the mode labels correspond to
+            ordinals, otherwise they correspond to user labels.
+        mode_map_ord_to_user: the map from ordinals to user labels.
+    """
+    mode_label_freq = Counter(chain(*inputs))
+    del mode_label_freq[Ellipsis]
+
+    key = None if mode_map_ord_to_user is None else lambda m: mode_map_ord_to_user[m]
+    return tuple(sorted((m for m, c in mode_label_freq.items() if c == 1), key=key))
+
+
+def process_ellipses(inputs, output, operands, label_end, mode_map_ord_to_user, mapping_morpher):
+    """
+    Replace ellipses by generated mode labels, using 'label_end' and aligning
+    shapes from the right. Infer or update output mode labels.
+
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences)
+            after relabelling modes.
+        output: The output mode labels after relabelling as a sequence or None.
+        operands: Wrapped operands.
+        label_end: One past the largest mode label (int), including modes
+            resulting from Ellipsis expansion.
+        mode_map_ord_to_user: the map from ordinals to user labels.
+        mapping_morpher: A callable that transforms a term in neutral format
+            (sequence) to string format, while converting internal labels to
+            user labels.
+
+    Returns:
+        tuple: a 2-tuple (inputs, output) after ellipsis expansion and inferring
+            output mode labels if needed.
+    """
+
+    inferred = False
+    if output is None:
+        output = infer_output_mode_labels(inputs, mode_map_ord_to_user)
+        inferred = True
+
+    shortest, longest = label_end, 0
+    for i, _input in enumerate(inputs):
+        if Ellipsis not in _input:
+            continue
+
+        n = len(operands[i].shape) - (len(_input) - 1)
+        assert n >= 0, "Internal error"
+
+        s = _input.index(Ellipsis)
+        shortest, longest = min(shortest, n), max(longest, n)
+        inputs[i] = _input[:s] + tuple(range(label_end - n, label_end)) + _input[s + 1 :]
+
+    if not inferred:
+        count = output.count(Ellipsis)
+        if count > 1:
+            message = f"""Incorrect ellipsis use.
+The output term cannot have more than one ellipsis.
+Specified term = {mapping_morpher(output)}"""
+            raise ValueError(message)
+        if count == 1:  # Replace ellipsis by the longest sequence of labels.
+            s = output.index(Ellipsis)
+            output = output[:s] + tuple(range(label_end - longest, label_end)) + output[s + 1 :]
+        else:  # If all ellipses expand to the same number of mode labels, the latter are reduced.
+            if shortest != longest:
+                message = f"""Ellipsis length mismatch for reduction.
+The ellipses specified in the expression do not expand to the same number of
+mode labels and thus cannot be reduced. The expanded number of dimensions
+ranges from {shortest} to {longest}."""
+                raise ValueError(message)
+    else:  # The mode labels corresponding to ellipsis expansion followed by the inferred mode labels.
+        output = tuple(range(label_end - longest, label_end)) + output
+
+    return inputs, output
+
+
+def parse_einsum(expr, *operands):
+    """
+    Parse the generalized Einstein summation expression in string format. Any
+    hashable and comparable object is accepted for mode label specification, and
+    unicode strings are accepted. If the output is not provided (implicit form
+    or missing output sublist), it will be inferred from the expression.
+
+    Returns wrapped operands, mapped inputs and output, size dictionary based on
+    internal mode numbers, and the forward as well as the reverse mode maps.
+    """
+
+    # Parse einsum keeping ellipses.
+
+    inputs, output = parse_einsum_str(expr)
+
+    num_operand, num_input = len(operands), len(inputs)
+    if num_operand != num_input:
+        message = f"""Operand-term mismatch.
+The number of operands ({num_operand}) must match the number of inputs
+({num_input}) specified in the Einsum expression."""
+        raise ValueError(message)
+
+    morpher = select_morpher()
+
+    if num_operand < 1:
+        message = f"""The number of operands must be either 2 (binary contraction) or 3 (ternary contraction),
+found {num_operand}."""
+        raise ValueError(message)
+
+    # First wrap operands.
+    operands = tensor_wrapper.wrap_operands(operands)
+
+    # Preliminary checks, before mode label remapping.
+
+    ellipses = any(Ellipsis in _input for _input in inputs)
+
+    # Ensure at most one ellipsis per operand.
+    if ellipses:
+        check_ellipses(inputs, morpher)
+
+    # Ensure that ellipsis is not present only in the output.
+    if not ellipses and output is not None and Ellipsis in output:
+        message = f"""Invalid ellipsis specification.
+The output term {morpher(output)} contains ellipsis while none of the input
+terms do."""
+        raise ValueError(message)
+
+    # Ensure that the number of modes is consistent with the operand shape.
+    check_einsum_with_operands(inputs, operands, morpher)
+
+    # Calculate the maximum number of extra mode labels that will be needed.
+    num_extra_labels = max(len(o.shape) for o in operands) if ellipses else 0
+
+    # Map data to ordinals for cutensornet.
+    inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end = map_modes(inputs, output, num_extra_labels, morpher)
+
+    has_user_output = output is not None
+
+    mapper = ModeLabelMapper(mode_map_ord_to_user)
+    mapping_morpher = select_morpher(mapper)
+
+    # Ellipsis expansion.
+    if ellipses:
+        inputs, output = process_ellipses(inputs, output, operands, label_end, mode_map_ord_to_user, mapping_morpher)
+    elif output is None:
+        output = infer_output_mode_labels(inputs, mode_map_ord_to_user)
+
+    # Create mode-extent map based on internal mode numbers.
+    size_dict = create_size_dict(inputs, operands)
+
+    return operands, inputs, output, has_user_output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user, ellipses
+
+
+def parse_elementary_einsum(inputs, output, a, b, c):
+    morpher = select_morpher()
+
+    # Preliminary checks, before mode label remapping.
+    ellipses = any(Ellipsis in _input for _input in inputs)
+
+    # Ensure at most one ellipsis per operand.
+    if ellipses:
+        check_ellipses(inputs, morpher)
+
+    # Ensure that ellipsis is not present only in the output.
+    if not ellipses and output is not None and Ellipsis in output:
+        message = f"""Invalid ellipsis specification.
+The output term {morpher(output)} contains ellipsis while none of the input
+terms do."""
+        raise ValueError(message)
+
+    # Ensure that the number of modes is consistent with the operand shape.
+    num_inputs = len(inputs)
+    if num_inputs == 2:
+        operands = [a, b]
+    elif num_inputs == 3:
+        operands = [a, b, c]
+    else:
+        raise ValueError(f"Invalid number of inputs: {num_inputs}")
+
+    check_einsum_with_operands(inputs, operands, morpher)
+
+    # Calculate the maximum number of extra mode labels that will be needed.
+    num_extra_labels = max(len(o.shape) for o in operands) if ellipses else 0
+
+    # Map data to ordinals for cutensornet.
+    inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end = map_modes(inputs, output, num_extra_labels, morpher)
+
+    has_user_output = output is not None
+
+    mapper = ModeLabelMapper(mode_map_ord_to_user)
+    mapping_morpher = select_morpher(mapper)
+
+    # Ellipsis expansion.
+    if ellipses:
+        inputs, output = process_ellipses(inputs, output, operands, label_end, mode_map_ord_to_user, mapping_morpher)
+    elif output is None:
+        output = infer_output_mode_labels(inputs, mode_map_ord_to_user)
+
+    # Create mode-extent map based on internal mode numbers.
+    size_dict = create_size_dict(inputs, operands)
+
+    return inputs, output, has_user_output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user, ellipses
diff --git a/nvmath/tensor/_internal/typemaps.py b/nvmath/tensor/_internal/typemaps.py
new file mode 100644
index 0000000..70771ed
--- /dev/null
+++ b/nvmath/tensor/_internal/typemaps.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+__all__ = ["get_default_compute_type_from_dtype_name", "get_supported_compute_types"]
+
+import threading
+
+from nvmath.bindings.cutensor import ComputeDesc  # type: ignore
+
+DEFAULT_COMPUTE_TYPE: dict[str, int] = {}  # dtype_name -> default compute type
+SUPPORTED_COMPUTE_TYPES: dict[str, set[int]] = {}  # dtype_name -> set of compute types supported
+_typemap_lock = threading.Lock()
+
+
+def _maybe_initialize_compute_type_map():
+    # https://docs.nvidia.com/cuda/cutensor/latest/api/cutensor.html#cutensorcreatecontraction
+    global DEFAULT_COMPUTE_TYPE, SUPPORTED_COMPUTE_TYPES
+    if not DEFAULT_COMPUTE_TYPE:  # fast path if the map is already initialized
+        with _typemap_lock:
+            if not DEFAULT_COMPUTE_TYPE:  # double-check to avoid race condition
+                DEFAULT_COMPUTE_TYPE = {
+                    "float16": ComputeDesc.COMPUTE_32F(),
+                    "bfloat16": ComputeDesc.COMPUTE_32F(),
+                    "float32": ComputeDesc.COMPUTE_32F(),
+                    "complex64": ComputeDesc.COMPUTE_32F(),
+                    "complex128": ComputeDesc.COMPUTE_64F(),
+                    "float64": ComputeDesc.COMPUTE_64F(),
+                }
+                SUPPORTED_COMPUTE_TYPES = {
+                    "float16": {ComputeDesc.COMPUTE_32F()},
+                    "bfloat16": {ComputeDesc.COMPUTE_32F()},
+                    "float32": {
+                        ComputeDesc.COMPUTE_32F(),
+                        ComputeDesc.COMPUTE_TF32(),
+                        ComputeDesc.COMPUTE_3XTF32(),
+                        ComputeDesc.COMPUTE_16F(),
+                        ComputeDesc.COMPUTE_16BF(),
+                    },
+                    "complex64": {ComputeDesc.COMPUTE_32F(), ComputeDesc.COMPUTE_TF32(), ComputeDesc.COMPUTE_3XTF32()},
+                    "complex128": {ComputeDesc.COMPUTE_32F(), ComputeDesc.COMPUTE_64F()},
+                    "float64": {ComputeDesc.COMPUTE_64F(), ComputeDesc.COMPUTE_32F()},
+                }
+    return
+
+
+def get_default_compute_type_from_dtype_name(dtype_name: str) -> int:
+    _maybe_initialize_compute_type_map()
+    if dtype_name not in DEFAULT_COMPUTE_TYPE:
+        raise ValueError(f"Invalid data type: {dtype_name}")
+    return DEFAULT_COMPUTE_TYPE[dtype_name]
+
+
+def get_supported_compute_types(dtype_name: str) -> set[int]:
+    # https://docs.nvidia.com/cuda/cutensor/latest/api/cutensor.html#cutensorcreatecontraction
+    _maybe_initialize_compute_type_map()
+    if dtype_name not in SUPPORTED_COMPUTE_TYPES:
+        raise ValueError(f"Invalid data type: {dtype_name}")
+    return SUPPORTED_COMPUTE_TYPES[dtype_name]
diff --git a/nvmath/tensor/contract.py b/nvmath/tensor/contract.py
new file mode 100644
index 0000000..e88544d
--- /dev/null
+++ b/nvmath/tensor/contract.py
@@ -0,0 +1,1562 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Any
+import numpy as np
+from .. import memory
+from ..bindings import cutensor
+from ..internal import formatters
+from ..internal import utils
+from ..internal import tensor_wrapper
+from .._utils import CudaDataType
+from ._internal import einsum_parser
+from ..internal.typemaps import NAME_TO_DATA_TYPE, DATA_TYPE_TO_NAME
+from ._internal.typemaps import get_default_compute_type_from_dtype_name, get_supported_compute_types
+from ._internal.cutensor_config_ifc import ContractionPlanPreference
+from ._configuration import ContractionOptions, ExecutionCUDA
+
+__all__ = [
+    "BinaryContraction",
+    "TernaryContraction",
+    "ContractionPlanPreference",
+    "ComputeDesc",
+    "binary_contraction",
+    "ternary_contraction",
+    "Operator",
+    "tensor_qualifiers_dtype",
+]
+
+
+Operator = cutensor.Operator
+
+ComputeDesc = cutensor.ComputeDesc
+
+tensor_qualifiers_dtype = np.int32
+
+# As of cuTensor 2.3.1, only the following operators are supported in the contraction APIs
+OPERATORS_SUPPORTED = {Operator.OP_IDENTITY, Operator.OP_CONJ}
+
+
+def _compute_pointer_alignment(ptr: int) -> int:
+    """
+    Compute the pointer alignment for the given pointer.
+
+    Args:
+        ptr: Pointer address as integer
+
+    Returns:
+        The alignment value (256, 128, 64, 32, 16, 8, 4, 2, or 1)
+    """
+    return 256 if ptr == 0 else min(ptr & -ptr, 256)
+
+
+SHARED_CONTRACTION_DOCUMENTATION = utils.COMMON_SHARED_DOC_MAP.copy()
+SHARED_CONTRACTION_DOCUMENTATION.update(
+    {
+        "expr": """\
+The einsum expression to perform the contraction.
+""".replace("\n", " "),
+        #
+        "a": """\
+A tensor representing the first operand to the tensor contraction. The currently supported types
+are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "b": """\
+A tensor representing the second operand to the tensor contraction. The currently supported types
+are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "c": """\
+A tensor representing the third operand to the tensor contraction. The currently supported types
+are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "addend": """\
+(Optional) A tensor representing the operand to add to the tensor contraction result (fused operation in cuTensor).
+The currently supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`,
+and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "alpha": """\
+The scale factor for the tensor contraction term as a real or complex number. The default is
+:math:`1.0`.""".replace("\n", " "),
+        #
+        "beta": """\
+The scale factor for the tensor addition term as a real or complex number. A value for `beta` must be provided if
+the operand to be added is specified.""".replace("\n", " "),
+        #
+        "qualifiers": """\
+If desired, specify the operators as a :class:`numpy.ndarray` of dtype :class:`~nvmath.tensor.tensor_qualifiers_dtype`
+with the same length as the number of operands in the contraction expression plus one (for the operand to be added).
+All elements must be valid :class:`~nvmath.tensor.Operator` objects. See
+:ref:`matrix-tensor-qualifiers` for the motivation behind qualifiers.""".replace("\n", " "),
+        #
+        "options": """\
+Specify options for the tensor contraction as a :class:`~nvmath.tensor.ContractionOptions` object. Alternatively,
+a `dict` containing the parameters for the ``ContractionOptions`` constructor can also be provided. If not specified, the
+value will be set to the default-constructed ``ContractionOptions`` object.""".replace("\n", " "),
+        #
+        "execution": """\
+Specify execution space options for the tensor contraction as a :class:`ExecutionCUDA` object or a string 'cuda'.
+Alternatively, a `dict` containing 'name' key set to 'cuda' and the additional parameters for the ``ExecutionCUDA``
+constructor can also be provided. If not provided, the execution space will be selected to match operand's storage if
+the operands are on the GPU. If the operands are on the CPU and execution space is not provided, the execution space
+will be a default-constructed :class:`ExecutionCUDA` object with device_id = 0.""".replace("\n", " "),
+        #
+        "out": """\
+(Optional) The output tensor to store the result of the contraction. Must be a :class:`numpy.ndarray`, \
+:class:`cupy.ndarray`, or :class:`torch.Tensor` object and must be on the same device as the input operands. \
+If not specified, the result will be returned on the same device as the input operands.
+
+        .. note::
+
+            The support of output tensor in the API is experimental and subject to change in future versions
+            without prior notice.
+
+""".strip(),
+        #
+        "result": """\
+The result of the specified contraction, which remains on the same device and belong to the
+same package as the input operands. """.replace("\n", " "),
+    }
+)
+
+
+class InvalidContractionState(Exception):
+    pass
+
+
+@utils.docstring_decorator(SHARED_CONTRACTION_DOCUMENTATION, skip_missing=False)
+class _ElementaryContraction:
+    """
+    Pairwise contraction:
+        O = A * B + C
+    Ternary contraction:
+        O = A * B * C + D
+    """
+
+    def __init__(self, expr, a, b, *, c=None, d=None, out=None, qualifiers=None, options=None, execution=None, stream=None):
+        """Binary & Ternary Contraction"""
+
+        version = cutensor.get_version()
+        if version < 20301:
+            raise RuntimeError(
+                f"cuTensor version {version} is detected, which is lower than the minimum required "
+                f"version 2.3.1 for nvmath.tensor module. please upgrade cuTensor to a compatible version."
+            )
+
+        self.expr = expr
+
+        # Process options.
+        self.options: Any = utils.check_or_create_options(ContractionOptions, options, "elementary contraction options")
+        self.blocking = self.options.blocking
+        self.logger = self.options.logger if self.options.logger is not None else logging.getLogger()
+
+        # Process operands & einsum expression
+        self.a, self.b = tensor_wrapper.wrap_operands([a, b])
+        input_operand_class = self.a.__class__
+        self.input_package = utils.get_operands_package([self.a, self.b])
+        inputs, output = einsum_parser.parse_einsum_str(expr)
+        self.num_inputs = len(inputs)
+
+        if self.num_inputs == 2:
+            assert d is None, f"Internal error: Binary contraction {expr} cannot have a fourth operand"
+        elif self.num_inputs == 3:
+            assert c is not None, f"Internal error: Ternary contraction {expr} must have a third operand"
+        else:
+            raise NotImplementedError("Only binary and ternary contractions are supported")
+
+        wrapped_operands = [self.a, self.b]
+        for op_name, op in zip(["c", "d"], [c, d], strict=False):
+            if op is not None:
+                op = tensor_wrapper.wrap_operand(op)
+                if op.name != self.input_package:
+                    raise ValueError(f"The operand {op_name} must be a {self.input_package} tensor")
+                wrapped_operands.append(op)
+            setattr(self, op_name, op)
+
+        if self.input_package == "numpy":
+            self.internal_package = "cuda"
+        else:
+            self.internal_package = self.input_package
+        tensor_wrapper.maybe_register_package(self.internal_package)
+
+        self.input_device_id = utils.get_operands_device_id(wrapped_operands)
+
+        if execution is None:
+            self.execution = ExecutionCUDA()
+        else:
+            self.execution = utils.check_or_create_one_of_options(
+                (ExecutionCUDA,),
+                execution,
+                "execution options",
+            )
+        # TODO: cutensor supports R_64F (A) C_64F (B) C_64F (C) combination (and inverse)
+        # https://docs.nvidia.com/cuda/cutensor/latest/api/cutensor.html#cutensorcreatecontraction
+        self.data_type = utils.get_operands_dtype(wrapped_operands)
+        self.cuda_data_type = NAME_TO_DATA_TYPE[self.data_type]
+
+        # Parse compute descriptor
+        if self.options.compute_type is None:
+            self.compute_type = get_default_compute_type_from_dtype_name(self.data_type)
+        elif isinstance(self.options.compute_type, int):
+            # make sure compute type is valid
+            if self.options.compute_type not in get_supported_compute_types(self.data_type):
+                raise ValueError(f"Invalid compute type: {self.options.compute_type} for data type: {self.data_type}")
+            self.compute_type = self.options.compute_type
+        else:
+            raise ValueError(f"Invalid compute type: {self.options.compute_type}")
+
+        if self.input_device_id == "cpu":
+            self.execution_device_id = self.execution.device_id
+            stream_holder = utils.get_or_create_stream(self.execution_device_id, stream, self.internal_package)
+            self.a = self.a.to(self.execution_device_id, stream_holder)
+            self.b = self.b.to(self.execution_device_id, stream_holder)
+            if self.c is not None:
+                self.c = self.c.to(self.execution_device_id, stream_holder)
+            if self.d is not None:
+                self.d = self.d.to(self.execution_device_id, stream_holder)
+        else:
+            self.execution_device_id = self.input_device_id
+            stream_holder = utils.get_or_create_stream(self.execution_device_id, stream, self.internal_package)
+
+        if qualifiers is None:
+            self.qualifiers = np.full(self.num_inputs + 1, cutensor.Operator.OP_IDENTITY, dtype=np.int32)  # size of enum value
+        else:
+            self.qualifiers = np.asarray(qualifiers, dtype=np.int32)
+            if self.qualifiers.size != self.num_inputs + 1:
+                if self.num_inputs == 2:
+                    message = f"The qualifiers must be a numpy array of length {self.num_inputs + 1}\
+                              corresponding to the operands a, b and c"
+                else:
+                    message = f"The qualifiers must be a numpy array of length {self.num_inputs + 1}\
+                              corresponding to the operands a, b, c and d"
+                raise ValueError(message)
+            if self.qualifiers[self.num_inputs] != cutensor.Operator.OP_IDENTITY:
+                raise ValueError(
+                    f"The operand for the offset must be the identity operator, found {self.qualifiers[self.num_inputs]}"
+                )
+            if self.num_inputs == 2:
+                iterator = zip(["a", "b"], self.qualifiers[:-1], strict=False)
+            else:
+                iterator = zip(["a", "b", "c"], self.qualifiers[:-1], strict=False)
+            for op_name, qualifier in iterator:
+                if qualifier not in OPERATORS_SUPPORTED:
+                    raise ValueError(
+                        f"Each operator must be a valid cutensor operator, "
+                        f"currently only support {OPERATORS_SUPPORTED}, "
+                        f"got {qualifier}."
+                    )
+                if qualifier == cutensor.Operator.OP_CONJ:
+                    operand = getattr(self, op_name)
+                    if "complex" not in operand.dtype:
+                        raise ValueError(f"The operand {op_name} must be a complex tensor to use the conjugate operator.")
+
+        # Set memory allocator.
+        self.allocator = (
+            self.options.allocator
+            if self.options.allocator is not None
+            else memory._MEMORY_MANAGER[self.internal_package](self.execution_device_id, self.logger)
+        )
+
+        self.memory_limit = utils.get_memory_limit_from_device_id(self.options.memory_limit, self.execution_device_id)
+
+        self.tensor_descs = {}
+
+        self.input_modes, self.output_modes, _, size_dict = einsum_parser.parse_elementary_einsum(
+            inputs, output, self.a, self.b, self.c
+        )[:4]
+
+        output_shape = [size_dict[mode] for mode in self.output_modes]
+
+        # self.out is the output tensor that will be used for the execution
+        # self.out_return is the output tensor that will be returned by the execute method
+        self.output_provided = out is not None
+        if self.output_provided:
+            out = tensor_wrapper.wrap_operand(out)
+            if out.name != self.input_package:
+                raise ValueError(f"The output operand out must be a {self.input_package} tensor")
+            if out.device_id != self.input_device_id:
+                raise ValueError("The output operand out must be on the same device as the input operands.")
+            self.out_return = out
+            if out.device_id == self.execution_device_id:
+                self.out = out
+            else:
+                self.out = out.to(self.execution_device_id, stream_holder)
+        else:
+            self.out = utils.create_empty_tensor(
+                self.a.__class__, output_shape, self.data_type, self.execution_device_id, stream_holder, False
+            )
+            if self.input_device_id == self.execution_device_id:
+                self.out_return = self.out
+            else:
+                tmp_stream_holder = None if self.input_device_id == "cpu" else stream_holder
+                self.out_return = utils.create_empty_tensor(
+                    input_operand_class, output_shape, self.data_type, self.input_device_id, tmp_stream_holder, False
+                )
+
+        with utils.device_ctx(self.execution_device_id):
+            if self.options.handle is not None:
+                self.own_handle = False
+                self.handle = self.options.handle
+                self.logger.info(f"The library handle has been set to the specified value: {self.handle}.")
+            else:
+                self.own_handle = True
+                self.handle = cutensor.create()
+                self.logger.info(f"The library handle has been created: {self.handle}.")
+
+        self.valid_state = True
+
+        # Parse tensor descriptors
+        self.operands_info = {}
+        self.pointer_alignment = {}
+        for op_name in ["a", "b", "c", "d", "out"]:
+            op = getattr(self, op_name)
+            if op is not None:
+                self.pointer_alignment[op_name] = _compute_pointer_alignment(op.data_ptr)
+                self.tensor_descs[op_name] = cutensor.create_tensor_descriptor(
+                    self.handle, len(op.shape), op.shape, op.strides, self.cuda_data_type, self.pointer_alignment[op_name]
+                )
+                self.operands_info[op_name] = {
+                    "dtype": op.dtype,
+                    "shape": op.shape,
+                    "strides": op.strides,
+                }
+
+        # Create contraction descriptor
+        if self.num_inputs == 2:
+            self.contraction_desc = cutensor.create_contraction(
+                self.handle,
+                self.tensor_descs["a"],
+                self.input_modes[0],
+                self.qualifiers[0],
+                self.tensor_descs["b"],
+                self.input_modes[1],
+                self.qualifiers[1],
+                self.tensor_descs["out"]
+                if c is None
+                else self.tensor_descs["c"],  # if c is set to None, then C descriptor is the same as the out descriptor
+                self.output_modes,  # NOTE: currently assuming c has the same output modes as the out
+                self.qualifiers[2],  # only identity operator is supported for c
+                self.tensor_descs["out"],
+                self.output_modes,
+                self.compute_type,
+            )
+        else:
+            self.contraction_desc = cutensor.create_contraction_trinary(
+                self.handle,
+                self.tensor_descs["a"],
+                self.input_modes[0],
+                self.qualifiers[0],
+                self.tensor_descs["b"],
+                self.input_modes[1],
+                self.qualifiers[1],
+                self.tensor_descs["c"],
+                self.input_modes[2],
+                self.qualifiers[2],
+                self.tensor_descs["out"]
+                if d is None
+                else self.tensor_descs["d"],  # if d is set to None, then D descriptor is the same as the out descriptor
+                self.output_modes,
+                self.qualifiers[3],  # only identity operator is supported for d
+                self.tensor_descs["out"],
+                self.output_modes,
+                self.compute_type,
+            )
+
+        scalar_dtype = cutensor.get_operation_descriptor_attribute_dtype(cutensor.OperationDescriptorAttribute.SCALAR_TYPE)
+        scalar_dtype_buffer = np.empty(1, dtype=scalar_dtype)
+        cutensor.operation_descriptor_get_attribute(
+            self.handle,
+            self.contraction_desc,
+            cutensor.OperationDescriptorAttribute.SCALAR_TYPE,
+            scalar_dtype_buffer.ctypes.data,
+            scalar_dtype_buffer.itemsize,
+        )
+        self.scalar_type = CudaDataType(scalar_dtype_buffer.item())
+
+        self.alpha = np.empty(1, dtype=DATA_TYPE_TO_NAME[self.scalar_type])
+        self.beta = np.empty(1, dtype=DATA_TYPE_TO_NAME[self.scalar_type])
+
+        self.contraction_planned = False
+        self.plan_preference_ptr = cutensor.create_plan_preference(self.handle, cutensor.Algo.DEFAULT, cutensor.JitMode.NONE)
+        self._plan_preference = ContractionPlanPreference(self)
+        self.plan_ptr = None
+
+        self.workspace_ptr = None
+        self.workspace_allocated_size = 0
+        self.workspace_size = None
+        self.workspace_stream = None
+        self.workspace_allocated_here = False
+
+    def _check_valid_contraction(self, *args, **kwargs):
+        """
+        Check if the ElementaryContraction object is alive and well.
+        """
+        if not self.valid_state:
+            raise InvalidContractionState("The ElementaryContraction object cannot be used after resources are free'd")
+
+    def _check_valid_operands(self, *args, **kwargs):
+        """
+        Check if the operands are available for the operation.
+        """
+        what = kwargs["what"]
+        if self.num_inputs == 2:
+            if self.a is None or self.b is None:
+                raise RuntimeError(
+                    f"{what} cannot be performed if a or b have been set to None "
+                    f"for pairwise contraction. Use reset_operands() to set the "
+                    f"desired input before using performing the {what.lower()}."
+                )
+        else:
+            if self.a is None or self.b is None or self.c is None:
+                raise RuntimeError(
+                    f"{what} cannot be performed if a, b, or c have been set to None "
+                    f"for ternary contraction. Use reset_operands() to set the "
+                    f"desired input before using performing the {what.lower()}."
+                )
+
+        if self.output_provided and self.out is None:
+            raise RuntimeError(
+                f"{what} cannot be performed if out has been set to None. Use reset_operands() to set the "
+                f"desired input before using performing the {what.lower()}."
+            )
+
+    def _free_plan_resources(self, exception: Exception | None = None) -> bool:
+        """
+        Free resources allocated in planning.
+        """
+
+        if self.plan_ptr is not None:
+            cutensor.destroy_plan(self.plan_ptr)
+            self.plan_ptr = None
+
+        if self.plan_preference_ptr is not None:
+            cutensor.destroy_plan_preference(self.plan_preference_ptr)
+            self.plan_preference_ptr = None
+
+        self.contraction_planned = False
+        return True
+
+    def _check_planned(self, *args, **kwargs):
+        what = kwargs["what"]
+        if not self.contraction_planned:
+            raise RuntimeError(f"{what} cannot be performed before plan() has been called.")
+
+    def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
+        """
+        Free workspace by releasing the MemoryPointer object.
+        """
+        if self.workspace_ptr is None:
+            return True
+
+        self.workspace_ptr = None
+        self.workspace_allocated_size = 0
+        self.logger.debug("[_free_workspace_memory] The workspace has been released.")
+
+        return True
+
+    def _reset_workspace_allocation_tracking(self):
+        """
+        Reset workspace allocation tracking attributes to False at the end of the
+        methods where workspace memory is potentially allocated. This is necessary
+        to prevent any exceptions raised before method entry from using stale
+        tracking values.
+        """
+        self.workspace_allocated_here = False
+
+    @utils.precondition(_check_valid_contraction)
+    def _release_workspace_memory_perhaps(self, release_workspace):
+        """
+        Free workspace memory if it's larger than the specified limit.
+        """
+        if not release_workspace:
+            return True
+
+        # Establish ordering wrt the computation and free workspace if requested.
+        if self.last_compute_event is not None:
+            self.workspace_stream.wait(self.last_compute_event)
+            self.logger.debug("Established ordering with respect to the computation before releasing the workspace.")
+            self.last_compute_event = None
+
+        self.logger.debug("[_release_workspace_memory_perhaps] The workspace memory will be released.")
+        return self._free_workspace_memory()
+
+    def _release_workspace_memory_perhaps_wrapper(self, exception: Exception | None = None) -> bool:
+        """
+        This is used in @atomic.
+        """
+        self._release_workspace_memory_perhaps(release_workspace=self.workspace_allocated_here)
+        self._reset_workspace_allocation_tracking()
+        return True
+
+    @utils.precondition(_check_valid_contraction)
+    @utils.precondition(_check_planned, "Workspace memory allocation")
+    @utils.atomic(_free_workspace_memory, method=True)
+    def _allocate_workspace_memory(self, stream_holder: utils.StreamHolder):
+        """
+        Allocate workspace memory using the specified allocator.
+        """
+
+        assert self.workspace_size is not None, "Internal Error."
+        assert self.workspace_allocated_here is False, "Internal Error."
+
+        if self.workspace_size == 0:  # For performance, bypass allocator for workspace size == 0.
+            self.workspace_ptr = memory.MemoryPointer(0, 0, finalizer=None)
+        else:
+            self.logger.debug("Allocating workspace for performing the tensor contraction...")
+            with utils.device_ctx(self.execution_device_id), stream_holder.ctx:
+                try:
+                    if isinstance(self.allocator, memory.BaseCUDAMemoryManagerAsync):
+                        self.workspace_ptr = self.allocator.memalloc_async(self.workspace_size, stream_holder.obj)
+                    else:
+                        self.workspace_ptr = self.allocator.memalloc(self.workspace_size)
+                    self.workspace_allocated_here = True
+                except TypeError as e:
+                    message = (
+                        "The method 'memalloc' in the allocator object must conform to the interface in the "
+                        "'BaseCUDAMemoryManager' protocol."
+                    )
+                    raise TypeError(message) from e
+
+        self.workspace_allocated_size = self.workspace_size
+        self.workspace_stream = stream_holder.obj
+        self.logger.debug(
+            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size)} in the context "
+            f"of stream {self.workspace_stream}."
+        )
+
+    def _allocate_workspace_memory_perhaps(self, stream_holder: utils.StreamHolder):
+        """
+        Allocate workspace memory using the specified allocator, if it hasn't
+        already been done.
+        """
+
+        if self.workspace_ptr is not None and self.workspace_allocated_size >= self.workspace_size:
+            return
+
+        return self._allocate_workspace_memory(stream_holder)
+
+    @property
+    def plan_preference(self):
+        """
+        An accessor to configure or query the contraction planning phase
+        attributes.
+
+        Returns:
+            A :class:`ContractionPlanPreference` object, whose attributes can be set (or
+            queried) to configure the planning phase.
+
+        .. seealso::
+            :class:`ContractionPlanPreference`, :meth:`plan`.
+        """
+        return self._plan_preference
+
+    @utils.precondition(_check_valid_contraction)
+    def reset_operands(self, a=None, b=None, *, c=None, d=None, out=None, stream=None):
+        if self.num_inputs == 2 and d is not None:
+            raise RuntimeError("Internal Error: For pairwise contractions, d can not be set.")
+
+        stream_holder = None  # lazy initialization
+        for op_name, op in zip(["a", "b", "c", "d", "out"], [a, b, c, d, out], strict=False):
+            if op is None:
+                if op_name == "out":
+                    if self.output_provided:
+                        self.out_return = None
+                        self.out = None
+                    else:
+                        # if out is not provided during initialization,
+                        # we don't do anything with it
+                        continue
+                else:
+                    setattr(self, op_name, None)
+                self.logger.info(f"operand {op_name} has been reset to None.")
+                continue
+            tensor_info = self.operands_info.get(op_name)
+            if tensor_info is None:
+                raise ValueError(
+                    f"operand {op_name} was not specified during the initialization "
+                    f"of the ElementaryContraction object and therefore can not be reset "
+                    f"to a concrete tensor."
+                )
+            op = tensor_wrapper.wrap_operand(op)
+            if op.name != self.input_package:
+                raise ValueError(f"The operand {op_name} must be a {self.input_package} tensor")
+            if op.device_id != self.input_device_id:
+                raise ValueError(
+                    f"The operand {op_name} must be on the same device "
+                    f"as the operands provided during the initialization of the "
+                    f"ElementaryContraction object."
+                )
+
+            for attr, value in tensor_info.items():
+                if getattr(op, attr) != value:
+                    raise ValueError(
+                        f"The operand {op_name} must have the same {attr} "
+                        f"as the one specified during the initialization of the "
+                        f"ElementaryContraction object."
+                    )
+
+            if op_name == "out":
+                self.out_return = op
+
+            if op.device_id != self.execution_device_id:
+                if stream_holder is None:
+                    stream_holder = utils.get_or_create_stream(self.execution_device_id, stream, self.internal_package)
+                if op_name == "out" and self.out is not None:
+                    # if out_name is "out" and we own a valid self.out,
+                    # we can directly reuse it here
+                    op = self.out
+                else:
+                    op = op.to(self.execution_device_id, stream_holder)
+
+            if _compute_pointer_alignment(op.data_ptr) != self.pointer_alignment[op_name]:
+                raise ValueError(
+                    f"The operand {op_name} must have the same pointer alignment "
+                    f"as the one specified during the initialization of the "
+                    f"ElementaryContraction object."
+                )
+
+            setattr(self, op_name, op)
+            self.logger.info(f"operand {op_name} has been reset to the new operand provided.")
+        return
+
+    @utils.precondition(_check_valid_contraction)
+    @utils.atomic(_free_plan_resources, method=True)
+    def plan(self, *, stream=None):
+        """
+        Plan the tensor contraction. The planning phase can be optionally configured through
+        the property :attr:`plan_preference` (an object of type
+        :class:`ContractionPlanPreference`).
+
+        Args:
+            stream: {stream}
+
+        .. seealso::
+            :attr:`plan_preference`, :class:`ContractionPlanPreference`.
+
+        Note:
+            If the :attr:`plan_preference` has been updated, a :meth:`plan` call is
+            required to apply the changes.
+        """
+        log_info = self.logger.isEnabledFor(logging.INFO)
+
+        # A new plan needs to be created at each plan() call
+        if self.plan_ptr is not None:
+            cutensor.destroy_plan(self.plan_ptr)
+            self.plan_ptr = None
+
+        if log_info:
+            self.logger.info("= PLANNING PHASE =")
+        stream_holder = utils.get_or_create_stream(self.execution_device_id, stream, self.internal_package)
+        required_workspace_size_buffer = np.empty(
+            1, dtype=cutensor.get_plan_attribute_dtype(cutensor.PlanAttribute.REQUIRED_WORKSPACE)
+        )
+
+        with utils.cuda_call_ctx(stream_holder, self.blocking, timing=log_info) as (
+            self.last_compute_event,
+            elapsed,
+        ):
+            self.plan_ptr = cutensor.create_plan(
+                self.handle, self.contraction_desc, self.plan_preference_ptr, self.memory_limit
+            )
+            cutensor.plan_get_attribute(
+                self.handle,
+                self.plan_ptr,
+                cutensor.PlanAttribute.REQUIRED_WORKSPACE,
+                required_workspace_size_buffer.ctypes.data,
+                required_workspace_size_buffer.itemsize,
+            )
+
+        if log_info and elapsed.data is not None:
+            self.logger.info(f"The planning phase took {elapsed.data:.3f} ms to complete.")
+
+        self.workspace_size = required_workspace_size_buffer.item()
+        self.contraction_planned = True
+
+    @utils.precondition(_check_valid_contraction)
+    @utils.precondition(_check_planned, "Execution")
+    @utils.precondition(_check_valid_operands, "Execution")
+    @utils.atomic(_release_workspace_memory_perhaps_wrapper, method=True)
+    def execute(self, *, alpha=1.0, beta=None, release_workspace=False, stream=None):
+        """
+        Execute a prepared tensor contraction.
+
+        Args:
+            alpha: {alpha}
+
+            beta: {beta}
+
+            release_workspace: {release_workspace}
+
+            stream: {stream}
+
+        Returns:
+           {result}
+        """
+        if beta is None:
+            if self.num_inputs == 2 and self.c is not None:
+                raise ValueError("beta must be set when c is specified in a binary contraction")
+            elif self.num_inputs == 3 and self.d is not None:
+                raise ValueError("beta must be set when d is specified in a ternary contraction")
+            beta = 0.0
+        else:
+            if self.num_inputs == 2 and self.c is None:
+                raise ValueError("For binary contraction, beta can only be set if c is specified")
+            elif self.num_inputs == 3 and self.d is None:
+                raise ValueError("For ternary contraction, beta can only be set if d is specified")
+
+        log_info = self.logger.isEnabledFor(logging.INFO)
+
+        self.alpha[0] = alpha
+        self.beta[0] = beta
+
+        if log_info:
+            self.logger.info("= EXECUTION PHASE =")
+        stream_holder = utils.get_or_create_stream(self.execution_device_id, stream, self.internal_package)
+        if log_info:
+            self.logger.info(f"The specified stream for execute() is {stream_holder.obj}.")
+
+        # Allocate workspace if needed.
+        self._allocate_workspace_memory_perhaps(stream_holder)
+
+        raw_workspace_ptr = utils.get_ptr_from_memory_pointer(self.workspace_ptr)
+
+        with utils.cuda_call_ctx(stream_holder, self.blocking, timing=log_info) as (
+            self.last_compute_event,
+            elapsed,
+        ):
+            if self.num_inputs == 2:
+                cutensor.contract(
+                    self.handle,
+                    self.plan_ptr,
+                    self.alpha.ctypes.data,
+                    self.a.data_ptr,
+                    self.b.data_ptr,
+                    self.beta.ctypes.data,
+                    self.c.data_ptr if self.c is not None else self.out.data_ptr,
+                    self.out.data_ptr,
+                    raw_workspace_ptr,
+                    self.workspace_size,
+                    stream_holder.ptr,
+                )
+            else:
+                cutensor.contract_trinary(
+                    self.handle,
+                    self.plan_ptr,
+                    self.alpha.ctypes.data,
+                    self.a.data_ptr,
+                    self.b.data_ptr,
+                    self.c.data_ptr,
+                    self.beta.ctypes.data,
+                    self.d.data_ptr if self.d is not None else self.out.data_ptr,
+                    self.out.data_ptr,
+                    raw_workspace_ptr,
+                    self.workspace_size,
+                    stream_holder.ptr,
+                )
+
+        if log_info and elapsed.data is not None:
+            self.logger.info(f"The tensor contraction calculation took {elapsed.data:.3f} ms to complete.")
+
+        # Establish ordering wrt the computation and free workspace if requested.
+        if release_workspace:
+            self._release_workspace_memory_perhaps(True)
+
+        self._reset_workspace_allocation_tracking()
+
+        if self.out.device_id != self.out_return.device_id:
+            self.out_return.copy_(self.out, stream_holder)
+        return self.out_return.tensor
+
+    @utils.precondition(_check_valid_contraction)
+    def free(self):
+        """Free tensor contraction resources.
+
+        It is recommended that the contraction object be used
+        within a context, but if it is not possible then this method must be
+        called explicitly to ensure that the tensor contraction resources
+        (especially internal library objects) are properly cleaned up.
+        """
+
+        if not self.valid_state:
+            return
+
+        try:
+            if self.last_compute_event is not None and self.workspace_stream is not None:
+                self.workspace_stream.wait(self.last_compute_event)
+                self.last_compute_event = None
+
+            self._free_workspace_memory()
+
+            self._free_plan_resources()
+
+            class_name = self.__class__.__name__
+            # Free handle if we own it.
+            if self.handle is not None and self.own_handle:
+                cutensor.destroy(self.handle)
+                self.handle, self.own_handle = None, False
+
+            if self.contraction_desc is not None:
+                cutensor.destroy_operation_descriptor(self.contraction_desc)
+                self.contraction_desc = None
+
+            while self.tensor_descs:
+                tensor_desc = self.tensor_descs.popitem()[1]
+                cutensor.destroy_tensor_descriptor(tensor_desc)
+
+        except Exception as e:
+            self.logger.critical(f"Internal error: only part of the {class_name} object's resources have been released.")
+            self.logger.critical(str(e))
+            raise e
+        finally:
+            self.valid_state = False
+
+        self.logger.info(f"The {class_name} object's resources have been released.")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.free()
+
+
+@utils.docstring_decorator(SHARED_CONTRACTION_DOCUMENTATION, skip_missing=False)
+class BinaryContraction(_ElementaryContraction):
+    """
+    Create a stateful object encapsulating the specified binary tensor contraction
+    :math:`\\alpha a @ b + \\beta c` and the required resources to perform the operation.
+    A stateful object can be used to amortize the cost of preparation (planning in the
+    case of binary tensor contraction) across multiple executions (also see the
+    :ref:`Stateful APIs<host api types>` section).
+
+    The function-form API :func:`binary_contraction` is a convenient alternative to using
+    stateful objects for *single* use (the user needs to perform just one tensor
+    contraction, for example), in which case there is no possibility of amortizing
+    preparatory costs. The function-form APIs are just convenience wrappers around
+    the stateful object APIs.
+
+    Using the stateful object typically involves the following steps:
+
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific binary tensor contraction operation.
+    3. **Execution**: Perform the tensor contraction computation with :meth:`execute`.
+    4. **Resource Management**: Ensure all resources are released either by explicitly
+       calling :meth:`free` or by managing the stateful object within a context manager.
+
+    Detailed information on what's happening in the various phases described above can be
+    obtained by passing in a :class:`logging.Logger` object to :class:`ContractionOptions`
+    or by setting the appropriate options in the root logger object,
+    which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {addend}
+
+        out: {out}
+
+        qualifiers: {qualifiers}
+
+        stream: {stream}
+
+        options: {options}
+
+        execution: {execution}
+
+    .. seealso::
+        :attr:`plan_preference`, :meth:`plan`, :meth:`reset_operands`, :meth:`execute`
+
+    Examples:
+
+        >>> import numpy as np
+        >>> import nvmath
+
+        Create two 3-D float64 ndarrays on the CPU:
+
+        >>> M, N, K = 32, 32, 32
+        >>> a = np.random.rand(M, N, K)
+        >>> b = np.random.rand(N, K, M)
+
+        We will define a binary tensor contraction operation.
+
+        Create a BinaryContraction object encapsulating the problem specification above:
+
+        >>> contraction = nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b)
+
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`ContractionOptions`).
+
+        Next, plan the operation. Optionally, preferences can
+        be specified for planning:
+
+        >>> contraction.plan()
+
+        Now execute the binary tensor contraction, and obtain the result `r1` as a NumPy
+        ndarray.
+
+        >>> r1 = contraction.execute()
+
+        Finally, free the object's resources. To avoid having to explicitly making this
+        call, it's recommended to use the BinaryContraction object as a context manager
+        as shown below, if possible.
+
+        >>> contraction.free()
+
+        Note that all :class:`BinaryContraction` methods execute on the current
+        stream by default. Alternatively, the `stream` argument can be used to run a
+        method on a specified stream.
+
+        Let's now look at the same problem with CuPy ndarrays on the GPU.
+
+        Create a 3-D float64 CuPy ndarray on the GPU:
+
+        >>> import cupy as cp
+        >>> a = cp.random.rand(M, N, K)
+        >>> b = cp.random.rand(N, K, M)
+
+        Create an BinaryContraction object encapsulating the problem specification
+        described earlier and use it as a context manager.
+
+        >>> with nvmath.tensor.BinaryContraction("ijk,jkl->il", a, b) as contraction:
+        ...     contraction.plan()
+        ...
+        ...     # Execute the operation to get the first result.
+        ...     r1 = contraction.execute()
+        ...
+        ...     # Update operands A and B in-place (see reset_operands() for an
+        ...     # alternative).
+        ...     a[:] = cp.random.rand(M, K)
+        ...     b[:] = cp.random.rand(K, N)
+        ...
+        ...     # Execute the operation to get the new result.
+        ...     r2 = contraction.execute()
+
+
+        All the resources used by the object are released at the end of the block.
+
+        Further examples can be found in the `nvmath/examples/tensor/contraction
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction>`_
+        directory.
+    """
+
+    def __init__(self, expr, a, b, *, c=None, out=None, qualifiers=None, stream=None, options=None, execution=None):
+        super().__init__(expr, a, b, c=c, out=out, qualifiers=qualifiers, stream=stream, options=options, execution=execution)
+
+    def reset_operands(self, a=None, b=None, *, c=None, out=None, stream=None):
+        """
+        Reset the operands held by this :class:`BinaryContraction` instance.
+
+        This method has two use cases:
+            (1) it can be used to provide new operands for execution when the original
+                operands are on the CPU
+            (2) it can be used to release the internal reference to the previous operands
+                and make their memory available for other use by passing ``None`` for *all*
+                arguments. In this case, this method must be called again to provide the
+                desired operands before another call to execution APIs like :meth:`execute`.
+
+        This method is not needed when the operands reside on the GPU and in-place
+        operations are used to update the operand values.
+
+        This method will perform various checks on the new operands to make sure:
+
+            - The shapes, strides, datatypes match those of the old ones.
+            - The packages that the operands belong to match those of the old ones.
+            - If input tensors are on GPU, the device must match.
+
+        Args:
+            a: {a}
+
+            b: {b}
+
+            c: {addend}
+
+            out: {out}
+
+            stream: {stream}
+
+        Examples:
+
+            >>> import cupy as cp
+            >>> import nvmath
+
+            Create two 3-D float64 ndarrays on the GPU:
+
+            >>> M, N, K = 128, 128, 256
+            >>> a = cp.random.rand(M, K)
+            >>> b = cp.random.rand(K, N)
+
+            Create an binary contraction object as a context manager
+
+            >>> with nvmath.tensor.BinaryContraction("ij,jk->ik", a, b) as contraction:
+            ...     # Plan the operation.
+            ...     algorithms = contraction.plan()
+            ...
+            ...     # Execute the contraction to get the first result.
+            ...     r1 = contraction.execute()
+            ...
+            ...     # Reset the operands to new CuPy ndarrays.
+            ...     a1 = cp.random.rand(M, K)
+            ...     b1 = cp.random.rand(K, N)
+            ...     contraction.reset_operands(a=a1, b=b1)
+            ...
+            ...     # Execute to get the new result corresponding to the updated operands.
+            ...     r2 = contraction.execute()
+
+            Note that if only a subset of operands are reset, the operands that are not
+            reset hold their original values.
+
+            With :meth:`reset_operands`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
+
+            For the particular example above, explicitly calling :meth:`reset_operands` is
+            equivalent to updating the operands in-place, i.e, replacing
+            ``contraction.reset_operand(a=a1, b=b1)`` with ``a[:]=a1`` and ``b[:]=b1``.
+            Note that updating the operand in-place should be adopted with caution as it can
+            only yield the expected result under the additional constraint below:
+
+                - The operand is on the GPU (more precisely, the operand memory space should
+                  be accessible from the execution space).
+
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction/example06_stateful_inplace.py>`_.
+        """
+        super().reset_operands(a=a, b=b, c=c, out=out, stream=stream)
+
+
+@utils.docstring_decorator(SHARED_CONTRACTION_DOCUMENTATION, skip_missing=False)
+class TernaryContraction(_ElementaryContraction):
+    """
+    Create a stateful object encapsulating the specified ternary tensor contraction
+    :math:`\\alpha a @ b + \\beta c` and the required resources to perform the operation.
+    A stateful object can be used to amortize the cost of preparation (planning in the
+    case of ternary tensor contraction) across multiple executions (also see the
+    :ref:`Stateful APIs<host api types>` section).
+
+    The function-form API :func:`ternary_contraction` is a convenient alternative to using
+    stateful objects for *single* use (the user needs to perform just one tensor
+    contraction, for example), in which case there is no possibility of amortizing
+    preparatory costs. The function-form APIs are just convenience wrappers around
+    the stateful object APIs.
+
+    Using the stateful object typically involves the following steps:
+
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific ternary tensor contraction operation.
+    3. **Execution**: Perform the tensor contraction computation with :meth:`execute`.
+    4. **Resource Management**: Ensure all resources are released either by explicitly
+       calling :meth:`free` or by managing the stateful object within a context manager.
+
+    Detailed information on what's happening in the various phases described above can be
+    obtained by passing in a :class:`logging.Logger` object to :class:`ContractionOptions`
+    or by setting the appropriate options in the root logger object,
+    which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        d: {addend}
+
+        out: {out}
+
+        qualifiers: {qualifiers}
+
+        stream: {stream}
+
+        options: {options}
+
+        execution: {execution}
+
+    .. seealso::
+        :attr:`plan_preference`, :meth:`plan`, :meth:`reset_operands`, :meth:`execute`
+
+    Examples:
+
+        >>> import numpy as np
+        >>> import nvmath
+
+        Create three 3-D float64 ndarrays on the CPU:
+
+        >>> M, N, K = 32, 32, 32
+        >>> a = np.random.rand(M, N, K)
+        >>> b = np.random.rand(N, K, M)
+        >>> c = np.random.rand(M, N)
+
+        We will define a ternary tensor contraction operation.
+
+        Create a TernaryContraction object encapsulating the problem specification above:
+
+        >>> expr = "ijk,jkl,ln->in"
+        >>> contraction = nvmath.tensor.TernaryContraction(expr, a, b, c)
+
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`ContractionOptions`).
+
+        Next, plan the operation. Optionally, preferences can
+        be specified for planning:
+
+        >>> contraction.plan()
+
+        Now execute the ternary tensor contraction, and obtain the result `r1` as
+        a NumPy ndarray.
+
+        >>> r1 = contraction.execute()
+
+        Finally, free the object's resources. To avoid having to explicitly making this
+        call, it's recommended to use the TernaryContraction object as a context manager
+        as shown below, if possible.
+
+        >>> contraction.free()
+
+        Note that all :class:`TernaryContraction` methods execute on the current
+        stream by default. Alternatively, the `stream` argument can be used to run a
+        method on a specified stream.
+
+        Let's now look at the same problem with CuPy ndarrays on the GPU.
+
+        Create a 3-D float64 CuPy ndarray on the GPU:
+
+        >>> import cupy as cp
+        >>> a = cp.random.rand(M, N, K)
+        >>> b = cp.random.rand(N, K, M)
+        >>> c = cp.random.rand(M, N)
+
+        Create an TernaryContraction object encapsulating the problem specification
+        described earlier and use it as a context manager.
+
+        >>> expr = "ijk,jkl,ln->in"
+        >>> with nvmath.tensor.TernaryContraction(expr, a, b, c) as contraction:
+        ...     contraction.plan()
+        ...
+        ...     # Execute the operation to get the first result.
+        ...     r1 = contraction.execute()
+        ...
+        ...     # Update operands A, B and C in-place (see reset_operands() for an
+        ...     # alternative).
+        ...     a[:] = cp.random.rand(M, N, K)
+        ...     b[:] = cp.random.rand(N, K, M)
+        ...     c[:] = cp.random.rand(M, N)
+        ...
+        ...     # Execute the operation to get the new result.
+        ...     r2 = contraction.execute()
+
+
+        All the resources used by the object are released at the end of the block.
+
+        Further examples can be found in the `nvmath/examples/tensor/contraction
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction>`_
+        directory.
+    """
+
+    def __init__(self, expr, a, b, c, *, d=None, out=None, qualifiers=None, stream=None, options=None, execution=None):
+        super().__init__(
+            expr, a, b, c=c, d=d, out=out, qualifiers=qualifiers, stream=stream, options=options, execution=execution
+        )
+
+    def reset_operands(self, a=None, b=None, c=None, *, d=None, out=None, stream=None):
+        """
+        Reset the operands held by this :class:`TernaryContraction` instance.
+
+        This method has two use cases:
+            (1) it can be used to provide new operands for execution when the original
+                operands are on the CPU
+            (2) it can be used to release the internal reference to the previous operands
+                and make their memory available for other use by passing ``None`` for *all*
+                arguments. In this case, this method must be called again to provide the
+                desired operands before another call to execution APIs like :meth:`execute`.
+
+        This method is not needed when the operands reside on the GPU and in-place
+        operations are used to update the operand values.
+
+        This method will perform various checks on the new operands to make sure:
+
+            - The shapes, strides, datatypes match those of the old ones.
+            - The packages that the operands belong to match those of the old ones.
+            - If input tensors are on GPU, the device must match.
+
+        Args:
+            a: {a}
+
+            b: {b}
+
+            c: {c}
+
+            d: {addend}
+
+            out: {out}
+
+            stream: {stream}
+
+        Examples:
+
+            >>> import cupy as cp
+            >>> import nvmath
+
+            Create two 3-D float64 ndarrays on the GPU:
+
+            >>> M, N, K = 12, 16, 32
+            >>> a = cp.random.rand(M, M, N)
+            >>> b = cp.random.rand(N, K)
+            >>> c = cp.random.rand(K, K)
+
+            Create an ternary contraction object as a context manager
+
+            >>> expr = "ijk,kl,lm->ijm"
+            >>> with nvmath.tensor.TernaryContraction(expr, a, b, c) as contraction:
+            ...     # Plan the operation.
+            ...     algorithms = contraction.plan()
+            ...
+            ...     # Execute the contraction to get the first result.
+            ...     r1 = contraction.execute()
+            ...
+            ...     # Reset the operands to new CuPy ndarrays.
+            ...     a1 = cp.random.rand(M, M, N)
+            ...     b1 = cp.random.rand(N, K)
+            ...     c1 = cp.random.rand(K, K)
+            ...     contraction.reset_operands(a=a1, b=b1, c=c1)
+            ...
+            ...     # Execute to get the new result corresponding to the updated operands.
+            ...     r2 = contraction.execute()
+
+            Note that if only a subset of operands are reset, the operands that are not
+            reset hold their original values.
+
+            With :meth:`reset_operands`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
+
+            For the particular example above, explicitly calling :meth:`reset_operands`
+            is equivalent to updating the operands in-place, i.e, replacing
+            ``contraction.reset_operand(a=a1, b=b1, c=c1)`` with ``a[:]=a1``
+            and ``b[:]=b1`` and ``c[:]=c1``. Note that updating the operand in-place
+            should be adopted with caution as it can only yield the expected result
+            under the additional constraint below:
+
+                - The operand is on the GPU (more precisely, the operand memory space should
+                  be accessible from the execution space).
+
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction/example06_stateful_inplace.py>`_.
+        """
+        super().reset_operands(a=a, b=b, c=c, d=d, out=out, stream=stream)
+
+
+@utils.docstring_decorator(SHARED_CONTRACTION_DOCUMENTATION, skip_missing=False)
+def binary_contraction(
+    expr, a, b, *, c=None, alpha=1.0, beta=None, out=None, qualifiers=None, stream=None, options=None, execution=None
+):
+    """
+    Evaluate the Einstein summation convention for binary contraction on the operands.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression.
+
+    Additionally, the binary contraction can be performed with
+    an additional operand, which is added to the result with a scale factor.
+
+    This function-form is a wrapper around the stateful
+    :class:`BinaryContraction` object APIs and is meant for *single* use (the user needs
+    to perform just one binary contraction, for example), in which case there is
+    no possibility of amortizing preparatory costs.
+
+    Detailed information on what's happening within this function can be obtained by passing
+    in a :class:`logging.Logger` object to :class:`ContractionOptions` or by setting the
+    appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        expr: {expr}
+
+        a: {a}
+
+        b: {b}
+
+        c: {addend}
+
+        alpha: {alpha}
+
+        beta: {beta}
+
+        out: {out}
+
+        qualifiers: {qualifiers}
+
+        stream: {stream}
+
+        options: {options}
+
+        execution: {execution}
+
+    Returns:
+        {result}
+
+    .. seealso::
+        :class:`BinaryContraction`, :func:`ternary_contraction`,
+        :class:`TernaryContraction`, :class:`ContractionOptions`,
+        :class:`ContractionPlanPreferences`
+
+        For tensor network contraction with arbitrary number of operands including
+        contraction path finding, see cuQuantum:
+
+        - :external+cuquantum:py:func:`cuquantum.tensornet.contract`
+        - :external+cuquantum:py:class:`cuquantum.tensornet.Network`
+
+    Examples:
+
+        >>> import cupy as cp
+        >>> import nvmath
+
+        Create three float32 ndarrays on the GPU:
+
+        >>> M, N = 32, 64
+        >>> a = cp.random.rand(M, M, N, N, dtype=cp.float32)
+        >>> b = cp.random.rand(N, N, N, N, dtype=cp.float32)
+        >>> c = cp.random.rand(M, M, N, N, dtype=cp.float32)
+
+        Perform the operation :math:`\\alpha \\sum A[i,j,a,b] * B[a,b,c,d] +
+        \\beta C[i,j,c,d]` using :func:`binary_contraction`.
+        The result `r` is also a CuPy float32 ndarray:
+
+        >>> r = nvmath.tensor.binary_contraction(
+        ...     "ijab,abcd->ijcd", a, b, c=c, alpha=1.23, beta=0.74
+        ... )
+
+        The result is equivalent to:
+
+        >>> r = 1.23 * cp.einsum("ijab,abcd->ijcd", a, b) + 0.74 * c
+
+        Options can be provided to customize the operation:
+
+        >>> compute_type = nvmath.bindings.cutensor.ComputeDesc.COMPUTE_3XTF32()
+        >>> o = nvmath.tensor.ContractionOptions(compute_type=compute_type)
+        >>> r = nvmath.tensor.binary_contraction("ijab,abcd->ijcd", a, b, options=o)
+
+        See `ContractionOptions` for the complete list of available options.
+
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the binary contraction operation. This can be done if the operands
+        are computed on a different stream, for example:
+
+        >>> s = cp.cuda.Stream()
+        >>> with s:
+        ...     a = cp.random.rand(M, M, N, N)
+        ...     b = cp.random.rand(N, N, N, N)
+        >>> r = nvmath.tensor.binary_contraction("ijab,abcd->ijcd", a, b, stream=s)
+
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
+
+        Create NumPy ndarrays on the CPU.
+
+        >>> import numpy as np
+        >>> a = np.random.rand(M, M, N, N)
+        >>> b = np.random.rand(N, N, N, N)
+
+        Provide the NumPy ndarrays to :func:`binary_contraction`, with the result
+        also being a NumPy ndarray:
+
+        >>> r = nvmath.tensor.binary_contraction("ijab,abcd->ijcd", a, b)
+
+    Notes:
+        - This function is a convenience wrapper around :class:`BinaryContraction` and is
+          specifically meant for *single* use.
+
+    Further examples can be found in the `nvmath/examples/tensor/contraction
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction>`_
+    directory.
+    """
+    if c is None and beta is not None:
+        raise ValueError("beta can only be set if c is specified in a binary contraction")
+    elif c is not None and beta is None:
+        raise ValueError("beta must be set when c is specified in a binary contraction")
+    with BinaryContraction(
+        expr, a, b, c=c, out=out, qualifiers=qualifiers, stream=stream, options=options, execution=execution
+    ) as contraction:
+        contraction.plan()
+        out = contraction.execute(alpha=alpha, beta=beta, stream=stream)
+    return out
+
+
+@utils.docstring_decorator(SHARED_CONTRACTION_DOCUMENTATION, skip_missing=False)
+def ternary_contraction(
+    expr, a, b, c, *, d=None, alpha=1.0, beta=None, out=None, qualifiers=None, stream=None, options=None, execution=None
+):
+    """
+    Evaluate the Einstein summation convention for ternary contraction on the operands.
+
+    Explicit as well as implicit form is supported for the Einstein summation expression.
+
+    Additionally, the ternary contraction can be performed with
+    an additional operand, which is added to the result with a scale factor.
+
+    This function-form is a wrapper around the stateful
+    :class:`TernaryContraction` object APIs and is meant for *single* use (the user needs
+    to perform just one ternary contraction, for example), in which case there is
+    no possibility of amortizing preparatory costs.
+
+    Detailed information on what's happening within this function can be obtained by passing
+    in a :class:`logging.Logger` object to :class:`ContractionOptions` or by setting the
+    appropriate options in the root logger object, which is used by default:
+
+        >>> import logging
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
+
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
+
+    Args:
+        expr: {expr}
+
+        a: {a}
+
+        b: {b}
+
+        c: {c}
+
+        d: {addend}
+
+        alpha: {alpha}
+
+        beta: {beta}
+
+        out: {out}
+
+        qualifiers: {qualifiers}
+
+        stream: {stream}
+
+        options: {options}
+
+        execution: {execution}
+
+    Returns:
+        {result}
+
+    .. seealso::
+        :class:`TernaryContraction`, :func:`binary_contraction`,
+        :class:`BinaryContraction`, :class:`ContractionOptions`,
+        :class:`ContractionPlanPreferences`
+
+        For tensor network contraction with arbitrary number of operands including
+        contraction path finding, see cuQuantum:
+
+        - :external+cuquantum:py:func:`cuquantum.tensornet.contract`
+        - :external+cuquantum:py:class:`cuquantum.tensornet.Network`
+
+    Examples:
+
+        >>> import cupy as cp
+        >>> import nvmath
+
+        Create three float32 ndarrays on the GPU:
+
+        >>> M, N, K = 16, 24, 32
+        >>> a = cp.random.rand(M, M, dtype=cp.float32)
+        >>> b = cp.random.rand(M, N, K, dtype=cp.float32)
+        >>> c = cp.random.rand(N, K, M, dtype=cp.float32)
+        >>> d = cp.random.rand(M, M, dtype=cp.float32)
+
+        Perform the operation :math:`\\alpha \\sum A[i,j] * B[j,k,l] * C[k,l,m] +
+        \\beta D[i,m]` using :func:`ternary_contraction`.
+        The result `r` is also a CuPy float32 ndarray:
+
+        >>> r = nvmath.tensor.ternary_contraction(
+        ...     "ij,jkl,klm->im", a, b, c, d=d, alpha=0.63, beta=0.22
+        ... )
+
+        The result is equivalent to:
+
+        >>> r = 0.63 * cp.einsum("ij,jkl,klm->im", a, b, c) + 0.22 * d
+
+        Options can be provided to customize the operation:
+
+        >>> compute_type = nvmath.bindings.cutensor.ComputeDesc.COMPUTE_3XTF32()
+        >>> o = nvmath.tensor.ContractionOptions(compute_type=compute_type)
+        >>> r = nvmath.tensor.ternary_contraction("ij,jkl,klm->im", a, b, c, options=o)
+
+        See `ContractionOptions` for the complete list of available options.
+
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the ternary contraction operation. This can be done if the operands
+        are computed on a different stream, for example:
+
+        >>> s = cp.cuda.Stream()
+        >>> with s:
+        ...     a = cp.random.rand(M, M, dtype=cp.float32)
+        ...     b = cp.random.rand(M, N, K, dtype=cp.float32)
+        ...     c = cp.random.rand(N, K, M, dtype=cp.float32)
+        >>> r = nvmath.tensor.ternary_contraction("ij,jkl,klm->im", a, b, c, stream=s)
+
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
+
+        Create NumPy ndarrays on the CPU.
+
+        >>> import numpy as np
+        >>> a = np.random.rand(M, M)
+        >>> b = np.random.rand(M, N, K)
+        >>> c = np.random.rand(N, K, M)
+
+        Provide the NumPy ndarrays to :func:`ternary_contraction`, with the result
+        also being a NumPy ndarray:
+
+        >>> r = nvmath.tensor.ternary_contraction("ij,jkl,klm->im", a, b, c)
+
+    Notes:
+        - This function is a convenience wrapper around :class:`TernaryContraction` and is
+          specifically meant for *single* use.
+
+    Further examples can be found in the `nvmath/examples/tensor/contraction
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/tensor/contraction>`_
+    directory.
+    """
+    if d is None and beta is not None:
+        raise ValueError("beta can only be set if d is specified in a ternary contraction")
+    elif d is not None and beta is None:
+        raise ValueError("beta must be set when d is specified in a ternary contraction")
+    with TernaryContraction(
+        expr, a, b, c, d=d, out=out, qualifiers=qualifiers, stream=stream, options=options, execution=execution
+    ) as contraction:
+        contraction.plan()
+        out = contraction.execute(alpha=alpha, beta=beta, stream=stream)
+    return out
diff --git a/pyproject.toml b/pyproject.toml
index f6e1165..07e1103 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,23 +9,21 @@ requires = [
     "setuptools>=77.0.3",
     "tomli>=2.0.1; python_version < '3.11'",
     # whatever version works here, see builder/utils.py for detail
-    "nvidia-cuda-runtime-cu11",
-    "nvidia-cuda-nvcc-cu11",
+    "nvidia-cuda-runtime-cu12",
+    "nvidia-cuda-nvcc-cu12",
     # needed for internal/bindings
-    "cuda-bindings==11.8.7",
+    "cuda-bindings==12.9.*",
     # needed for internal/bindings (cuda.bindings.cydriver)
-    "nvidia-cuda-profiler-api-cu11",
+    "nvidia-cuda-profiler-api-cu12",
 ]
 build-backend = "setuptools.build_meta"
 
 
 [project]
 name = "nvmath-python"
-version = "0.6.0"
+version = "0.7.0"
 dynamic = [
     "readme",
-    "dependencies",
-    "optional-dependencies"
 ]
 requires-python = '>=3.10,<3.14'
 description = "NVIDIA Math Python libraries"
@@ -34,6 +32,13 @@ authors = [
 ]
 license = "Apache-2.0"
 license-files = ["LICENSE"]
+dependencies = [
+    "cuda-bindings",
+    "cuda-core >=0.3.2,<0.4",
+    "cuda-pathfinder>=1.3.2,<2.0",
+    "numpy >=1.25,<3",
+    "pywin32; platform_system=='Windows'"
+]
 
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -73,17 +78,256 @@ include = ["nvmath", "nvmath.*"]
 
 [tool.setuptools.dynamic]
 readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
-dependencies = {file = ["requirements/pip/nvmath-python.txt"] }
-
-[tool.setuptools.dynamic.optional-dependencies]
-cpu = { file = ["requirements/pip/nvmath-python-cpu.txt"] }
-cu11 = { file = ["requirements/pip/nvmath-python-cu11.txt"] }
-cu12 = { file = ["requirements/pip/nvmath-python-cu12.txt"] }
-cu12-distributed = { file = ["requirements/pip/nvmath-python-cu12.txt", "requirements/pip/nvmath-python-cu12-distributed.txt"] }
-dx = { file = ["requirements/pip/nvmath-python-dx.txt"] }
-sysctk11 = { file = ["requirements/pip/nvmath-python-sysctk11.txt"] }
-sysctk12 = { file = ["requirements/pip/nvmath-python-sysctk12.txt"] }
-sysctk12-dx = { file = ["requirements/pip/nvmath-python-sysctk12.txt", "requirements/pip/nvmath-python-sysctk12-dx.txt"] }
+
+[project.optional-dependencies]
+cpu = [
+    "mkl; platform_machine=='x86_64'",
+    "nvpl-fft >=0.3,<1; platform_system=='Linux' and platform_machine=='aarch64'",
+    "nvpl-blas >=0.3,<1; platform_system=='Linux' and platform_machine=='aarch64'",
+]
+sysctk12 = [
+    "cuda-bindings >=12.9.2,<13",
+    "cutensor-cu12 >=2.3.1",
+]
+sysctk13 = [
+    "cuda-bindings >=13.0.1,<14",
+    "cutensor-cu13 >=2.3.1",
+]
+cu12 = [
+    "nvmath-python[sysctk12]",
+    "cuda-core[cu12]",
+    "nvidia-cublas-cu12",
+    "nvidia-cuda-nvrtc-cu12",
+    "nvidia-cuda-runtime-cu12",
+    "nvidia-cudss-cu12 == 0.7.*",
+    "nvidia-cufft-cu12",
+    "nvidia-curand-cu12",
+    "nvidia-cusolver-cu12",
+    "nvidia-cusparse-cu12",
+    "cutensor-cu12 >=2.3.1",
+]
+cu13 = [
+    "nvmath-python[sysctk13]",
+    "cuda-core[cu13]",
+    "cuda-toolkit[cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse]==13.*",
+    "nvidia-cudss-cu13 == 0.7.*",
+    "cutensor-cu13 >=2.3.1",
+]
+dx = [
+    # numba-cuda defines version restriction
+    "numba",
+    "numba-cuda >= 0.18.1",
+]
+sysctk12-dx = [
+    "nvmath-python[sysctk12]",
+    "nvmath-python[dx]",
+]
+cu12-dx = [
+    "nvmath-python[cu12]",
+    "nvmath-python[dx]",
+    "nvidia-libmathdx-cu12 >=0.2.3,<0.3",
+    # Extra restrictions:
+    # Earlier versions have missing header files
+    "nvidia-cuda-cccl-cu12 > 12.4.127",
+    # For nvmath.device use of NVRTC. [Known bugs exist for 12.4.0, 12.4.1,
+    #  12.5.0]
+    "nvidia-cuda-nvrtc-cu12 !=12.4.*, !=12.5.0",
+]
+sysctk12-distributed = [
+    "nvmath-python[sysctk12]",
+    "mpi4py",
+    "nvidia-nccl-cu12 >= 2.24.3",
+]
+cu12-distributed = [
+    "nvmath-python[cu12]",
+    "nvmath-python[sysctk12-distributed]",
+    "nvidia-cublasmp-cu12 >= 0.6.0",
+    "nvidia-cufftmp-cu12",
+    "nvidia-nvshmem-cu12 >= 3.2.5",
+]
+sysctk13-dx = [
+    "nvmath-python[sysctk13]",
+    "nvmath-python[dx]",
+]
+cu13-dx = [
+    "nvmath-python[cu13]",
+    "nvmath-python[dx]",
+    "nvidia-libmathdx-cu13 >=0.2.3,<0.3",
+    "cuda-toolkit[cccl,nvrtc]==13.*",
+]
+sysctk13-distributed = [
+    "nvmath-python[sysctk13]",
+    "mpi4py",
+    "nvidia-nccl-cu13 >= 2.24.3",
+]
+cu13-distributed = [
+    "nvmath-python[cu13]",
+    "nvmath-python[sysctk13-distributed]",
+    "nvidia-cublasmp-cu13 >= 0.6.0",
+    # TODO: add cufftmp ctk13 when available
+    "nvidia-cufftmp-cu13 >= 12.1.3.2",
+    "nvidia-nvshmem-cu13 >= 3.2.5",
+]
+
+
+[dependency-groups]
+cupy-oldest-cu12 = [
+    "cupy-cuda12x ==12.1.*",
+    "numpy ==1.25.*",
+]
+dev = [
+    { include-group = 'tests-cu12' },
+    { include-group = 'lint' },
+]
+lint = [
+  "ruff",
+  "pre-commit",
+]
+docs = [
+    "breathe",
+    "enum-tools",
+    "grip",
+    "jupyter",
+    "mpi4py",
+    "myst-parser",
+    "nbconvert <7.5",  # to enable pandoc CLI < 2.14.2
+    "nbsphinx",
+    "nbsphinx-link",
+    "nvidia-sphinx-theme",
+    "pandoc",
+    "sphinx",
+    "sphinx-favicon",
+    "sphinx-toolbox",
+    "sphinxcontrib-programoutput",
+]
+build-wheel = [
+    "auditwheel",
+    "build",
+    "pyproject-validate",
+    "twine",
+    "wheel",
+]
+mpich = [
+    "mpich",
+]
+notebooks-cu126 = [
+    "jupyter",
+    "matplotlib",
+    "nbconvert",
+    "torchvision",
+]
+openmpi = [
+    "openmpi",
+]
+cupy-cu12 = [
+    "cupy-cuda12x >= 12.1",
+]
+cupy-cu13 = [
+    "cupy-cuda13x",
+]
+tests-no-cupy = [
+    "cffi",
+    "hypothesis",
+    "opt_einsum",
+    "packaging",
+    "psutil",
+    "pytest",
+    "pytest-repeat",
+    "pytest-xdist",
+    "scipy",
+]
+tests-cu12 = [
+    { include-group = 'tests-no-cupy' },
+    { include-group = 'cupy-cu12' },
+]
+tests-cu13 = [
+    { include-group = 'tests-no-cupy' },
+    { include-group = 'cupy-cu13' },
+]
+tests-dx = [
+    "nvidia-mathdx ~= 25.6.0",
+]
+tests-dx-sysctk12 = [
+    { include-group = 'tests-cu12' },
+    { include-group = 'tests-dx' },
+]
+tests-dx-cu12 = [
+    { include-group = 'tests-dx-sysctk12' },
+    "nvidia-libmathdx-cu12 < 0.3.0",
+]
+tests-dx-sysctk13 = [
+    { include-group = 'tests-cu13' },
+    { include-group = 'tests-dx' },
+]
+tests-dx-cu13 = [
+    { include-group = 'tests-dx-sysctk13' },
+    "nvidia-libmathdx-cu13 < 0.3.0",
+]
+tests-dx-dev-cu12 = [
+    { include-group = 'tests-cu12' },
+    { include-group = 'tests-dx' },
+    "nvidia-libmathdx-cu12 >= 0.3.0.dev0",
+]
+tests-dx-dev-cu13 = [
+    { include-group = 'tests-cu13' },
+    { include-group = 'tests-dx' },
+    "nvidia-libmathdx-cu13 >= 0.3.0.dev0"
+]
+torch-cu121 = [
+    "torch ==2.1.*"
+]
+torch-cu126 = [
+    # torch 2.6 is missing a dependency on cudnn?
+    "nvidia-cudnn-cu12",
+    "nvidia-cusparselt-cu12",
+    # torch wheels pin nvjitlink but not related compiler packages.
+    # However, if packages do not match then lto_callback tests will fail.
+    # torch wheels depend on nvidia wheels; do not add if testing system ctk
+    "nvidia-cuda-cccl-cu12 ==12.6.*",
+    "nvidia-cuda-nvcc-cu12 ==12.6.*",
+    "nvidia-cuda-nvrtc-cu12 ==12.6.*",
+    "nvidia-cuda-runtime-cu12 ==12.6.*",
+    "nvidia-nvjitlink-cu12 ==12.6.*",
+    "torch >=2.9; platform_system!='Windows'"
+]
+torch-cu128 = [
+    # torch wheels pin nvjitlink but not related compiler packages.
+    # However, if packages do not match then lto_callback tests will fail.
+    # torch wheels depend on nvidia wheels; do not add if testing system ctk
+    "nvidia-cuda-cccl-cu12 ==12.8.*",
+    "nvidia-cuda-nvcc-cu12 ==12.8.*",
+    "nvidia-cuda-nvrtc-cu12 ==12.8.*",
+    "nvidia-cuda-runtime-cu12 ==12.8.*",
+    "nvidia-nvjitlink-cu12 ==12.8.*",
+    "torch >=2.9; platform_system!='Windows'"
+]
+torch-cu129 = [
+    # torch wheels pin nvjitlink but not related compiler packages.
+    # However, if packages do not match then lto_callback tests will fail.
+    # torch wheels depend on nvidia wheels; do not add if testing system ctk
+    "nvidia-cuda-cccl-cu12 ==12.9.*",
+    "nvidia-cuda-nvcc-cu12 ==12.9.*",
+    "nvidia-cuda-nvrtc-cu12 ==12.9.*",
+    "nvidia-cuda-runtime-cu12 ==12.9.*",
+    "nvidia-nvjitlink-cu12 ==12.9.*",
+    "torch >=2.9; platform_system!='Windows'",
+    "pytorch_triton >=3.4.0; platform_system!='Windows'",
+]
+torch-cu130 = [
+    # torch wheels pin nvjitlink but not related compiler packages.
+    # However, if packages do not match then lto_callback tests will fail.
+    # torch wheels depend on nvidia wheels; do not add if testing system ctk
+    "cuda-toolkit[cccl,nvcc,nvrtc,cudart,nvjitlink]==13.0",
+    "torch >=2.9; platform_system!='Windows'",
+]
+torch-cu130-nightly = [
+    # torch wheels pin nvjitlink but not related compiler packages.
+    # However, if packages do not match then lto_callback tests will fail.
+    # torch wheels depend on nvidia wheels; do not add if testing system ctk
+    "cuda-toolkit[cccl,nvcc,nvrtc,cudart,nvjitlink]==13.0",
+    "torch >=2.10.0.dev20251010; platform_system!='Windows'",
+    "pytorch_triton; platform_system!='Windows'",
+]
 
 [[tool.setuptools.ext-modules]]
 name="nvmath.bindings._internal.utils"
@@ -101,10 +345,14 @@ modules = [
     "nvmath.bindings.cusparse",
     "nvmath.bindings.curand",
     "nvmath.bindings.mathdx",
+    "nvmath.bindings.nvpl.blas",
+    "nvmath.bindings.nvpl.fft",
 ]
 linux_modules = [
-    "nvmath.bindings.nvpl.fft",
+    "nvmath.bindings.cublasMp",
     "nvmath.bindings.cufftMp",
+    "nvmath.bindings.cutensor",
+    "nvmath.bindings.nccl",
     "nvmath.bindings.nvshmem",
 ]
 internal_modules = [
@@ -117,6 +365,53 @@ internal_modules = [
     "nvmath.internal.ndbuffer.package_utils",
 ]
 
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu121", group='torch-cu121' },
+  { index = "pytorch-cu126", group='torch-cu126' },
+  { index = "pytorch-cu128", group='torch-cu128' },
+  { index = "pytorch-cu129", group='torch-cu129' },
+  { index = "pytorch-cu130", group='torch-cu130' },
+  { index = "pytorch-cu130-nightly", group='torch-cu130-nightly' },
+]
+pytorch_triton = [
+  { index = "pytorch-cu129", group='torch-cu129' },
+  { index = "pytorch-cu130-nightly", group='torch-cu130-nightly' },
+]
+torchvision = [
+  { index = "pytorch-cu126", group='notebooks-cu126' },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu121"
+url = "https://download.pytorch.org/whl/cu121"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu129"
+url = "https://download.pytorch.org/whl/cu129"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu130"
+url = "https://download.pytorch.org/whl/cu130"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu130-nightly"
+url = "https://download.pytorch.org/whl/nightly/cu130"
+explicit = true
+
 [tool.ruff]
 line-length = 128
 # Don't format autogenerated files
diff --git a/requirements/README.md b/requirements/README.md
deleted file mode 100644
index 2c9e861..0000000
--- a/requirements/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# nvmath-python requirements
-
-Dependencies are organized with requirements.txt files which can be use to set up
-virtualenvs with all required development tools to build docs, run tests, and build
-redistributable wheels.  Different requirements are necessary for installation with
-[pip](https://pip.pypa.io/en/stable/) vs [conda](https://docs.conda.io/en/latest/).
-
-## Pip: Top-level package requirements files
-
-Prefer using these `requirements/pip-<name>.txt` files for development in pip managed
-virtualenvs.  These include all relevant requirements sets and package extras.
-
-### Pip: Supported configurations for wheels
-
-| requirements.txt | Extras | Python Support | Platform Support | CUDA | Purpose |
-| ---------------- | ------ | ------- | ------- | ----- | ---- |
-| `requirements/pip-dev-cu11.txt` | `cu11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.x` | Development environment: ctk-11.x wheels  |
-| `requirements/pip-dev-cu118-torch.txt` | `cu11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.8` | Development environment: ctk-11.x wheels + torch |
-| `requirements/pip-dev-cu12-dx.txt` | `cu12`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.x` (latest) | Development environment: ctk-12.x wheels + DX APIs |
-| `requirements/pip-dev-cu12[6,8]-dx-torch.txt` | `cu12`, `cpu`, `dx`  | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.[6,8]` | Development environment: ctk-12.x wheels + DX APIs + torch |
-| `requirements/pip-dev-sysctk11.txt` | `sysctk11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.x` | Development environment: System CTK-11.x |
-| `requirements/pip-dev-sysctk12-dx.txt` |`sysctk12`, `sysctk12-dx`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.x` | Development environment: System CTK-12.x + DX APIs |
-
-### Pip: Development usage
-
-The requirements files provide dependencies only.  The nvmath-python package itself must
-also be installed, typically in editable mode for development.  Extras are not required to
-be specified on the editable install assuming the right requirements.txt has been installed
-in virtualenv.
-
-*Note*: For testing wheel/RPATH support locally, currently it requires to build in the
-non-editable mode (no `-e` flag).
-
-#### Install with pip
-
-Typically this is done inside a [virtualenv](https://docs.python.org/3/library/venv.html).
-
-```bash
-pip install -r requirements/pip-dev-<name>.txt
-pip install -e .
-```
-
-#### Install with pipenv
-
-See [pipenv docs](https://pipenv.pypa.io/en/latest/) for reference
-
-```bash
-$ pipenv install -r requirements/pip-dev-<name>.txt
-$ pipenv shell
-(nvmath-python) $ pip install -e .
-```
-
-### Pip: Fine-grained requirements
-
-Requirements for specific functionality are broken out into subsets.  These fine-grained
-requirements are included by the top-level requirements sets.
-
-| requirements.txt | Functionality |
-| ---------------- | ------- |
-| requirements/pip/build-wheel.txt | Utilities to build and validate wheels |
-| requirements/pip/docs.txt | Build documentation |
-| requirements/pip/mpich.txt | MPICH wheel test dependency. |
-| requirements/pip/nvmath-python.txt | nvmath-python core requirements |
-| requirements/pip/nvmath-python-cpu.txt | nvmath-python `[cpu]` extra requirements.  Enable CPU execution space. |
-| requirements/pip/nvmath-python-cu11.txt | nvmath-python `[cu11]` extra requirements.  Support CUDA-11.x via wheels. |
-| requirements/pip/nvmath-python-cu12.txt | nvmath-python `[cu12]` extra requirements.  Support CUDA-12.x via wheels. |
-| requirements/pip/nvmath-python-cu12-distributed.txt | nvmath-python `[cu12-distributed]` extra requirements.  Used for MGMN libraries + MPI. |
-| requirements/pip/nvmath-python-dx.txt | nvmath-python `[dx]` extra requirements.  Enable device APIs. |
-| requirements/pip/nvmath-python-sysctk11.txt | nvmath-python `[systemctk11]` extra requirements.  Used for system installed CTK-11.x |
-| requirements/pip/nvmath-python-sysctk12.txt | nvmath-python `[systemctk12]` extra requirements.  Used for system installed CTK-12.x |
-| requirements/pip/nvmath-python-sysctk12-dx.txt | nvmath-python `[systemctk12-dx]` extra requirements.  Used for `nvmath.device` with system installed CTK-12.x |
-| requirements/pip/openmpi.txt | OpenMPI wheel test dependency. |
-| requirements/pip/tests.txt | Test dependencies |
-| requirements/pip/torch-cu118.txt | Enable torch use in tests and examples via wheels for CUDA-11.8 |
-| requirements/pip/torch-cu126.txt | Enable torch use in tests and examples via wheels for CUDA-12.6 |
-| requirements/pip/torch-cu128.txt | Enable torch use in tests and examples via wheels for CUDA-12.8 |
-| requirements/pip/torch-cu129-nightly.txt | Enable torch nightly + CTK-12.9 wheels |
diff --git a/requirements/pip-dev-cu11-torch.txt b/requirements/pip-dev-cu11-torch.txt
deleted file mode 100644
index a4a041b..0000000
--- a/requirements/pip-dev-cu11-torch.txt
+++ /dev/null
@@ -1,6 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-cpu.txt
--r pip/nvmath-python-cu11.txt
--r pip/torch-cu118.txt
diff --git a/requirements/pip-dev-cu11.txt b/requirements/pip-dev-cu11.txt
deleted file mode 100644
index bbbff1c..0000000
--- a/requirements/pip-dev-cu11.txt
+++ /dev/null
@@ -1,5 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-cpu.txt
--r pip/nvmath-python-cu11.txt
diff --git a/requirements/pip-dev-cu12-dx-torch.txt b/requirements/pip-dev-cu12-dx-torch.txt
deleted file mode 100644
index 913909b..0000000
--- a/requirements/pip-dev-cu12-dx-torch.txt
+++ /dev/null
@@ -1,7 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-cpu.txt
--r pip/nvmath-python-cu12.txt
--r pip/nvmath-python-dx.txt
--r pip/torch-cu128.txt
diff --git a/requirements/pip-dev-cu12-dx.txt b/requirements/pip-dev-cu12-dx.txt
deleted file mode 100644
index 5090140..0000000
--- a/requirements/pip-dev-cu12-dx.txt
+++ /dev/null
@@ -1,6 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-cpu.txt
--r pip/nvmath-python-cu12.txt
--r pip/nvmath-python-dx.txt
diff --git a/requirements/pip-dev-sysctk11.txt b/requirements/pip-dev-sysctk11.txt
deleted file mode 100644
index dda2cdc..0000000
--- a/requirements/pip-dev-sysctk11.txt
+++ /dev/null
@@ -1,4 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-sysctk11.txt
diff --git a/requirements/pip-dev-sysctk12-dx.txt b/requirements/pip-dev-sysctk12-dx.txt
deleted file mode 100644
index ff3d4e6..0000000
--- a/requirements/pip-dev-sysctk12-dx.txt
+++ /dev/null
@@ -1,5 +0,0 @@
--r pip/docs.txt
--r pip/tests.txt
--r pip/nvmath-python.txt
--r pip/nvmath-python-sysctk12.txt
--r pip/nvmath-python-sysctk12-dx.txt
diff --git a/requirements/pip/build-wheel.txt b/requirements/pip/build-wheel.txt
deleted file mode 100644
index 301545e..0000000
--- a/requirements/pip/build-wheel.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-auditwheel
-build
-pyproject-validate
-twine
-wheel
diff --git a/requirements/pip/docs.txt b/requirements/pip/docs.txt
deleted file mode 100644
index dd39e8a..0000000
--- a/requirements/pip/docs.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-breathe
-enum-tools
-grip
-jupyter
-mpi4py
-myst-parser
-nbsphinx
-nbsphinx-link
-nvidia-sphinx-theme
-pandoc
-sphinx
-sphinx-favicon
-sphinx-toolbox
-sphinxcontrib-programoutput
diff --git a/requirements/pip/mpich.txt b/requirements/pip/mpich.txt
deleted file mode 100644
index 3bfcf2e..0000000
--- a/requirements/pip/mpich.txt
+++ /dev/null
@@ -1 +0,0 @@
-mpich
diff --git a/requirements/pip/notebooks.txt b/requirements/pip/notebooks.txt
deleted file mode 100644
index 723c4dc..0000000
--- a/requirements/pip/notebooks.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-jupyter
-matplotlib
-nbconvert
-torchvision
diff --git a/requirements/pip/nvmath-python-cpu.txt b/requirements/pip/nvmath-python-cpu.txt
deleted file mode 100644
index 62e8218..0000000
--- a/requirements/pip/nvmath-python-cpu.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cuda-core >=0.3.2,<0.4
-mkl; platform_machine=="x86_64"
-nvpl-fft ~= 0.3; platform_system=="Linux" and platform_machine=="aarch64"
diff --git a/requirements/pip/nvmath-python-cu11.txt b/requirements/pip/nvmath-python-cu11.txt
deleted file mode 100644
index e3ed0eb..0000000
--- a/requirements/pip/nvmath-python-cu11.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-cuda-bindings>=11.8.7,<12
-cuda-core[cu11]==0.3.2 # last supported version for ctk11
-cupy-cuda11x
-nvidia-cublas-cu11
-nvidia-cuda-nvrtc-cu11
-nvidia-cufft-cu11
-nvidia-curand-cu11
-nvidia-cusolver-cu11
-nvidia-cusparse-cu11
diff --git a/requirements/pip/nvmath-python-cu12-distributed.txt b/requirements/pip/nvmath-python-cu12-distributed.txt
deleted file mode 100644
index 3267d3c..0000000
--- a/requirements/pip/nvmath-python-cu12-distributed.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-mpi4py
-nvidia-cublasmp-cu12 >= 0.4.0
-nvidia-cufftmp-cu12
-nvidia-nvshmem-cu12 >= 3.2.5
diff --git a/requirements/pip/nvmath-python-cu12-no-cupy.txt b/requirements/pip/nvmath-python-cu12-no-cupy.txt
deleted file mode 100644
index 36ae240..0000000
--- a/requirements/pip/nvmath-python-cu12-no-cupy.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cuda-bindings>=12.9.1,<13
-cuda-core[cu12] >=0.3.2,<0.4
-nvidia-cublas-cu12
-nvidia-cuda-nvrtc-cu12
-nvidia-cuda-runtime-cu12
-nvidia-cudss-cu12 == 0.5.0.16
-nvidia-cufft-cu12
-nvidia-curand-cu12
-nvidia-cusolver-cu12
-nvidia-cusparse-cu12
diff --git a/requirements/pip/nvmath-python-cu12.txt b/requirements/pip/nvmath-python-cu12.txt
deleted file mode 100644
index d8111a7..0000000
--- a/requirements/pip/nvmath-python-cu12.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-cuda-bindings>=12.9.1,<13
-cuda-core[cu12] >=0.3.2,<0.4
-cupy-cuda12x
-nvidia-cublas-cu12
-nvidia-cuda-nvrtc-cu12
-nvidia-cuda-runtime-cu12
-nvidia-cudss-cu12 == 0.5.0.16
-nvidia-cufft-cu12
-nvidia-curand-cu12
-nvidia-cusolver-cu12
-nvidia-cusparse-cu12
diff --git a/requirements/pip/nvmath-python-dx.txt b/requirements/pip/nvmath-python-dx.txt
deleted file mode 100644
index 51d342e..0000000
--- a/requirements/pip/nvmath-python-dx.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cuda-bindings>=12.9.1,<13
-cuda-core[cu12] >=0.3.2,<0.4
-cupy-cuda12x
-numba # numba-cuda defines version restriction
-numba-cuda >= 0.18.1
-nvidia-cuda-cccl-cu12 > 12.4.127  # Earlier versions have missing header files
-nvidia-cuda-nvcc-cu12    # For numba use of libnvvm.so
-nvidia-cuda-nvrtc-cu12 !=12.4.*, !=12.5.0  # For nvmath.device use of NVRTC. [Known bugs exist for 12.4.0, 12.4.1, 12.5.0]
-# getting cuda headers from nvidia-cuda-runtime-cu12 at nvamth-python-cu12.txt
-nvidia-libmathdx-cu12 >=0.2.3,<0.3
diff --git a/requirements/pip/nvmath-python-sysctk-distributed.txt b/requirements/pip/nvmath-python-sysctk-distributed.txt
deleted file mode 100644
index 66c5ba0..0000000
--- a/requirements/pip/nvmath-python-sysctk-distributed.txt
+++ /dev/null
@@ -1 +0,0 @@
-mpi4py
diff --git a/requirements/pip/nvmath-python-sysctk11.txt b/requirements/pip/nvmath-python-sysctk11.txt
deleted file mode 100644
index a37e643..0000000
--- a/requirements/pip/nvmath-python-sysctk11.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cuda-bindings>=11.8.7,<12
-cuda-core==0.3.2 # last supported version for ctk11
-cupy-cuda11x
diff --git a/requirements/pip/nvmath-python-sysctk12-dx.txt b/requirements/pip/nvmath-python-sysctk12-dx.txt
deleted file mode 100644
index d9ab9b3..0000000
--- a/requirements/pip/nvmath-python-sysctk12-dx.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-numba # numba-cuda defines version restriction
-numba-cuda >= 0.18.1
diff --git a/requirements/pip/nvmath-python-sysctk12.txt b/requirements/pip/nvmath-python-sysctk12.txt
deleted file mode 100644
index 433c163..0000000
--- a/requirements/pip/nvmath-python-sysctk12.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cuda-bindings >=12.9.1,<13
-cuda-core >=0.3.2,<0.4
-cupy-cuda12x
diff --git a/requirements/pip/nvmath-python.txt b/requirements/pip/nvmath-python.txt
deleted file mode 100644
index f2395f3..0000000
--- a/requirements/pip/nvmath-python.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cuda-bindings
-cuda-core >=0.3.2,<0.4
-cuda-pathfinder>=1.2.1,<2.0
-numpy >=1.25,<3
-pywin32; platform_system=="Windows"
diff --git a/requirements/pip/openmpi.txt b/requirements/pip/openmpi.txt
deleted file mode 100644
index 6bd6ad4..0000000
--- a/requirements/pip/openmpi.txt
+++ /dev/null
@@ -1 +0,0 @@
-openmpi
diff --git a/requirements/pip/tests-dx-dev.txt b/requirements/pip/tests-dx-dev.txt
deleted file mode 100644
index d38c0f6..0000000
--- a/requirements/pip/tests-dx-dev.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-# These are dependencies to test against dev version of libmathdx
-nvidia-libmathdx-cu12 >= 0.2.4.dev0
diff --git a/requirements/pip/tests-dx.txt b/requirements/pip/tests-dx.txt
deleted file mode 100644
index 7264f16..0000000
--- a/requirements/pip/tests-dx.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-cuda-cccl >= 0.1.3.1.0.dev1486; python_version >="3.10" and python_version <= "3.13" and sys_platform == "linux" and platform_machine == "x86_64" # for examples
-nvidia-mathdx ~= 25.6.0 # for device performance testing
diff --git a/requirements/pip/tests.txt b/requirements/pip/tests.txt
deleted file mode 100644
index 23f902d..0000000
--- a/requirements/pip/tests.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cffi
-hypothesis
-psutil
-pytest
-pytest-repeat
-scipy
diff --git a/requirements/pip/torch-cu118.txt b/requirements/pip/torch-cu118.txt
deleted file mode 100644
index c8b81d2..0000000
--- a/requirements/pip/torch-cu118.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# pytorch >=2.3 to ensure numpy 1/2 compatibility
-# torch wheels depend on nvidia wheels; do not add if testing system ctk
-torch >=2.3; platform_system!="Windows"
-#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu118/
diff --git a/requirements/pip/torch-cu126.txt b/requirements/pip/torch-cu126.txt
deleted file mode 100644
index b7e76d9..0000000
--- a/requirements/pip/torch-cu126.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# torch wheels pin nvjitlink but not related compiler packages.
-# However, if packages do not match then lto_callback tests will fail
-nvidia-cuda-cccl-cu12 ==12.6.*
-nvidia-cuda-nvcc-cu12 ==12.6.*
-nvidia-cuda-nvrtc-cu12 ==12.6.*
-nvidia-cuda-runtime-cu12 ==12.6.*
-nvidia-nvjitlink-cu12 ==12.6.*
-# pytorch >=2.3 to ensure numpy 1/2 compatibility
-# torch wheels depend on nvidia wheels; do not add if testing system ctk
-torch >=2.3; platform_system!="Windows"
-#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu126/
diff --git a/requirements/pip/torch-cu128.txt b/requirements/pip/torch-cu128.txt
deleted file mode 100644
index 2fdf15d..0000000
--- a/requirements/pip/torch-cu128.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# torch wheels pin nvjitlink but not related compiler packages.
-# However, if packages do not match then lto_callback tests will fail
-nvidia-cuda-cccl-cu12 ==12.8.*
-nvidia-cuda-nvcc-cu12 ==12.8.*
-nvidia-cuda-nvrtc-cu12 ==12.8.*
-nvidia-cuda-runtime-cu12 ==12.8.*
-nvidia-nvjitlink-cu12 ==12.8.*
-# pytorch >=2.3 to ensure numpy 1/2 compatibility
-# torch wheels depend on nvidia wheels; do not add if testing system ctk
-torch >=2.3; platform_system!="Windows"
-#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu128/
diff --git a/requirements/pip/torch-cu129-nightly.txt b/requirements/pip/torch-cu129-nightly.txt
deleted file mode 100644
index e8def9a..0000000
--- a/requirements/pip/torch-cu129-nightly.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-# torch wheels pin nvjitlink but not related compiler packages.
-# However, if packages do not match then lto_callback tests will fail
-nvidia-cuda-cccl-cu12 ==12.9.*
-nvidia-cuda-nvcc-cu12 ==12.9.*
-nvidia-cuda-nvrtc-cu12 ==12.9.*
-nvidia-cuda-runtime-cu12 ==12.9.*
-nvidia-nvjitlink-cu12 ==12.9.*
-# pytorch >=2.3 to ensure numpy 1/2 compatibility
-# torch wheels depend on nvidia wheels; do not add if testing system ctk
-# In order to install torch nightly, we need to specify the index url for both torch and triton
-# Using a specific torch version makes solving faster
-#pipenv install "torch==2.9.0.dev20250813+cu129" "pytorch_triton==3.4.0+gitf7888497" --index=https://download.pytorch.org/whl/nightly/cu129
diff --git a/setup.py b/setup.py
index 68cc787..31f75fb 100644
--- a/setup.py
+++ b/setup.py
@@ -83,6 +83,12 @@ def get_ext_modules() -> list[Extension]:
     return ext_modules + ext_nvmath_internal_modules
 
 
+# WAR: cython compilation
+# https://github.com/cython/cython/issues/7122#issuecomment-3240416121
+# TODO: remove with next cython release (3.1.4+)
+sys.setrecursionlimit(50000)
+
+
 nthreads = os.cpu_count()
 setup(
     ext_modules=cythonize(
diff --git a/tests/diagnostics/comm_diagnostic.py b/tests/diagnostics/comm_diagnostic.py
new file mode 100644
index 0000000..4bff6d9
--- /dev/null
+++ b/tests/diagnostics/comm_diagnostic.py
@@ -0,0 +1,109 @@
+import argparse
+import cuda.core.experimental as ccx
+import socket
+import os
+import warnings
+
+from dataclasses import dataclass
+from mpi4py import MPI
+
+import numpy as np
+from nvmath.bindings import nccl
+
+
+def initialize_nccl(comm, rank, nranks):
+    # Create NCCL communicator.
+    unique_id = nccl.UniqueId()
+    if rank == 0:
+        nccl.get_unique_id(unique_id.ptr)
+    # PE 0 broadcasts the unique ID.
+    comm.Bcast(unique_id._data.view(np.int8), root=0)
+    nccl_comm = nccl.comm_init_rank(nranks, unique_id.ptr, rank)
+    return nccl_comm
+
+
+def set_device(rank):
+    device_id = rank % num_devices
+    device = ccx.Device(device_id)
+    device.set_current()
+
+
+@dataclass
+class HostInfo:
+    # Number of devices on this host.
+    num_devices: int = 0
+    # Number of processes on this host.
+    num_procs: int = 0
+
+
+parser = argparse.ArgumentParser(description="MPI and NCCL diagnostic tool")
+parser.add_argument("--nccl", action="store_true", help="Diagnose NCCL")
+args = parser.parse_args()
+use_nccl = args.nccl
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+num_devices = ccx.system.num_devices
+
+if nranks < 2:
+    raise RuntimeError(
+        "You need to run with multiple processes to take advantage of MGMN. Try running "
+        f"this script with `mpiexec -n $num_procs python {os.path.basename(__file__)}`"
+    )
+
+cluster_info = comm.allgather((socket.gethostname(), num_devices))
+
+# This check is probably unnecessary because if MPI isn't working, the allgather above
+# should fail.
+if cluster_info is None or len(cluster_info) != nranks:
+    raise RuntimeError("MPI is not working (did not get information from every process)")
+
+# Construct a map of number of processes and devices per host.
+host_info = {}
+for hostname, num_devices in cluster_info:
+    if hostname not in host_info:
+        host_info[hostname] = HostInfo(num_devices=num_devices, num_procs=1)
+    else:
+        if host_info[hostname].num_devices != num_devices:
+            raise RuntimeError(f"Processes on host {hostname} are not reporting the same device count")
+        host_info[hostname].num_procs += 1
+
+suboptimal_hosts = []
+if rank == 0:
+    print("\n========== Host info ==========")
+for hostname, info in host_info.items():
+    if rank == 0:
+        print(f"* Host {hostname}: {info}")
+    if info.num_devices < info.num_procs:
+        suboptimal_hosts.append(hostname)
+if rank == 0:
+    print("")
+
+comm.Barrier()
+
+if suboptimal_hosts:
+    if use_nccl:
+        raise RuntimeError(
+            "NCCL doesn't allow multiple processes per GPU: run the same number of processes "
+            "on each host as number of local GPUs."
+        )
+    elif rank == 0:
+        warnings.warn(
+            f"The setup is suboptimal in the following hosts: {suboptimal_hosts}. An optimal "
+            "setup requires a CUDA device uniquely assigned to a single process."
+        )
+
+if use_nccl:
+    set_device(rank)
+    nccl_comm = initialize_nccl(comm, rank, nranks)
+    if rank == 0:
+        print("NCCL initialized")
+    nccl.comm_destroy(nccl_comm)
+    if rank == 0:
+        print("NCCL communicator destroyed")
+
+if rank == 0:
+    print("\nMPI test passed")
+    if use_nccl:
+        print("NCCL test passed")
diff --git a/tests/diagnostics/mpi_diag.py b/tests/diagnostics/mpi_diag.py
deleted file mode 100644
index a89438f..0000000
--- a/tests/diagnostics/mpi_diag.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import socket
-import cuda.core.experimental
-import os
-import warnings
-
-from dataclasses import dataclass
-from mpi4py import MPI
-
-
-@dataclass
-class HostInfo:
-    num_devices: int = 0
-    num_procs: int = 0
-
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-nranks = comm.Get_size()
-num_devices = cuda.core.experimental.system.num_devices
-
-if nranks < 2:
-    raise RuntimeError(
-        "You need to run with multiple processes to take advantage of MGMN. Try running "
-        f"this script with `mpiexec -n $num_procs python {os.path.basename(__file__)}`"
-    )
-
-cluster_info = comm.gather((socket.gethostname(), num_devices))
-
-# NOTE: do no more communication after this point due to rank 0 being the only
-# one raising exceptions.
-
-if rank == 0:
-    if len(cluster_info) != nranks:
-        # Note: if MPI gather failed, it is more likely that the comm.gather
-        # raised an exception.
-        raise RuntimeError("MPI is not working (did not get information from every process)")
-
-    # Construct a map of number of processes and devices per host.
-    host_info = {}
-    for hostname, num_devices in cluster_info:
-        if hostname not in host_info:
-            host_info[hostname] = HostInfo(num_devices=num_devices, num_procs=1)
-        else:
-            if host_info[hostname].num_devices != num_devices:
-                raise RuntimeError(f"Processes on host {hostname} are not reporting the same device count")
-            host_info[hostname].num_procs += 1
-
-    suboptimal_hosts = []
-    print("\n========== Host info ==========")
-    for hostname, info in host_info.items():
-        print(f"- Host {hostname}: num_procs={info.num_procs} num_devices={info.num_devices}")
-        if info.num_devices < info.num_procs:
-            suboptimal_hosts.append(hostname)
-    print("")
-
-    if len(suboptimal_hosts) > 0:
-        warnings.warn(
-            f"The setup is suboptimal in the following hosts: {suboptimal_hosts}. An optimal "
-            "setup requires a CUDA device uniquely assigned to a single process."
-        )
-    else:
-        print("No issues found with the current MPI setup for running MGMN operations with nvmath.distributed")
-
-    print("\nMPI test passed")
diff --git a/tests/example_tests/device_tests/test_device_samples.py b/tests/example_tests/device_tests/test_device_samples.py
index a0fca86..c89102d 100644
--- a/tests/example_tests/device_tests/test_device_samples.py
+++ b/tests/example_tests/device_tests/test_device_samples.py
@@ -22,6 +22,11 @@ def test_sample(self, sample):
             # spec = importlib.util.find_spec("cuda.cccl")
             # if spec is None:
             pytest.skip("Skipping test for cublasdx_fp64_emulation.py, requires cuda.cccl module")
+        if os.path.basename(sample) == "cublasdx_simple_partition.py":
+            from nvmath.bindings import mathdx
+
+            if mathdx.get_version_ex() < (0, 3, 0):
+                pytest.skip("Partition is supported on libmathdx 0.3.0+")
         if os.path.basename(sample) == "cublasdx_gemm_fft_fp16.py":
             pytest.skip("NVBug 5218000")
         run_sample(samples_path, sample, {"__name__": "__main__"})
diff --git a/tests/example_tests/matmul_tests/test_generic_matmul_samples.py b/tests/example_tests/matmul_tests/test_generic_matmul_samples.py
new file mode 100644
index 0000000..d694202
--- /dev/null
+++ b/tests/example_tests/matmul_tests/test_generic_matmul_samples.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import glob
+import os
+import re
+
+import pytest
+
+import cuda.core.experimental as ccx
+
+from nvmath import bindings
+from ..test_utils import run_sample
+
+
+samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "linalg", "generic", "matmul")
+sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
+
+# Handle MPI tests separately.
+mpi_re = r".*_mpi[_]?.*\.py"
+sample_files = list(filter(lambda f: not re.search(mpi_re, f), sample_files))
+
+min_cublas_version = {}
+
+min_cc = {}
+
+test_requires_nvpl = {
+    "example04_stateful_torch_cpu.py": True,
+}
+
+cublas_version = bindings.cublasLt.get_version()
+device_properties = ccx.Device().properties
+cc = (device_properties.compute_capability_major, device_properties.compute_capability_minor)
+
+try:
+    from nvmath.bindings.nvpl.blas import get_version
+    from nvmath.bindings._internal.utils import FunctionNotFoundError
+
+    get_version()
+    del get_version
+    NVPL_AVAILABLE = True
+except FunctionNotFoundError as e:
+    if "function nvpl_blas_get_version is not found" not in str(e):
+        raise e
+    # An NVPL alternative was loaded which doesn't implement nvpl_blas_get_version
+    NVPL_AVAILABLE = True
+except RuntimeError as e:
+    if "Failed to dlopen all of the following libraries" not in str(e):
+        raise e
+    # Neither NVPL or an alternative was loaded
+    NVPL_AVAILABLE = False
+
+
+@pytest.mark.parametrize("sample", sample_files)
+class TestMatmulSamples:
+    def test_sample(self, sample):
+        filename = os.path.basename(sample)
+        required_cublas_version = min_cublas_version.get(filename, 0)
+        if cublas_version < required_cublas_version:
+            pytest.skip(f"cublas version {cublas_version} lower than required ({required_cublas_version})")
+        required_cc = min_cc.get(filename, (0, 0))
+        if cc < required_cc:
+            pytest.skip(f"compute capability {cc} lower than required {required_cc}")
+        nvpl_required = test_requires_nvpl.get(filename, False)
+        if nvpl_required and not NVPL_AVAILABLE:
+            pytest.skip("NVPL is required, but not available.")
+        run_sample(samples_path, sample)
diff --git a/tests/example_tests/tensor_tests/__init__.py b/tests/example_tests/tensor_tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/example_tests/tensor_tests/test_contraction_samples.py b/tests/example_tests/tensor_tests/test_contraction_samples.py
new file mode 100644
index 0000000..cf2b91c
--- /dev/null
+++ b/tests/example_tests/tensor_tests/test_contraction_samples.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import glob
+import os
+
+import pytest
+
+import cuda.core.experimental as ccx
+
+from nvmath.bindings import cutensor
+from nvmath.bindings._internal.utils import NotSupportedError, FunctionNotFoundError
+
+from ..test_utils import run_sample
+
+
+try:
+    cutensor.get_version()
+    HAS_CUTENSOR = True
+except (NotSupportedError, FunctionNotFoundError, RuntimeError):
+    HAS_CUTENSOR = False
+
+samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "tensor", "contraction")
+sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
+
+
+@pytest.mark.skipif(not HAS_CUTENSOR, reason="cuTensor is not available")
+@pytest.mark.skipif(ccx.Device().compute_capability <= (7, 0), reason="cuTensor 2.3.1+ requires compute capability > 7.0")
+@pytest.mark.parametrize("sample", sample_files)
+class TestContractionSamples:
+    def test_sample(self, sample):
+        run_sample(samples_path, sample)
diff --git a/tests/example_tests/test_utils.py b/tests/example_tests/test_utils.py
index f1027a8..101a1e2 100644
--- a/tests/example_tests/test_utils.py
+++ b/tests/example_tests/test_utils.py
@@ -58,8 +58,17 @@ def run_sample(samples_path, filename, env=None, use_subprocess=False, use_mpi=F
             # Check if the filename indicates with how many processes to run, for example:
             # `example_something_4p.py` is to be run with 4 processes.
             m = re.search(r".*_(\d+)p.py$", filename)
+            uses_nccl = "distributed/linalg/advanced/matmul" in fullpath
             if m:
                 num_procs = m.group(1)
+                if uses_nccl and int(num_procs) > DEVICE_COUNT:
+                    pytest.skip(
+                        f"This test requires {num_procs} processes but NCCL only allows one "
+                        f"process per GPU and there are {DEVICE_COUNT} GPUs"
+                    )
+            elif uses_nccl:
+                # NCCL only allows one process per GPU.
+                num_procs = str(DEVICE_COUNT)
             else:
                 # Run with 2 processes by default.
                 num_procs = "2"
diff --git a/tests/nvmath_tests/device/helpers.py b/tests/nvmath_tests/device/helpers.py
index 5a2b418..154b63f 100644
--- a/tests/nvmath_tests/device/helpers.py
+++ b/tests/nvmath_tests/device/helpers.py
@@ -13,7 +13,7 @@
 import pytest
 
 from nvmath.device import CodeType, ComputeCapability
-from nvmath.device.common_cuda import MAX_SUPPORTED_CC, get_default_code_type
+from nvmath.device.common_cuda import MAX_SUPPORTED_CC, current_device_sm, get_default_code_type
 from nvmath._utils import get_nvrtc_version
 
 
@@ -37,14 +37,13 @@ def CHECK_NVRTC(err, prog):
         raise RuntimeError(f"NVRTC error: {log.decode('ascii')}")
 
 
-def set_device():
+def set_device() -> ComputeCapability:
     (err,) = cudart.cudaSetDevice(0)
     CHECK_CUDART(err)
-    err, prop = cudart.cudaGetDeviceProperties(0)
-    CHECK_CUDART(err)
-    if (prop.major, prop.minor) > MAX_SUPPORTED_CC:
-        return MAX_SUPPORTED_CC
-    return (prop.major, prop.minor)
+    cc = current_device_sm()
+    if cc > MAX_SUPPORTED_CC:
+        cc = MAX_SUPPORTED_CC
+    return cc
 
 
 def random_complex(shape, real_dtype, order="C", module=np) -> np.ndarray:
diff --git a/tests/nvmath_tests/device/helpers_cpp.py b/tests/nvmath_tests/device/helpers_cpp.py
index f0a26bf..2b4b9ad 100644
--- a/tests/nvmath_tests/device/helpers_cpp.py
+++ b/tests/nvmath_tests/device/helpers_cpp.py
@@ -6,7 +6,7 @@
 from cuda.bindings import runtime as cudart, nvrtc, driver as cudadrv
 
 from nvmath._utils import PLATFORM_LINUX, PLATFORM_WIN
-from nvmath.device.common_mathdx import CUDA_HOME as _CUDA_HOME
+from cuda import pathfinder
 from importlib.metadata import files, PackageNotFoundError
 
 from .helpers import CHECK_CUDA, CHECK_CUDART, CHECK_NVRTC, make_args, get_unsigned
@@ -64,7 +64,7 @@ def run_and_time(kernel, grid_dim, block_dim, shared_memory_size, ncycles, *args
 
 
 def compile_cpp_kernel(cpp, mangled):
-    print(f"compile_cpp_kernel CUDA_HOME = {_CUDA_HOME}, MATHDX_HOME = {_MATHDX_HOME}")
+    print(f"compile_cpp_kernel MATHDX_HOME = {_MATHDX_HOME}")
 
     err, prop = cudart.cudaGetDeviceProperties(0)
     CHECK_CUDART(err)
@@ -72,8 +72,8 @@ def compile_cpp_kernel(cpp, mangled):
 
     opts = (
         [b"--std=c++17", b"--device-as-default-execution-space", b"-DCUFFTDX_DETAIL_USE_CUDA_STL=1"]
-        + [bytes(f"--include-path={h}/include", encoding="ascii") for h in _CUDA_HOME]
-        + [bytes(f"--include-path={h}/include/cccl", encoding="ascii") for h in _CUDA_HOME]
+        + [bytes(f"--include-path={pathfinder.find_nvidia_header_directory('cudart')}", encoding="ascii")]
+        + [bytes(f"--include-path={pathfinder.find_nvidia_header_directory('cccl')}", encoding="ascii")]
         + [
             bytes(f"--include-path={_MATHDX_HOME}/include", encoding="ascii"),
             bytes(f"--include-path={_MATHDX_HOME}/include/cufftdx", encoding="ascii"),
diff --git a/tests/nvmath_tests/device/numba_conv.py b/tests/nvmath_tests/device/numba_conv.py
index 22a9b0e..dea1a88 100644
--- a/tests/nvmath_tests/device/numba_conv.py
+++ b/tests/nvmath_tests/device/numba_conv.py
@@ -5,7 +5,7 @@
 import numpy as np
 from numba import cuda
 
-from nvmath.device import fft, float32x2_type, float64x2_type
+from nvmath.device import fft
 from .helpers import _TOLERANCE, l2error
 import cupy
 import time
@@ -60,9 +60,9 @@ def __init__(self, size, precision, fft_type, ffts_per_block, elements_per_threa
         assert FWD.ffts_per_block == ffts_per_block
         assert FWD.elements_per_thread == elements_per_thread
         if precision == np.float32:
-            assert complex_type == float32x2_type
+            assert complex_type == np.dtype(np.complex64)
         else:
-            assert complex_type == float64x2_type
+            assert complex_type == np.dtype(np.complex128)
         assert all(code.endswith(".ltoir") for code in FWD.files + INV.files)
 
         @cuda.jit(link=FWD.files + INV.files)
diff --git a/tests/nvmath_tests/device/test_cublasdx_generic.py b/tests/nvmath_tests/device/test_cublasdx_generic.py
index ea6a1ca..3f5b4e7 100644
--- a/tests/nvmath_tests/device/test_cublasdx_generic.py
+++ b/tests/nvmath_tests/device/test_cublasdx_generic.py
@@ -12,10 +12,11 @@
     matmul,
     TransposeMode,
     LeadingDimension,
-    BlasOptions,
+    Matmul,
 )
-from nvmath.device.common_cuda import MAX_SUPPORTED_CC, get_default_code_type
-from nvmath.device.cublasdx import BlasCompiled, BlasOptionsComplete, SharedStorageCalc
+from nvmath.device.types import complex32, complex64, complex128
+from nvmath.device.common_cuda import MAX_SUPPORTED_CC
+from nvmath.device.cublasdx import SharedStorageCalc, compile_blas_execute
 import functools
 import pytest
 import itertools
@@ -59,12 +60,13 @@ def test_third_party_symbol(execute_api):
         data_type="real",
         precision=np.float64,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM75,
+        sm=SM75.cc,
         execution="Block",
-        execute_api=execute_api,
     )
 
-    assert len(MM.symbol) > 0
+    _, symbol = compile_blas_execute(MM, code_type=SM75, execute_api=execute_api)
+
+    assert len(symbol) > 0
 
 
 def test_third_party_code():
@@ -73,23 +75,21 @@ def test_third_party_code():
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM75,
+        sm=SM75.cc,
         execution="Block",
     )
 
-    assert len(MM.codes) > 0
-    for code in MM.codes:
-        print(code.code_type, code.isa_version)
-    assert isinstance(MM, BlasCompiled)
+    code, _ = compile_blas_execute(MM, code_type=SM75, execute_api="static_leading_dimensions")
+
+    assert isinstance(MM, Matmul)
     assert MM.size == (16, 8, 16)
-    assert all(f.endswith(".ltoir") for f in MM.files)
-    assert all(code.isa_version.major >= 12 for code in MM.codes)
-    assert all(code.isa_version.minor >= 0 for code in MM.codes)
-    assert all(code.code_type.cc.major == 7 for code in MM.codes)
-    assert all(code.code_type.cc.minor == 5 for code in MM.codes)
-    assert all(code.code_type.kind == "lto" for code in MM.codes)
-    assert all(isinstance(code.data, bytes) for code in MM.codes)
-    assert all(len(code.data) > 0 for code in MM.codes)
+    assert code.isa_version.major >= 12
+    assert code.isa_version.minor >= 0
+    assert code.code_type.cc.major == 7
+    assert code.code_type.cc.minor == 5
+    assert code.code_type.kind == "lto"
+    assert isinstance(code.data, bytes)
+    assert len(code.data) > 0
     assert MM.max_threads_per_block <= 1024
 
 
@@ -125,47 +125,36 @@ def test_transpose_mode(ta, tb):
 
 
 def test_suggested_block_dim():
-    BO = BlasOptions(
+    MM = Matmul(
         size=(16, 8, 16),
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM75,
+        sm=SM75.cc,
         execution="Block",
         block_dim="suggested",
     )  # leading_dimension = None implicit
 
-    # block_dim = suggested    --> Dim3
-    # leading_dimension = None --> None
-    assert isinstance(BO, BlasOptions)
-    assert isinstance(BO.block_dim, Dim3)
-    assert BO.leading_dimension is None
-    assert BO.size == (16, 8, 16)
-    assert BO.block_dim[0] * BO.block_dim[1] * BO.block_dim[2] >= 1
-
-    MM = BO.create()
-
-    # block_dim         --> Dim3 (same as above)
-    # leading_dimension --> LeadingDimension (takes a default value)
-    assert isinstance(MM, BlasCompiled)
+    assert isinstance(MM, Matmul)
     assert isinstance(MM.block_dim, Dim3)
-    assert MM.block_dim == BO.block_dim
+    assert MM.size == (16, 8, 16)
+    assert MM.block_dim[0] * MM.block_dim[1] * MM.block_dim[2] >= 1
     assert isinstance(MM.leading_dimension, LeadingDimension)
 
 
 def test_suggested_leading_dimension():
-    BO = BlasOptions(
+    BO = Matmul(
         size=(16, 8, 16),
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM89,
+        sm=SM89.cc,
         block_size=64,
         execution="Block",
         leading_dimension="suggested",
     )
 
-    assert isinstance(BO, BlasOptions)
+    assert isinstance(BO, Matmul)
     assert isinstance(BO.leading_dimension, LeadingDimension)
     assert isinstance(BO.block_dim, Dim3)
 
@@ -175,7 +164,7 @@ def test_suggested_leading_dimension():
     assert BO.leading_dimension.c >= 1
 
     MM = BO.create()
-    assert isinstance(MM, BlasCompiled)
+    assert isinstance(MM, Matmul)
     assert isinstance(MM.leading_dimension, LeadingDimension)
     assert isinstance(MM.block_dim, Dim3)
 
@@ -185,26 +174,26 @@ def test_suggested_leading_dimension():
 
 
 def test_valid_finalize():
-    BO = BlasOptions(
+    BO = Matmul(
         size=(16, 8, 16),
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM75,
+        sm=SM75.cc,
         execution="Block",
     )
 
-    assert isinstance(BO, BlasOptions)
+    assert isinstance(BO, Matmul)
     valids = BO.valid("block_dim")
 
     count = 0
     for (block_dim,) in valids:
         count += 1
-        MM = BO.create(block_dim=block_dim, code_type=SM80)
-        assert isinstance(MM, BlasCompiled)
+        MM = BO.create(block_dim=block_dim)
+        assert isinstance(MM, Matmul)
         assert MM.block_dim == block_dim
         assert MM.size == (16, 8, 16)
-        assert MM.code_type == SM80
+        assert MM.sm == SM75.cc
     assert count > 0
 
 
@@ -302,7 +291,7 @@ def test_unsupported_sm():
             data_type="real",
             arrangement=("col_major", "col_major", "col_major"),
             precision=np.float32,
-            code_type=code_type,
+            sm=code_type.cc,
             execution="Block",
         )
 
@@ -332,11 +321,11 @@ def test_sm_type(code_type):
         ("real", np.float64, np.float64),
         ("real", (np.float16, np.float16, np.float16), np.float16),
         ("real", (np.float32, np.float32, np.float32), np.float32),
-        ("complex", np.float16, np.dtype([("x", np.float16), ("y", np.float16)], align=True)),
-        ("complex", np.float32, np.complex64),
-        ("complex", np.float64, np.complex128),
-        ("complex", (np.float32, np.float32, np.float32), np.complex64),
-        ("complex", (np.float64, np.float64, np.float64), np.complex128),
+        ("complex", np.float16, complex32),
+        ("complex", np.float32, complex64),
+        ("complex", np.float64, complex128),
+        ("complex", (np.float32, np.float32, np.float32), complex64),
+        ("complex", (np.float64, np.float64, np.float64), complex128),
     ],
 )
 def test_value_type(data_type, precision, value_type):
@@ -359,11 +348,11 @@ def test_value_type(data_type, precision, value_type):
     [
         ("real", (np.float16, np.float16, np.float32), (np.float16, np.float16, np.float32)),
         ("real", (np.float32, np.float32, np.float64), (np.float32, np.float32, np.float64)),
-        ("complex", (np.float32, np.float32, np.float64), (np.complex64, np.complex64, np.complex128)),
+        ("complex", (np.float32, np.float32, np.float64), (complex64, complex64, complex128)),
         (
             "complex",
             (np.float64, np.float64, np.float16),
-            (np.complex128, np.complex128, np.dtype([("x", np.float16), ("y", np.float16)], align=True)),
+            (complex128, complex128, complex32),
         ),
     ],
 )
@@ -449,17 +438,15 @@ class TestGetSharedStorageSize(NamedTuple):
 
 
 @pytest.mark.parametrize(
-    "MM_kwargs, compiled, t, t_ab",
+    "MM_kwargs, t, t_ab",
     [
         (
             {"size": (1, 1, 1), "precision": np.float16},
-            False,
             TestGetSharedStorageSize(expected_size=6),
             TestGetSharedStorageSize(expected_size=4),
         ),
         (
             {"size": (1, 1, 1), "precision": np.float16, "alignment": (8, 8, 8)},
-            False,
             TestGetSharedStorageSize(expected_size=18),
             TestGetSharedStorageSize(expected_size=10),
         ),
@@ -468,7 +455,6 @@ class TestGetSharedStorageSize(NamedTuple):
                 "size": (1, 1, 1),
                 "precision": (np.float16, np.float64, np.float16),
             },
-            False,
             TestGetSharedStorageSize(expected_size=18),
             TestGetSharedStorageSize(expected_size=16),
         ),
@@ -478,26 +464,22 @@ class TestGetSharedStorageSize(NamedTuple):
                 "precision": (np.float16, np.float64, np.float16),
                 "alignment": (8, 8, 8),
             },
-            False,
             TestGetSharedStorageSize(expected_size=18),
             TestGetSharedStorageSize(expected_size=16),
         ),
         (
             {"size": (4, 4, 4), "precision": np.float16},
-            False,
             TestGetSharedStorageSize(expected_size=96),
             TestGetSharedStorageSize(expected_size=64),
         ),
         (
             {"size": (4, 4, 4), "precision": np.float16, "alignment": (8, 8, 8)},
-            False,
             TestGetSharedStorageSize(expected_size=96),
             TestGetSharedStorageSize(expected_size=64),
         ),
         # Test wrong number of arguments
         (
             {"size": (1, 2, 3), "precision": np.float16, "alignment": (2, 4, 8)},
-            False,
             TestGetSharedStorageSize(
                 args=(1, 2),
                 expected_error=r"get_shared_storage_size\(\) takes either 0 or "
@@ -511,7 +493,6 @@ class TestGetSharedStorageSize(NamedTuple):
         ),
         (
             {"size": (1, 2, 3), "precision": np.float16, "alignment": (2, 4, 8)},
-            False,
             TestGetSharedStorageSize(
                 args=(1, 2, 3, 4),
                 expected_error=r"get_shared_storage_size\(\) takes either 0 or "
@@ -526,7 +507,6 @@ class TestGetSharedStorageSize(NamedTuple):
         # Test wrong types of arguments
         (
             {"size": (1, 2, 3), "precision": np.float16, "alignment": (2, 4, 8)},
-            False,
             TestGetSharedStorageSize(
                 args=(1, 2, "3"),
                 expected_error=r"get_shared_storage_size\(\) takes either 0 or "
@@ -543,10 +523,7 @@ class TestGetSharedStorageSize(NamedTuple):
                 "size": (1, 2, 3),
                 "precision": np.float16,
                 "alignment": (2, 4, 8),
-                "execute_api": "tensors",
-                "tensor_types": ("smem_a", "smem_b", "smem_c"),
             },
-            True,
             TestGetSharedStorageSize(
                 args=(1, 2, lambda MM: MM.get_layout_smem_c()),  # wrong type
                 expected_error=r"get_shared_storage_size\(\) takes either 0 or "
@@ -563,10 +540,7 @@ class TestGetSharedStorageSize(NamedTuple):
                 "size": (1, 2, 3),
                 "precision": np.float16,
                 "alignment": (2, 4, 8),
-                "execute_api": "tensors",
-                "tensor_types": ("smem_a", "smem_b", "smem_c"),
             },
-            True,
             TestGetSharedStorageSize(
                 args=(
                     lambda MM: MM.get_layout_smem_a(),
@@ -592,7 +566,6 @@ class TestGetSharedStorageSize(NamedTuple):
                 "precision": np.float16,
                 "alignment": (2, 4, 8),
             },
-            True,
             TestGetSharedStorageSize(
                 args=(5, 5, 5),
                 expected_size=76,
@@ -607,10 +580,7 @@ class TestGetSharedStorageSize(NamedTuple):
                 "size": (1, 2, 3),  # matrix sizes 3, 6, 2
                 "precision": np.float16,
                 "alignment": (2, 4, 8),
-                "execute_api": "tensors",
-                "tensor_types": ("smem_a", "smem_b", "smem_c"),
             },
-            True,
             TestGetSharedStorageSize(
                 args=(
                     lambda MM: MM.get_layout_smem_a(),
@@ -631,20 +601,18 @@ class TestGetSharedStorageSize(NamedTuple):
 )
 def test_cublasdx_get_shared_storage_size_args(
     MM_kwargs: dict,
-    compiled: bool,
     t: TestGetSharedStorageSize,
     t_ab: TestGetSharedStorageSize,
 ):
-    if compiled:
-        skip_nvbug_5218000(MM_kwargs["precision"], size=MM_kwargs["size"])
+    ct = SM80  # CodeType object
     MM_kwargs |= {
         "data_type": "real",
         "arrangement": ("col_major", "col_major", "col_major"),
         "block_size": 128,
-        "code_type": get_default_code_type(),
+        "sm": ct.cc,
     }
 
-    MM = BlasCompiled(**MM_kwargs) if compiled else BlasOptionsComplete(**MM_kwargs)
+    MM = Matmul(**MM_kwargs)
     # remove callables from args
     args = tuple(a(MM) if callable(a) else a for a in t.args)
     args_ab = tuple(a(MM) if callable(a) else a for a in t_ab.args)
@@ -681,41 +649,39 @@ def test_static_block_dim():
     MM2 = matmul_base()
     MM3 = matmul_base(static_block_dim=True)
 
-    # if input is the same handle will be cached and the same
-    assert MM1._handle == MM2._handle
-    # here since input is different, handle is different
-    assert MM1._handle != MM3._handle
+    assert MM1.static_block_dim == MM2.static_block_dim
+    assert MM1.static_block_dim != MM3.static_block_dim
 
 
 @pytest.mark.parametrize(
-    "dtype, alignment, expected, complete, expected_error",
+    "dtype, alignment, expected, expected_error",
     [
-        ("real", (8, 8, 8), Alignment(8, 8, 8), False, None),
-        ("real", [8, 8, 8], Alignment(8, 8, 8), False, None),
-        ("real", Alignment(8, 8, 8), Alignment(8, 8, 8), False, None),
-        ("real", (4, 8, 16), Alignment(4, 8, 16), False, None),
-        ("real", (4, 8, 16), Alignment(4, 8, 16), True, None),
-        ("real", MAX_ALIGNMENT, Alignment(16, 16, 16), False, None),
-        ("real", (8, 2, 8), None, False, "alignment.b must be a multiple of input value type 4. Got 2"),
-        ("real", (8, 8, 4), None, False, "alignment.c must be a multiple of input value type 8. Got 4"),
-        ("real", (32, 8, 8), None, False, "alignment.a must be less than maximum alignment 16. Got 32"),
-        ("real", (-1, 8, 8), None, False, "alignment.a must be > 0. Got -1"),
-        ("real", (8, 0, 8), None, False, "alignment.b must be > 0. Got 0"),
-        ("real", None, None, False, None),
-        ("real", None, Alignment(2, 4, 8), True, None),
-        ("real", (4, 8, 16), Alignment(4, 8, 16), True, None),
-        ("complex", (4, 8, 16), Alignment(4, 8, 16), False, None),
-        ("complex", (4, 8, 16), Alignment(4, 8, 16), True, None),
-        ("complex", (8, 8, 8), None, False, "alignment.c must be a multiple of input value type 16. Got 8"),
+        ("real", (8, 8, 8), Alignment(8, 8, 8), None),
+        ("real", [8, 8, 8], Alignment(8, 8, 8), None),
+        ("real", Alignment(8, 8, 8), Alignment(8, 8, 8), None),
+        ("real", (4, 8, 16), Alignment(4, 8, 16), None),
+        ("real", (4, 8, 16), Alignment(4, 8, 16), None),
+        ("real", MAX_ALIGNMENT, Alignment(16, 16, 16), None),
+        ("real", (8, 2, 8), None, "alignment.b must be a multiple of input value type 4. Got 2"),
+        ("real", (8, 8, 4), None, "alignment.c must be a multiple of input value type 8. Got 4"),
+        ("real", (32, 8, 8), None, "alignment.a must be less than maximum alignment 16. Got 32"),
+        ("real", (-1, 8, 8), None, "alignment.a must be > 0. Got -1"),
+        ("real", (8, 0, 8), None, "alignment.b must be > 0. Got 0"),
+        ("real", None, Alignment(a=2, b=4, c=8), None),
+        ("real", None, Alignment(2, 4, 8), None),
+        ("real", (4, 8, 16), Alignment(4, 8, 16), None),
+        ("complex", (4, 8, 16), Alignment(4, 8, 16), None),
+        ("complex", (4, 8, 16), Alignment(4, 8, 16), None),
+        ("complex", (8, 8, 8), None, "alignment.c must be a multiple of input value type 16. Got 8"),
     ],
 )
-def test_alignment(dtype, alignment, expected, complete, expected_error):
+def test_alignment(dtype, alignment, expected, expected_error):
     matmul = functools.partial(
-        BlasOptionsComplete if complete else BlasOptions,
+        Matmul,
         size=(64, 64, 64),
         precision=(np.float16, np.float32, np.float64),
         data_type=dtype,
-        code_type=get_default_code_type(),
+        sm=SM80.cc,
         arrangement=("col_major", "col_major", "col_major"),
         alignment=alignment,
     )
@@ -740,14 +706,14 @@ def test_alignment(dtype, alignment, expected, complete, expected_error):
         ("block_size", "suggested", "block_size_suggested"),
     ],
 )
-def test_blas_options_parameter_validation(param_name, param_value, special_case):
-    """Test BlasOptions parameter validation"""
+def test_matmul_parameter_validation(param_name, param_value, special_case):
+    """Test Matmul parameter validation"""
     base_kwargs = {
         "size": (16, 8, 16),
         "data_type": "real",
         "precision": np.float32,
         "transpose_mode": TransposeMode("non_transposed", "transposed"),
-        "code_type": SM75,
+        "sm": SM75.cc,
         "execution": "Block",
     }
 
@@ -755,13 +721,13 @@ def test_blas_options_parameter_validation(param_name, param_value, special_case
         base_kwargs["block_size"] = param_value
         base_kwargs["block_dim"] = (64, 1, 1)
         with pytest.raises(ValueError):
-            BlasOptions(**base_kwargs)
+            Matmul(**base_kwargs)
     elif special_case == "block_size_suggested":
         base_kwargs["block_size"] = param_value
-        BO = BlasOptions(**base_kwargs)
+        BO = Matmul(**base_kwargs)
         assert isinstance(BO.block_dim, Dim3)
         assert BO.block_dim[0] * BO.block_dim[1] * BO.block_dim[2] >= 1
     else:
         base_kwargs[param_name] = param_value
         with pytest.raises(ValueError):
-            BlasOptions(**base_kwargs)
+            Matmul(**base_kwargs)
diff --git a/tests/nvmath_tests/device/test_cublasdx_numba.py b/tests/nvmath_tests/device/test_cublasdx_numba.py
index 5029f2b..b1058d4 100644
--- a/tests/nvmath_tests/device/test_cublasdx_numba.py
+++ b/tests/nvmath_tests/device/test_cublasdx_numba.py
@@ -10,6 +10,7 @@
 from nvmath.device.cublasdx_backend import Arrangement, Precision
 from .helpers import (
     _TOLERANCE,
+    SM80,
     random_real,
     random_complex,
     random_int,
@@ -19,9 +20,8 @@
     time_this,
 )
 import time
-from nvmath.device import current_device_lto, matmul, float16x2_type, float32x2_type, float64x2_type, Dim3
-from nvmath.device import TransposeMode, BlasOptions
-from nvmath.device.cublasdx import BlasCompiled, BlasNumba
+from nvmath.device import matmul, float16x2_type, float32x2_type, float64x2_type, Dim3
+from nvmath.device import TransposeMode, Matmul
 import pytest
 
 
@@ -322,10 +322,7 @@ def test_matmul(shape, block_size, block_dim, data_type, trans, arrangement, pre
     elif block_dim is not None:
         assert MM.block_dim == Dim3(*block_dim)
     assert MM.max_threads_per_block <= 1024
-    assert MM.code_type.kind == "lto"
-
-    assert MM.code_type.cc.major == SM[0]
-    assert MM.code_type.cc.minor == SM[1]
+    assert MM.sm == SM
 
     a_size = MM.a_size
     b_size = MM.b_size
@@ -474,21 +471,21 @@ def f(a_global, b_global, c_global):
 
 
 def test_valid():
-    base_MM = BlasOptions(
+    base_MM = Matmul(
         size=(8, 4, 16),
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("transposed", "non_transposed"),
         execution="Block",
-        code_type=current_device_lto(),
+        sm=SM80.cc,
     )
 
     count = 0
     for (bd,) in base_MM.valid("block_dim"):
         MM0 = base_MM.create(block_dim=bd, compiler="numba")
-        assert isinstance(MM0, BlasNumba)
+        assert isinstance(MM0, Matmul)
         MM1 = base_MM.create(block_dim=bd, compiler="numba")
-        assert isinstance(MM1, BlasCompiled)
+        assert isinstance(MM1, Matmul)
         count += 1
 
     assert count > 0
@@ -504,7 +501,6 @@ def test_valid():
     ],
 )
 def test_opaque_tensor(tensor_types):
-    print(tensor_types)
     m, n, k = 4, 2, 8
     block_size = 64
     precision = Precision(np.float32, np.float32, np.float64)
@@ -522,9 +518,6 @@ def test_opaque_tensor(tensor_types):
         execute_api="tensors",
     )
 
-    print("uids: ", MM._gmem_tensor_uids)
-    print("tids:", MM._target_tensor_uids)
-
     is_suggested_a = "suggested" in tensor_types[0]
     is_suggested_b = "suggested" in tensor_types[1]
     is_suggested_c = "suggested" in tensor_types[2]
@@ -606,3 +599,81 @@ def f(alpha, a, b, beta, c, output):
 
     error = np.linalg.norm(data_test - data_ref) / np.linalg.norm(data_ref)
     assert error < 1e-2
+
+
+def test_make_fragment_like_C():
+    from nvmath.bindings import mathdx
+
+    if mathdx.get_version_ex() < (0, 3, 0):
+        pytest.skip("Partition is supported on libmathdx 0.3.0+")
+    MM = matmul(
+        size=(2, 2, 2),
+        data_type="real",
+        precision=np.float32,
+        arrangement=("col_major", "col_major", "col_major"),
+        execution="Block",
+        execute_api="tensors",
+        compiler="numba",
+        tensor_types=("suggested_smem_a", "suggested_smem_b", "suggested_rmem_c"),
+    )
+
+    c_size = MM.suggest_layout_rmem_c().size
+    assert c_size == 1
+
+    @cuda.jit(link=MM.files)
+    def kernel(c):
+        gmem_c = make_tensor(c, MM.get_layout_gmem_c())
+        partitioner = MM.suggest_partitioner()
+        c_frag = partitioner.partition_like_C(gmem_c)
+
+        if partitioner.is_thread_active():
+            for i in range(c_size):
+                if (not partitioner.is_predicated()) or partitioner.is_index_in_bounds(i):
+                    c_frag[i] = c_frag[i] * 2
+
+    a = np.arange(4, dtype=np.float32).reshape((2, 2))
+    kernel[1, MM.block_dim](a)
+    expected = np.arange(4, dtype=np.float32).reshape((2, 2)) * 2
+    assert np.allclose(a, expected)
+
+
+def test_lto_symbol_duplicate():
+    """
+    Test that two different MM(...) function overloads points to the same LTO
+    symbol without causing a duplicate symbol error at link time.
+
+    Two local arrays have different type (ndim is different), so that triggers
+    overload resolution twice in Numba.
+    """
+    alpha, beta = 1.1, 1.2
+    m, n, k = 4, 2, 8
+    block_size = 64
+    precision = np.float32
+
+    MM = Matmul(
+        size=(m, n, k),
+        precision=precision,
+        data_type="real",
+        arrangement=("col_major", "row_major", "row_major"),
+        execution="Block",
+        block_size=block_size,
+    )
+
+    @cuda.jit
+    def f(a, b, c):
+        shared_a1 = cuda.shared.array(shape=(MM.a_size,), dtype=MM.a_value_type)
+        shared_a1[0] = a[0, 0]
+        shared_a2 = cuda.shared.array(shape=MM.a_dim, dtype=MM.a_value_type)
+        shared_a2[0, 0] = a[0, 0]
+        cuda.syncthreads()
+        MM.execute(alpha, shared_a1, b, beta, c)
+        MM.execute(alpha, shared_a2, b, beta, c)
+
+    a = np.ones(shape=MM.a_dim, dtype=MM.a_value_type)
+    b = np.ones(shape=MM.b_dim, dtype=MM.b_value_type)
+    c = np.ones(shape=MM.c_dim, dtype=MM.c_value_type)
+    a_d = cuda.to_device(a)
+    b_d = cuda.to_device(b)
+    c_d = cuda.to_device(c)
+
+    f[1, MM.block_size](a_d, b_d, c_d)
diff --git a/tests/nvmath_tests/device/test_cufftdx_generic.py b/tests/nvmath_tests/device/test_cufftdx_generic.py
index 1d0e0ea..00b5f2d 100644
--- a/tests/nvmath_tests/device/test_cufftdx_generic.py
+++ b/tests/nvmath_tests/device/test_cufftdx_generic.py
@@ -3,8 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
-from nvmath.device import fft, CodeType, ComputeCapability, FFTOptions
-from nvmath.device.cufftdx import FFTCompiled
+from nvmath.device import fft, Code, CodeType, ComputeCapability, FFT
+from nvmath.device.cufftdx import compile_fft_execute
+from nvmath.device.types import half4, complex64, complex128
 import pytest
 import numpy as np
 from .helpers import (
@@ -20,29 +21,24 @@
     SM86,
     SM89,
     SM90,
-    AssertFilesClosed,
     skip_unsupported_sm,
 )
 
 
-def test_files_closed():
-    with AssertFilesClosed():
-        _ = fft(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-
-
 @pytest.mark.parametrize("execute_api", ["shared_memory", "register_memory"])
 def test_third_party_block_symbol(execute_api):
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=256,
         precision=np.float32,
         direction="forward",
-        code_type=SM90,
+        sm=SM90.cc,
         execution="Block",
-        execute_api=execute_api,
     )
 
-    assert len(FFT.symbol) > 0
+    _, symbol = compile_fft_execute(fft, code_type=SM90, execute_api=execute_api)
+
+    assert len(symbol) > 0
 
 
 @pytest.mark.parametrize(
@@ -50,68 +46,69 @@ def test_third_party_block_symbol(execute_api):
     ["Block", "Thread"],
 )
 def test_third_party_symbol(execution):
-    FFT = fft(fft_type="c2c", size=16, precision=np.float32, direction="forward", code_type=SM90, execution=execution)
+    fft = FFT(fft_type="c2c", size=16, precision=np.float32, direction="forward", sm=SM90.cc, execution=execution)
+    _, symbol = compile_fft_execute(fft, code_type=SM90)
 
-    assert len(FFT.symbol) > 0
+    assert len(symbol) > 0
 
 
 def test_third_party_code():
-    FFT = fft(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-
-    assert isinstance(FFT, FFTCompiled)
-    assert all(code.endswith(".ltoir") for code in FFT.files)
-    assert len(FFT.codes) > 0
-    for code in FFT.codes:
-        print(code.code_type, code.isa_version)
-    assert all(code.code_type.kind == "lto" for code in FFT.codes)
-    assert all(code.isa_version.major >= 12 for code in FFT.codes)
-    assert all(code.isa_version.minor >= 0 for code in FFT.codes)
-    assert all(code.code_type.cc.major == 8 for code in FFT.codes)
-    assert all(code.code_type.cc.minor == 0 for code in FFT.codes)
-    assert all(isinstance(code.data, bytes) for code in FFT.codes)
-    assert all(len(code.data) > 0 for code in FFT.codes)
+    fft = FFT(fft_type="c2c", size=32, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block")
+    code, _ = compile_fft_execute(fft, code_type=SM80)
+
+    assert isinstance(code, Code)
+    assert code.code_type.kind == "lto"
+    assert code.isa_version.major >= 12
+    assert code.isa_version.minor >= 0
+    assert code.code_type.cc.major == 8
+    assert code.code_type.cc.minor == 0
+    assert isinstance(code.data, bytes)
+    assert len(code.data) > 0
 
 
 #                                            2      | 2, 2^2, ... | 2, 2^2, ... | 2, 2^2, ...  # noqa: W505
 @pytest.mark.parametrize("size, mincount", [(2, 1), (16, 4), (128, 4), (2048, 4)])
 def test_knobs_c2c_ept_fpb(size, mincount):
-    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-    valid = FO.valid("elements_per_thread", "ffts_per_block")
+    FO = functools.partial(
+        FFT, fft_type="c2c", size=size, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block"
+    )
+    valid = FO().valid("elements_per_thread", "ffts_per_block")
     assert len(list(valid)) >= mincount
     for ept, fpb in valid:
         print("ept, fpb = ", ept, fpb)
-        FFT = FO.create(elements_per_thread=ept, ffts_per_block=fpb)
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.elements_per_thread == ept
-        assert FFT.ffts_per_block == fpb
-        assert len(FFT.files) > 0
+        fft = FO(elements_per_thread=ept, ffts_per_block=fpb)
+        assert isinstance(fft, FFT)
+        assert fft.elements_per_thread == ept
+        assert fft.ffts_per_block == fpb
 
 
 #                                            3, 3^2 | 11      |2, 2^2, 2^3, ...
 @pytest.mark.parametrize("size, mincount", [(9, 2), (121, 1), (2048, 4)])
 def test_knobs_c2c_ept_only(size, mincount):
-    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-    valid = FO.valid("elements_per_thread")
+    FO = functools.partial(
+        FFT, fft_type="c2c", size=size, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block"
+    )
+    valid = FO().valid("elements_per_thread")
     assert len(list(valid)) >= mincount
     for (ept,) in valid:
         print("ept = ", ept)
-        FFT = FO.create(elements_per_thread=ept)
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.elements_per_thread == ept
-        assert len(FFT.files) > 0
+        fft = FO(elements_per_thread=ept)
+        assert isinstance(fft, FFT)
+        assert fft.elements_per_thread == ept
 
 
 @pytest.mark.parametrize("size, mincount", [(7, 1), (36, 1), (2048, 1)])
 def test_knobs_c2c_fpb_only(size, mincount):
-    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-    valid = FO.valid("ffts_per_block")
+    FO = functools.partial(
+        FFT, fft_type="c2c", size=size, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block"
+    )
+    valid = FO().valid("ffts_per_block")
     assert len(list(valid)) >= mincount
     for (fpb,) in valid:
         print("fpb = ", fpb)
-        FFT = FO.create(ffts_per_block=fpb)
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.ffts_per_block == fpb
-        assert len(FFT.files) > 0
+        fft = FO(ffts_per_block=fpb)
+        assert isinstance(fft, FFT)
+        assert fft.ffts_per_block == fpb
 
 
 @pytest.mark.parametrize(
@@ -132,99 +129,99 @@ def test_knobs_c2c_fpb_only(size, mincount):
     ],
 )
 def test_knobs_r2c_c2r(fft_type, complex_layout, real_mode):
-    FO = FFTOptions(
+    FO = functools.partial(
+        FFT,
         fft_type=fft_type,
         size=512,
         precision=np.float32,
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         real_fft_options={"complex_layout": complex_layout, "real_mode": real_mode},
     )
-    valid = FO.valid("elements_per_thread", "ffts_per_block")
+    valid = FO().valid("elements_per_thread", "ffts_per_block")
     assert len(valid) > 2
     for ept, fpb in valid:
         print("ept, fpb = ", ept, fpb)
-        FFT = FO.create(elements_per_thread=ept, ffts_per_block=fpb)
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.elements_per_thread == ept
-        assert FFT.ffts_per_block == fpb
-        assert len(FFT.files) > 0
+        fft = FO(elements_per_thread=ept, ffts_per_block=fpb)
+        assert isinstance(fft, FFT)
+        assert fft.elements_per_thread == ept
+        assert fft.ffts_per_block == fpb
     # Max EPT is usually 32, but folded allows for EPT=64
     if real_mode == "folded":
         assert 64 in [e for (e, _) in valid]
 
 
 def test_knobs_0():
-    FO = FFTOptions(fft_type="c2c", size=4, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-    val = FO.valid("elements_per_thread", "ffts_per_block")
+    fft = FFT(fft_type="c2c", size=4, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block")
+    val = fft.valid("elements_per_thread", "ffts_per_block")
     print(val)
 
 
 def test_knobs_1():
-    FO = FFTOptions(
+    fft = FFT(
         fft_type="c2c",
         size=4,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         ffts_per_block="suggested",
         elements_per_thread="suggested",
     )
-    assert FO.ffts_per_block is not None
-    assert FO.elements_per_thread is not None
-    assert FO.ffts_per_block > 1
-    assert isinstance(FO, FFTOptions)
-    ffts_per_block = FO.ffts_per_block
+    assert fft.ffts_per_block is not None
+    assert fft.elements_per_thread is not None
+    assert fft.ffts_per_block > 1
+    assert isinstance(fft, FFT)
+    ffts_per_block = fft.ffts_per_block
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=4,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         ffts_per_block=ffts_per_block,
     )
-    assert isinstance(FFT, FFTCompiled)
-    assert FFT.ffts_per_block == ffts_per_block
+    assert isinstance(fft, FFT)
+    assert fft.ffts_per_block == ffts_per_block
 
 
 def test_knobs_2():
-    FO = FFTOptions(
+    FO = FFT(
         fft_type="c2c",
         size=4,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         ffts_per_block="suggested",
     )
-    assert FO.ffts_per_block is not None
-    assert FO.elements_per_thread is None
-    assert FO.ffts_per_block > 1
+    assert FO._ffts_per_block is not None
+    assert FO._elements_per_thread is None
+    assert FO._ffts_per_block > 1
 
 
 def test_knobs_3():
-    FO = FFTOptions(
+    FO = FFT(
         fft_type="c2c",
         size=4,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         elements_per_thread="suggested",
     )
-    assert FO.ffts_per_block is None
-    assert FO.elements_per_thread is not None
+    assert FO._ffts_per_block is None
+    assert FO._elements_per_thread is not None
 
 
 def test_functools_partial():
-    base = functools.partial(fft, size=32, precision=np.float32, code_type=SM80, execution="Block")
+    base = functools.partial(FFT, size=32, precision=np.float32, sm=SM80.cc, execution="Block")
     R2C = base(fft_type="r2c")
     C2R = base(fft_type="c2r")
-    assert isinstance(R2C, FFTCompiled)
-    assert isinstance(C2R, FFTCompiled)
+    assert isinstance(R2C, FFT)
+    assert isinstance(C2R, FFT)
 
     assert R2C.fft_type == "r2c"
     assert C2R.fft_type == "c2r"
@@ -232,87 +229,88 @@ def test_functools_partial():
 
 
 def test_partial_fft():
-    FO = FFTOptions(
+    FO = FFT(
         fft_type="c2c",
         size=32,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         ffts_per_block="suggested",
     )
     suggested_ffts_per_block = FO.ffts_per_block
 
-    FFT = fft(
+    fft = FFT(
         fft_type="c2c",
         size=32,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
         ffts_per_block=suggested_ffts_per_block,
     )
-    assert isinstance(FFT, FFTCompiled)
-    assert FFT.ffts_per_block == suggested_ffts_per_block
+    assert isinstance(fft, FFT)
+    assert fft.ffts_per_block == suggested_ffts_per_block
 
 
 def test_valid_knobs_0():
-    FO = FFTOptions(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
+    FO = FFT(fft_type="c2c", size=32, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block")
     valids = FO.valid("elements_per_thread", "ffts_per_block")
     count = 0
     for ept, bpb in valids:
         count += 1
-        FFT = fft(
+        fft = FFT(
             fft_type="c2c",
             size=32,
             precision=np.float32,
             direction="forward",
-            code_type=SM80,
+            sm=SM80.cc,
             execution="Block",
             elements_per_thread=ept,
             ffts_per_block=bpb,
         )
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.elements_per_thread == ept
-        assert FFT.ffts_per_block == bpb
+        assert isinstance(fft, FFT)
+        assert fft.elements_per_thread == ept
+        assert fft.ffts_per_block == bpb
 
     assert count > 0
 
 
 def test_valid_knobs_1():
-    FO = FFTOptions(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
-    valids = FO.valid("elements_per_thread", "ffts_per_block")
+    FO = functools.partial(
+        FFT, fft_type="c2c", size=32, precision=np.float32, direction="forward", sm=SM80.cc, execution="Block"
+    )
+    valids = FO().valid("elements_per_thread", "ffts_per_block")
     count = 0
     for ept, bpb in valids:
         count += 1
 
-        FFT = FO.create(elements_per_thread=ept, ffts_per_block=bpb)
-        assert isinstance(FFT, FFTCompiled)
-        assert FFT.elements_per_thread == ept
-        assert FFT.ffts_per_block == bpb
-        assert len(FFT.files) > 0
+        fft = FO(elements_per_thread=ept, ffts_per_block=bpb)
+        assert isinstance(fft, FFT)
+        assert fft.elements_per_thread == ept
+        assert fft.ffts_per_block == bpb
 
     assert count > 0
 
 
 @pytest.mark.parametrize(
-    "code_type, ept, bpb",
+    "sm, ept, bpb",
     [
-        (SM80, 2, 128),
-        (SM86, 2, 64),
-        (SM89, 2, 32),
+        (SM80.cc, 2, 128),
+        (SM86.cc, 2, 64),
+        (SM89.cc, 2, 32),
     ],
 )
-def test_valid_knob_values(code_type, ept, bpb):
-    FO = FFTOptions(
+def test_valid_knob_values(sm, ept, bpb):
+    fft = FFT(
         fft_type="c2c",
         size=2,
         precision=np.float32,
         direction="forward",
-        code_type=code_type,
+        sm=sm,
         execution="Block",
     )
-    valids = FO.valid("elements_per_thread", "ffts_per_block")
+    valids = fft.valid("elements_per_thread", "ffts_per_block")
 
     assert len(valids) == 1
     assert valids[0] == (ept, bpb)
@@ -328,16 +326,16 @@ def test_valid_knob_values(code_type, ept, bpb):
     ],
 )
 def test_invalid_knob_values(knobs):
-    FO = FFTOptions(
+    fft = FFT(
         fft_type="c2c",
         size=2,
         precision=np.float32,
         direction="forward",
-        code_type=SM80,
+        sm=SM80.cc,
         execution="Block",
     )
     with pytest.raises(ValueError, match="Unsupported knob"):
-        FO.valid(*knobs)
+        fft.valid(*knobs)
 
 
 @pytest.mark.parametrize(
@@ -353,13 +351,6 @@ def test_invalid_knob_values(knobs):
         ("direction", None),
         ("direction", "both"),
         ("direction", "INVERSE"),
-        ("code_type", None),
-        ("code_type", CodeType("lto", ComputeCapability(-1, 0))),
-        ("code_type", CodeType("lto", ComputeCapability(5, 0))),
-        ("code_type", CodeType("sass", ComputeCapability(7, 0))),
-        ("code_type", CodeType("ptx", ComputeCapability(7, 0))),
-        ("code_type", CodeType("lto", ComputeCapability(1000, 0))),  # invalid cc > supported Max cc
-        ("code_type", ("lto", "lto", ComputeCapability(10, 0))),  # len(code_type) != 2
         ("execution", None),
         ("execution", "CGA"),
         ("ffts_per_block", -1),
@@ -384,46 +375,69 @@ def test_negative(opt, value):
     else:
         opts[opt] = value
     with pytest.raises(Exception):
-        FFT = fft(**opts)  # noqa: F841
+        FFT = fft(**opts)
+        # trigger compilation
+        value_type = FFT.value_type  # noqa: F841
+
+
+@pytest.mark.parametrize(
+    "opt, value",
+    [
+        ("code_type", None),
+        ("code_type", CodeType("lto", ComputeCapability(-1, 0))),
+        ("code_type", CodeType("lto", ComputeCapability(5, 0))),
+        ("code_type", CodeType("sass", ComputeCapability(7, 0))),
+        ("code_type", CodeType("ptx", ComputeCapability(7, 0))),
+        ("code_type", CodeType("lto", ComputeCapability(1000, 0))),  # invalid cc > supported Max cc
+        ("code_type", ("lto", "lto", ComputeCapability(10, 0))),  # len(code_type) != 2
+    ],
+)
+def test_negative_compile(opt, value):
+    fft = FFT(fft_type="c2c", size=256, precision=np.float32, direction="forward", execution="Block")
+
+    with pytest.raises(Exception):
+        compile_fft_execute(fft, code_type=value)
 
 
 @pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90, SM100, SM101, SM103, SM120, SM121])
 def test_sm(code_type):
     skip_unsupported_sm(code_type)
-    FFT = fft(fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block")
-    assert all(isinstance(code.data, bytes) for code in FFT.codes)
-    assert all(len(code.data) > 0 for code in FFT.codes)
+    fft = FFT(fft_type="c2c", size=256, precision=np.float32, direction="forward", execution="Block")
+    code, symbol = compile_fft_execute(fft, code_type=code_type)
+
+    assert isinstance(code.data, bytes)
+    assert len(code.data) > 0
+    assert len(symbol) > 0
 
 
 @pytest.mark.parametrize(
     "precision,value_type",
     [
-        (
-            np.float16,
-            np.dtype([("x", np.float16), ("y", np.float16), ("z", np.float16), ("w", np.float16)], align=True),
-        ),  # ~ complex<__half2>
-        (np.float32, np.complex64),  # complex<float>
-        (np.float64, np.complex128),  # complex<double>
+        (np.float16, half4),  # ~ complex<__half2>
+        (np.float32, complex64),  # complex<float>
+        (np.float64, complex128),  # complex<double>
     ],
 )
 def test_value_type(precision, value_type):
     for fft_type in ["c2r", "r2c", "c2c"]:
-        FFT = fft(
+        fft = FFT(
             fft_type=fft_type,
             size=256,
             precision=precision,
             direction="forward" if fft_type == "c2c" else None,
-            code_type=SM90,
+            sm=SM90.cc,
             execution="Block",
         )
-        assert FFT.value_type == value_type
+        assert fft.value_type == value_type
 
 
-@pytest.mark.parametrize("code_type", [("lto", (7, 0)), ("lto", (8, 0))])
+@pytest.mark.parametrize("code_type", [("lto", (7, 5)), ("lto", (8, 0))])
 def test_sm_tuple(code_type):
-    FFT = fft(fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block")
-    assert all(isinstance(code.data, bytes) for code in FFT.codes)
-    assert all(len(code.data) > 0 for code in FFT.codes)
-    assert all(code.code_type.kind == code_type[0] for code in FFT.codes)
-    assert all(code.code_type.cc.major == code_type[1][0] for code in FFT.codes)
-    assert all(code.code_type.cc.minor == code_type[1][1] for code in FFT.codes)
+    fft = FFT(fft_type="c2c", size=256, precision=np.float32, direction="forward", execution="Block")
+    code, symbol = compile_fft_execute(fft, code_type=code_type)
+    assert isinstance(code.data, bytes)
+    assert len(code.data) > 0
+    assert len(symbol) > 0
+    assert code.code_type.kind == code_type[0]
+    assert code.code_type.cc.major == code_type[1][0]
+    assert code.code_type.cc.minor == code_type[1][1]
diff --git a/tests/nvmath_tests/device/test_cufftdx_numba.py b/tests/nvmath_tests/device/test_cufftdx_numba.py
index a5a2feb..808577e 100644
--- a/tests/nvmath_tests/device/test_cufftdx_numba.py
+++ b/tests/nvmath_tests/device/test_cufftdx_numba.py
@@ -6,9 +6,10 @@
 from numba import cuda
 import pytest
 
-from nvmath.device import FFTOptions
-from nvmath.device import current_device_lto, fft, float16x4, float16x2, float64x2_type, float32x2_type, float16x4_type
-from nvmath.device.cufftdx import FFTCompiled, FFTNumba
+from nvmath.device import FFT
+from nvmath.device import fft, float16x4, float16x2
+from nvmath.device.types import half4, complex64, complex128
+from nvmath.device.common_cuda import current_device_sm
 from .helpers import _TOLERANCE, random_complex, random_real, show_FFT_traits, complex64_to_fp16x2, fp16x2_to_complex64
 
 np.random.seed(314 + 271)
@@ -90,7 +91,7 @@ def convert_output(fft_type, precision, output_d):
     return output_test
 
 
-COMPLEX_TYPE_MAP = {np.float16: float16x4_type, np.float32: float32x2_type, np.float64: float64x2_type}
+COMPLEX_TYPE_MAP = {np.float16: half4, np.float32: complex64, np.float64: complex128}
 
 IMPLICIT_BATCHING_MAP = {
     np.float16: 2,
@@ -516,21 +517,53 @@ def f(input, output):
 
 
 def test_valid():
-    base_FFT = FFTOptions(
+    base_FFT = FFT(
         fft_type="c2c",
         size=2,
         precision=np.float32,
         direction="forward",
         execution="Block",
-        code_type=current_device_lto(),
+        sm=current_device_sm(),
     )
 
     count = 0
     for ept, fpb in base_FFT.valid("elements_per_thread", "ffts_per_block"):
-        FFT0 = base_FFT.create(elements_per_thread=ept, ffts_per_block=fpb, compiler="numba")
-        assert isinstance(FFT0, FFTNumba)
-        FFT1 = base_FFT.create(elements_per_thread=ept, ffts_per_block=fpb)
-        assert isinstance(FFT1, FFTCompiled)
+        FFT0 = base_FFT.create(elements_per_thread=ept, ffts_per_block=fpb)
+        assert isinstance(FFT0, FFT)
         count += 1
 
     assert count > 0
+
+
+def test_lto_symbol_duplicate():
+    """
+    Test that two different FFT(...) function overloads points to the same LTO
+    symbol without causing a duplicate symbol error at link time.
+
+    Two local arrays have different type (ndim is different), so that triggers
+    overload resolution twice in Numba.
+    """
+    threads_count = 4
+    FFT = fft(
+        fft_type="c2c",
+        size=8,
+        precision=np.float32,
+        direction="forward",
+        execution="Thread",
+    )
+
+    @cuda.jit
+    def f(data):
+        thread_data = cuda.local.array(shape=(FFT.storage_size,), dtype=FFT.value_type)
+        thread_data[0] = data[0, 0]
+        FFT(thread_data)
+        data[0, 0] = thread_data[0]
+
+        thread_data2 = cuda.local.array(shape=(FFT.storage_size, 1), dtype=FFT.value_type)
+        thread_data2[0, 0] = data[0, 0]
+        FFT(thread_data2)
+        data[0, 0] = thread_data2[0, 0]
+
+    data = np.ones((threads_count, FFT.size), dtype=FFT.value_type)
+    data_d = cuda.to_device(data)
+    f[1, threads_count](data_d)
diff --git a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
index a34e63d..63408bb 100644
--- a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
+++ b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from nvmath.device import CodeType, FFTOptions
+from nvmath.device import FFT
 from .helpers import smallest_multiple, time_check_cupy, set_device, random_complex
 from ..helpers import fft_conv_perf_GFlops, print_aligned_table
 import cupy
@@ -59,7 +59,7 @@ def run_conv_perf(test_cases):
 
     for size, precision in test_cases:
         # Figure out EPT/BPB
-        BASE = FFTOptions(
+        BASE = FFT(
             fft_type="c2c",
             size=size,
             precision=precision,
@@ -67,7 +67,7 @@ def run_conv_perf(test_cases):
             elements_per_thread="suggested",
             ffts_per_block="suggested",
             execution="Block",
-            code_type=CodeType("lto", (SM[0], SM[1])),
+            sm=SM,
         )
 
         ffts_per_block = BASE.ffts_per_block
diff --git a/tests/nvmath_tests/device/test_vector_types_numba.py b/tests/nvmath_tests/device/test_vector_types_numba.py
index 2365c16..6cee19b 100644
--- a/tests/nvmath_tests/device/test_vector_types_numba.py
+++ b/tests/nvmath_tests/device/test_vector_types_numba.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import inspect
 import numpy as np
 from numba import cuda, types
 
@@ -14,6 +15,13 @@
     float32x2_type,
     float16x2_type,
     float16x4_type,
+    complex32,
+    complex64,
+    complex128,
+    half2,
+    half4,
+    np_float16x2,
+    np_float16x4,
 )
 import pytest
 
@@ -189,4 +197,82 @@ def f_non_vectorized(input, output):
     assert "st.global.u64" not in ptx
 
     assert "ld.global.u16" in ptx
-    assert "ld.global.u16" in ptx
+    assert "st.global.u16" in ptx
+
+
+@pytest.mark.parametrize(
+    "dtype, expected_host_dtype, expected_alignment",
+    [
+        (complex32, np_float16x2, 4),
+        (complex64, np.complex64, 8),
+        (complex128, np.complex128, 16),
+        (half2, np_float16x2, 4),
+        (half4, np_float16x4, 8),
+    ],
+)
+def test_numba_type(dtype, expected_host_dtype, expected_alignment):
+    make_dtype = dtype.make
+
+    HOST_COMPLEX = inspect.isclass(expected_host_dtype) and issubclass(expected_host_dtype, np.complexfloating)
+    FOUR_ARGS = expected_host_dtype == np_float16x4
+
+    @cuda.jit
+    def kernel(a):
+        l = cuda.local.array(shape=(1,), dtype=dtype)
+        if FOUR_ARGS:
+            l[0] = make_dtype(3.14, 2.71, -1.0, 1.0)
+        else:
+            l[0] = make_dtype(3.14, 2.71)
+        if HOST_COMPLEX:
+            a[0] = l[0]
+        else:
+            a.view(dtype)[0] = l[0]
+
+    a = np.zeros(1, dtype=dtype)
+    assert a.dtype == expected_host_dtype
+
+    kernel[1, 1](a)
+
+    if HOST_COMPLEX:
+        assert a[0] == 3.14 + 2.71j
+    elif FOUR_ARGS:
+        assert np.allclose(a.view(np.float16), np.array([3.14, 2.71, -1.0, 1.0], dtype=np.float16))
+    else:
+        assert np.allclose(a.view(np.float16), np.array([3.14, 2.71], dtype=np.float16))
+
+
+@pytest.mark.parametrize(
+    "dtype, expected_alignment",
+    [
+        (complex32, 4),
+        (complex64, 8),
+        (complex128, 16),
+        (half2, 4),
+        (half4, 8),
+    ],
+)
+def test_numba_type_alignment(dtype, expected_alignment):
+    @cuda.jit
+    def copy(a, b):
+        av = a.view(dtype)
+        bv = b.view(dtype)
+        bv[0] = av[0]
+
+    a = np.zeros(1, dtype=dtype)
+    b = np.zeros(1, dtype=dtype)
+
+    copy[1, 1](a, b)
+
+    ptx = [v for k, v in copy.inspect_asm().items()]
+    assert len(ptx) == 1
+    ptx = ptx[0]
+
+    print(ptx)
+
+    if expected_alignment < 16:
+        expected_ld_st_inst = f"global.u{expected_alignment * 8}"
+    else:
+        expected_ld_st_inst = f"global.v2.u{expected_alignment * 4}"
+
+    assert "ld." + expected_ld_st_inst in ptx
+    assert "st." + expected_ld_st_inst in ptx
diff --git a/tests/nvmath_tests/distributed/conftest.py b/tests/nvmath_tests/distributed/conftest.py
index a496556..3872018 100644
--- a/tests/nvmath_tests/distributed/conftest.py
+++ b/tests/nvmath_tests/distributed/conftest.py
@@ -9,6 +9,23 @@
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "need_4_procs: The test requires 4 processes")
+    config.addinivalue_line("markers", "uncollect_if(*, func): function to unselect tests from parametrization")
+
+
+def pytest_collection_modifyitems(config, items):
+    removed = []
+    kept = []
+    for item in items:
+        m = item.get_closest_marker("uncollect_if")
+        if m:
+            func = m.kwargs["func"]
+            if func(**item.callspec.params):
+                removed.append(item)
+                continue
+        kept.append(item)
+    if removed:
+        config.hook.pytest_deselected(items=removed)
+        items[:] = kept
 
 
 SYMMETRIC_MEMORY_LEAK_MESSAGE = "Symmetric heap memory needs to be deallocated explicitly"
@@ -37,7 +54,7 @@ def check_symmetric_memory_leaks(caplog):
     from mpi4py import MPI
 
     comm = MPI.COMM_WORLD
-    error = np.array([error], dtype=np.bool)
+    error = np.array([error], dtype=np.bool_)
     comm.Allreduce(MPI.IN_PLACE, error, MPI.LOR)
     if error:
         raise MemoryError(SYMMETRIC_MEMORY_LEAK_MESSAGE)
diff --git a/tests/nvmath_tests/distributed/helpers.py b/tests/nvmath_tests/distributed/helpers.py
index 42586c7..7fd3c08 100644
--- a/tests/nvmath_tests/distributed/helpers.py
+++ b/tests/nvmath_tests/distributed/helpers.py
@@ -11,23 +11,24 @@
 from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, _TENSOR_TYPES as _DIST_TENSOR_TYPES
 from nvmath.distributed._internal.tensor_ifc import DistributedTensor
 from nvmath.internal.tensor_ifc_ndbuffer import NDBufferTensor
+from nvmath.internal.typemaps import NAME_TO_DATA_WIDTH
 
 
-def to_gpu(data_cpu, device_id, stream):
+def to_gpu(data_cpu, device_id, stream, symmetric_memory):
     """
     Move host tensor to GPU. For numpy tensor, we explicitly
     use cupy as a counterpart.
     """
     match data_cpu.name:
         case "numpy":
-            return numpy2cupy(data_cpu, device_id, stream)
+            return numpy2cupy(data_cpu, device_id, stream, symmetric_memory)
         case "torch":
-            return data_cpu.to(device_id, stream, symmetric_memory=True)
+            return data_cpu.to(device_id, stream, symmetric_memory=symmetric_memory)
         case _:
             raise AssertionError(f"Unsupported tensor type: {data_cpu.name}")
 
 
-def numpy2cupy(data_cpu, device_id, stream):
+def numpy2cupy(data_cpu, device_id, stream, symmetric_memory):
     """
     Convert numpy tensor to cupy tensor. While we use cupy wrapper
     to allocate the nvshmem-based tensor, we use cupy to copy the
@@ -44,8 +45,8 @@ def numpy2cupy(data_cpu, device_id, stream):
             dtype=data_cpu.dtype,
             device_id=device_id,
             strides=data_cpu.strides,
-            make_symmetric=True,
-            symmetric_memory=True,
+            make_symmetric=symmetric_memory,
+            symmetric_memory=symmetric_memory,
             stream_holder=stream,
         )
         with stream.ctx:
@@ -125,10 +126,17 @@ def calculate_strides(shape, axis_order):
     return strides
 
 
-def generate_random_data(package, memory_space, shape, dtype, stream, memory_layout="C"):
+def generate_random_data(package, memory_space, shape, dtype, stream, memory_layout="C", symmetric_memory=True):
     """Generate random data of the given shape and dtype.
     Returns instance of data on CPU, and a copy on the specified memory_space ("cpu", "gpu")
     wrapped around distributed TensorHolder.
+
+    Args:
+        package: numpy or torch. For numpy package with memory_space="gpu", uses cupy.
+
+        memory_space: "cpu" or "gpu"
+
+        dtype: numpy dtype
     """
     if np.issubdtype(dtype, np.complexfloating):
         data_cpu = (np.random.rand(*shape) + 1j * np.random.rand(*shape)).astype(dtype)
@@ -147,8 +155,9 @@ def generate_random_data(package, memory_space, shape, dtype, stream, memory_lay
     assert isinstance(data_cpu, DistributedTensor)
     if memory_space == "gpu":
         device_id = nvmath.distributed.get_context().device_id
-        data_gpu = to_gpu(data_cpu, device_id, stream)
+        data_gpu = to_gpu(data_cpu, device_id, stream, symmetric_memory)
         assert isinstance(data_gpu, DistributedTensor)
+        assert data_gpu.is_symmetric_memory == symmetric_memory
         return data_cpu, data_gpu
     else:
         data_cpu_copy = data_cpu.__class__.empty(shape, dtype=data_cpu.dtype, strides=data_cpu.strides)
@@ -179,6 +188,10 @@ def is_close(a, b, rtol=1e-07, atol=0, allow_ndbuffer=False):
             import cupy as cp
 
             module = cp
+    if NAME_TO_DATA_WIDTH[a.dtype] == 8 and "float" in a.dtype:
+        a_tensor = a_tensor.to(module.float32)
+    if NAME_TO_DATA_WIDTH[b.dtype] == 8 and "float" in a.dtype:
+        b_tensor = b_tensor.to(module.float32)
     if device_id != "cpu":
         with device_ctx(device_id):
             return module.allclose(a_tensor, b_tensor, rtol=rtol, atol=atol)
@@ -192,6 +205,7 @@ def gather_array(arr, partition_dim, comm, rank):
 
     assert isinstance(arr, DistributedTensor)
     assert arr.device == "cpu"
+    dtype_name = arr.dtype
     package = arr.module
     assert package.__name__ in ("numpy", "torch"), f"package: {package}"
 
@@ -235,11 +249,20 @@ def transpose(a, dim0, dim1, make_contiguous=False):
     recv_counts = comm.gather(math.prod(arr.shape))
     if rank == 0:
         global_arr = package.empty(global_shape, dtype=arr.dtype)
-        comm.Gatherv(sendbuf=arr, recvbuf=(global_arr, recv_counts), root=0)
+
+        sendbuf = arr
+        recvbuf = (global_arr, recv_counts)
+        if NAME_TO_DATA_WIDTH[dtype_name] <= 16:
+            # WAR for MPI not having narrow-precision types.
+            sendbuf = arr.view(dtype=package.int8)
+            recv_counts = [x * (NAME_TO_DATA_WIDTH[dtype_name] // 8) for x in recv_counts]
+            recvbuf = (global_arr.view(dtype=package.int8), recv_counts)
+
+        comm.Gatherv(sendbuf=sendbuf, recvbuf=recvbuf, root=0)
         if transposed:
             # Undo the transpose.
             global_arr = transpose(global_arr, 1, 0, make_contiguous=True)
         # Note that this is not a distributed tensor any longer.
         return wrap_operand(global_arr)
     else:
-        comm.Gatherv(arr, None)
+        comm.Gatherv(arr if NAME_TO_DATA_WIDTH[dtype_name] > 16 else arr.view(dtype=package.int8), None)
diff --git a/tests/nvmath_tests/distributed/test_fft.py b/tests/nvmath_tests/distributed/test_fft.py
index 6f86e34..64135ec 100644
--- a/tests/nvmath_tests/distributed/test_fft.py
+++ b/tests/nvmath_tests/distributed/test_fft.py
@@ -9,7 +9,7 @@
 from nvmath.internal.utils import device_ctx, get_or_create_stream
 from nvmath.distributed import free_symmetric_memory
 from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, maybe_register_package
-from nvmath.distributed.fft._configuration import Slab
+from nvmath.distributed.distribution import Slab, Box
 
 from .helpers import gather_array, generate_random_data, is_close, to_host
 from .helpers_fft import calc_slab_shape
@@ -34,7 +34,7 @@ def nvmath_distributed():
         pass
 
     device_id = MPI.COMM_WORLD.Get_rank() % cuda.core.experimental.system.num_devices
-    nvmath.distributed.initialize(device_id, MPI.COMM_WORLD)
+    nvmath.distributed.initialize(device_id, MPI.COMM_WORLD, backends=["nvshmem"])
 
     yield
 
@@ -95,7 +95,10 @@ def test_wrong_slab_shape(distribution, nvmath_distributed, check_symmetric_memo
             shape = (25, 64) if distribution == Slab.X else (64, 25)
 
     data = np.ones(shape, dtype=np.complex64)
-    with pytest.raises(ValueError, match=(r"The operand shape is \(\d+, \d+\), but the expected slab shape is \(\d+, \d+\)")):
+    with pytest.raises(
+        nvmath.distributed.distribution.BindDistributionError,
+        match=(r"The given shapes \(global_shape=\(\d+, \d+\), shape=\(\d+, \d+\)\) don't fit distribution Slab"),
+    ):
         nvmath.distributed.fft.fft(data, distribution=distribution)
 
 
@@ -240,6 +243,7 @@ def generate_data_with_padding(
         partition_dim = 0 if distribution == Slab.X else 1
         shape = calc_slab_shape(global_shape, partition_dim, rank, nranks)
     else:
+        assert isinstance(distribution[0], Box)
         lower, upper = distribution[0]
         shape = tuple(upper[i] - lower[i] for i in range(len(global_shape)))
 
@@ -365,12 +369,12 @@ def test_distributed_fft(
 
     global_output_shape = list(global_shape)
     if fft_type == "C2C":
-        in_dtype = np.complex64
+        in_dtype = np.complex64 if blocking is True else np.complex128
     elif fft_type == "R2C":
-        in_dtype = np.float32
+        in_dtype = np.float32 if blocking is True else np.float64
         global_output_shape[-1] = global_output_shape[-1] // 2 + 1
     elif fft_type == "C2R":
-        in_dtype = np.complex64
+        in_dtype = np.complex64 if blocking is True else np.complex128
         global_output_shape[-1] = (global_output_shape[-1] - 1) * 2
         if last_axis_parity == "odd":
             global_output_shape[-1] += 1
@@ -412,10 +416,12 @@ def test_distributed_fft(
             fft_count += 1
             assert data_in.module is result.module
 
-            if fft_type in ("C2C", "R2C"):
-                assert result.dtype == "complex64"
+            if fft_type == "C2C":
+                assert result.dtype == np.dtype(in_dtype).name
+            elif fft_type == "R2C":
+                assert result.dtype == "complex64" if np.dtype(in_dtype).name == "float32" else "complex128"
             else:
-                assert result.dtype == "float32"
+                assert result.dtype == "float32" if np.dtype(in_dtype).name == "complex64" else "float64"
 
             if data_in.shape == result.shape:
                 assert data_in.tensor is result.tensor
@@ -538,7 +544,7 @@ def calculate_box(dim0, dim1, shapes, global_shape, rank):
     upper = list(lower)
     for i in range(len(upper)):
         upper[i] += shapes[rank][i]
-    return (lower, upper)
+    return Box(lower, upper)
 
 
 def gather_pencils(x, dim0, dim1, shape, global_shape, comm, rank, nranks):
diff --git a/tests/nvmath_tests/distributed/test_matmul.py b/tests/nvmath_tests/distributed/test_matmul.py
new file mode 100644
index 0000000..cb51358
--- /dev/null
+++ b/tests/nvmath_tests/distributed/test_matmul.py
@@ -0,0 +1,1024 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import numpy as np
+import pytest
+import random
+import re
+import cuda.core.experimental as ccx
+from collections.abc import Sequence
+
+from pathlib import Path
+import tempfile
+import os
+
+import nvmath.distributed
+from nvmath.internal.utils import device_ctx, get_or_create_stream
+from nvmath.distributed import free_symmetric_memory
+from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, maybe_register_package
+from nvmath.internal.tensor_wrapper import wrap_operand
+
+from .helpers import gather_array, generate_random_data, is_close, to_host
+
+from nvmath.internal.typemaps import NAME_TO_DATA_TYPE, NAME_TO_DATA_WIDTH
+
+from nvmath.distributed.linalg.advanced import matrix_qualifiers_dtype, MatmulEpilog, MatmulComputeType
+
+from nvmath.distributed.distribution import ProcessGrid, BlockNonCyclic, BlockCyclic, Slab, Box
+
+from nvmath.bindings import cublasMp
+
+import cuda.core.experimental
+
+package_name_to_package = {"numpy": np}
+
+
+@pytest.fixture(scope="module")
+def nvmath_distributed():
+    """Pytest fixture that initializes nvmath.distributed and finalizes it on exit"""
+    from mpi4py import MPI
+
+    try:
+        import cupy
+
+        maybe_register_package("cupy")
+        package_name_to_package["cupy"] = cupy
+    except ImportError:
+        pass
+
+    try:
+        import torch
+
+        maybe_register_package("torch")
+        package_name_to_package["torch"] = torch
+    except ImportError:
+        pass
+
+    device_id = MPI.COMM_WORLD.Get_rank() % cuda.core.experimental.system.num_devices
+    nvmath.distributed.initialize(device_id, MPI.COMM_WORLD, backends=["nvshmem", "nccl"])
+
+    yield
+
+    nvmath.distributed.finalize()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def cublasmp_logfile():
+    # We're not using the cuBLASMp logging runtime APIs for now, which are considered
+    # experimental. When setting the log file through env vars, the log file gets fixed
+    # when the library is initialized and there is no way to change it per matmul
+    # operation. So we need to select the file at the module scope.
+    from mpi4py import MPI
+
+    rank = MPI.COMM_WORLD.Get_rank()
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_file_path = Path(temp_dir) / f"cublasmp_{rank}.log"
+        prev_log_level = os.environ.get("CUBLASMP_LOG_LEVEL", "")
+        prev_log_file = os.environ.get("CUBLASMP_LOG_FILE", "")
+        os.environ["CUBLASMP_LOG_LEVEL"] = "5"
+        os.environ["CUBLASMP_LOG_FILE"] = str(temp_file_path)
+
+        yield temp_file_path
+
+        os.environ["CUBLASMP_LOG_LEVEL"] = prev_log_level
+        os.environ["CUBLASMP_LOG_FILE"] = prev_log_file
+
+
+@pytest.fixture(scope="function")
+def cublasmp_logfile_with_cleanup(cublasmp_logfile):
+    """Clear the log file after every test."""
+
+    def truncate_log():
+        try:
+            # Can't delete the file because libcublasmp won't reopen without restarting
+            # the application, so we truncate it instead to clear its contents and reuse
+            # it in the same session.
+            os.truncate(cublasmp_logfile, 0)
+        except FileNotFoundError:
+            pass
+
+    truncate_log()  # in case the previous test hasn't used this fixture
+
+    yield cublasmp_logfile
+
+    truncate_log()
+
+
+def test_wrong_distribution(nvmath_distributed):
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+
+    valid_nranks = (2, 4, 8)
+    if nranks not in valid_nranks:
+        pytest.skip(f"This test needs nranks in {valid_nranks}")
+
+    process_grid = ProcessGrid(shape=(1, nranks), layout=ProcessGrid.Layout.ROW_MAJOR)
+
+    global_shape = (64, 64)
+    assert global_shape[1] % nranks == 0
+
+    # ERROR: all ranks must have the same dim 0 length
+    nrows = 60 if rank == 0 else global_shape[0]
+    ncols = global_shape[1] // nranks  # partition on dim 1
+    a = np.zeros((nrows, ncols), dtype=np.float32)
+    a = np.asfortranarray(a)
+
+    distributions = [BlockNonCyclic(process_grid)] * 3
+    with pytest.raises(ValueError, match="The problem size is inconsistent across processes"):
+        _ = nvmath.distributed.linalg.advanced.Matmul(a, a, distributions=distributions)
+
+
+@pytest.mark.parametrize("symmetric_memory", [False, True])
+def test_symmetric_memory(symmetric_memory, nvmath_distributed, check_symmetric_memory_leaks):
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    nranks = comm.Get_size()
+    device_id = distributed_ctx.device_id
+
+    m, n, k = 64, 32, 48
+    a_shape = (k // nranks, m)
+    b_shape = (k // nranks, n)
+
+    import cupy as cp
+
+    if symmetric_memory:
+        # allocate a and b on symmetric memory
+        a = nvmath.distributed.allocate_symmetric_memory(a_shape, cp, dtype=cp.float32, axis_order="F")
+        b = nvmath.distributed.allocate_symmetric_memory(b_shape, cp, dtype=cp.float32, axis_order="F")
+    else:
+        with device_ctx(device_id):
+            a = cp.asfortranarray(cp.zeros(a_shape))
+            b = cp.asfortranarray(cp.zeros(b_shape))
+
+    with device_ctx(device_id):
+        a[:] = cp.random.rand(*a_shape)
+        b[:] = cp.random.rand(*b_shape)
+        stream = cp.cuda.Stream()
+
+    distributions = [Slab.X, Slab.X, Slab.Y]
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[0]["is_transpose"] = True
+    d = nvmath.distributed.linalg.advanced.matmul(a, b, distributions=distributions, qualifiers=qualifiers, stream=stream)
+
+    stream.synchronize()
+
+    d = dist_wrap_operand(d)
+    assert d.device_id == device_id
+    assert d.is_symmetric_memory == symmetric_memory
+
+    if symmetric_memory:
+        nvmath.distributed.free_symmetric_memory(a, b, d.tensor)
+
+
+@pytest.mark.parametrize("global_size", [32, 64, 48])
+def test_matmul_execute_sequence(global_size, nvmath_distributed, check_symmetric_memory_leaks):
+    """Calculate A^4 where A is a square matrix, by creating and planning three separate
+    matmuls, which then execute in sequence."""
+
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+    device_id = distributed_ctx.device_id
+
+    valid_nranks = (1, 2, 4, 8)
+    if nranks not in valid_nranks:
+        pytest.skip(f"This test needs nranks in {valid_nranks}")
+
+    assert global_size % nranks == 0
+    matrix_shape = (global_size // nranks, global_size)
+
+    import cupy as cp
+
+    stream = get_or_create_stream(device_id, stream=None, op_package="cupy")
+    with device_ctx(device_id):
+        a = cp.random.rand(*matrix_shape).astype(cp.float32)
+        a = cp.asfortranarray(a)
+        a_ = a.copy(order="F")
+
+    distributions = [Slab.X] * 3
+    mm1 = nvmath.distributed.linalg.advanced.Matmul(a, a_, distributions=distributions)
+    mm2 = nvmath.distributed.linalg.advanced.Matmul(a, a_, distributions=distributions)
+    mm3 = nvmath.distributed.linalg.advanced.Matmul(a, a_, distributions=distributions)
+
+    mm1.plan()
+    mm2.plan()
+    mm3.plan()
+
+    with device_ctx(device_id):
+        a[:] = mm1.execute()
+        a[:] = mm2.execute()
+        d = mm3.execute()
+
+    for mm in (mm1, mm2, mm3):
+        mm.free()
+
+    a_global = gather_array(to_host(dist_wrap_operand(a_), device_id, stream), 0, comm, rank)
+    result_global = gather_array(to_host(dist_wrap_operand(d), device_id, stream), 0, comm, rank)
+    if rank == 0:
+        a = a_global.tensor
+        expected = a @ a @ a @ a
+        assert is_close(result_global, wrap_operand(expected), rtol=1e-5, atol=1e-5), (
+            "Gathered result doesn't match single-GPU matmul"
+        )
+
+
+def generate_process_grids(only_2d=False):
+    """Generate all possible process grids for the current number of MPI processes."""
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+    nranks = comm.Get_size()
+    # Return process grids as tuples of process grid shape and layout. We can't create
+    # ProcessGrid objects here because nvmath.distributed has not been initialized yet.
+    process_grids = []
+    for i in range(1, nranks + 1):
+        for j in range(1, nranks + 1):
+            if only_2d and (i == 1 or j == 1):
+                continue
+            if i * j == nranks:
+                process_grids.append(((i, j), ProcessGrid.Layout.COL_MAJOR))
+                process_grids.append(((i, j), ProcessGrid.Layout.ROW_MAJOR))
+    return process_grids
+
+
+def read_algo_from_log(logfile_path) -> int | None:
+    with open(logfile_path) as logfile:
+        # NOTE: need to call mm.execute() to see this printed to the logfile.
+        regexAlgo = re.compile(r"\[cublasMpMatmul\] using matmul algo (\d+)$")
+        for line in logfile:
+            m = regexAlgo.search(line)
+            if m:
+                return int(m.group(1))
+    return None
+
+
+def skip_test_uniform_1d_distributions(
+    package,
+    input_memory_space,
+    M_N_K,
+    transA,
+    transB,
+    A_distribution,
+    B_distribution,
+    C_distribution,
+    input_C,
+    epilog_AR,
+):
+    if epilog_AR:
+        if not (transA and not transB):
+            # GEMM+AR algo only supported for TN
+            return True
+        if not (A_distribution == "R" and B_distribution == "R" and C_distribution == "C"):
+            # GEMM+AR algo requires A and B row-wise and C col-wise
+            return True
+
+    if package == "numpy" and input_memory_space != "cpu":
+        return True  # numpy only supports CPU memory space
+    if package == "cupy" and input_memory_space != "gpu":
+        return True  # cupy only supports GPU memory space
+
+
+@pytest.mark.uncollect_if(func=skip_test_uniform_1d_distributions)
+@pytest.mark.parametrize("package", ["numpy", "cupy", "torch"])
+@pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
+@pytest.mark.parametrize("M_N_K", [(64, 64, 64), (128, 96, 64), (64, 128, 64)])
+@pytest.mark.parametrize("transA", [True, False])
+@pytest.mark.parametrize("transB", [True, False])
+@pytest.mark.parametrize("A_distribution", ["R", "C"])
+@pytest.mark.parametrize("B_distribution", ["R", "C"])
+@pytest.mark.parametrize("C_distribution", ["R", "C"])  # same distribution applies to D
+@pytest.mark.parametrize("input_C", [False, True])
+@pytest.mark.parametrize("epilog_AR", [False, True])
+def test_uniform_1d_distributions(
+    package,
+    input_memory_space,
+    M_N_K,
+    transA,
+    transB,
+    A_distribution,
+    B_distribution,
+    C_distribution,
+    input_C,
+    epilog_AR,
+    nvmath_distributed,
+    cublasmp_logfile_with_cleanup,
+    check_symmetric_memory_leaks,
+):
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+    device_id = distributed_ctx.device_id
+
+    valid_nranks = (1, 2, 4, 8)
+    if nranks not in valid_nranks:
+        pytest.skip(f"This test needs nranks in {valid_nranks}")
+
+    try:
+        pkg = package_name_to_package[package]
+    except KeyError:
+        pytest.skip(f"{package} is not available")
+
+    M, N, K = M_N_K
+    assert M % nranks == 0
+    assert N % nranks == 0
+    assert K % nranks == 0
+
+    assert all(d in ("C", "R") for d in (A_distribution, B_distribution, C_distribution))
+
+    if nranks == 1:
+        # With nranks=1 cuBLASMp always does a local GEMM.
+        expected_algo = 5  # local GEMM
+    else:
+        expected_algo = 0  # naive
+        if epilog_AR:
+            expected_algo = 4  # GEMM+AR
+        elif (
+            C_distribution == "R" and A_distribution == ("C" if transA else "R") and B_distribution == ("R" if transB else "C")
+        ):
+            expected_algo = 3  # AG+GEMM
+        elif (
+            C_distribution == "C" and A_distribution == ("R" if transA else "C") and B_distribution == ("C" if transB else "R")
+        ):
+            expected_algo = 2  # GEMM+RS
+
+    # Generate some random numbers and broadcast them because every process
+    # must use the same.
+    r = np.random.rand(3)
+    r[2] = random.randint(0, nranks - 1)
+    comm.Bcast(r)
+    if r[0] < 0.5:
+        RowWiseDist = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)))
+        assert RowWiseDist._is_row_wise()
+    else:
+        RowWiseDist = Slab.X
+
+    if r[1] < 0.5:
+        ColWiseDist = BlockNonCyclic(ProcessGrid(shape=(1, nranks)))
+        assert ColWiseDist._is_col_wise()
+    else:
+        ColWiseDist = Slab.Y
+
+    if A_distribution == "R":
+        if expected_algo == 0:
+            # Currently (rsrc, csrc) != (0, 0) only works for algo 0 (naive algorithm).
+            first_process = (int(r[2]), 0)
+            distribution_A = BlockNonCyclic(ProcessGrid(shape=(nranks, 1)), first_process=first_process)
+        else:
+            distribution_A = RowWiseDist
+        A_shape = (K // nranks, M) if transA else (M // nranks, K)
+    else:
+        distribution_A = ColWiseDist
+        A_shape = (K, M // nranks) if transA else (M, K // nranks)
+
+    if B_distribution == "R":
+        distribution_B = RowWiseDist
+        B_shape = (N // nranks, K) if transB else (K // nranks, N)
+    else:
+        distribution_B = ColWiseDist
+        B_shape = (N, K // nranks) if transB else (K, N // nranks)
+
+    if C_distribution == "R":
+        distribution_C = RowWiseDist
+        C_shape = (M // nranks, N) if not epilog_AR else (M, N)
+    else:
+        distribution_C = ColWiseDist
+        C_shape = (M, N // nranks) if not epilog_AR else (M, N)
+
+    stream = None
+    if input_memory_space == "gpu":
+        stream = get_or_create_stream(device_id, stream=None, op_package=package)
+
+    dtype = np.float32
+
+    def generate_random_matrix(shape, dtype, symmetric_memory):
+        return generate_random_data(
+            np if package != "torch" else pkg,
+            input_memory_space,
+            shape,
+            dtype,
+            stream,
+            memory_layout="F",
+            symmetric_memory=symmetric_memory,
+        )
+
+    a_cpu, a = generate_random_matrix(A_shape, dtype, False)
+    b_cpu, b = generate_random_matrix(B_shape, dtype, True)
+    if input_C:
+        beta = 0.8
+        c_cpu, c = generate_random_matrix(C_shape, dtype, False)
+        if epilog_AR:
+            # For epilog_AR cuBLASMp has each process contribute its C to the result.
+            # To get the same result as single-GPU MM, we have to set the values
+            # to zero on every rank except one.
+            c_cpu.tensor[:] = 7.0 if rank == 0 else 0.0
+            with device_ctx(device_id):
+                c.tensor[:] = 7.0 if rank == 0 else 0.0
+    else:
+        beta = c_cpu = c = None
+
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[0]["is_transpose"] = transA
+    qualifiers[1]["is_transpose"] = transB
+    distributions = [distribution_A, distribution_B, distribution_C]
+    # For 1D uniform distribution we don't have to pass blocking sizes and can
+    # let Matmul infer them.
+    with nvmath.distributed.linalg.advanced.Matmul(
+        a.tensor,
+        b.tensor,
+        c=c.tensor if input_C else None,
+        beta=beta,
+        distributions=distributions,
+        qualifiers=qualifiers,
+    ) as mm:
+        assert M == mm.mm_traits.M
+        assert N == mm.mm_traits.N
+        assert K == mm.mm_traits.K
+        mm.plan(epilog=MatmulEpilog.ALLREDUCE if epilog_AR else None)
+        assert expected_algo == mm._expected_algo
+
+        mm_count = 0
+        MM_LIMIT = 2
+        while True:
+            d = mm.execute()
+            mm_count += 1
+            d = dist_wrap_operand(d)
+
+            assert d.module is a.module
+            assert d.dtype == "float32"
+            if input_memory_space == "gpu":
+                assert d.device == "cuda"
+                d_cpu = to_host(d, device_id, stream)
+                # matmul was called with some operands on symmetric memory and others not,
+                # so the result won't be on symmetric memory.
+                assert not d.is_symmetric_memory
+            else:
+                assert d.device == "cpu"
+                d_cpu = d
+
+            if b.is_symmetric_memory:
+                free_symmetric_memory(b.tensor)
+
+            if isinstance(distributions[0], BlockCyclic) and distributions[0].first_process != (0, 0):
+                # Reshape A to a BlockNonCyclic distribution with first_process=(0,0) before
+                # gathering.
+                assert distributions[0].process_grid.shape == (nranks, 1)
+                assert distributions[0].first_process[0] > 0
+                rank_adjusted = (rank - distributions[0].first_process[0]) % nranks
+                mb, nb = A_shape
+
+                lower = (mb * rank_adjusted, 0)
+                upper = (mb * rank_adjusted + mb, nb)
+                input_box = Box(lower, upper)
+
+                lower = (mb * rank, 0)
+                upper = (mb * rank + mb, nb)
+                output_box = Box(lower, upper)
+
+                a_cpu = nvmath.distributed.reshape.reshape(a_cpu.tensor, input_box, output_box)
+                a_cpu = dist_wrap_operand(a_cpu)
+
+            a_global = gather_array(a_cpu, 0 if A_distribution == "R" else 1, comm, rank)
+            b_global = gather_array(b_cpu, 0 if B_distribution == "R" else 1, comm, rank)
+            if epilog_AR:
+                # C/D is not actually distributed (it's replicated on all processes).
+                if input_C:
+                    c_global = c_cpu
+                d_global = d_cpu
+            else:
+                if input_C:
+                    c_global = gather_array(c_cpu, 0 if C_distribution == "R" else 1, comm, rank)
+                d_global = gather_array(d_cpu, 0 if C_distribution == "R" else 1, comm, rank)
+            if rank == 0:
+                if input_C:
+                    assert c_global.shape == (M, N)
+                assert d_global.shape == (M, N)
+                single_gpu_result = nvmath.linalg.advanced.matmul(
+                    a_global.tensor.T if transA else a_global.tensor,
+                    b_global.tensor.T if transB else b_global.tensor,
+                    c=c_global.tensor if input_C else None,
+                    beta=beta,
+                )
+                single_gpu_result = wrap_operand(single_gpu_result)
+                try:
+                    assert is_close(d_global, single_gpu_result, rtol=1e-5, atol=1e-5), (
+                        "Gathered result doesn't match single-GPU matmul"
+                    )
+
+                    algo = read_algo_from_log(cublasmp_logfile_with_cleanup)
+                    assert algo is not None, "Couldn't determine the distributed matmul algorithm used"
+                    assert algo == expected_algo, (
+                        f"cuBLASMp didn't run the expected distributed algorithm: algo is {algo}, "
+                        f"expected algo is {expected_algo}"
+                    )
+
+                    comm.bcast(None)
+
+                except Exception as e:
+                    # Broadcast the exception to avoid deadlock.
+                    comm.bcast(e)
+                    raise
+            else:
+                # If rank 0 raises an exception, every process has to do the same to avoid
+                # deadlock.
+                e = comm.bcast(None)
+                if e is not None:
+                    raise e
+
+            if mm_count == MM_LIMIT:
+                break
+
+            # Reset operands.
+            a_cpu, a = generate_random_matrix(A_shape, dtype, False)
+            b_cpu, b = generate_random_matrix(B_shape, dtype, True)
+            if input_C:
+                beta = 0.5
+                c_cpu, c = generate_random_matrix(C_shape, dtype, False)
+                if epilog_AR:
+                    # For epilog_AR cuBLASMp has each process contribute its C to
+                    # the result. To get the same result as single-GPU MM, we have
+                    # to set the values to zero on every rank except one.
+                    c_cpu.tensor[:] = 10.0 if rank == 0 else 0.0
+                    with device_ctx(device_id):
+                        c.tensor[:] = 10.0 if rank == 0 else 0.0
+            else:
+                beta = c_cpu = c = None
+            mm.reset_operands(a.tensor, b.tensor, c.tensor if c is not None else None, beta=beta)
+
+
+@pytest.mark.parametrize("M_N_K", [(64, 64, 64), (128, 96, 64), (64, 128, 64)])
+@pytest.mark.parametrize("transA", [True, False])
+@pytest.mark.parametrize("transB", [True, False])
+@pytest.mark.parametrize("input_C", [False, True])
+@pytest.mark.parametrize("cyclic", [False, True])
+@pytest.mark.parametrize("process_grid", generate_process_grids(only_2d=True))
+def test_2d_block(
+    M_N_K,
+    transA,
+    transB,
+    input_C,
+    cyclic,
+    process_grid,
+    nvmath_distributed,
+    cublasmp_logfile_with_cleanup,
+    check_symmetric_memory_leaks,
+):
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+
+    valid_nranks = (1, 2, 4)
+    if nranks not in valid_nranks:
+        pytest.skip(f"This test needs nranks in {valid_nranks}")
+
+    M, N, K = M_N_K
+    assert M % nranks == 0
+    assert N % nranks == 0
+    assert K % nranks == 0
+
+    # Use the same process grid for A, B and C/D.
+    process_grid = ProcessGrid(shape=process_grid[0], layout=process_grid[1])
+    if cyclic:
+        distribution = BlockCyclic(process_grid, (4, 4))
+    else:
+        distribution = BlockNonCyclic(process_grid)
+
+    A_shape = distribution.shape(rank, (K, M) if transA else (M, K))
+    B_shape = distribution.shape(rank, (N, K) if transB else (K, N))
+    C_shape = distribution.shape(rank, (M, N))
+
+    a = np.asfortranarray(np.random.rand(*A_shape).astype(np.float32))
+    b = np.asfortranarray(np.random.rand(*B_shape).astype(np.float32))
+    if input_C:
+        beta = 0.8
+        c = np.asfortranarray(np.random.rand(*C_shape).astype(np.float32))
+    else:
+        beta = c = None
+
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[0]["is_transpose"] = transA
+    qualifiers[1]["is_transpose"] = transB
+    with nvmath.distributed.linalg.advanced.Matmul(
+        a,
+        b,
+        c=c,
+        beta=beta,
+        distributions=[distribution] * 3,
+        qualifiers=qualifiers,
+    ) as mm:
+        # Check that the global matrix sizes were inferred correctly.
+        assert M == mm.mm_traits.M
+        assert N == mm.mm_traits.N
+        assert K == mm.mm_traits.K
+        mm.plan()
+        assert mm._expected_algo in (-1, 0, 5)  # unknown or naive or local
+        d = mm.execute()
+
+    nprow, npcol = process_grid.shape
+    myprow = rank % nprow if process_grid.layout == ProcessGrid.Layout.COL_MAJOR else rank // npcol
+    mypcol = rank // nprow if process_grid.layout == ProcessGrid.Layout.COL_MAJOR else rank % npcol
+
+    def gather_matrix(matrix, mb, nb, global_shape):
+        # Reshape matrix to 1D column-wise (partitioning on Y) to be able to gather it.
+        lower = (myprow * mb, mypcol * nb)
+        upper = (lower[0] + mb, lower[1] + nb)
+        input_box = Box(lower, upper)
+        output_box = Box((0, global_shape[1] // nranks * rank), (global_shape[0], global_shape[1] // nranks * (rank + 1)))
+        matrix = nvmath.distributed.reshape.reshape(matrix, input_box, output_box)
+        # Gather matrix
+        return gather_array(dist_wrap_operand(matrix), 1, comm, rank)
+
+    # For gather we ignore the cyclic property, it doesn't affect correctness testing
+    # since cyclic determines a global permutation of values but the values themselves
+    # don't change.
+    a_global = gather_matrix(a, A_shape[0], A_shape[1], (K, M) if transA else (M, K))
+    b_global = gather_matrix(b, B_shape[0], B_shape[1], (N, K) if transB else (K, N))
+    if input_C:
+        c_global = gather_matrix(c, C_shape[0], C_shape[1], (M, N))
+    d_global = gather_matrix(d, C_shape[0], C_shape[1], (M, N))
+
+    if rank == 0:
+        single_gpu_result = nvmath.linalg.advanced.matmul(
+            a_global.tensor.T if transA else a_global.tensor,
+            b_global.tensor.T if transB else b_global.tensor,
+            c=c_global.tensor if input_C else None,
+            beta=beta,
+        )
+        single_gpu_result = wrap_operand(single_gpu_result)
+        try:
+            assert is_close(d_global, single_gpu_result, rtol=1e-5, atol=1e-5), (
+                "Gathered result doesn't match single-GPU matmul"
+            )
+
+            algo = read_algo_from_log(cublasmp_logfile_with_cleanup)
+            assert algo is not None, "Couldn't determine the distributed matmul algorithm used"
+            expected_algo = 1 if cyclic else 0  # 0 is naive, 1 is SUMMA.
+            assert algo == expected_algo, (
+                f"cuBLASMp didn't run the expected distributed algorithm: algo is {algo}, expected algo is {expected_algo}"
+            )
+
+            comm.bcast(None)
+
+        except Exception as e:
+            # Broadcast the exception to avoid deadlock.
+            comm.bcast(e)
+            raise
+    else:
+        # If rank 0 raises an exception, every process has to do the same to avoid
+        # deadlock.
+        e = comm.bcast(None)
+        if e is not None:
+            raise e
+
+
+def skip_test_global_shape_inference(global_shape, process_grid, blocking_sizes):
+    nprow, npcol = process_grid[0]
+    if blocking_sizes != "non-cyclic":
+        mb, nb = blocking_sizes
+        if nprow != 1 and npcol != 1 and (mb == "all" or nb == "all"):
+            return True  # 'all' block size not used for 2D block distributions
+        if mb == "all" and nprow != 1:
+            return True  # "mb='all' row block size requires (1, N) process grid
+        if nb == "all" and npcol != 1:
+            return True  # nb='all' col block size requires (N, 1) process grid
+    return False
+
+
+@pytest.mark.uncollect_if(func=skip_test_global_shape_inference)
+# This test only uses square matrices.
+@pytest.mark.parametrize("global_shape", [64])
+@pytest.mark.parametrize("process_grid", generate_process_grids())
+@pytest.mark.parametrize(
+    "blocking_sizes",
+    [
+        (1, "all"),
+        (2, "all"),
+        (3, "all"),
+        ("all", 1),
+        ("all", 4),
+        ("all", 7),
+        (1, 1),
+        (1, 2),
+        (2, 1),
+        (2, 2),
+        (4, 3),
+        (4, 4),
+        (5, 3),
+        "non-cyclic",
+    ],
+)
+def test_global_shape_inference(global_shape, process_grid, blocking_sizes, nvmath_distributed):
+    """This tests that global shape inference works under a wide range of BlockCyclic
+    distributions, generated from all possible 1D and 2D process grids (given the
+    number of processes running the test) and various block sizes (cyclic and non-cyclic).
+    It doesn't run matmul end-to-end because the block sizes don't match across
+    A, B, C/D for matching dimensions."""
+
+    # test parameter with blocking size "all" means that the block size in that dimension
+    # is the full length of the global matrix in that dimension. Only used with 1D
+    # distributions for the dimension that is not partitioned.
+
+    # Note that for 1D distributions, for the dimension that is not partitioned, a
+    # blocking size < 'all' is also valid (it simply means that the process has a number of
+    # contiguous blocks), and is in fact used to specify some algorithms in cuBLASMp.
+
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+
+    process_grid = ProcessGrid(shape=process_grid[0], layout=process_grid[1])
+    nprow, npcol = process_grid.shape
+
+    if blocking_sizes == "non-cyclic":
+        mb = math.ceil(global_shape / nprow)
+        nb = math.ceil(global_shape / npcol)
+    else:
+        mb, nb = blocking_sizes
+        if mb == "all":
+            mb = global_shape
+        if nb == "all":
+            nb = global_shape
+
+    myprow = rank % nprow if process_grid.layout == ProcessGrid.Layout.COL_MAJOR else rank // npcol
+    mypcol = rank // nprow if process_grid.layout == ProcessGrid.Layout.COL_MAJOR else rank % npcol
+
+    local_nrows = cublasMp.numroc(global_shape, mb, myprow, 0, nprow)
+    local_ncols = cublasMp.numroc(global_shape, nb, mypcol, 0, npcol)
+
+    from mpi4py import MPI
+
+    total_elements = np.array([local_nrows * local_ncols], dtype=np.int64)
+    comm.Allreduce(MPI.IN_PLACE, total_elements, op=MPI.SUM)
+    assert total_elements == global_shape * global_shape
+
+    a = np.zeros((local_nrows, local_ncols))
+    a = np.asfortranarray(a)
+
+    distributions = [BlockCyclic(process_grid, (mb, nb))] * 3
+    mm = nvmath.distributed.linalg.advanced.Matmul(a, a, distributions=distributions)
+    # Check that the global matrix sizes were inferred correctly.
+    assert mm.mm_traits.M == mm.mm_traits.N == mm.mm_traits.K == global_shape
+    mm.free()
+
+
+def valid_matrix_dtypes():
+    SUPPORTED_TYPES = nvmath.linalg._internal.typemaps.SUPPORTED_TYPES
+    # cupy doesn't support complex32
+    return [dt for dt in SUPPORTED_TYPES if dt != "complex32"]
+
+
+def is_invalid_compute_and_dtype_combination(compute_type, a_dtype, b_dtype, c_dtype, d_dtype, M_N_K):
+    assert all(dtype is not None for dtype in (a_dtype, b_dtype, d_dtype))
+
+    # TODO: c_type != None
+
+    if "complex" in a_dtype or "complex" in d_dtype:
+        if not (a_dtype == b_dtype == d_dtype):
+            return True
+        if a_dtype == "complex64" and not compute_type.startswith("COMPUTE_32F"):
+            return True
+        if a_dtype == "complex128" and not compute_type.startswith("COMPUTE_64F"):
+            return True
+
+    if compute_type in ("COMPUTE_32F", "COMPUTE_32F_PEDANTIC") and NAME_TO_DATA_WIDTH[d_dtype] == 16 and a_dtype == "float32":
+        return True
+
+    if compute_type in ("COMPUTE_32F", "COMPUTE_32F_PEDANTIC") and NAME_TO_DATA_WIDTH[d_dtype] == 64 and a_dtype == "float32":
+        return True
+
+    if compute_type in ("COMPUTE_32I", "COMPUTE_32I_PEDANTIC"):
+        return True
+
+    if compute_type in ("COMPUTE_16F", "COMPUTE_16F_PEDANTIC") and a_dtype != "float16":
+        return True
+
+    if (compute_type in ("COMPUTE_64F", "COMPUTE_64F_PEDANTIC")) ^ (a_dtype in ("float64", "complex128")):
+        return True
+
+    if compute_type in ("COMPUTE_32F_FAST_16F", "COMPUTE_32F_FAST_16BF", "COMPUTE_32F_FAST_TF32"):
+        if a_dtype not in ("float32", "complex64"):
+            return True
+        if not (a_dtype == b_dtype == d_dtype):
+            # NOTE: cuBLASLt and cuBLASMp don't throw an error for this case
+            # (e.g. a_dtype=float32, b_dtype=float32, d_dtype=float64)
+            # but according to docs cuBLASMp doesn't support this, and the result doesn't
+            # match cuBLASLt.
+            return True
+
+    if NAME_TO_DATA_WIDTH[a_dtype] != 8 and NAME_TO_DATA_WIDTH[b_dtype] != 8:
+        if a_dtype != b_dtype:
+            return True
+    else:
+        # FP8
+        if d_dtype == "float64" or d_dtype.startswith("complex"):
+            return True
+        if d_dtype == "float8_e5m2" and a_dtype != "float8_e5m2" and b_dtype != "float8_e5m2":
+            return True
+        if compute_type != "COMPUTE_32F":
+            return True
+        if NAME_TO_DATA_WIDTH[a_dtype] != NAME_TO_DATA_WIDTH[b_dtype]:
+            return True
+        if a_dtype == "float8_e5m2" and b_dtype == "float8_e5m2":
+            return True
+
+    if NAME_TO_DATA_WIDTH[d_dtype] == 8 and NAME_TO_DATA_WIDTH[a_dtype] != 8:
+        return True
+
+    if NAME_TO_DATA_WIDTH[a_dtype] == 64 and (NAME_TO_DATA_WIDTH[b_dtype] != 64 or NAME_TO_DATA_WIDTH[d_dtype] != 64):
+        return True
+
+    if a_dtype == b_dtype and NAME_TO_DATA_WIDTH[a_dtype] == 16 and d_dtype != a_dtype:
+        if compute_type in ("COMPUTE_16F", "COMPUTE_16F_PEDANTIC"):
+            return True
+        if d_dtype != "float32":
+            return True
+
+    if (a_dtype == b_dtype == "bfloat16") and (d_dtype not in ("bfloat16", "float32")):
+        return True
+
+
+# Skip invalid compute_type and matrix dtype combinations. It might be better to check
+# that Matmul correctly throws an error for invalid combinations, but the number of
+# invalid combinations is very large, and destroying distributed Matmul objects currently
+# takes too long.
+@pytest.mark.uncollect_if(func=is_invalid_compute_and_dtype_combination)
+# Use the compute_type name instead of the enum value in order to see the name
+# in the pytest output instead of an integer code.
+@pytest.mark.parametrize("compute_type", [compute_type.name for compute_type in MatmulComputeType])
+@pytest.mark.parametrize("a_dtype", valid_matrix_dtypes())
+@pytest.mark.parametrize("b_dtype", valid_matrix_dtypes())
+@pytest.mark.parametrize("c_dtype", [None])
+@pytest.mark.parametrize("d_dtype", valid_matrix_dtypes())
+@pytest.mark.parametrize("M_N_K", [(64, 64, 64), (128, 96, 64), (64, 128, 64)])
+def test_dtypes(
+    compute_type,
+    a_dtype,
+    b_dtype,
+    c_dtype,
+    d_dtype,
+    M_N_K,
+    nvmath_distributed,
+    check_symmetric_memory_leaks,
+):
+    """Test various combinations of compute_type and matrix dtypes (including mixed and
+    narrow-precision) and check that the result matches single-GPU matmul."""
+
+    distributed_ctx = nvmath.distributed.get_context()
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+    device_id = distributed_ctx.device_id
+
+    # TODO: c_dtype != None
+
+    compute_type = MatmulComputeType[compute_type]
+    dtypes = (a_dtype, b_dtype, c_dtype, d_dtype)
+
+    torch_required = set(dtypes) & {"float8_e4m3fn", "float8_e5m2", "bfloat16"}
+    if torch_required:
+        pytest.skip("FP8 not supported")
+    if torch_required and "torch" not in package_name_to_package:
+        pytest.skip(f"torch is required for one of {torch_required} but is not installed")
+
+    m, n, k = M_N_K
+    assert k % nranks == 0
+
+    # Use TN: (k, m) * (k, n) = (m, n)
+    # (note that we use x.T on the created matrices so that cuBLASMp sees
+    # Fortran memory order)
+    a_shape = (m, k // nranks)
+    b_shape = (n, k // nranks)
+
+    beta = None if c_dtype is None else 1.0
+
+    scales = None
+    if torch_required:
+        # Allocate all operands with PyTorch.
+        import torch
+
+        stream = get_or_create_stream(device_id, stream=None, op_package="torch")
+        name_to_dtype = nvmath.internal.tensor_ifc_torch.TorchTensor.name_to_dtype
+        # transpose to get Fortran order
+        a = (torch.rand(*a_shape, device=f"cuda:{device_id}") * 10).type(name_to_dtype[a_dtype]).T
+        b = (torch.rand(*b_shape, device=f"cuda:{device_id}") * 10).type(name_to_dtype[b_dtype]).T
+        c = None
+        if c_dtype is not None:
+            raise NotImplementedError
+        if NAME_TO_DATA_WIDTH[a_dtype] == 8:
+            scales = {"a": 0.8, "b": 0.9}
+            if NAME_TO_DATA_WIDTH[d_dtype] == 8:
+                scales["d"] = 0.1
+    else:
+        # Allocate all operands with CuPy.
+        import cupy as cp
+
+        stream = get_or_create_stream(device_id, stream=None, op_package="cupy")
+        name_to_dtype = nvmath.internal.tensor_ifc_numpy.NumpyTensor.name_to_dtype
+        with device_ctx(device_id):
+            # transpose to get Fortran order
+            if "complex" in a_dtype:
+                assert a_dtype != "complex32"
+                float_dtype = cp.float32 if a_dtype == "complex64" else cp.float64
+                a = (cp.random.rand(*a_shape, dtype=float_dtype) + 1j * cp.random.rand(*a_shape, dtype=float_dtype)).T
+                b = (cp.random.rand(*b_shape, dtype=float_dtype) + 1j * cp.random.rand(*b_shape, dtype=float_dtype)).T
+            else:
+                a = (cp.random.rand(*a_shape) * 10).astype(name_to_dtype[a_dtype]).T
+                b = (cp.random.rand(*b_shape) * 10).astype(name_to_dtype[b_dtype]).T
+            c = None
+        if c_dtype is not None:
+            raise NotImplementedError
+
+    cc = ccx.Device(device_id).compute_capability
+    if any(NAME_TO_DATA_WIDTH[dt] <= 8 for dt in dtypes if dt is not None) and cc < (8, 9):
+        pytest.skip("FP8 requires compute capability >= 8.9")
+
+    options = {"compute_type": compute_type, "result_type": NAME_TO_DATA_TYPE[d_dtype]}
+    if NAME_TO_DATA_WIDTH[d_dtype] <= 8:
+        options["result_amax"] = True
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    if "complex" in a_dtype:
+        qualifiers[0]["is_conjugate"] = qualifiers[0]["is_transpose"] = True
+    else:
+        qualifiers[0]["is_transpose"] = True
+    d = nvmath.distributed.linalg.advanced.matmul(
+        a,
+        b,
+        c=c,
+        distributions=[Slab.X] * 3,
+        beta=beta,
+        qualifiers=qualifiers,
+        # quantization_scales=scales,
+        options=options,
+    )
+    if isinstance(d, Sequence) and len(d) == 2:
+        d, aux = d
+        if "result_amax" in aux:
+            from mpi4py import MPI
+
+            aux_global = comm.allreduce(aux["result_amax"].item(), op=MPI.MAX)
+
+    a = dist_wrap_operand(a)
+    b = dist_wrap_operand(b)
+    d = dist_wrap_operand(d)
+    assert d.shape == (m // nranks, n)
+    assert d.dtype == d_dtype
+    assert d.module is a.module
+    assert d.device == "cuda" and d.device_id == device_id
+
+    if "complex" in a_dtype:
+        a_global = gather_array(to_host(dist_wrap_operand(a.tensor), device_id, stream), 0, comm, rank)
+    else:
+        a_global = gather_array(to_host(dist_wrap_operand(a.tensor.T), device_id, stream), 1, comm, rank)
+    b_global = gather_array(to_host(dist_wrap_operand(b.tensor.T), device_id, stream), 1, comm, rank)
+    d_global = gather_array(to_host(d, device_id, stream), 0, comm, rank)
+    if rank == 0:
+        qualifiers = None
+        c = None
+        if "complex" in a_dtype:
+            qualifiers = np.zeros((3,), dtype=nvmath.linalg.advanced.matrix_qualifiers_dtype)
+            qualifiers[0]["is_conjugate"] = True
+            # cuBLASLt fails to query heuristics for conjugate transpose unless
+            # C is provided.
+            beta = 1.0
+            c = np.zeros((m, n), dtype=name_to_dtype[d_dtype])
+        single_gpu_result = nvmath.linalg.advanced.matmul(
+            a_global.tensor if "complex" not in a_dtype else a_global.tensor.T,
+            b_global.tensor.T,
+            c=c,
+            beta=beta,
+            qualifiers=qualifiers,
+            quantization_scales=scales,
+            options=options,
+        )
+        single_gpu_aux = {}
+        if isinstance(single_gpu_result, Sequence) and len(single_gpu_result) == 2:
+            single_gpu_result, single_gpu_aux = single_gpu_result
+
+        single_gpu_result = wrap_operand(single_gpu_result)
+        try:
+            if "result_amax" in single_gpu_aux:
+                assert math.isclose(aux_global, single_gpu_aux["result_amax"].item(), rel_tol=1e-3, abs_tol=1e-3)
+            if NAME_TO_DATA_WIDTH[a_global.dtype] <= 16:
+                rtol, atol = 1e-1, 1
+            elif compute_type in (MatmulComputeType.COMPUTE_32F_FAST_TF32, MatmulComputeType.COMPUTE_32F_FAST_16F):
+                rtol, atol = 1e-2, 1e-1
+            else:
+                rtol, atol = 1e-5, 1e-5
+            assert is_close(d_global, single_gpu_result, rtol, atol), "Gathered result doesn't match single-GPU matmul"
+            comm.bcast(None)
+        except Exception as e:
+            # Broadcast the exception to avoid deadlock.
+            comm.bcast(e)
+            raise
+    else:
+        # If rank 0 raises an exception, every process has to do the same to avoid
+        # deadlock.
+        e = comm.bcast(None)
+        if e is not None:
+            raise e
diff --git a/tests/nvmath_tests/distributed/test_nvshmem.py b/tests/nvmath_tests/distributed/test_nvshmem.py
index 0bb6002..3e7de67 100644
--- a/tests/nvmath_tests/distributed/test_nvshmem.py
+++ b/tests/nvmath_tests/distributed/test_nvshmem.py
@@ -45,7 +45,7 @@ def nvmath_distributed():
 
     comm = MPI.COMM_WORLD
     device_id = comm.Get_rank() % cuda.core.experimental.system.num_devices
-    nvmath.distributed.initialize(device_id)
+    nvmath.distributed.initialize(device_id, comm, backends=["nvshmem"])
 
     yield
 
diff --git a/tests/nvmath_tests/distributed/test_reshape.py b/tests/nvmath_tests/distributed/test_reshape.py
index 3fee97d..c93dc0a 100644
--- a/tests/nvmath_tests/distributed/test_reshape.py
+++ b/tests/nvmath_tests/distributed/test_reshape.py
@@ -7,6 +7,7 @@
 import re
 
 import nvmath.distributed
+from nvmath.distributed.distribution import Box
 from nvmath.internal.utils import device_ctx, get_or_create_stream
 from nvmath.distributed import free_symmetric_memory
 from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, maybe_register_package
@@ -34,14 +35,14 @@ def nvmath_distributed():
         pass
 
     device_id = MPI.COMM_WORLD.Get_rank() % cuda.core.experimental.system.num_devices
-    nvmath.distributed.initialize(device_id, MPI.COMM_WORLD)
+    nvmath.distributed.initialize(device_id, MPI.COMM_WORLD, backends=["nvshmem"])
 
     yield
 
     nvmath.distributed.finalize()
 
 
-def _calculate_local_box(global_shape, partition_dim, rank, nranks):
+def _calculate_local_box(global_shape, partition_dim, rank, nranks) -> Box:
     """Given a global shape of data that is partitioned across ranks along the
     `partition_dim` dimension, return the local box of this rank (as a lower and
     upper coordinate in the global shape).
@@ -53,7 +54,7 @@ def _calculate_local_box(global_shape, partition_dim, rank, nranks):
     shape = calc_slab_shape(global_shape, partition_dim, rank, nranks)
     upper = list(shape)
     upper[partition_dim] += lower[partition_dim]
-    return lower, upper
+    return Box(lower, upper)
 
 
 @pytest.mark.parametrize("dtype", [np.int8, np.int16])
@@ -90,7 +91,7 @@ def test_wrong_boxes1(nvmath_distributed, check_symmetric_memory_leaks):
         ValueError, match=re.escape("The global number of elements is incompatible with the inferred global shape (2, 2)")
     ):
         data = np.array([0, 1, 2, 3], dtype=np.int32).reshape((2, 2))
-        nvmath.distributed.reshape.reshape(data, input_box=[(0, 0), (2, 2)], output_box=[(0, 0), (2, 2)])
+        nvmath.distributed.reshape.reshape(data, input_box=Box((0, 0), (2, 2)), output_box=Box((0, 0), (2, 2)))
 
 
 def test_wrong_boxes2(nvmath_distributed, check_symmetric_memory_leaks):
@@ -112,10 +113,10 @@ def test_wrong_boxes2(nvmath_distributed, check_symmetric_memory_leaks):
     ):
         if rank % 2 == 0:
             data = np.array([0, 1, 2, 3], dtype=dtype).reshape((2, 2))
-            nvmath.distributed.reshape.reshape(data, input_box=[(0, 0), (2, 2)], output_box=[(0, 0), (2, 2)])
+            nvmath.distributed.reshape.reshape(data, input_box=Box((0, 0), (2, 2)), output_box=Box((0, 0), (2, 2)))
         else:
             data = np.array([4, 5, 6, 7], dtype=dtype).reshape((2, 2))
-            nvmath.distributed.reshape.reshape(data, input_box=[(4, 4), (6, 6)], output_box=[(4, 4), (6, 6)])
+            nvmath.distributed.reshape.reshape(data, input_box=Box((4, 4), (6, 6)), output_box=Box((4, 4), (6, 6)))
 
 
 def test_wrong_boxes3(nvmath_distributed, check_symmetric_memory_leaks):
@@ -131,7 +132,7 @@ def test_wrong_boxes3(nvmath_distributed, check_symmetric_memory_leaks):
         match=re.escape("The upper coordinates must be larger than the lower coordinates, but got lower=(2, 2) upper=(0, 0)"),
     ):
         data = np.array([0, 1, 2, 3], dtype=np.int32).reshape((2, 2))
-        nvmath.distributed.reshape.reshape(data, input_box=[(2, 2), (0, 0)], output_box=[(2, 2), (0, 0)])
+        nvmath.distributed.reshape.reshape(data, input_box=Box((2, 2), (0, 0)), output_box=Box((2, 2), (0, 0)))
 
 
 def test_inconsistent_layout(nvmath_distributed, check_symmetric_memory_leaks):
@@ -184,11 +185,11 @@ def F(a):
 
     if rank == 0:
         data = F(np.array([0, 1, 3, 4], dtype=dtype).reshape((2, 2)))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(0, 0), (2, 2)], output_box=[(0, 0), (2, 1)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((0, 0), (2, 2)), output_box=Box((0, 0), (2, 1)))
         expected = np.array([0, 3], dtype=dtype).reshape((2, 1))
     else:
         data = F(np.array([2, 5], dtype=dtype).reshape((2, 1)))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(0, 2), (2, 3)], output_box=[(0, 1), (2, 3)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((0, 2), (2, 3)), output_box=Box((0, 1), (2, 3)))
         expected = np.array([1, 2, 4, 5], dtype=dtype).reshape((2, 2))
     np.testing.assert_equal(result, expected)
 
@@ -223,19 +224,19 @@ def F(a):
 
     if rank == 0:
         data = F(np.array([(0, 1), (4, 5)], dtype=dtype))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(0, 0), (2, 2)], output_box=[(0, 0), (4, 1)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((0, 0), (2, 2)), output_box=Box((0, 0), (4, 1)))
         expected = np.array([0, 4, 8, 12], dtype=dtype).reshape((4, 1))
     elif rank == 1:
         data = F(np.array([(2, 3), (6, 7)], dtype=dtype))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(0, 2), (2, 4)], output_box=[(0, 1), (4, 2)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((0, 2), (2, 4)), output_box=Box((0, 1), (4, 2)))
         expected = np.array([1, 5, 9, 13], dtype=dtype).reshape((4, 1))
     elif rank == 2:
         data = F(np.array([(8, 9), (12, 13)], dtype=dtype))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(2, 0), (4, 2)], output_box=[(0, 2), (4, 3)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((2, 0), (4, 2)), output_box=Box((0, 2), (4, 3)))
         expected = np.array([2, 6, 10, 14], dtype=dtype).reshape((4, 1))
     else:
         data = F(np.array([(10, 11), (14, 15)], dtype=dtype))
-        result = nvmath.distributed.reshape.reshape(data, input_box=[(2, 2), (4, 4)], output_box=[(0, 3), (4, 4)])
+        result = nvmath.distributed.reshape.reshape(data, input_box=Box((2, 2), (4, 4)), output_box=Box((0, 3), (4, 4)))
         expected = np.array([3, 7, 11, 15], dtype=dtype).reshape((4, 1))
     np.testing.assert_equal(result, expected)
 
@@ -508,12 +509,12 @@ def test_distributed_reshape_1D(package, nvmath_distributed, check_symmetric_mem
     # Calculate output box.
     nelems_per_other_rank = (global_shape[0] - 80) // (nranks - 1)
     if rank == 0:
-        output_box = ([0], [80])
+        output_box = Box([0], [80])
     else:
         lower = 80
         for i in range(1, rank):
             lower += nelems_per_other_rank
-        output_box = [lower], [lower + nelems_per_other_rank]
+        output_box = Box([lower], [lower + nelems_per_other_rank])
 
     # Run distributed reshape.
     result = nvmath.distributed.reshape.reshape(data_in.tensor, input_box, output_box)
diff --git a/tests/nvmath_tests/fft/test_lto_callbacks.py b/tests/nvmath_tests/fft/test_lto_callbacks.py
index 03d0485..db7bae4 100644
--- a/tests/nvmath_tests/fft/test_lto_callbacks.py
+++ b/tests/nvmath_tests/fft/test_lto_callbacks.py
@@ -92,11 +92,8 @@ def get_tolerance(a, shape_kind=None):
     framework = get_framework_from_array(a)
     mem_backend = get_array_backend(a)
     rtol, atol = get_default_tolerance(dtype, shape_kind, exec_backend=ExecBackend.cufft)
-    # LTO EA was observed to have slightly off outputs for float32
     # The torch CPU fft has bigger difference as well
-    if (nvmath.bindings.cufft.get_version() < 11300 and dtype in [DType.float32, DType.complex64]) or (
-        framework == Framework.torch and mem_backend == MemBackend.cpu
-    ):
+    if framework == Framework.torch and mem_backend == MemBackend.cpu:
         rtol *= 1.2
     return {"rtol": rtol, "atol": atol}
 
@@ -127,25 +124,13 @@ def assert_norm_close_check_constant(a, a_ref, rtol=None, atol=None, axes=None,
             raise AssertionError(f"The outputs differ by a constant factor of {factor}") from e
 
 
-def allow_to_fail_lto_ea_3d(e, shape, axes):
-    if isinstance(e, ValueError) and "3D FFT with the last extent equal 1" in str(e):
-        assert nvmath.bindings.cufft.get_version() < 11300
-        fft_dim = len(axes) if axes is not None else len(shape)
-        assert fft_dim == 3
-        axes = axes or list(range(fft_dim))
-        assert shape[axes[-1]] == 1
-        assert sum(shape[a] == 1 for a in axes) == 1
-        raise pytest.skip("cuFFT LTO EA 3D last extent 1 is not supported")
-
-
 def allow_to_fail_compund_shape(e, shape, axes):
-    if not has_only_small_factors(shape, axes):
-        if nvmath.bindings.cufft.get_version() < 11300:
-            if isinstance(e, ValueError) and "cuFFT LTO EA does not" in str(e):
-                raise pytest.skip(f"NVMATH CHECK: Unsupported {shape} comprising primes larger than 127")
-        else:
-            if isinstance(e, nvmath.bindings.cufft.cuFFTError) and "CUFFT_NOT_SUPPORTED" in str(e):
-                raise pytest.skip(f"CUFFT_UNSUPPORTED: Unsupported {shape} comprising primes larger than 127")
+    if (
+        isinstance(e, nvmath.bindings.cufft.cuFFTError)
+        and "CUFFT_NOT_SUPPORTED" in str(e)
+        and not has_only_small_factors(shape, axes)
+    ):
+        raise pytest.skip(f"CUFFT_NOT_SUPPORTED: Unsupported {shape} comprising primes larger than 127")
     raise
 
 
@@ -1232,7 +1217,6 @@ def ref(data, flt):
                 },
             )
         except (nvmath.bindings.cufft.cuFFTError, ValueError) as e:
-            allow_to_fail_lto_ea_3d(e, shape, axes)
             if not allow_to_fail:
                 raise
             allow_to_fail_compund_shape(e, shape, axes=axes)
@@ -2315,3 +2299,42 @@ def epilog_fn(data, offset, element, flt, unused):
         match="The 'prolog' and 'epilog' are not supported with CPU 'execution'",
     ):
         fn(signal, execution=exec_backend.nvname, **cb_kwargs)
+
+
+def test_device_callable_validation():
+    """Test DeviceCallable class validation logic."""
+    # Test case 1: ltoir type validation - must be bytes or int
+    with pytest.raises(ValueError, match="The LTO-IR code must be provided as a bytes object or as a Python int"):
+        nvmath.fft.DeviceCallable(ltoir="test")  # string is invalid
+
+    # Test case 2: size is required when ltoir is int pointer
+    with pytest.raises(ValueError, match="The size of the LTO-IR code specified as a pointer must be explicitly provided"):
+        nvmath.fft.DeviceCallable(ltoir=1000, size=None)  # int pointer without size
+
+    # Test case 3: size validation - must be integer type
+    with pytest.raises(ValueError, match="Invalid size value"):
+        nvmath.fft.DeviceCallable(ltoir=1000, size=3.14)  # float is invalid
+
+    # Test case 4: data default value when None
+    dc = nvmath.fft.DeviceCallable(ltoir=b"test", data=None)
+    assert dc.data == 0  # should default to 0
+
+    # Test case 5: data type validation - must be integer type
+    with pytest.raises(ValueError, match="The 'data' attribute must be a Python int"):
+        nvmath.fft.DeviceCallable(ltoir=b"test", data="invalid_data")  # string is invalid
+
+    # Test case 6: valid cases with bytes ltoir
+    dc = nvmath.fft.DeviceCallable(ltoir=b"test", data=42)
+    assert dc.size == 4  # len(b"test") = 4
+    assert dc.data == 42
+
+    # Test case 7: valid cases with int pointer ltoir
+    dc = nvmath.fft.DeviceCallable(ltoir=12345, size=100, data=42)
+    assert dc.size == 100  # explicitly specified
+    assert dc.data == 42
+
+    # Test case 8: default values when ltoir is None
+    dc = nvmath.fft.DeviceCallable()
+    assert dc.ltoir is None
+    assert dc.size is None
+    assert dc.data is None
diff --git a/tests/nvmath_tests/linalg/__init__.py b/tests/nvmath_tests/linalg/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/linalg/advanced/__init__.py b/tests/nvmath_tests/linalg/advanced/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py b/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py
index a6620e0..84dacb0 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py
@@ -8,7 +8,7 @@
     torch = None
 import pytest
 import numpy as np
-from .utils import sample_matrix, assert_tensors_equal, to_numpy
+from ...utils import sample_matrix, assert_tensors_equal, to_numpy
 from nvmath.internal.utils import check_or_create_options
 from nvmath.linalg.advanced import matmul
 from nvmath.linalg.advanced import _configuration
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
index 8ea28f5..bed79bc 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
@@ -12,7 +12,7 @@
 from nvmath.linalg.advanced import matmul, Matmul, MatmulEpilog as Epilog
 from nvmath.bindings import cublasLt as cublaslt
 
-from .utils import (
+from ...utils import (
     compare_tensors,
     get_absolute_tolerance,
     get_framework,
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py
index d919965..d4b4b34 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py
@@ -8,7 +8,7 @@
 except ImportError:
     torch = None
 import pytest
-from .utils import sample_matrix, assert_tensors_equal, matmul_with_random_autotune
+from ...utils import sample_matrix, assert_tensors_equal, matmul_with_random_autotune
 from .fp8_utils import choose_scales, generate_inputs, assert_fp8_equal, fp8_matmul_reference
 from nvmath.linalg.advanced import Matmul, matmul, MatmulQuantizationScales
 from nvmath.internal.typemaps import NAME_TO_DATA_TYPE
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py
index 8847ebd..2be5093 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py
@@ -8,7 +8,7 @@
 except ImportError:
     torch = None
 import pytest
-from .utils import sample_matrix, allow_cublas_unsupported, matmul_with_random_autotune
+from ...utils import sample_matrix, allow_cublas_unsupported, matmul_with_random_autotune
 from .fp8_utils import assert_fp8_equal, fp8_matmul_reference, simple_scales, generate_inputs, choose_scales
 from nvmath.linalg.advanced import Matmul, MatmulEpilog as Epilog
 from nvmath.internal.typemaps import NAME_TO_DATA_TYPE
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
index da745c1..fac9eb2 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
@@ -16,7 +16,7 @@
 except ModuleNotFoundError:
     cp = None
 
-from .utils import compare_tensors, random_torch_complex, sample_matrix, assert_tensors_equal, to_numpy, get_framework
+from ...utils import compare_tensors, random_torch_complex, sample_matrix, assert_tensors_equal, to_numpy, get_framework
 
 
 @pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
index e874d6f..fda7634 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
@@ -53,7 +53,7 @@
 from nvmath.memory import BaseCUDAMemoryManager
 
 from nvmath_tests.helpers import nvmath_seed
-from .utils import get_absolute_tolerance
+from ...utils import get_absolute_tolerance
 
 MatmulEpilog_BIAS_list = [
     MatmulEpilog.BIAS,
@@ -569,7 +569,7 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs
             "or as a dict with valid Matrix multiplication plan preferences." in str(e)
         ):
             assert not isinstance(preferences, MatmulPlanPreferences)
-        elif "The allocator must be an object of type that fulfils the BaseCUDAMemoryManager protocol" in str(e):
+        elif "The allocator must be an object of type that fulfills the BaseCUDAMemoryManager protocol" in str(e):
             assert not isinstance(options["allocator"], BaseCUDAMemoryManager)
         elif "int() argument must be a string, a bytes-like object or a number, not 'NoneType'" in str(e):
             assert preferences["reduction_scheme_mask"] is None or preferences["numerical_impl_mask"] is None
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py b/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py
index 0f03dc0..79851cd 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py
@@ -8,7 +8,7 @@
 except ImportError:
     torch = None
 import pytest
-from .utils import sample_matrix
+from ...utils import sample_matrix
 from .fp8_utils import assert_fp8_equal
 from nvmath.linalg.advanced import Matmul, matmul, MatmulEpilog as Epilog
 from nvmath.linalg.advanced.helpers import matmul as matmul_helpers
@@ -17,7 +17,7 @@
 from nvmath.internal.utils import check_or_create_options
 from nvmath.linalg.advanced import _configuration
 from contextlib import nullcontext
-from .utils import allow_cublas_unsupported
+from ...utils import allow_cublas_unsupported
 
 if torch is None:
     pytest.skip("Torch is required for MXFP8 tests", allow_module_level=True)
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
index bafa6f2..bef2d2b 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
@@ -9,7 +9,7 @@
 import nvmath
 import pytest
 
-from .utils import assert_tensors_equal, sample_matrix, is_torch_available
+from ...utils import assert_tensors_equal, sample_matrix, is_torch_available
 
 try:
     import cupy_backends.cuda
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
index c678017..f6cf335 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
@@ -11,7 +11,7 @@
 import numpy as np
 import pytest
 
-from .utils import sample_matrix, allow_cublas_unsupported, assert_tensors_equal
+from ...utils import sample_matrix, allow_cublas_unsupported, assert_tensors_equal
 
 try:
     import cupy
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
index c1645a7..44bf149 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
@@ -9,7 +9,7 @@
 import nvmath
 import pytest
 
-from .utils import assert_tensors_equal, random_torch_complex, sample_matrix, skip_if_cublas_before
+from ...utils import assert_tensors_equal, random_torch_complex, sample_matrix, skip_if_cublas_before
 
 
 @pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
diff --git a/tests/nvmath_tests/linalg/generic/__init__.py b/tests/nvmath_tests/linalg/generic/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/linalg/generic/matmul/__init__.py b/tests/nvmath_tests/linalg/generic/matmul/__init__.py
new file mode 100644
index 0000000..eaf6019
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/__init__.py
@@ -0,0 +1,26 @@
+try:
+    from nvmath.linalg._internal.utils import get_handle
+
+    get_handle(0, binding="cublas")
+    del get_handle
+    CUBLAS_AVAILABLE = True
+except:
+    CUBLAS_AVAILABLE = False
+
+try:
+    from nvmath.bindings.nvpl.blas import get_version
+    from nvmath.bindings._internal.utils import FunctionNotFoundError
+
+    get_version()
+    del get_version
+    NVPL_AVAILABLE = True
+except FunctionNotFoundError as e:
+    if "function nvpl_blas_get_version is not found" not in str(e):
+        raise e
+    # An NVPL alternative was loaded which doesn't implement nvpl_blas_get_version
+    NVPL_AVAILABLE = True
+except RuntimeError as e:
+    if "Failed to dlopen all of the following libraries" not in str(e):
+        raise e
+    # Neither NVPL or an alternative was loaded
+    NVPL_AVAILABLE = False
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_input.py b/tests/nvmath_tests/linalg/generic/matmul/test_input.py
new file mode 100644
index 0000000..1e3346d
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_input.py
@@ -0,0 +1,572 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This set of tests checks matmul's behavior for different kinds of inputs.
+"""
+
+from nvmath.linalg.generic import matmul, matrix_qualifiers_dtype, GeneralMatrixQualifier
+import numpy as np
+import pytest
+from nvmath.bindings import cublasLt as cublaslt
+
+try:
+    import cupy as cp
+except ModuleNotFoundError:
+    cp = None
+
+from ...utils import compare_tensors, random_torch_complex, sample_matrix, assert_tensors_equal, to_numpy, get_framework
+
+from . import CUBLAS_AVAILABLE, NVPL_AVAILABLE
+
+use_cuda_options = (
+    *((True,) if CUBLAS_AVAILABLE else ()),
+    *((False,) if NVPL_AVAILABLE else ()),
+)
+
+
+@pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
+@pytest.mark.parametrize("dtype", ("float32", "float64", "complex64", "complex128"))
+@pytest.mark.parametrize("with_c", (True, False))
+@pytest.mark.parametrize(
+    "n,m,k",
+    (
+        (1, 1, 1),
+        (2, 3, 4),
+        (3, 2, 1),
+        (4, 3, 2),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_types(framework, dtype, with_c, n, m, k, use_cuda):
+    """
+    Tests support for different input data types and frameworks.
+    """
+    try:
+        a = sample_matrix(framework, dtype, (n, k), use_cuda)
+        b = sample_matrix(framework, dtype, (k, m), use_cuda)
+        c = sample_matrix(framework, dtype, (n, m), use_cuda)
+    except NotImplementedError as e:
+        pytest.skip(f"Unable to generate sample matrix: {str(e)}")
+
+    if with_c:
+        result = matmul(a, b, c, alpha=0.5, beta=0.3)
+        reference = 0.5 * to_numpy(a) @ to_numpy(b) + 0.3 * to_numpy(c)
+    else:
+        result = matmul(a, b, alpha=0.6)
+        reference = 0.6 * to_numpy(a) @ to_numpy(b)
+
+    assert_tensors_equal(result, reference)
+
+
+@pytest.mark.parametrize("with_c", (True, False))
+@pytest.mark.parametrize("n", (1, 8, 64, 100, 200, 300))
+@pytest.mark.parametrize("m", (1, 8, 64, 100, 200, 300))
+@pytest.mark.parametrize("k", (1, 8, 64, 100, 200, 300))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_shapes(with_c, n, m, k, use_cuda):
+    """
+    Tests support for different input data shapes
+    """
+    try:
+        a = sample_matrix("numpy/cupy", "float64", (n, k), use_cuda=use_cuda)
+        b = sample_matrix("numpy/cupy", "float64", (k, m), use_cuda=use_cuda)
+        c = sample_matrix("numpy/cupy", "float64", (n, m), use_cuda=use_cuda)
+    except NotImplementedError as e:
+        pytest.skip(f"Unable to generate sample matrix: {str(e)}")
+
+    if with_c:
+        result = matmul(a, b, c, alpha=0.5, beta=0.3)
+        reference = 0.5 * to_numpy(a) @ to_numpy(b) + 0.3 * to_numpy(c)
+    else:
+        result = matmul(a, b, alpha=0.6)
+        reference = 0.6 * to_numpy(a) @ to_numpy(b)
+
+    assert_tensors_equal(result, reference)
+
+
+@pytest.mark.skipif(not CUBLAS_AVAILABLE, reason="This test requires cuBLAS")
+def test_framework_mixing():
+    """
+    Tests error on inputs from different frameworks.
+    """
+    a = sample_matrix("torch", "float32", (7, 7), True)
+    b = sample_matrix("cupy", "float32", (7, 7), True)
+    with pytest.raises(TypeError, match="All tensors in the network must be from the same library"):
+        matmul(a, b)
+
+
+@pytest.mark.skipif(not NVPL_AVAILABLE, reason="This test requires NVPL")
+def test_default_alpha():
+    """
+    Tests if the value of alpha is correct.
+    """
+    a = sample_matrix("numpy", "float32", (3, 3), False)
+    b = sample_matrix("numpy", "float32", (3, 3), False)
+    assert compare_tensors(matmul(a, b, alpha=1.0), matmul(a, b))
+
+
+@pytest.mark.skipif(not NVPL_AVAILABLE, reason="This test requires NVPL")
+@pytest.mark.parametrize("a_layout", ("F", "C"))
+@pytest.mark.parametrize("b_layout", ("F", "C"))
+@pytest.mark.parametrize("c_layout", ("F", "C"))
+def test_layouts(a_layout, b_layout, c_layout):
+    """
+    Tests if matmul works with different layouts.
+    """
+    a = sample_matrix("numpy", "float32", (3, 4), False)
+    b = sample_matrix("numpy", "float32", (4, 5), False)
+    c = sample_matrix("numpy", "float32", (3, 5), False)
+    if a_layout == "F":
+        a = np.asfortranarray(a)
+    if b_layout == "F":
+        b = np.asfortranarray(b)
+    if c_layout == "F":
+        c = np.asfortranarray(c)
+
+    assert compare_tensors(matmul(a, b, c, beta=0.2), np.matmul(a, b) + c * 0.2)
+
+
+@pytest.mark.parametrize(
+    "a_batch,b_batch,c_batch,out_batch",
+    (
+        ((), (), (), ()),
+        # TODO: Uncomment when batched inputs are supported.
+        # ((8,), (8,), (8,), (8,)),
+        # ((3,), (), (), (3,)), # TODO: For generic matmul broadcasting is unsupported
+        # ((), (4,), (), (4,)), # TODO: For generic matmul broadcasting is unsupported
+        # ((), (), (5,), (5,)), # TODO: Should we support batched C?
+        # ((6,), (), (6,), (6,)), # TODO: For generic matmul broadcasting is unsupported
+        # ((), (7,), (7,), (7,)), # TODO: For generic matmul broadcasting is unsupported
+        # ((10, 20), (10, 20), (10, 20), (10, 20)),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_batching(a_batch, b_batch, c_batch, out_batch, use_cuda):
+    """
+    Tests if matmul works with different batch sizes.
+    """
+    matrix_shape = (7, 7)
+
+    def sample_batch(batch_shape):
+        return sample_matrix("numpy/cupy", "float32", (*batch_shape, *matrix_shape), use_cuda=use_cuda)
+
+    a = sample_batch(a_batch)
+    b = sample_batch(b_batch)
+    c = sample_batch(c_batch)
+
+    result = matmul(a, b, c, beta=1)
+    assert result.shape == (*out_batch, *matrix_shape)
+    assert_tensors_equal(result, a @ b + c)
+
+
+@pytest.mark.skip(
+    reason="TODO: Generic and advanced shape promotion rules are different because advanced supports broadcasting C."
+)
+@pytest.mark.parametrize("c_desc", (None, "M1", "MN"))
+@pytest.mark.parametrize("b_desc", ("K", "KN"))
+@pytest.mark.parametrize("a_desc", ("K", "MK"))
+@pytest.mark.parametrize("a_t", (True, False))
+@pytest.mark.parametrize("b_t", (True, False))
+@pytest.mark.parametrize("c_t", (True, False))
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("M,N,K", ((2, 3, 5),))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_shape_promotion(a_desc, b_desc, c_desc, a_t, b_t, c_t, M, N, K, framework, use_cuda):
+    """
+    Test shape promotion rules for 1D inputs
+    """
+
+    if "M" not in a_desc:
+        M = 1
+    if "N" not in b_desc:
+        N = 1
+
+    def unpack_shape(shape_desc):
+        if shape_desc is None:
+            return None
+        shape_map = {
+            "N": N,
+            "M": M,
+            "K": K,
+            "1": 1,
+        }
+        return tuple(shape_map[c] for c in shape_desc)
+
+    a_shape, b_shape, c_shape = unpack_shape(a_desc), unpack_shape(b_desc), unpack_shape(c_desc)
+
+    def make_matrix(shape, transposed):
+        if transposed:
+            return sample_matrix(framework, "float32", tuple(reversed(shape)), use_cuda=use_cuda).T
+        else:
+            return sample_matrix(framework, "float32", shape, use_cuda=use_cuda)
+
+    a = make_matrix(a_shape, a_t)
+    b = make_matrix(b_shape, b_t)
+    if c_desc:
+        c = make_matrix(c_shape, c_t)
+        with_c = True
+    else:
+        c = None
+        with_c = False
+
+    a_promoted, b_promoted, c_promoted = a, b, c
+
+    if len(a_shape) == 1:
+        # If argument a is 1-D, it is promoted to a matrix by prefixing 1 to its dimensions.
+        a_promoted = a_promoted.reshape(1, a_shape[0])
+
+    if len(b_shape) == 1:
+        # If argument b is 1-D, it is promoted to a matrix by appending 1 to its dimensions.
+        b_promoted = b_promoted.reshape(b_shape[0], 1)
+
+    if with_c and len(c_shape) == 1:
+        c_promoted = c_promoted.reshape(c_shape[0], 1)
+
+    if with_c and c_promoted.shape[-1] == 1:
+        # If a vector is provided or N = 1, the columns of c are broadcast for the addition.
+        c_promoted = get_framework(c_promoted).stack([c_promoted[..., 0]] * N, -1)
+
+    alpha = 0.12
+    beta = 0.34 if with_c else None
+    result = matmul(a, b, c=c, alpha=alpha, beta=beta)
+    reference = matmul(a_promoted, b_promoted, c=c_promoted, alpha=alpha, beta=beta)
+
+    if len(a_shape) == 1:
+        assert reference.shape[-2] == 1
+        reference = reference.reshape((*reference.shape[:-2], reference.shape[-1]))
+
+    if len(b_shape) == 1:
+        assert reference.shape[-1] == 1
+        reference = reference.reshape(reference.shape[:-1])
+
+    assert_tensors_equal(result, reference)
+
+
+@pytest.mark.skipif(not CUBLAS_AVAILABLE, reason="This test requires cuBLAS")
+@pytest.mark.parametrize(
+    "slices",
+    (
+        ((1, 2), (1, 1), (1, 1)),
+        ((1, 1), (1, 3), (1, 1)),
+        ((1, 1), (1, 1), (1, 4)),
+        ((1, 2), (1, 2), (1, 2)),
+        ((2, 3), (4, 5), (6, 7)),
+    ),
+)
+def test_sliced_unsupported(slices):
+    """
+    Tests if unsupported strided matrices are rejected with appropriate error message.
+    (Unsupported strides are the ones with no stride equal to 1)
+    """
+    (a_step_x, a_step_y), (b_step_x, b_step_y), (c_step_x, c_step_y) = slices
+
+    a = sample_matrix("numpy/cupy", "float32", (a_step_x * 3, a_step_y * 4), True)[::a_step_x, ::a_step_y]
+    b = sample_matrix("numpy/cupy", "float32", (b_step_x * 4, b_step_y * 5), True)[::b_step_x, ::b_step_y]
+    c = sample_matrix("numpy/cupy", "float32", (c_step_x * 3, c_step_y * 5), True)[::c_step_x, ::c_step_y]
+
+    with pytest.raises(ValueError, match="Unsupported layout."):
+        matmul(a, b, c, beta=0.2)
+
+
+@pytest.mark.parametrize(
+    "slices",
+    (
+        ((2, 1), (1, 1), (1, 1)),
+        ((1, 1), (3, 1), (1, 1)),
+        ((1, 1), (1, 1), (4, 1)),
+        ((2, 1), (2, 1), (2, 1)),
+    ),
+)
+@pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_sliced(slices, framework, use_cuda):
+    """
+    Tests if strided tensors work correctly
+    """
+    (a_step_x, a_step_y), (b_step_x, b_step_y), (c_step_x, c_step_y) = slices
+
+    a = sample_matrix(framework, "float32", (a_step_x * 3, a_step_y * 4), use_cuda)[::a_step_x, ::a_step_y]
+    b = sample_matrix(framework, "float32", (b_step_x * 4, b_step_y * 5), use_cuda)[::b_step_x, ::b_step_y]
+    c = sample_matrix(framework, "float32", (c_step_x * 3, c_step_y * 5), use_cuda)[::c_step_x, ::c_step_y]
+
+    assert_tensors_equal(matmul(a, b, c, beta=0.2), a @ b + 0.2 * c)
+
+
+@pytest.mark.skip(reason="Batched inputs are not supported for generic matmul yet.")
+@pytest.mark.parametrize(
+    "slices",
+    (
+        ((2, 1), (1, 1), (1, 1)),
+        ((1, 1), (3, 1), (1, 1)),
+        ((1, 1), (1, 1), (4, 1)),
+        ((2, 1), (2, 1), (2, 1)),
+    ),
+)
+@pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_sliced_batched(slices, framework, use_cuda):
+    """
+    Tests if strided tensors work correctly
+    """
+    (a_step_x, a_step_y), (b_step_x, b_step_y), (c_step_x, c_step_y) = slices
+    batch = 8
+
+    a = sample_matrix(framework, "float32", (batch, a_step_x * 3, a_step_y * 4), use_cuda)[1::2, ::a_step_x, ::a_step_y]
+    b = sample_matrix(framework, "float32", (batch, b_step_x * 4, b_step_y * 5), use_cuda)[1::2, ::b_step_x, ::b_step_y]
+    c = sample_matrix(framework, "float32", (batch, c_step_x * 3, c_step_y * 5), use_cuda)[1::2, ::c_step_x, ::c_step_y]
+
+    assert_tensors_equal(matmul(a, b, c, beta=0.2), a @ b + 0.2 * c)
+
+
+@pytest.mark.skipif(not CUBLAS_AVAILABLE, reason="This test requires cuBLAS")
+def test_sliced_m1_n1():
+    """
+    Tests M=1 and N=1, strides are not 1
+    """
+    a_m1 = sample_matrix("cupy", "float32", (10, 20), True)[2:3, ::2]  # A is [1, 10] with strides [20, 2].
+    b_n1 = sample_matrix("cupy", "float32", (15, 20), True).T[::2, 2:3]  # B is [10, 1] with strides [2, 20]
+
+    result = matmul(a_m1, b_n1)
+    assert_tensors_equal(result, a_m1 @ b_n1)
+
+
+@pytest.mark.parametrize("a_conj", (True, False))
+@pytest.mark.parametrize("b_conj", (True, False))
+@pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_conjugate_qualifiers(a_conj, b_conj, framework, use_cuda):
+    """
+    Test if is_conjugate qualifiers work correctly
+    """
+    a = random_torch_complex((8, 7), use_cuda, a_conj)
+    b = random_torch_complex((7, 11), use_cuda, b_conj)
+    c = random_torch_complex((8, 11), use_cuda)
+
+    qualifiers = np.zeros((3,), dtype=matrix_qualifiers_dtype)
+    qualifiers[:] = GeneralMatrixQualifier.create()
+    qualifiers[0]["conjugate"] = a_conj
+    qualifiers[1]["conjugate"] = b_conj
+
+    r = matmul(a, b, c=c, beta=1.0, qualifiers=qualifiers)
+
+    if a_conj:
+        a = a.conj()
+    if b_conj:
+        b = b.conj()
+    assert_tensors_equal(r, a @ b + c)
+
+
+@pytest.mark.parametrize("a_conj", (True, False))
+@pytest.mark.parametrize("b_conj", (True, False))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_conjugate_torch_auto(a_conj, b_conj, use_cuda):
+    """
+    Test if conjugate flag of torch tensors is interpreted correctly
+    """
+
+    a = random_torch_complex((8, 7), use_cuda, a_conj)
+    b = random_torch_complex((7, 11), use_cuda, b_conj)
+    c = random_torch_complex((8, 11), use_cuda)
+
+    if a_conj:
+        a = a.conj()
+    if b_conj:
+        b = b.conj()
+
+    r = matmul(a, b, c=c, beta=1.0)
+    assert_tensors_equal(r, a @ b + c)
+
+
+@pytest.mark.parametrize(
+    "a_cuda,b_cuda,c_cuda",
+    (
+        (True, True, False),
+        (True, False, True),
+        (False, True, True),
+        (False, False, True),
+        (False, True, False),
+        (True, False, False),
+    ),
+)
+def test_device_mismatch(a_cuda, b_cuda, c_cuda):
+    """
+    Tests if a proper error is reported when the devices differ.
+    """
+    assert not (a_cuda == b_cuda == c_cuda)
+    a = sample_matrix("torch", "float32", (2, 2), a_cuda)
+    b = sample_matrix("torch", "float32", (2, 2), b_cuda)
+    c = sample_matrix("torch", "float32", (2, 2), c_cuda)
+    with pytest.raises(ValueError, match=r"not on the same device"):
+        matmul(a, b, c, beta=0.42)
+
+
+@pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
+@pytest.mark.parametrize(
+    "a_dtype,b_dtype,c_dtype",
+    (
+        ("float64", "float32", "float64"),
+        ("complex64", "complex128", "complex128"),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_dtype_mismatch(framework, a_dtype, b_dtype, c_dtype, use_cuda):
+    """
+    Tests if a proper error is reported when the data types differ.
+    """
+    assert not (a_dtype == b_dtype == c_dtype)
+    try:
+        a = sample_matrix(framework, a_dtype, (2, 2), use_cuda=use_cuda)
+        b = sample_matrix(framework, b_dtype, (2, 2), use_cuda=use_cuda)
+        c = sample_matrix(framework, c_dtype, (2, 2), use_cuda=use_cuda)
+    except NotImplementedError:
+        pytest.skip("Unable to generate matrix of this dtype")
+    with pytest.raises(ValueError, match=r"Unsupported combination of dtypes"):
+        matmul(a, b, c, beta=1)
+
+
+def test_shape_mismatch_ab():
+    """
+    Tests if a proper error is reported when the shapes of A and B do not match
+    """
+    a = np.zeros((3, 2))
+    b = c = np.ones((2, 2))
+    # TODO: Align error messages across advanced/generic
+    with pytest.raises(ValueError, match=r"dimension|extent"):
+        matmul(a, b, c, beta=1)
+
+
+def test_shape_mismatch_abc():
+    """
+    Tests if a proper error is reported when the shapes of AB and C do not match
+    """
+    a = b = np.zeros((3, 3))
+    c = np.zeros((4, 4))
+    # TODO: Align error messages across advanced/generic
+    with pytest.raises(ValueError, match=r"dimension|extent"):
+        matmul(a, b, c, beta=1)
+
+
+def test_missing_beta():
+    """
+    Tests if a proper error is reported C is provided, but beta is not.
+    """
+    a = b = c = np.ones((3, 3))
+    with pytest.raises(ValueError, match=r"A value for beta must be provided if operand C is provided"):
+        matmul(a, b, c)
+
+
+def test_unsupported_type():
+    """
+    Tests if a proper error is reported for an unsupported data type.
+    """
+    a = b = c = np.zeros((2, 2), dtype=np.int64)
+    with pytest.raises(ValueError, match=r"^The dtype of operand.*not supported"):
+        matmul(a, b, c, beta=1)
+
+
+@pytest.mark.skip(reason="The generic matmul API does not have a quantization argument.")
+def test_unsupported_float8():
+    """
+    Tests if proper error is reported when FP8 is not supported.
+    """
+    try:
+        import torch
+    except:
+        pytest.skip("Torch is required for FP8 support test.")
+
+    if not hasattr(torch, "float8_e4m3fn"):
+        # Old torch versions don't support float8_e4m3fn at all.
+        pytest.skip("torch.float8_e4m3fn is required for FP8 support test.")
+
+    a = torch.zeros((16, 16)).type(torch.float8_e4m3fn).cuda()
+    b = torch.zeros((16, 16)).type(torch.float8_e4m3fn).cuda()
+
+    if cublaslt.get_version() < 120800:
+        with pytest.raises(ValueError, match=r"FP8 is not supported.*cuBLASLt version 12\.8 or higher is required"):
+            matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1})
+    elif (torch.cuda.get_device_properties(0).major, torch.cuda.get_device_properties(0).minor) < (8, 9):
+        with pytest.raises(cublaslt.cuBLASLtError):
+            matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1})
+
+
+@pytest.mark.skip(reason="FIXME: Error messagages for batch dimensions should be aligned.")
+@pytest.mark.skipif(not CUBLAS_AVAILABLE, reason="This test requires cuBLAS")
+@pytest.mark.parametrize(
+    "test_case,expected_error",
+    [
+        ("not_tileable_a", "batch layout for A .* is not tileable"),
+        ("not_tileable_b", "batch layout for B .* is not tileable"),
+        ("batch_shape_mismatch", "batch dimensions of operands A .* and B .* must match"),
+        ("batch_order_mismatch", "batch order of operands A .* and B .* must match"),
+        ("c_m_dimension_mismatch", "The M dimension of the C matrix .* must match the M dimension of A"),
+        ("c_n_dimension_mismatch", "The N dimension of the C matrix .* must match the N dimension of B"),
+        ("c_batch_shape_mismatch", "The batch dimension of operand C .* must match with that of the other operands"),
+        ("c_batch_order_mismatch", "The batch axis order of operand C .* must match with that of the other"),
+        (
+            "c_not_tileable",
+            "The batch layout for C corresponding to shape .* is currently not supported because it is not tileable",
+        ),
+    ],
+)
+def test_batch_matrix_negative(test_case, expected_error):
+    if cp is None:
+        pytest.skip("Cupy is required for this test.")
+    M, K, N = 3, 4, 5
+
+    matrices = {
+        "not_tileable_a": (
+            sample_matrix("cupy", "float32", (2, 3, M, K), True)[:, :2, :, :],
+            sample_matrix("cupy", "float32", (2, 2, K, N), True),
+            None,
+        ),
+        "not_tileable_b": (
+            sample_matrix("cupy", "float32", (2, 2, M, K), True),
+            sample_matrix("cupy", "float32", (2, 3, K, N), True)[:, :2, :, :],
+            None,
+        ),
+        "batch_shape_mismatch": (
+            sample_matrix("cupy", "float32", (2, M, K), True),
+            sample_matrix("cupy", "float32", (3, K, N), True),
+            None,
+        ),
+        "batch_order_mismatch": (
+            sample_matrix("cupy", "float32", (2, 3, M, K), True),
+            cp.transpose(sample_matrix("cupy", "float32", (3, 2, K, N), True), (1, 0, 2, 3)),
+            None,
+        ),
+        "c_m_dimension_mismatch": (
+            sample_matrix("cupy", "float32", (M, K), True),
+            sample_matrix("cupy", "float32", (K, N), True),
+            sample_matrix("cupy", "float32", (M + 1, N), True),
+        ),
+        "c_n_dimension_mismatch": (
+            sample_matrix("cupy", "float32", (M, K), True),
+            sample_matrix("cupy", "float32", (K, N), True),
+            sample_matrix("cupy", "float32", (M, N + 1), True),
+        ),
+        "c_batch_shape_mismatch": (
+            sample_matrix("cupy", "float32", (2, 2, M, K), True),
+            sample_matrix("cupy", "float32", (2, 2, K, N), True),
+            sample_matrix("cupy", "float32", (3, 2, M, N), True),
+        ),
+        "c_batch_order_mismatch": (
+            sample_matrix("cupy", "float32", (2, 3, M, K), True),
+            sample_matrix("cupy", "float32", (2, 3, K, N), True),
+            cp.transpose(sample_matrix("cupy", "float32", (3, 2, M, N), True), (1, 0, 2, 3)),
+        ),
+        "c_not_tileable": (
+            sample_matrix("cupy", "float32", (2, 2, M, K), True),
+            sample_matrix("cupy", "float32", (2, 2, K, N), True),
+            sample_matrix("cupy", "float32", (2, 3, M, N), True)[:, :2, :, :],
+        ),
+    }
+
+    a, b, c = matrices[test_case]
+
+    with pytest.raises(ValueError, match=expected_error):
+        if c is None:
+            matmul(a, b)
+        else:
+            matmul(a, b, c, beta=1.0)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_matmul_with_hypothesis.py b/tests/nvmath_tests/linalg/generic/matmul/test_matmul_with_hypothesis.py
new file mode 100644
index 0000000..4857bf2
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_matmul_with_hypothesis.py
@@ -0,0 +1,517 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import collections
+import logging
+import typing
+
+from hypothesis import given, assume, reproduce_failure  # noqa: F401
+from hypothesis.extra.numpy import arrays, from_dtype
+from hypothesis.strategies import (
+    booleans,
+    composite,
+    integers,
+    none,
+    one_of,
+    sampled_from,
+    tuples,
+    lists,
+)
+import numpy as np
+
+from nvmath._internal.templates import ExecutionCPU, ExecutionCUDA
+from nvmath.internal.tensor_wrapper import maybe_register_package
+from nvmath.memory import _MEMORY_MANAGER
+from nvmath.bindings import cublas
+from nvmath.linalg._internal.typemaps import (
+    NAMES_TO_DEFAULT_COMPUTE_TYPE,
+    CUBLAS_COMPUTE_TYPE_TO_NAME,
+)
+from nvmath.linalg.generic import (
+    DiagonalMatrixQualifier,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    matmul,
+    MatmulOptions,
+    MatrixQualifier,
+    matrix_qualifiers_dtype,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+)
+
+from nvmath_tests.helpers import nvmath_seed
+from ...utils import get_absolute_tolerance
+from . import CUBLAS_AVAILABLE, NVPL_AVAILABLE
+
+
+AVAILABLE_TENSOR_LIBRARIES: list[str] = ["numpy"]
+
+
+try:
+    import cupy as cp
+
+    maybe_register_package("cupy")
+
+    AVAILABLE_TENSOR_LIBRARIES.append("cupy")
+except ModuleNotFoundError:
+    pass
+
+try:
+    import torch
+
+    maybe_register_package("torch")
+
+    AVAILABLE_TENSOR_LIBRARIES.append("torch-cpu")
+    AVAILABLE_TENSOR_LIBRARIES.append("torch-gpu")
+except ImportError:
+    pass
+
+
+def compare_result(*, res, ref):
+    np.testing.assert_allclose(
+        actual=res,
+        desired=ref,
+        equal_nan=True,
+        rtol=(1e-02 if res.dtype == np.float16 else 2e-05),
+        atol=2 * get_absolute_tolerance(ref),
+    )  # type: ignore
+
+
+def verify_result(a, b, c, result_c, alpha, beta, qualifiers: typing.Sequence[MatrixQualifier]):
+    if (len(qualifiers) > 0 and TriangularMatrixQualifier.is_valid(qualifiers[0])) or (
+        len(qualifiers) > 1 and TriangularMatrixQualifier.is_valid(qualifiers[1])
+    ):
+        # C is not added to A@B for trmm, so we just set beta to 0.0
+        beta = 0.0
+
+    if (len(qualifiers) > 0 and DiagonalMatrixQualifier.is_valid(qualifiers[0])) or (
+        len(qualifiers) > 1 and DiagonalMatrixQualifier.is_valid(qualifiers[1])
+    ):
+        # alpha/beta are not used by dgmm, so we just set alpha to None and beta to 0.0
+        alpha = None
+        beta = 0.0
+
+    if len(qualifiers) > 0 and DiagonalMatrixQualifier.is_valid(qualifiers[0]):
+        a = np.diagflat(a[:: qualifiers[0]["incx"]])
+
+    if len(qualifiers) > 1 and DiagonalMatrixQualifier.is_valid(qualifiers[1]):
+        b = np.diagflat(b[:: qualifiers[1]["incx"]])
+
+    if len(qualifiers) > 2 and DiagonalMatrixQualifier.is_valid(qualifiers[2]):
+        c = np.diagflat(c[:: qualifiers[2]["incx"]])
+
+    if len(qualifiers) > 0 and qualifiers[0]["conjugate"]:
+        a = np.conjugate(a)
+
+    if len(qualifiers) > 1 and qualifiers[1]["conjugate"]:
+        b = np.conjugate(b)
+
+    if len(qualifiers) > 2 and qualifiers[2]["conjugate"]:
+        c = np.conjugate(c)
+
+    logging.debug("Reference matrix A is \n%s", a)
+    logging.debug("Reference matrix B is \n%s", b)
+    logging.debug("Reference matrix C is \n%s", c)
+
+    possible_dtype = CUBLAS_COMPUTE_TYPE_TO_NAME[NAMES_TO_DEFAULT_COMPUTE_TYPE[(str(a.dtype), str(b.dtype))]]
+    compute_dtype = possible_dtype[1] if np.iscomplexobj(a) else possible_dtype[0]
+    ab = (
+        np.matmul(a, b, dtype=compute_dtype)
+        if alpha is None
+        else np.matmul(np.multiply(alpha, a, dtype=compute_dtype), b, dtype=compute_dtype)
+    )
+    ref_c = ab if c is None else np.add(ab, np.multiply(c, beta, dtype=compute_dtype), dtype=compute_dtype)
+
+    result_c_ = result_c[0] if isinstance(result_c, tuple) else result_c
+    logging.debug("Reference result is \n%s", ref_c)
+    logging.debug("Actual    result is \n%s", result_c_)
+    compare_result(res=result_c_, ref=ref_c.astype(a.dtype))
+
+
+problem_size_mnk = integers(min_value=1, max_value=256)
+
+options_blocking_values = [True, "auto"]
+options_allocator_values = [
+    None,
+    _MEMORY_MANAGER["_raw"](0, logging.getLogger()),
+    _MEMORY_MANAGER["cupy"](0, logging.getLogger()),
+    _MEMORY_MANAGER["torch"](0, logging.getLogger()) if "torch" in _MEMORY_MANAGER else None,
+]
+
+# FIXME: Add integer types to tests
+ab_type_values = [
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+]
+
+MatmulInputs = collections.namedtuple(
+    "MatmulInputs",
+    [
+        "a",
+        "b",
+        "c",
+        "m",
+        "n",
+        "k",
+        "ab_type",
+        "beta",
+        "alpha",
+        "qualifiers",
+        "batches",
+    ],
+)
+
+
+def notNone(x):
+    return x is not None
+
+
+@composite
+def matrix_qualifiers(draw):
+    qual = draw(
+        sampled_from(
+            [
+                GeneralMatrixQualifier,
+                HermitianMatrixQualifier,
+                SymmetricMatrixQualifier,
+                TriangularMatrixQualifier,
+                DiagonalMatrixQualifier,
+            ]
+        )
+    )
+    kwargs = {"conjugate": draw(booleans())}
+    if qual not in (GeneralMatrixQualifier, DiagonalMatrixQualifier):
+        kwargs["uplo"] = draw(sampled_from([cublas.FillMode.LOWER, cublas.FillMode.UPPER]))
+    if qual is TriangularMatrixQualifier:
+        kwargs["diag"] = draw(sampled_from(cublas.DiagType))
+    if qual is DiagonalMatrixQualifier:
+        kwargs["incx"] = draw(sampled_from([+1, -1]))
+    return qual.create(**kwargs)
+
+
+def enforce_matrix_qualifiers(A: np.ndarray, qualifier: MatrixQualifier | None) -> np.ndarray:
+    """For random valued A, force the values of A to be an example of the qualifier."""
+    if GeneralMatrixQualifier.is_valid(qualifier):
+        pass
+    elif HermitianMatrixQualifier.is_valid(qualifier):
+        assert A.shape[-2] == A.shape[-1], "Hermitian matrices are square."
+        A = 0.5 * (A + np.conj(np.swapaxes(A, -2, -1)))
+        assert np.array_equal(A, np.conj(np.swapaxes(A, -2, -1))), "A is not Hermitian."
+    elif SymmetricMatrixQualifier.is_valid(qualifier):
+        assert A.shape[-2] == A.shape[-1], "Symmetric matrices are square."
+        A = np.tril(A) + np.triu(np.swapaxes(A, -2, -1), 1)
+        assert np.array_equal(A, np.swapaxes(A, -2, -1)), "A is not symmetric."
+    elif TriangularMatrixQualifier.is_valid(qualifier):
+        assert A.shape[-2] == A.shape[-1], "Triangular matrices are square."
+        if qualifier["diag"] == cublas.DiagType.UNIT:
+            # A = A - A * np.identity(A.shape[-1]) + np.identity(A.shape[-1])
+            A[..., np.identity(A.shape[-1], dtype=bool)] = 1.0
+            assert np.all(np.diagonal(A, offset=0, axis1=-2, axis2=-1) == 1.0), np.diagonal(A, offset=0, axis1=-2, axis2=-1)
+        match qualifier["uplo"]:
+            case cublas.FillMode.UPPER:
+                A = np.triu(A)
+                assert np.all(A[np.tril(np.ones_like(A, dtype=np.bool_), k=-1)] == 0.0)
+            case cublas.FillMode.LOWER:
+                A = np.tril(A)
+                assert np.all(A[np.triu(np.ones_like(A, dtype=np.bool_), k=+1)] == 0.0)
+            case _:
+                raise ValueError(f"{qualifier['uplo']} is not UPPER or LOWER.")
+    elif DiagonalMatrixQualifier.is_valid(qualifier):
+        assert len(A.shape) == 1, "Diagonal matrix should be vector."
+    else:
+        raise ValueError(f"{qualifier} describes an unknown matrix type.")
+    return A
+
+
+def destroy_unreferenced_matrix(A: np.ndarray, qualifier: MatrixQualifier | None) -> np.ndarray:
+    """Destroy information in the unreferenced portion of the matrix."""
+    nan_array = np.empty((1,), A.dtype)
+    nan_array[0] = (np.nan + 1j * np.nan) if np.iscomplexobj(A) else np.nan
+    nan_value = nan_array[0]
+    if GeneralMatrixQualifier.is_valid(qualifier) or DiagonalMatrixQualifier.is_valid(qualifier) or A.size <= 1:
+        return A
+    if (
+        HermitianMatrixQualifier.is_valid(qualifier)
+        or TriangularMatrixQualifier.is_valid(qualifier)
+        or SymmetricMatrixQualifier.is_valid(qualifier)
+    ):
+        match qualifier["uplo"]:
+            case cublas.FillMode.LOWER:
+                A = np.where(np.tril(np.ones_like(A, dtype=np.bool_)), A, nan_value)
+            case cublas.FillMode.UPPER:
+                A = np.where(np.triu(np.ones_like(A, dtype=np.bool_)), A, nan_value)
+            case _:
+                raise ValueError(f"{qualifier['uplo']} is not UPPER or LOWER.")
+    if TriangularMatrixQualifier.is_valid(qualifier):  # noqa: SIM102
+        if qualifier["diag"] == cublas.DiagType.UNIT:
+            A = np.where(np.identity(A.shape[-1], dtype=np.bool_), nan_value, A)
+            np.testing.assert_equal(actual=np.diagonal(A, offset=0, axis1=-2, axis2=-1), desired=nan_value)
+    return A
+
+
+@composite
+def batch_strategy(draw):
+    """Generate three tuples of ints which represent valid batching dimensions for a,b,c."""
+    batch_shape: tuple[int] = tuple(draw(lists(integers(min_value=1, max_value=4), min_size=1, max_size=4)))
+    # () is first because booleans() shrinks to True
+    a_batch = () if draw(booleans()) else batch_shape
+    b_batch = () if draw(booleans()) else batch_shape
+    # Use truthy check here. c_batch must be the larger of a,b batch
+    c_batch = batch_shape if a_batch or b_batch else ()
+    return a_batch, b_batch, c_batch
+
+
+@composite
+def matrix_multiply_arrays(draw):
+    k = draw(problem_size_mnk)
+    # Let k be random and then let m,n depend on whether A,B are square matrices
+    qualifiers = np.empty(3, dtype=matrix_qualifiers_dtype)
+    qualifiers[0] = draw(matrix_qualifiers())
+    qualifiers[1] = draw(matrix_qualifiers())
+    if GeneralMatrixQualifier.is_valid(qualifiers[0]):
+        m = draw(one_of(none(), problem_size_mnk))
+    else:
+        m = k
+    if GeneralMatrixQualifier.is_valid(qualifiers[1]):
+        n = draw(one_of(none(), problem_size_mnk))
+    else:
+        n = k
+    ab_type = draw(sampled_from(ab_type_values))
+    if HermitianMatrixQualifier.is_valid(qualifiers[0]) or HermitianMatrixQualifier.is_valid(qualifiers[1]):
+        assume(np.iscomplexobj(ab_type()))
+
+    a_shape = (k,) if (m is None or DiagonalMatrixQualifier.is_valid(qualifiers[0])) else (m, k)
+    b_shape = (k,) if (n is None or DiagonalMatrixQualifier.is_valid(qualifiers[1])) else (k, n)
+    c_shape = (m, n)
+    if (m is None and not DiagonalMatrixQualifier.is_valid(qualifiers[0])) and (
+        n is None and not DiagonalMatrixQualifier.is_valid(qualifiers[1])
+    ):
+        c_shape = ()
+    elif m is None and not DiagonalMatrixQualifier.is_valid(qualifiers[0]):
+        c_shape = (n,)
+    elif n is None and not DiagonalMatrixQualifier.is_valid(qualifiers[1]):
+        c_shape = (m,)
+
+    # TODO: Uncomment when batched inputs are supported
+    # if len(a_shape) == 2 and len(b_shape) == 2 and len(c_shape) == 2:
+    #     a_batch, b_batch, c_batch = draw(batch_strategy())
+    #     a_shape = a_batch + a_shape
+    #     b_shape = b_batch + b_shape
+    #     c_shape = c_batch + c_shape
+    # else:
+    a_batch, b_batch, c_batch = (), (), ()
+
+    # Generate data in range [0, 5] to match sample_matrix() from utils
+    # Only non-negative reals to avoid catastrophic cancellation
+    element_properties: dict[str, typing.Any] = {
+        "allow_infinity": False,
+        "allow_nan": False,
+        "allow_subnormal": False,
+        "max_magnitude": np.sqrt(50),
+        "min_magnitude": 0,
+        "max_value": 5,
+        "min_value": 0,
+    }
+    # NOTE: It is unfeasible for hypothesis to explore a parameter space where
+    # all elements of the input arrays are unique, so most of the time, arrays
+    # contain just a few unique values
+    a = draw(
+        arrays(
+            dtype=ab_type,
+            shape=a_shape,
+            elements=element_properties,
+        )
+    )
+    b = draw(
+        arrays(
+            dtype=ab_type,
+            shape=b_shape,
+            elements=element_properties,
+        )
+    )
+
+    # Type promotion can happen unintentionally when enforcing matrix structure.
+    a = enforce_matrix_qualifiers(a, qualifier=qualifiers[0]).astype(ab_type)
+    b = enforce_matrix_qualifiers(b, qualifier=qualifiers[1]).astype(ab_type)
+
+    # The generic API does not support broadcasting of C, so the shape of must match the
+    # output of the matmul exactly.
+    c = draw(
+        one_of(
+            none(),
+            arrays(dtype=ab_type, shape=c_shape, elements=element_properties),
+        )
+    )
+    if c is None:
+        qualifiers = qualifiers[:2]
+    else:
+        qualifiers[2] = GeneralMatrixQualifier.create()
+
+    beta = None if c is None else draw(from_dtype(dtype=np.dtype(ab_type), **element_properties))
+    alpha = draw(one_of(none(), from_dtype(dtype=np.dtype(ab_type), **element_properties)))
+
+    assume(np.all(np.isfinite(a)))
+    assume(np.all(np.isfinite(b)))
+    assume(c is None or np.all(np.isfinite(c)))
+    assert c is None or c.shape in [c_batch + (m, n), (m, n), (m,), (n,), ()]
+    assert a.shape in [a_batch + (m, k), (m, k), (k,)]
+    assert b.shape in [b_batch + (k, n), (k, n), (k,)]
+    return MatmulInputs(
+        a=a,
+        b=b,
+        c=c,
+        m=m,
+        n=n,
+        k=k,
+        ab_type=ab_type,
+        beta=beta,
+        alpha=alpha,
+        qualifiers=qualifiers,
+        batches=(a_batch, b_batch, c_batch),
+    )
+
+
+@composite
+def options_strategy(draw):
+    return MatmulOptions(
+        blocking=draw(sampled_from(options_blocking_values)),
+        allocator=draw(sampled_from(options_allocator_values)),
+        inplace=draw(booleans()),
+    )
+
+
+@nvmath_seed()
+@given(
+    input_arrays=matrix_multiply_arrays(),
+    order=tuples(
+        sampled_from(["F", "C"]),
+        sampled_from(["F", "C"]),
+        sampled_from(["F", "C"]),
+    ),
+    options=one_of(
+        none(),
+        options_strategy(),
+    ),
+    execution=sampled_from(
+        [
+            # None,  # Cannot test None because not all test envs have CPU deps
+            *((ExecutionCUDA(),) if CUBLAS_AVAILABLE else ()),
+            *((ExecutionCPU(),) if NVPL_AVAILABLE else ()),
+        ]
+    ),
+    preferences=one_of(
+        none(),
+    ),
+    tensor_library=sampled_from(AVAILABLE_TENSOR_LIBRARIES),
+)
+def test_matmul(input_arrays, order, options, execution, preferences, tensor_library):
+    """Call nvmath.linalg.generic.matmul() with valid inputs."""
+    a, b, c, m, n, k, ab_type, beta, alpha, qualifiers, batches = input_arrays
+
+    if c is None and options is not None and options.inplace:
+        # Cannot have inplace operation when c is None
+        return
+
+    ax = destroy_unreferenced_matrix(a, qualifiers[0])
+    bx = destroy_unreferenced_matrix(b, qualifiers[1])
+
+    ax = np.array(ax, order=order[0])
+    bx = np.array(bx, order=order[1])
+    c = None if c is None else np.array(c, order=order[2])
+
+    match tensor_library:
+        case "cupy":
+            d_a = cp.asarray(ax)
+            d_b = cp.asarray(bx)
+            d_c = None if c is None else cp.asarray(c)
+        case "torch-cpu":
+            d_a = torch.tensor(ax, device="cpu")
+            d_b = torch.tensor(bx, device="cpu")
+            d_c = None if c is None else torch.tensor(c, device="cpu")
+        case "torch-gpu":
+            d_a = torch.tensor(ax, device="cuda")
+            d_b = torch.tensor(bx, device="cuda")
+            d_c = None if c is None else torch.tensor(c, device="cuda")
+        case _:
+            d_a = np.copy(ax)
+            d_b = np.copy(bx)
+            d_c = None if c is None else np.copy(c)
+
+    try:
+        result_c = matmul(
+            d_a,
+            d_b,
+            c=d_c,
+            alpha=alpha,
+            beta=beta,
+            execution=execution,
+            options=options,
+            qualifiers=qualifiers,
+        )
+    except ValueError as error:
+        message = str(error)
+        if "No available generic matrix multiplication matches the provided matrices" in message and (
+            not GeneralMatrixQualifier.is_valid(qualifiers[0]) and not GeneralMatrixQualifier.is_valid(qualifiers[1])
+        ):
+            logging.warning("Hypothesis ignored the following error: %s", message)
+            return
+        if (
+            "was not convertible to a valid" in message
+            or "Operations on the non-triangular operand" in message
+            or "Operations on the non-hermitian/non-symmetric operands" in message
+            or "Transpose on operand A is not supported" in message
+        ):
+            logging.warning("Hypothesis ignored the following error: %s", message)
+            return
+        if "Operations on the non-diagonal operands A,C are not supported" in message and (
+            order[2] == "C"
+            or (
+                (DiagonalMatrixQualifier.is_valid(qualifiers[0]) and order[1] == "C")
+                or (DiagonalMatrixQualifier.is_valid(qualifiers[1]) and order[0] == "C")
+            )
+        ):
+            return
+        if "Conjugate-Transpose on operand X is not supported" in message and (
+            (DiagonalMatrixQualifier.is_valid(qualifiers[0]) and qualifiers[0]["conjugate"])
+            or (DiagonalMatrixQualifier.is_valid(qualifiers[1]) and qualifiers[1]["conjugate"])
+        ):
+            return
+        if "is not valid for batching" in message and (order[0] == "F" or order[1] == "F" or order[2] == "F"):
+            return
+        if "is unsupported" in message:
+            logging.warning("Hypothesis ignored the following error: %s", message)
+            return
+        if "dgmm() is an unknown NVPL BLAS function" in message and (
+            DiagonalMatrixQualifier.is_valid(qualifiers[0]) or DiagonalMatrixQualifier.is_valid(qualifiers[1])
+        ):
+            return
+        if "Unsupported layout" in message and (
+            ax.dtype.itemsize not in ax.strides[-2:]
+            or bx.dtype.itemsize not in bx.strides[-2:]
+            or (c is not None and c.dtype.itemsize not in c.strides[-2:])
+        ):
+            # The provided matrices are probably batched,
+            # and the layout is incompatible with batching
+            return
+        raise error
+    except NotImplementedError as error:
+        message = str(error)
+        logging.warning("Hypothesis ignored the following error: %s", message)
+        return
+
+    assert result_c.dtype is d_a.dtype, f"Result ({result_c}) and input ({a.dtype}) types should match!"
+    if options is not None and options.inplace:
+        assert result_c is d_c, "For inplace operations, the result should be the same object as operand c."
+
+    match tensor_library:
+        case "cupy":
+            result_c = cp.asnumpy(result_c)
+        case "torch-cpu" | "torch-gpu":
+            result_c = result_c.cpu().detach().numpy()
+        case _:
+            pass
+
+    verify_result(a, b, c, result_c, alpha, beta, qualifiers)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_options.py b/tests/nvmath_tests/linalg/generic/matmul/test_options.py
new file mode 100644
index 0000000..b0a9a4a
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_options.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from nvmath.bindings import cublas
+from nvmath.linalg.generic import matmul, Matmul, MatmulOptions
+from nvmath.internal.tensor_wrapper import maybe_register_package
+import logging
+import nvmath
+import pytest
+
+from ...utils import assert_tensors_equal, sample_matrix, is_torch_available
+
+from . import NVPL_AVAILABLE
+
+try:
+    import cupy_backends.cuda
+
+    maybe_register_package("cupy")
+    HAS_CUPY = True
+except ModuleNotFoundError:
+    HAS_CUPY = False
+
+
+if is_torch_available():
+    maybe_register_package("torch")
+
+"""
+This set of tests checks Matmul's options
+"""
+
+
+def check_matmul_with_options(size, options, use_cuda=(not NVPL_AVAILABLE), dtype="float32", atol=None, rtol=None):
+    a = b = sample_matrix("numpy/cupy" if dtype != "bfloat16" else "torch", dtype, (size, size), use_cuda)
+    is_complex = "complex" in dtype
+    alpha = 0.42 + 0.24j if is_complex else 0.42
+    result = matmul(a, b, alpha=alpha, options=options)
+    assert_tensors_equal(result, alpha * (a @ b), atol=atol, rtol=rtol)
+    return result
+
+
+ct = cublas.ComputeType
+st = nvmath.CudaDataType
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support compute preferences.")
+@pytest.mark.parametrize(
+    "dtype,compute_type,scale_type",
+    (
+        # None specified
+        ("bfloat16", None, None),
+        ("float16", None, None),
+        ("float32", None, None),
+        ("float64", None, None),
+        ("complex64", None, None),
+        ("complex128", None, None),
+        # Only compute type specified
+        ("float16", ct.COMPUTE_16F, None),
+        ("float16", ct.COMPUTE_16F_PEDANTIC, None),
+        ("float16", ct.COMPUTE_32F, None),
+        ("float32", ct.COMPUTE_32F, None),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("complex64", ct.COMPUTE_32F, None),
+        ("float16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("float32", ct.COMPUTE_32F_PEDANTIC, None),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, None),
+        ("float32", ct.COMPUTE_32F_FAST_16F, None),
+        ("float32", ct.COMPUTE_32F_FAST_16BF, None),
+        ("float32", ct.COMPUTE_32F_FAST_TF32, None),
+        ("float64", ct.COMPUTE_64F, None),
+        ("float64", ct.COMPUTE_64F_PEDANTIC, None),
+        ("complex128", ct.COMPUTE_64F, None),
+        ("complex128", ct.COMPUTE_64F_PEDANTIC, None),
+        # Only scale type specified
+        ("float16", None, st.CUDA_R_16F),
+        ("float16", None, st.CUDA_R_32F),
+        ("bfloat16", None, st.CUDA_R_32F),
+        ("float32", None, st.CUDA_R_32F),
+        ("complex64", None, st.CUDA_C_32F),
+        ("float32", None, st.CUDA_R_32F),
+        ("float64", None, st.CUDA_R_64F),
+        ("complex128", None, st.CUDA_C_64F),
+        # Both compute and scale type specified
+        ("float16", ct.COMPUTE_16F, st.CUDA_R_16F),
+        ("float16", ct.COMPUTE_16F_PEDANTIC, st.CUDA_R_16F),
+        ("float16", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("bfloat16", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F, st.CUDA_C_32F),
+        ("float16", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, st.CUDA_C_32F),
+        ("float32", ct.COMPUTE_32F_FAST_16F, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_FAST_16BF, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_FAST_TF32, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_64F, st.CUDA_R_64F),
+        ("float64", ct.COMPUTE_64F_PEDANTIC, st.CUDA_R_64F),
+        ("complex128", ct.COMPUTE_64F, st.CUDA_C_64F),
+        ("complex128", ct.COMPUTE_64F_PEDANTIC, st.CUDA_C_64F),
+    ),
+)
+def test_compute_scale_type(dtype, compute_type, scale_type):
+    check_matmul_with_options(
+        2,
+        MatmulOptions(compute_type=compute_type, scale_type=scale_type),
+        dtype=dtype,
+        use_cuda=True,
+        atol=0.1,
+        rtol=None,
+    )
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support compute preferences.")
+@pytest.mark.parametrize(
+    "dtype,compute_type,scale_type",
+    (
+        ("float16", ct.COMPUTE_32F, st.CUDA_R_16F),
+        ("float32", ct.COMPUTE_16F, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_64F, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_32F_FAST_16F, st.CUDA_R_32F),
+        ("float16", ct.COMPUTE_32F_FAST_16BF, st.CUDA_R_32F),
+    ),
+)
+def test_unsupported_compute_scale_type(dtype, compute_type, scale_type):
+    with pytest.raises(Exception, match="not supported|INVALID_VALUE|NOT_SUPPORTED"):
+        check_matmul_with_options(
+            2,
+            MatmulOptions(compute_type=compute_type, scale_type=scale_type),
+            dtype=dtype,
+            use_cuda=True,
+        )
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support memory_limit options.")
+@pytest.mark.parametrize(
+    "memory_limit, expected_result",
+    (
+        (8, 8),
+        (0.5, 500),
+        (1.0, 1000),
+        (1, 1),
+        ("0.01", ValueError),
+        ("8 b", 8),
+        ("100%", 1000),
+        ("1gib", 1024**3),
+        ("2mib", 2 * 1024**2),
+        ("3kib", 3 * 1024),
+        ("4 GiB", 4 * 1024**3),
+        ("5 MiB", 5 * 1024**2),
+        ("6 KiB", 6 * 1024),
+        ("1gb", 1000**3),
+        ("2mb", 2 * 1000**2),
+        ("3kb", 3 * 1000),
+        ("4 GB", 4 * 1000**3),
+        ("5 MB", 5 * 1000**2),
+        ("6 KB", 6 * 1000),
+        ("6e2 KB", 600 * 1000),
+        ("1e-1 Kb", 100),
+        ("0.1 Kb", 100),
+        ("123 megabytes", ValueError),
+        (-1, ValueError),
+        (-0.1, ValueError),
+        ("-1%", ValueError),
+        ("-1gib", ValueError),
+        (
+            "-1",
+            (
+                ValueError,
+                "The memory limit must be specified in one of the following forms",
+            ),
+        ),
+    ),
+)
+def test_memory_limit_parsing(memory_limit, expected_result):
+    """
+    Tests if various forms of memory limits are parsed correctly.
+    """
+    if isinstance(expected_result, int):
+        assert expected_result == nvmath.internal.utils._get_memory_limit(memory_limit, 1_000)
+    else:
+        if isinstance(expected_result, tuple):
+            exception, pattern = expected_result
+        else:
+            exception, pattern = expected_result, None
+
+        with pytest.raises(exception, match=pattern):
+            nvmath.internal.utils._get_memory_limit(memory_limit, 1_000)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support memory_limit options.")
+def test_memory_limit():
+    """
+    Tests if specifying a memory limit doesn't break anything
+    """
+    options = MatmulOptions()
+    options.memory_limit = 0.9
+    check_matmul_with_options(10, options)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support memory_limit options.")
+def test_memory_limit_filtering():
+    """
+    Tests if some algorithms are filtered with memory limit set.
+    """
+    a = b = sample_matrix("numpy/cupy", "float32", (1000, 1000), True)
+
+    def get_memory_requirements(algos):
+        return [alg.algorithm.workspace_size for alg in algos]
+
+    all_memory = get_memory_requirements(Matmul(a, b).plan())
+
+    filtered = get_memory_requirements(Matmul(a, b, options=MatmulOptions(memory_limit="1 b")).plan())
+
+    assert max(filtered) < max(all_memory)
+
+
+def test_logger():
+    """
+    Tests if specifying a custom logger works as expected.
+    """
+    import logging
+    from io import StringIO
+
+    log_stream = StringIO()
+    logger = logging.Logger("test_logger", level=logging.DEBUG)
+    logger.addHandler(logging.StreamHandler(log_stream))
+    options = MatmulOptions(logger=logger)
+    check_matmul_with_options(10, options)
+    assert len(log_stream.getvalue()) > 0
+
+
+def test_allocator():
+    """
+    Tests if manually specifying an allocator works
+    """
+    if not is_torch_available():
+        pytest.skip("no pytorch")
+
+    from nvmath.memory import _MEMORY_MANAGER
+
+    allocator = _MEMORY_MANAGER["torch"](0, logging.getLogger())
+    options = MatmulOptions(allocator=allocator)
+    check_matmul_with_options(10, options)
+
+
+def test_different_allocator():
+    """
+    Tests if matmul of torch tensors can be performed with cupy allocator
+    """
+    from nvmath.memory import _MEMORY_MANAGER
+
+    if not HAS_CUPY:
+        pytest.skip("cupy is required for this test")
+
+    allocator = _MEMORY_MANAGER["cupy"](0, logging.getLogger())
+    options = MatmulOptions(allocator=allocator)
+    check_matmul_with_options(10, options)
+
+
+def test_custom_allocator():
+    """
+    Checks if custom allocator is actually used
+    """
+    if not is_torch_available():
+        pytest.skip("no pytorch")
+
+    from nvmath.memory import _MEMORY_MANAGER
+
+    class MockAllocator(_MEMORY_MANAGER["torch"]):
+        def __init__(self, device_id, logger):
+            super().__init__(device_id, logger)
+            self.counter = 0
+
+        def memalloc(self, size, *args, **kwargs):
+            print("ALLOC", size)
+            self.counter += 1
+            return super().memalloc(size, *args, **kwargs)
+
+    allocator = MockAllocator(0, logging.getLogger())
+    options = MatmulOptions(allocator=allocator)
+    check_matmul_with_options(10, options)
+    assert allocator.counter >= 0
+
+
+def test_invalid_allocator():
+    """
+    Tests if reasonable error is produced when an invalid allocator is specified
+    """
+    with pytest.raises(TypeError):
+        MatmulOptions(allocator="Hello, I'm a real allocator!")
+
+
+def test_uninstantiated_allocator():
+    """
+    Tests if reasonable error is produced when an allocator class is provided instead of an
+    instance
+    """
+    if not is_torch_available():
+        pytest.skip("no pytorch")
+
+    from nvmath.memory import _MEMORY_MANAGER
+
+    try:
+        # This may not fail if allocator won't be used
+        options = MatmulOptions(allocator=_MEMORY_MANAGER["torch"])
+        check_matmul_with_options(10, options)
+    except TypeError:
+        pass
+
+
+@pytest.mark.skip(reason="The generic matmul API uses ExecutionCUDA instead of MatmulOptions for execution space.")
+def test_device_id():
+    """
+    Tests if specifying a device id works as expected.
+    """
+    options = MatmulOptions(device_id=0)
+    check_matmul_with_options(10, options, use_cuda=False)
+
+
+@pytest.mark.skip(reason="The generic matmul API uses ExecutionCUDA instead of MatmulOptions for execution space.")
+def test_invalid_device_id():
+    """
+    Tests if specifying negative device id raises an error
+    """
+    if not HAS_CUPY:
+        pytest.skip("cupy is required for this test")
+
+    options = MatmulOptions(device_id=-1)
+    with pytest.raises((RuntimeError, cupy_backends.cuda.api.runtime.CUDARuntimeError, ValueError), match="device"):
+        check_matmul_with_options(10, options)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_perf.py b/tests/nvmath_tests/linalg/generic/matmul/test_perf.py
new file mode 100644
index 0000000..fee710e
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_perf.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+try:
+    import cupy
+except ModuleNotFoundError:
+    pytest.skip("cupy required for matmul tests", allow_module_level=True)
+import nvmath
+import numpy as np
+
+from nvmath_tests.helpers import time_cupy, print_aligned_table, matmul_perf_GFlops, matmul_flops
+
+
+def run_test(data, precision, m, n, k, autotune=False, ncycles=10):
+    A = cupy.random.rand(m, k).astype(precision)
+    B = cupy.random.rand(k, n).astype(precision)
+
+    with nvmath.linalg.generic.Matmul(A, B) as mm:
+        mm.plan()
+        if autotune:
+            raise NotImplementedError("Generic matmul APIs do not support autotune.")
+            mm.autotune()
+
+        time_nvmath = time_cupy(lambda: mm.execute(), ncycles)
+
+    time_cp = time_cupy(lambda: cupy.matmul(A, B), ncycles)
+
+    data.append(
+        {
+            "precision": precision.__name__,
+            "autotune": "yes" if autotune else "no",
+            "m": m,
+            "n": n,
+            "k": k,
+            "nvmath-python [ms]": time_nvmath["time_ms"],
+            "cupy [ms]": time_cp["time_ms"],
+            "dataset_size [MiB]": (m * k + k * n + m * n) * precision(1).itemsize / (2**20),
+            "cupy [GFlop/s]": matmul_perf_GFlops(m, n, k, time_cp["time_ms"], precision),
+            "nvmath-python [GFlop/s]": matmul_perf_GFlops(m, n, k, time_nvmath["time_ms"], precision),
+            "speedup nvmath-python over cupy": time_cp["time_ms"] / time_nvmath["time_ms"],
+        }
+    )
+
+    return data
+
+
+def test_matmul_perf():
+    data = []
+
+    for precision in [np.float32, np.float64]:
+        for m in [2**i for i in range(14)]:
+            for k in [m, m // 2, m // 4]:
+                n = m
+                if matmul_flops(m, n, k, precision) < 1e8:
+                    continue  # skip small cases
+
+                run_test(data, precision, m, n, k)
+
+    print("\n")
+    cols = [
+        "precision",
+        "autotune",
+        "m",
+        "n",
+        "k",
+        "dataset_size [MiB]",
+        "cupy [ms]",
+        "nvmath-python [ms]",
+        "cupy [GFlop/s]",
+        "nvmath-python [GFlop/s]",
+        "speedup nvmath-python over cupy",
+    ]
+    print_aligned_table(cols, data)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_planning.py b/tests/nvmath_tests/linalg/generic/matmul/test_planning.py
new file mode 100644
index 0000000..891b334
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_planning.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This set of tests checks basic properties of separated planning.
+"""
+
+from nvmath.bindings import cublasLt as cublaslt
+from nvmath.linalg.generic import Matmul
+import numpy as np
+import pytest
+
+from ...utils import sample_matrix, allow_cublas_unsupported, assert_tensors_equal
+
+try:
+    import cupy
+except ModuleNotFoundError:
+    pytest.skip("cupy required for matmul tests", allow_module_level=True)
+
+
+def MatmulPlanPreferences(*args, **kwargs):
+    """Placeholder used to silence linter"""
+    pass
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("dtype", ("float32", "complex64", "float64", "complex128"))
+@pytest.mark.parametrize(
+    "n,m,k",
+    (
+        (2, 3, 4),
+        (50, 51, 52),
+        (64, 32, 32),
+        (200, 100, 50),
+    ),
+)
+@pytest.mark.parametrize("max_waves_count", (0.99, 1.0))
+@pytest.mark.parametrize("iterations", (1, 5))
+@pytest.mark.parametrize("prune", (1, 5, 9))
+@pytest.mark.parametrize("use_cuda", (True, False))
+def test_autotuning(
+    framework,
+    dtype,
+    n,
+    m,
+    k,
+    max_waves_count,
+    iterations,
+    prune,
+    use_cuda,
+):
+    a = sample_matrix(framework, dtype, (n, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, m), use_cuda)
+    c = sample_matrix(framework, dtype, (n, m), use_cuda)
+    mm = Matmul(a, b, beta=0.7, c=c)
+    with allow_cublas_unsupported(
+        allow_invalid_value=False,
+        message=(
+            f"Unsupported configuration: {framework}-{dtype}-{n}-{m}-{k}-{max_waves_count}-{iterations}-{prune}-{use_cuda}."
+        ),
+    ):
+        mm.plan(preferences=MatmulPlanPreferences(limit=9, max_waves_count=max_waves_count))
+    num_algorithms = len(mm.algorithms)
+    mm.autotune(iterations=iterations, prune=prune)
+    assert len(mm.algorithms) == min(prune, num_algorithms)
+    assert_tensors_equal(mm.execute(), a @ b + c * 0.7)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("dtype", ("float64", "complex128"))
+@pytest.mark.parametrize(
+    "n,m,k",
+    (
+        (1, 1, 1),
+        (64, 32, 96),
+    ),
+)
+@pytest.mark.parametrize("max_waves_count", (0.0, 1.0, 2.0))
+@pytest.mark.parametrize("use_cuda", (True, False))
+def test_plan(framework, dtype, n, m, k, max_waves_count, use_cuda):
+    a = sample_matrix(framework, dtype, (n, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, m), use_cuda)
+    c = sample_matrix(framework, dtype, (n, m), use_cuda)
+    mm = Matmul(a, b, beta=0.7, c=c)
+    mm.plan(preferences=MatmulPlanPreferences(limit=6, max_waves_count=max_waves_count))
+    assert_tensors_equal(mm.execute(), a @ b + c * 0.7)
+
+
+def test_multiple_executions():
+    """
+    Tests if single Matmul object can be reused.
+    """
+    a = cupy.zeros((10, 10))
+    b = cupy.zeros((10, 10))
+    mm = Matmul(a, b)
+    mm.plan()
+    for _ in range(5):
+        cupy.copyto(a, cupy.random.rand(*a.shape))
+        cupy.copyto(b, cupy.random.rand(*b.shape))
+        result = mm.execute()
+        assert_tensors_equal(result, a @ b)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+def test_limit():
+    """
+    Tests if limiting the number of algorithms works as expected
+    """
+    a = cupy.zeros((10, 10))
+    b = cupy.zeros((10, 10))
+    mm = Matmul(a, b)
+    mm.plan(preferences=MatmulPlanPreferences(limit=3))
+    assert len(mm.algorithms) <= 3
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+def test_reduction_scheme():
+    """
+    Tests if one can specify reduction scheme
+    """
+    a = cupy.zeros((1000, 1000))
+    b = cupy.zeros((1000, 1000))
+    mm = Matmul(a, b)
+    algos = mm.plan(preferences=MatmulPlanPreferences(reduction_scheme_mask=cublaslt.ReductionScheme.NONE, limit=64))
+    assert not any(a.reduction_scheme for a in algos)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+def test_capabilities():
+    """
+    Tests if one can modify algorithm capabilities
+    """
+    a = cupy.random.rand(1000, 1000, dtype=np.float32)
+    b = cupy.random.rand(1000, 1000, dtype=np.float32)
+    mm = Matmul(a, b)
+    mm.plan()
+    best = mm.algorithms[0]
+    best.tile = best.capabilities.tile_ids[-1]
+    with allow_cublas_unsupported(message=f"Unsupported tile: {best.tile}"):
+        # The chosen tile size might not be supported on some platforms
+        result = mm.execute()
+        assert_tensors_equal(result, a @ b)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("serialize", (True, False))
+@pytest.mark.parametrize("use_cuda", (True, False))
+def test_algorithms(framework, serialize, use_cuda):
+    a = b = sample_matrix(framework, "float32", (20, 20), use_cuda)
+    mm = Matmul(a, b)
+    algos = mm.plan(preferences=MatmulPlanPreferences(limit=10))
+    if serialize:
+        import pickle
+
+        algos = pickle.loads(pickle.dumps(algos))
+    c = d = sample_matrix(framework, "float32", (20, 20), use_cuda)
+
+    # Test providing multiple algorithms
+    mm2 = Matmul(c, d)
+    mm2.plan(algorithms=algos)
+    assert_tensors_equal(mm2.execute(), c @ d)
+
+    # Test executing a specified algorithm
+    mm3 = Matmul(c, d)
+    mm3.plan(algorithms=algos)
+    assert_tensors_equal(mm3.execute(algorithm=algos[0]), c @ d)
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+@pytest.mark.parametrize("value", (None, 0, "algo"))
+def test_algorithms_invalid(value):
+    a = b = sample_matrix("torch", "float32", (20, 20), True)
+    mm = Matmul(a, b)
+    with pytest.raises(AssertionError):
+        mm.plan(algorithms=[value])
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("use_cuda", (True, False))
+def test_algorithm_not_planned(framework, use_cuda):
+    a = b = sample_matrix(framework, "float32", (20, 20), use_cuda)
+    mm = Matmul(a, b)
+    algos = mm.plan(preferences=MatmulPlanPreferences(limit=10))
+
+    mm2 = Matmul(a, b)
+    mm2.plan(algorithms=algos[1:])
+    with pytest.raises(
+        ValueError,
+        match=r"Algorithm passed to execute\(\) has to be included in the plan\(\) algorithms",
+    ):
+        mm2.execute(algorithm=algos[0])
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+def test_algorithm_ids():
+    a = cupy.zeros((10, 10))
+    b = cupy.zeros((10, 10))
+    with Matmul(a, b) as mm:
+        assert len(mm.applicable_algorithm_ids(limit=4)) <= 4
+
+
+@pytest.mark.skip(reason="Generic matmul APIs do not support plan preferences.")
+def test_algo_attributes():
+    """
+    Test Algorithm class setter/property
+    """
+    m, n, k = 24, 24, 24
+    a = cupy.random.rand(m, k)
+    b = cupy.random.rand(k, n)
+
+    with Matmul(a, b) as mm:
+        algos = mm.plan()
+        best = algos[0]
+
+        # An attribute may not be supported in all cuBLASLt versions (INVALID_VALUE).
+
+        message = "The attribute '{attr}' is not supported in this version."
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="stages")):
+            if best.capabilities.stages_ids:
+                best.stages = best.capabilities.stages_ids[-1]
+                assert best.stages == best.capabilities.stages_ids[-1]
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="split_k")):
+            best.split_k = 4
+            assert best.split_k == 4
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="reduction_scheme")):
+            best.reduction_scheme = best.capabilities.reduction_scheme_mask
+            assert best.reduction_scheme == best.capabilities.reduction_scheme_mask
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="cta_swizzling")):
+            best.cta_swizzling = True
+            assert best.cta_swizzling
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="custom_option")):
+            best.custom_option = 1
+            assert best.custom_option == 1
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="inner_shape")):
+            best.inner_shape = cublaslt.MatmulInnerShape.MMA884
+            assert best.inner_shape == cublaslt.MatmulInnerShape.MMA884
+
+        with allow_cublas_unsupported(allow_invalid_value=True, message=message.format(attr="cluster_shape")):
+            best.cluster_shape = (1, 1, 1)
+            assert best.cluster_shape == (1, 1, 1)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_qualifier.py b/tests/nvmath_tests/linalg/generic/matmul/test_qualifier.py
new file mode 100644
index 0000000..db0fa3d
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_qualifier.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Concept exploration for NumPy custom-dtype-backed MatrixQualifiers."""
+
+import numpy as np
+import pytest
+
+import nvmath.bindings.cublas as cublas
+
+from nvmath.linalg.generic import (
+    DiagonalMatrixQualifier,
+    GeneralMatrixQualifier,
+    HermitianMatrixQualifier,
+    matrix_qualifiers_dtype,
+    SymmetricMatrixQualifier,
+    TriangularMatrixQualifier,
+)
+
+
+@pytest.mark.parametrize(
+    "constructor",
+    [
+        DiagonalMatrixQualifier,
+        GeneralMatrixQualifier,
+        HermitianMatrixQualifier,
+        SymmetricMatrixQualifier,
+        TriangularMatrixQualifier,
+    ],
+)
+def test_matrix_qualifier_constructor(constructor):
+    q = constructor.create()
+    assert isinstance(q, np.ndarray)
+    assert q.size == 1
+    assert not q.shape
+    assert q.dtype == matrix_qualifiers_dtype
+    print(q)
+
+
+def test_matrix_qualifier_set_ranges():
+    q = np.empty(10, dtype=matrix_qualifiers_dtype)
+    print(q)
+    q[...] = GeneralMatrixQualifier.create(conjugate=True)
+    print(q)
+    q[6] = HermitianMatrixQualifier.create(conjugate=False, transpose=True)
+    print(q)
+
+    print(q["abbreviation"])
+
+    print(q.dtype)
+
+
+def test_matrix_qualifier_validity():
+    g = GeneralMatrixQualifier.create()
+    assert GeneralMatrixQualifier.is_valid(g)
+
+    h = HermitianMatrixQualifier.create()
+    assert HermitianMatrixQualifier.is_valid(h)
+
+    assert not HermitianMatrixQualifier.is_valid(g)
+    assert not GeneralMatrixQualifier.is_valid(h)
+
+    g["abbreviation"] = "xx"
+    assert not GeneralMatrixQualifier.is_valid(g)
+
+    h["uplo"] = cublas.FillMode.UPPER
+    assert HermitianMatrixQualifier.is_valid(h)
+    h["uplo"] = -1
+    assert not HermitianMatrixQualifier.is_valid(h)
+
+
+def test_matrix_qualifier_attributes():
+    t = TriangularMatrixQualifier.create(
+        conjugate=False,
+        transpose=True,
+        uplo=cublas.FillMode.UPPER,
+        diag=cublas.DiagType.UNIT,
+    )
+    assert t["transpose"]
+    assert not t["conjugate"]
+    assert t["uplo"] == cublas.FillMode.UPPER
+    assert t["diag"] == cublas.DiagType.UNIT
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_reset.py b/tests/nvmath_tests/linalg/generic/matmul/test_reset.py
new file mode 100644
index 0000000..1cc4228
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_reset.py
@@ -0,0 +1,378 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This set of tests checks reset_operands
+"""
+
+import nvmath
+import pytest
+
+from ...utils import assert_tensors_equal, random_torch_complex, sample_matrix, skip_if_cublas_before
+
+from . import CUBLAS_AVAILABLE, NVPL_AVAILABLE
+
+use_cuda_options = (
+    *((True,) if CUBLAS_AVAILABLE else ()),
+    *((False,) if NVPL_AVAILABLE else ()),
+)
+
+
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("dtype", ("float32",))
+@pytest.mark.parametrize(
+    "reset_a",
+    (
+        True,
+        False,
+    ),
+)
+@pytest.mark.parametrize(
+    "reset_b",
+    (
+        True,
+        False,
+    ),
+)
+@pytest.mark.parametrize("with_alpha, reset_alpha", ((False, False), (True, False), (True, True)))
+@pytest.mark.parametrize(
+    "with_c, reset_c, reset_beta, with_epilog, reset_epilog",
+    (
+        # No c, no epilog
+        (False, False, False, False, False),
+        # With c, no epilog
+        (True, False, False, False, False),
+        (True, False, True, False, False),
+        (True, True, False, False, False),
+        (True, True, True, False, False),
+        # No c, with epilog
+        # (False, False, False, True, False),
+        # (False, False, False, True, True),
+    ),
+)
+@pytest.mark.parametrize("reset_to_none", (True, False))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_reset(
+    framework,
+    dtype,
+    reset_a,
+    reset_b,
+    with_alpha,
+    reset_alpha,
+    with_c,
+    reset_c,
+    reset_beta,
+    with_epilog,
+    reset_epilog,
+    reset_to_none,
+    use_cuda,
+):
+    """
+    Tests resetting particular operands
+    """
+    if not any((reset_a, reset_b, reset_c, reset_alpha, reset_beta, reset_epilog)):
+        pytest.skip("No operand will be reset in this test")
+
+    m, n, k = 12, 34, 56
+    a = sample_matrix(framework, dtype, (m, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, n), use_cuda)
+    c = sample_matrix(framework, dtype, (m, n), use_cuda)
+    alpha = 0.12
+    beta = 0.34
+
+    if with_epilog:
+        skip_if_cublas_before(11501)  # Epilog inputs not fully supported
+
+    matmul_kwargs = {}
+
+    if with_alpha:
+        matmul_kwargs["alpha"] = alpha
+
+    if with_c:
+        matmul_kwargs["c"] = c
+        matmul_kwargs["beta"] = beta
+
+    with nvmath.linalg.generic.Matmul(a, b, **matmul_kwargs) as mm:
+        mm.plan()
+
+        reference1 = a @ b * (alpha if with_alpha else 1)
+        if with_c:
+            reference1 += c * beta
+
+        result1 = mm.execute()
+        assert_tensors_equal(result1, reference1)
+
+        if reset_to_none:
+            mm.reset_operands(None)
+
+        new_a = sample_matrix(framework, dtype, (m, k), use_cuda)
+        new_b = sample_matrix(framework, dtype, (k, n), use_cuda)
+        new_c = sample_matrix(framework, dtype, (m, n), use_cuda)
+        new_alpha = 0.56
+        new_beta = 0.78
+        new_epilog_inputs = {"bias": sample_matrix(framework, dtype, (m, 1), use_cuda)}
+
+        reset_kwargs = {}
+        if reset_a:
+            reset_kwargs["a"] = new_a
+        if reset_b:
+            reset_kwargs["b"] = new_b
+        if reset_c:
+            reset_kwargs["c"] = new_c
+        if reset_alpha:
+            reset_kwargs["alpha"] = new_alpha
+        if reset_beta:
+            reset_kwargs["beta"] = new_beta
+        if reset_epilog:
+            reset_kwargs["epilog_inputs"] = new_epilog_inputs
+
+        all_operands_reset = reset_a and reset_b and (reset_c or not with_c) and (reset_epilog or not with_epilog)
+        if reset_to_none and not all_operands_reset:
+            with pytest.raises(ValueError):
+                mm.reset_operands(**reset_kwargs)
+        else:
+            mm.reset_operands(**reset_kwargs)
+
+            reference2 = (new_a if reset_a else a) @ (new_b if reset_b else b)
+            reference2 *= new_alpha if reset_alpha else alpha if with_alpha else 1
+            if with_c:
+                reference2 += (new_c if reset_c else c) * (new_beta if reset_beta else beta)
+
+            result2 = mm.execute()
+            assert_tensors_equal(result2, reference2)
+
+
+@pytest.mark.parametrize("framework", ("numpy/cupy",))
+@pytest.mark.parametrize("dtype", ("float64",))
+@pytest.mark.parametrize("a_mismatch", (True, False))
+@pytest.mark.parametrize("b_mismatch", (True, False))
+@pytest.mark.parametrize(
+    "with_c, c_mismatch, with_epilog, bias_mismatch",
+    (
+        (False, False, False, False),
+        (True, False, False, False),
+        (True, True, False, False),
+        (False, False, False, False),
+        # (False, False, True, False),
+        # (False, False, True, True),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_shape_mismatch(
+    framework,
+    dtype,
+    a_mismatch,
+    b_mismatch,
+    with_c,
+    c_mismatch,
+    with_epilog,
+    bias_mismatch,
+    use_cuda,
+):
+    """
+    Checks if resetting operands to ones of different shapes results in appropriate error
+    message
+    """
+    m, n, k = 54, 32, 10
+    a = sample_matrix(framework, dtype, (m, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, n), use_cuda)
+    c = sample_matrix(framework, dtype, (m, n), use_cuda) if with_c else None
+
+    if with_epilog:
+        skip_if_cublas_before(11501)  # Epilog inputs not fully supported
+
+    with nvmath.linalg.generic.Matmul(a, b, c=c, beta=2 if with_c else None) as mm:
+        mm.plan()
+        mm.execute()
+
+        new_a = sample_matrix(framework, dtype, (m, k + 1) if a_mismatch else (m, k), use_cuda)
+        new_b = sample_matrix(framework, dtype, (k, n + 3) if b_mismatch else (k, n), use_cuda)
+        new_c = sample_matrix(framework, dtype, (m + 9, n - 3) if c_mismatch else (m, n), use_cuda) if with_c else None
+
+        if any((a_mismatch, b_mismatch, c_mismatch, bias_mismatch)):
+            with pytest.raises(ValueError, match="The extents .* must match"):
+                mm.reset_operands(a=new_a, b=new_b, c=new_c)
+        else:
+            pytest.skip("All shapes match")
+
+
+@pytest.mark.parametrize("framework", ("numpy/cupy",))
+@pytest.mark.parametrize("dtype, bad_dtype", (("float64", "float32"), ("float32", "float64")))
+@pytest.mark.parametrize("a_mismatch", (True, False))
+@pytest.mark.parametrize("b_mismatch", (True, False))
+@pytest.mark.parametrize(
+    "with_c, c_mismatch, with_epilog, bias_mismatch",
+    (
+        (False, False, False, False),
+        (True, False, False, False),
+        (True, True, False, False),
+        (False, False, False, False),
+        # (False, False, True, False),
+        # (False, False, True, True),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_dtype_mismatch(
+    framework,
+    dtype,
+    bad_dtype,
+    a_mismatch,
+    b_mismatch,
+    with_c,
+    c_mismatch,
+    with_epilog,
+    bias_mismatch,
+    use_cuda,
+):
+    """
+    Checks if resetting operands to ones with different dtypes results in appropriate error
+    message
+    """
+
+    m, n, k = 19, 28, 37
+    a = sample_matrix(framework, dtype, (m, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, n), use_cuda)
+    c = sample_matrix(framework, dtype, (m, n), use_cuda) if with_c else None
+
+    if with_epilog:
+        skip_if_cublas_before(11501)  # Epilog inputs not fully supported
+
+    with nvmath.linalg.generic.Matmul(a, b, c=c, beta=2 if with_c else None) as mm:
+        mm.plan()
+        mm.execute()
+
+        new_a = sample_matrix(framework, bad_dtype if a_mismatch else dtype, (m, k), use_cuda)
+        new_b = sample_matrix(framework, bad_dtype if b_mismatch else dtype, (k, n), use_cuda)
+        new_c = sample_matrix(framework, bad_dtype if c_mismatch else dtype, (m, n), use_cuda) if with_c else None
+
+        if any((a_mismatch, b_mismatch, c_mismatch, bias_mismatch)):
+            with pytest.raises(
+                ValueError,
+                match="The data type of the new operand must match the data type of the original operand.",
+            ):
+                mm.reset_operands(a=new_a, b=new_b, c=new_c)
+        else:
+            # All shapes match, just check if nothing explodes here
+            mm.reset_operands(a=new_a, b=new_b, c=new_c)
+            mm.execute()
+
+
+@pytest.mark.parametrize("framework, bad_framework", (("numpy/cupy", "torch"), ("torch", "numpy/cupy")))
+@pytest.mark.parametrize("dtype", ("float64",))
+@pytest.mark.parametrize("a_mismatch", (True, False))
+@pytest.mark.parametrize("b_mismatch", (True, False))
+@pytest.mark.parametrize(
+    "with_c, c_mismatch, with_epilog, bias_mismatch",
+    (
+        (False, False, False, False),
+        (True, False, False, False),
+        (True, True, False, False),
+        (False, False, False, False),
+        # (False, False, True, False),
+        # (False, False, True, True),
+    ),
+)
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_framework_mismatch(
+    framework,
+    bad_framework,
+    dtype,
+    a_mismatch,
+    b_mismatch,
+    with_c,
+    c_mismatch,
+    with_epilog,
+    bias_mismatch,
+    use_cuda,
+):
+    """
+    Checks if resetting operands to ones from different framework results in appropriate
+    error message
+    """
+
+    m, n, k = 10, 11, 12
+    a = sample_matrix(framework, dtype, (m, k), use_cuda)
+    b = sample_matrix(framework, dtype, (k, n), use_cuda)
+    c = sample_matrix(framework, dtype, (m, n), use_cuda) if with_c else None
+
+    if with_epilog:
+        skip_if_cublas_before(11501)  # Epilog inputs not fully supported
+
+    with nvmath.linalg.generic.Matmul(a, b, c=c, beta=2 if with_c else None) as mm:
+        mm.plan()
+        mm.execute()
+
+        new_a = sample_matrix(bad_framework if a_mismatch else framework, dtype, (m, k), use_cuda)
+        new_b = sample_matrix(bad_framework if b_mismatch else framework, dtype, (k, n), use_cuda)
+        new_c = sample_matrix(bad_framework if c_mismatch else framework, dtype, (m, n), use_cuda) if with_c else None
+
+        if any((a_mismatch, b_mismatch, c_mismatch, bias_mismatch)):
+            with pytest.raises(TypeError, match="Library package mismatch"):
+                mm.reset_operands(a=new_a, b=new_b, c=new_c)
+        else:
+            # All dtypes match, just check if nothing explodes here
+            mm.reset_operands(a=new_a, b=new_b, c=new_c)
+            mm.execute()
+
+
+@pytest.mark.parametrize("framework", ("numpy/cupy",))
+@pytest.mark.parametrize("dtype", ("float32",))
+@pytest.mark.parametrize("ta", (True, False))
+@pytest.mark.parametrize("tb", (True, False))
+@pytest.mark.parametrize("tc", (True, False))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_layout_change(framework, dtype, ta, tb, tc, use_cuda):
+    """
+    Check if layout change of the input matrix is handled correctly
+    """
+    m = n = k = 5
+    a = sample_matrix(framework, dtype, (m, k), use_cuda=use_cuda)
+    b = sample_matrix(framework, dtype, (k, n), use_cuda=use_cuda)
+    c = sample_matrix(framework, dtype, (m, n), use_cuda=use_cuda)
+
+    with nvmath.linalg.generic.Matmul(a, b, c=c, beta=1) as mm:
+        mm.plan()
+        result1 = mm.execute()
+        assert_tensors_equal(result1, a @ b + c)
+
+        a = a.T if ta else a
+        b = b.T if tb else b
+        c = c.T if tc else c
+
+        if ta or tb or tc:
+            with pytest.raises(ValueError, match="The strides .* must match"):
+                mm.reset_operands(a=a, b=b, c=c)
+
+
+@pytest.mark.parametrize("b_conj_init, b_conj_reset", ((True, False), (False, True)))
+@pytest.mark.parametrize("use_cuda", use_cuda_options)
+def test_conjugate_flag(b_conj_init, b_conj_reset, use_cuda):
+    """
+    Tests if conjugate flag of torch tensors is inferred again on reset.
+
+    Only checks GPU tensors, because conj flag is reset on H2D copy.
+
+    Only checks B, because changing conj flag of A requires transposing it due to cublas
+    requirements, which causes stride mismatch.
+    """
+    m, k, n = 3, 4, 5
+
+    a = random_torch_complex((m, k), use_cuda, True)
+    b = random_torch_complex((k, n), use_cuda, True)
+    c = random_torch_complex((m, n), use_cuda, False)
+
+    if b_conj_init:
+        b = b.conj()
+
+    with nvmath.linalg.generic.Matmul(a, b, c=c, beta=1) as mm:
+        mm.plan()
+        result1 = mm.execute()
+        assert_tensors_equal(result1, a @ b + c)
+
+        b = random_torch_complex((k, n), use_cuda, True)
+        if b_conj_reset:
+            b = b.conj()
+
+        with pytest.raises(ValueError):
+            mm.reset_operands(b=b)
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_stateful.py b/tests/nvmath_tests/linalg/generic/matmul/test_stateful.py
new file mode 100644
index 0000000..33c476c
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_stateful.py
@@ -0,0 +1,103 @@
+import numpy as np
+import pytest
+
+from nvmath.linalg.generic import (
+    ExecutionCUDA,
+    GeneralMatrixQualifier,
+    Matmul,
+    matrix_qualifiers_dtype,
+)
+
+
+def test_unplanned():
+    a = np.random.rand(4, 4)
+    b = np.random.rand(4, 4)
+    q = np.empty(2, dtype=matrix_qualifiers_dtype)
+    q[:] = GeneralMatrixQualifier.create()
+
+    with (
+        pytest.raises(
+            RuntimeError,
+            match=r"Execution cannot be performed before plan\(\) has been called",
+        ),
+        Matmul(
+            a,
+            b,
+            qualifiers=q,
+            execution=ExecutionCUDA(),
+        ) as mm,
+    ):
+        mm.execute()
+
+
+def test_reset_operands():
+    a = np.ones(shape=(4, 4))
+    b = np.ones(shape=(4, 4))
+    a1 = np.ones(shape=(4, 4)) * 2
+    b1 = np.ones(shape=(4, 4)) * 3
+    q = np.empty(2, dtype=matrix_qualifiers_dtype)
+    q[:] = GeneralMatrixQualifier.create()
+
+    with Matmul(
+        a,
+        b,
+        qualifiers=q,
+        execution=ExecutionCUDA(),
+    ) as mm:
+        mm.plan()
+        r = mm.execute()
+        mm.reset_operands(a1, b1)
+        r1 = mm.execute()
+    np.testing.assert_equal(6 * r, r1)
+
+
+def test_reset_operands_new_shape():
+    a = np.random.rand(4, 4)
+    b = np.random.rand(4, 4)
+    a1 = np.random.rand(4, 4)
+    b1 = np.random.rand(4, 5)
+    q = np.empty(2, dtype=matrix_qualifiers_dtype)
+    q[:] = GeneralMatrixQualifier.create()
+
+    with (
+        pytest.raises(
+            ValueError,
+            match=r"The extents of the new operand must match the extents of the original operand.",
+        ),
+        Matmul(
+            a,
+            b,
+            qualifiers=q,
+            execution=ExecutionCUDA(),
+        ) as mm,
+    ):
+        mm.plan()
+        mm.execute()
+        mm.reset_operands(a1, b1)
+        mm.execute()
+
+
+def test_reset_operands_new_dtype():
+    a = np.random.rand(4, 4)
+    b = np.random.rand(4, 4).astype(np.double)
+    a1 = np.random.rand(4, 4)
+    b1 = np.random.rand(4, 4).astype(np.single)
+    q = np.empty(2, dtype=matrix_qualifiers_dtype)
+    q[:] = GeneralMatrixQualifier.create()
+
+    with (
+        pytest.raises(
+            ValueError,
+            match=r"The data type of the new operand must match the data type of the original operand.",
+        ),
+        Matmul(
+            a,
+            b,
+            qualifiers=q,
+            execution=ExecutionCUDA(),
+        ) as mm,
+    ):
+        mm.plan()
+        mm.execute()
+        mm.reset_operands(a1, b1)
+        mm.execute()
diff --git a/tests/nvmath_tests/linalg/generic/matmul/test_wrap.py b/tests/nvmath_tests/linalg/generic/matmul/test_wrap.py
new file mode 100644
index 0000000..252cdbc
--- /dev/null
+++ b/tests/nvmath_tests/linalg/generic/matmul/test_wrap.py
@@ -0,0 +1,35 @@
+import logging
+import pytest
+
+from nvmath.internal.typemaps import cudaDataType
+from nvmath.linalg.generic import ExecutionCPU, ExecutionCUDA
+from nvmath.linalg.generic._configuration import wrap
+
+
+from . import CUBLAS_AVAILABLE, NVPL_AVAILABLE
+
+
+@pytest.mark.skipif(not NVPL_AVAILABLE, reason="NVPL BLAS required for this test.")
+def test_nvpl_blas_function_not_found():
+    logger = logging.getLogger()
+    with pytest.raises(NotImplementedError):
+        wrap.nvpl_mm_function(
+            execution=ExecutionCPU(),
+            dtype=cudaDataType.CUDA_R_32F,
+            matrix_descr_abbreviation="xx",
+            logger=logger,
+            batch_type="group",
+        )
+
+
+@pytest.mark.skipif(not CUBLAS_AVAILABLE, reason="cuBLAS required for this test.")
+def test_cublas_function_not_found():
+    logger = logging.getLogger()
+    with pytest.raises(NotImplementedError):
+        wrap.cublas_mm_function(
+            execution=ExecutionCUDA(),
+            dtype=cudaDataType.CUDA_R_32F,
+            matrix_descr_abbreviation="xx",
+            logger=logger,
+            batch_type="group",
+        )
diff --git a/tests/nvmath_tests/linalg/test_layout.py b/tests/nvmath_tests/linalg/test_layout.py
new file mode 100644
index 0000000..6dec6d1
--- /dev/null
+++ b/tests/nvmath_tests/linalg/test_layout.py
@@ -0,0 +1,93 @@
+import pytest
+
+from nvmath.bindings import cublasLt as cublaslt
+from nvmath.linalg._internal.layout import BLASMatrixTraits
+
+COL = cublaslt.Order.COL
+ROW = cublaslt.Order.ROW
+
+
+order_cases = [
+    # 0D
+    ((), (), COL, None),
+    # 1D - overlapping
+    ((1,), (0,), COL, None),
+    ((4,), (0,), COL, None),
+    # 1D - dense
+    ((1,), (1,), COL, None),
+    ((4,), (1,), COL, None),
+    # 1D - strided
+    ((1,), (8,), COL, None),
+    ((4,), (8,), COL, None),
+    #
+    # 2D - overlapping, overlapping
+    ((1, 1), (0, 0), COL, 1),
+    ((1, 4), (0, 0), COL, None),
+    ((4, 1), (0, 0), COL, None),
+    ((4, 4), (0, 0), COL, None),
+    # 2D - overlapping, dense
+    ((1, 1), (0, 1), COL, 1),
+    ((1, 4), (0, 1), COL, 1),
+    ((4, 1), (0, 1), COL, None),
+    ((4, 4), (0, 1), COL, None),
+    # 2D - overlapping, strided
+    ((1, 1), (0, 8), COL, 1),
+    ((1, 4), (0, 8), COL, 8),
+    ((4, 1), (0, 8), COL, None),
+    ((4, 4), (0, 8), COL, None),
+    #
+    # 2D - dense, overlapping
+    ((1, 1), (1, 0), COL, 1),
+    ((1, 4), (1, 0), COL, None),
+    ((4, 1), (1, 0), ROW, None),
+    ((4, 4), (1, 0), ROW, None),
+    # 2D - dense, dense
+    ((1, 1), (1, 1), COL, 1),
+    ((1, 4), (1, 1), COL, 1),
+    ((4, 1), (1, 4), ROW, None),
+    ((4, 4), (1, 4), COL, 4),
+    ((1, 4), (4, 1), COL, 1),
+    ((4, 1), (1, 1), ROW, None),
+    ((4, 4), (4, 1), ROW, None),
+    # 2D - dense, strided
+    ((1, 1), (1, 8), COL, 1),
+    ((1, 4), (1, 8), COL, 8),
+    ((4, 1), (1, 8), ROW, None),
+    ((4, 4), (1, 8), COL, 8),
+    #
+    # 2D - strided, overlapping
+    ((1, 1), (8, 0), COL, 1),
+    ((1, 4), (8, 0), COL, None),
+    ((4, 1), (8, 0), ROW, None),
+    ((4, 4), (8, 0), ROW, None),
+    # 2D - strided, dense
+    ((1, 1), (8, 1), COL, 1),
+    ((1, 4), (8, 1), COL, 1),
+    ((4, 1), (8, 1), ROW, None),
+    ((4, 4), (8, 1), ROW, None),
+    # 2D - strided, strided
+    ((1, 1), (8, 128), COL, 1),
+    ((1, 4), (8, 128), COL, 128),
+    ((4, 1), (8, 128), ROW, None),
+    ((4, 4), (8, 128), None, None),
+    ((1, 1), (128, 8), COL, 1),
+    ((1, 4), (128, 8), COL, 8),
+    ((4, 1), (128, 8), ROW, None),
+    ((4, 4), (128, 8), None, None),
+]
+
+
+@pytest.mark.parametrize("shape, strides, order, ld", order_cases)
+def test_matrix_traits_order(shape, strides, order, ld):
+    t = BLASMatrixTraits(dtype=0, shape=shape, strides=strides, is_conjugate=False, is_lower=False, is_transpose=False)
+    if order is None:
+        with pytest.raises(ValueError, match="Unsupported layout"):
+            _ = t.order
+        return
+    assert t.order == order
+
+    if ld is None:
+        with pytest.raises((ValueError, AssertionError)):
+            _ = t.ld
+        return
+    assert t.ld == ld
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/utils.py b/tests/nvmath_tests/linalg/utils.py
similarity index 98%
rename from tests/nvmath_tests/linalg/advanced/matmul/utils.py
rename to tests/nvmath_tests/linalg/utils.py
index cc04d98..bff0e43 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/utils.py
+++ b/tests/nvmath_tests/linalg/utils.py
@@ -71,6 +71,8 @@ def to_numpy(tensor):
         return cupy.asnumpy(tensor)
     elif isinstance(tensor, np.ndarray):
         return tensor
+    elif isinstance(tensor, np.number):
+        return np.array(tensor)
     else:
         msg = f"Cannot convert to numpy from {type(tensor)}"
         raise AssertionError(msg)
diff --git a/tests/nvmath_tests/ndbuffer/test_ndbuffer.py b/tests/nvmath_tests/ndbuffer/test_ndbuffer.py
index ccf8b5d..d8810e8 100644
--- a/tests/nvmath_tests/ndbuffer/test_ndbuffer.py
+++ b/tests/nvmath_tests/ndbuffer/test_ndbuffer.py
@@ -634,6 +634,12 @@ def test_wide_strides_large_volume_copy(caplog, shape, slice, permutation, dtype
 def test_unsupported_ndim():
     with pytest.raises(ValueError, match="Max supported ndim is 32"):
         ndb.empty(shape=(1,) * 33, dtype_name="int8", itemsize=1, device_id=ndb.CPU_DEVICE_ID)
+    # For numpy==1.*, the maximum numpy.ndarray ndim is also 32, so we cannot test
+    # conversion
+    if int(np.__version__.split(".")[0]) < 2:
+        with pytest.raises(ValueError, match=("maximum supported dimension for an ndarray is 32")):
+            np.zeros(shape=(1,) * 34, dtype="float32")
+        return
     with pytest.raises(ValueError, match="Max supported ndim is 32"):
         wrap_operand(np.zeros(shape=(1,) * 34, dtype="float32")).asndbuffer()
 
diff --git a/tests/nvmath_tests/sparse/advanced/test_sparse.py b/tests/nvmath_tests/sparse/advanced/test_sparse.py
index fada337..d8ce310 100644
--- a/tests/nvmath_tests/sparse/advanced/test_sparse.py
+++ b/tests/nvmath_tests/sparse/advanced/test_sparse.py
@@ -28,7 +28,7 @@
     supported_dtypes,
     supported_sparse_array_types,
     supported_exec_space_dense_rhs,
-    supported_index_dtype,
+    supported_index_dtypes,
     supported_sparse_type_dtype,
 )
 from .utils.common_axes import np, cp
@@ -175,18 +175,21 @@ def get_alg_matrix_type_and_view(sparse_type, sparse_view):
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
         "density",
     ),
     [
-        (framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k, Param("density", density))
+        (framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k, Param("density", density))
         for framework in Framework.enabled()
         if framework in sparse_supporting_frameworks
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in supported_index_dtypes
+        if index_type in framework2index_dtype[framework]
         for dtype in supported_dtypes
         if dtype in framework2dtype[framework]
         for n in [1, 10]
@@ -197,9 +200,11 @@ def get_alg_matrix_type_and_view(sparse_type, sparse_view):
     ],
     ids=idfn,
 )
-def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k, density):
+def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k, density):
     density = density.value
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, density, dtype, seed=42)
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, density, dtype, seed=42, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
     x = nvmath.sparse.advanced.direct_solver(a, b, execution=exec_space.nvname)
@@ -213,6 +218,7 @@ def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_typ
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
@@ -225,6 +231,7 @@ def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_typ
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             n,
             rhs_k,
@@ -236,6 +243,9 @@ def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_typ
         for exec_space in ExecutionSpace
         for operand_placement in [rng.choice(framework2operand_placement[framework])]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [
+            rng.choice([index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]])
+        ]
         for dtype in [rng.choice([dtype for dtype in supported_dtypes if dtype in framework2dtype[framework]])]
         for n in [16]
         for rhs_k in [RHSVector(n), RHSMatrix(n, 5)]
@@ -245,12 +255,16 @@ def test_matrix_solve(framework, exec_space, operand_placement, sparse_array_typ
     ids=idfn,
 )
 def test_matrix_unsupported_reset_density_change(
-    framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k, density_0, density_1
+    framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k, density_0, density_1
 ):
     density_0 = density_0.value
     density_1 = density_1.value
-    a_0 = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, density_0, dtype, seed=42)
-    a_1 = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, density_1, dtype, seed=44)
+    a_0 = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, density_0, dtype, seed=42, index_dtype=index_type
+    )
+    a_1 = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, density_1, dtype, seed=44, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
     with nvmath.sparse.advanced.DirectSolver(a_0, b, execution=exec_space.nvname) as solver:
@@ -273,6 +287,7 @@ def test_matrix_unsupported_reset_density_change(
         "operand_placement",
         "device_id",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
@@ -284,6 +299,7 @@ def test_matrix_unsupported_reset_density_change(
             operand_placement,
             Param("device_id", 1),
             sparse_array_type,
+            index_type,
             dtype,
             n,
             rhs_k,
@@ -293,6 +309,8 @@ def test_matrix_unsupported_reset_density_change(
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in supported_index_dtypes
+        if index_type in framework2index_dtype[framework]
         for dtype in supported_dtypes
         if dtype in framework2dtype[framework]
         for n in [3, 12]
@@ -303,7 +321,7 @@ def test_matrix_unsupported_reset_density_change(
 )
 @multi_gpu_only
 def test_matrix_solve_non_default_device_id(
-    framework, exec_space, operand_placement, device_id, sparse_array_type, dtype, n, rhs_k
+    framework, exec_space, operand_placement, device_id, sparse_array_type, index_type, dtype, n, rhs_k
 ):
     density = 0.5
     device_id = device_id.value
@@ -313,7 +331,16 @@ def test_matrix_solve_non_default_device_id(
         device_id = "cpu"
     tensor_framework = framework2tensor_framework[framework]
     a_0 = create_random_sparse_matrix(
-        framework, operand_placement, sparse_array_type, n, n, density, dtype, seed=42, device_id=device_id
+        framework,
+        operand_placement,
+        sparse_array_type,
+        n,
+        n,
+        density,
+        dtype,
+        seed=42,
+        index_dtype=index_type,
+        device_id=device_id,
     )
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype, device_id=device_id, start=1)
     with nvmath.sparse.advanced.DirectSolver(a_0, b, execution=options) as solver:
@@ -324,7 +351,16 @@ def test_matrix_solve_non_default_device_id(
         check(a_0, b, x)
         del a_0
         a_1 = create_random_sparse_matrix(
-            framework, operand_placement, sparse_array_type, n, n, density, dtype, seed=44, device_id=device_id
+            framework,
+            operand_placement,
+            sparse_array_type,
+            n,
+            n,
+            density,
+            dtype,
+            seed=44,
+            index_dtype=index_type,
+            device_id=device_id,
         )
         solver.reset_operands(a=a_1)
         solver.plan()
@@ -429,6 +465,7 @@ def test_matrix_solve_device_id(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
@@ -443,6 +480,7 @@ def test_matrix_solve_device_id(
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             n,
             rhs_k,
@@ -456,6 +494,9 @@ def test_matrix_solve_device_id(
         for exec_space in [ExecutionSpace.cudss_cuda]
         for operand_placement in [rng.choice(framework2operand_placement[framework])]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [
+            rng.choice([index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]])
+        ]
         for dtype in [rng.choice([dtype for dtype in supported_dtypes if dtype in framework2dtype[framework]])]
         for n in [15]
         for rhs_k in [rng.choice([RHSVector(n), RHSMatrix(n, 3)])]
@@ -471,6 +512,7 @@ def test_matrix_solve_cuda_options(
     exec_space,
     operand_placement,
     sparse_array_type,
+    index_type,
     dtype,
     n,
     rhs_k,
@@ -478,7 +520,9 @@ def test_matrix_solve_cuda_options(
     memory_mode_format,
 ):
     assert exec_space == ExecutionSpace.cudss_cuda
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, 0.5, dtype, seed=42)
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, 0.5, dtype, seed=42, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
     host_memory_estimates = []
@@ -516,6 +560,7 @@ def test_matrix_solve_cuda_options(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
@@ -529,6 +574,7 @@ def test_matrix_solve_cuda_options(
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             n,
             rhs_k,
@@ -541,6 +587,9 @@ def test_matrix_solve_cuda_options(
         for exec_space in ExecutionSpace
         for operand_placement in [rng.choice(framework2operand_placement[framework])]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [
+            rng.choice([index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]])
+        ]
         for dtype in [
             rng.choice([dtype for dtype in supported_sparse_type_dtype[sparse_type] if dtype in framework2dtype[framework]])
         ]
@@ -558,6 +607,7 @@ def test_solver_matrix_type_options(
     exec_space,
     operand_placement,
     sparse_array_type,
+    index_type,
     dtype,
     n,
     rhs_k,
@@ -576,6 +626,7 @@ def test_solver_matrix_type_options(
         None,
         dtype,
         seed=42,
+        index_dtype=index_type,
         alg_matrix_type=alg_matrix_type,
         alg_matrix_view=alg_matrix_view,
     )
@@ -640,6 +691,10 @@ def test_solver_matrix_options_external_handle(
                     continue
                 sparse_type = rng.choice(list(DirectSolverMatrixType))
                 sparse_view = rng.choice(list(DirectSolverMatrixViewType))
+                index_types = [
+                    index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]
+                ]
+                index_type = rng.choice(index_types)
                 dtypes = [dtype for dtype in supported_sparse_type_dtype[sparse_type] if dtype in framework2dtype[framework]]
                 dtype = rng.choice(dtypes)
                 alg_matrix_type, alg_matrix_view = get_alg_matrix_type_and_view(sparse_type, sparse_view)
@@ -652,6 +707,7 @@ def test_solver_matrix_options_external_handle(
                     None,
                     dtype,
                     seed=42,
+                    index_dtype=index_type,
                     alg_matrix_type=alg_matrix_type,
                     alg_matrix_view=alg_matrix_view,
                 )
@@ -695,6 +751,7 @@ def test_solver_matrix_options_external_handle(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
@@ -710,6 +767,7 @@ def test_solver_matrix_options_external_handle(
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             n,
             rhs_k,
@@ -725,6 +783,9 @@ def test_solver_matrix_options_external_handle(
         for exec_space in [ExecutionSpace.cudss_cuda]
         for operand_placement in [rng.choice(framework2operand_placement[framework])]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [
+            rng.choice([index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]])
+        ]
         for dtype in [rng.choice([dtype for dtype in supported_dtypes if dtype in framework2dtype[framework]])]
         for n in [15]
         for rhs_k in [rng.choice([RHSVector(n), RHSMatrix(n, 3)])]
@@ -741,6 +802,7 @@ def test_matrix_solve_cuda_options_too_tight_limit(
     exec_space,
     operand_placement,
     sparse_array_type,
+    index_type,
     dtype,
     n,
     rhs_k,
@@ -748,7 +810,9 @@ def test_matrix_solve_cuda_options_too_tight_limit(
     memory_mode_format,
 ):
     assert exec_space == ExecutionSpace.cudss_cuda
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
     execution = get_exec_cuda_options(
@@ -768,17 +832,21 @@ def test_matrix_solve_cuda_options_too_tight_limit(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
     ),
     [
-        (framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k)
+        (framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k)
         for framework in Framework.enabled()
         if framework in sparse_supporting_frameworks
         for exec_space in [ExecutionSpace.cudss_hybrid]
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [
+            rng.choice([index_type for index_type in supported_index_dtypes if index_type in framework2index_dtype[framework]])
+        ]
         for dtype in [rng.choice([dtype for dtype in supported_dtypes if dtype in framework2dtype[framework]])]
         for n in [11]
         for rhs_k in [RHSMatrix(n, 1), RHSMatrix(n, 11)]
@@ -786,10 +854,12 @@ def test_matrix_solve_cuda_options_too_tight_limit(
     ids=idfn,
 )
 def test_matrix_solve_hybrid_multiple_rhs_unsupported(
-    framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k
+    framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k
 ):
     assert rhs_k.type not in supported_exec_space_dense_rhs[exec_space]
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
     with pytest.raises(TypeError, match="multiple RHS"):
@@ -847,7 +917,7 @@ def test_matrix_solve_unsupported_dtype(framework, exec_space, operand_placement
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
-        for index_types in [[dtype for dtype in framework2index_dtype[framework] if dtype not in supported_index_dtype]]
+        for index_types in [[dtype for dtype in framework2index_dtype[framework] if dtype not in supported_index_dtypes]]
         if index_types
         for index_type in [rng.choice(index_types)]
         for dtype in [rng.choice([dtype for dtype in supported_dtypes if dtype in framework2dtype[framework]])]
@@ -859,7 +929,7 @@ def test_matrix_solve_unsupported_dtype(framework, exec_space, operand_placement
 def test_matrix_solve_unsupported_index_dtype(
     framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k
 ):
-    assert index_type not in supported_index_dtype
+    assert index_type not in supported_index_dtypes
     a = create_random_sparse_matrix(
         framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
     )
@@ -889,7 +959,16 @@ def _generate_rhs_k(batch_size, lhs_batching_mode, rhs_batching_mode, max_value)
 
 
 def _generate_lhs_batch(
-    lhs_batching_mode, batch_size, framework, exec_space, operand_placement, sparse_array_type, dtype, ns, seed=42
+    lhs_batching_mode,
+    batch_size,
+    framework,
+    exec_space,
+    operand_placement,
+    sparse_array_type,
+    dtype,
+    ns,
+    seed=42,
+    index_dtype=DType.int32,
 ):
     assert isinstance(ns, int) or len(ns) == batch_size
 
@@ -897,7 +976,9 @@ def _generate_lhs_batch(
         if isinstance(ns, int):
             ns = [ns] * batch_size
         a = [
-            create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, 0.5, dtype, seed=seed + i)
+            create_random_sparse_matrix(
+                framework, operand_placement, sparse_array_type, n, n, 0.5, dtype, seed=seed + i, index_dtype=index_dtype
+            )
             for i, n in enumerate(ns)
         ]
     else:
@@ -908,7 +989,16 @@ def _generate_lhs_batch(
             assert all(m == n for m in ns)
         assert framework == Framework.torch
         a = create_random_sparse_matrix(
-            framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, batch_dims=batch_size
+            framework,
+            operand_placement,
+            sparse_array_type,
+            n,
+            n,
+            None,
+            dtype,
+            seed=42,
+            index_dtype=index_dtype,
+            batch_dims=batch_size,
         )
     return a
 
@@ -974,6 +1064,7 @@ def _check_batched_result(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "batch_size",
         "lhs_batching_mode",
@@ -987,6 +1078,7 @@ def _check_batched_result(
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             batch_size,
             lhs_batching_mode,
@@ -999,6 +1091,8 @@ def _check_batched_result(
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [DType.int32]  # int64 is not supported with batching.
+        if index_type in framework2index_dtype[framework]
         for dtype in supported_dtypes
         if dtype in framework2dtype[framework]
         for lhs_batching_mode in (["sequence", "tensor"] if framework == Framework.torch else ["sequence"])
@@ -1015,6 +1109,7 @@ def test_batching(
     exec_space,
     operand_placement,
     sparse_array_type,
+    index_type,
     dtype,
     batch_size: int | tuple[int, int],
     lhs_batching_mode: typing.Literal["sequence", "tensor"],
@@ -1023,7 +1118,15 @@ def test_batching(
     rhs_ks,
 ):
     a = _generate_lhs_batch(
-        lhs_batching_mode, batch_size, framework, exec_space, operand_placement, sparse_array_type, dtype, ns
+        lhs_batching_mode,
+        batch_size,
+        framework,
+        exec_space,
+        operand_placement,
+        sparse_array_type,
+        dtype,
+        ns,
+        index_dtype=index_type,
     )
     b = _generate_rhs_batch(rhs_batching_mode, batch_size, framework, exec_space, operand_placement, dtype, ns, rhs_ks)
 
@@ -1043,17 +1146,20 @@ def test_batching(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
     ),
     [
-        (framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k)
+        (framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k)
         for framework in Framework.enabled()
         if framework in sparse_supporting_frameworks
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in supported_index_dtypes
+        if index_type in framework2index_dtype[framework]
         for dtype in supported_dtypes
         if dtype in framework2dtype[framework]
         for n in [1, 10]
@@ -1072,12 +1178,18 @@ def test_batching(
         if not free or (lhs and rhs)  # both operands need to be set after freeing
     ],
 )
-def test_reset(framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k, reset_lhs, reset_rhs, free):
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
+def test_reset(
+    framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k, reset_lhs, reset_rhs, free
+):
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
+    )
     tensor_framework = framework2tensor_framework[framework]
     b = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype)
 
-    a2 = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42 + 1)
+    a2 = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42 + 1, index_dtype=index_type
+    )
     b2 = create_dense_rhs(tensor_framework, operand_placement, rhs_k, dtype, start=-100)
 
     with nvmath.sparse.advanced.DirectSolver(a, b, execution=exec_space.nvname) as solver:
@@ -1113,6 +1225,7 @@ def test_reset(framework, exec_space, operand_placement, sparse_array_type, dtyp
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "batch_size",
         "lhs_batching_mode",
@@ -1126,6 +1239,7 @@ def test_reset(framework, exec_space, operand_placement, sparse_array_type, dtyp
             exec_space,
             operand_placement,
             sparse_array_type,
+            index_type,
             dtype,
             batch_size,
             lhs_batching_mode,
@@ -1138,6 +1252,8 @@ def test_reset(framework, exec_space, operand_placement, sparse_array_type, dtyp
         for exec_space in ExecutionSpace
         for operand_placement in framework2operand_placement[framework]
         for sparse_array_type in supported_sparse_array_types
+        for index_type in [DType.int32]  # int64 is not supported with batching.
+        if index_type in framework2index_dtype[framework]
         for dtype in supported_dtypes
         if dtype in framework2dtype[framework]
         for lhs_batching_mode in (["sequence", "tensor"] if framework == Framework.torch else ["sequence"])
@@ -1157,6 +1273,7 @@ def test_reset_batched(
     exec_space,
     operand_placement,
     sparse_array_type,
+    index_type,
     dtype,
     batch_size: int | tuple[int, int],
     lhs_batching_mode: typing.Literal["sequence", "tensor"],
@@ -1167,12 +1284,30 @@ def test_reset_batched(
     reset_rhs,
 ):
     a = _generate_lhs_batch(
-        lhs_batching_mode, batch_size, framework, exec_space, operand_placement, sparse_array_type, dtype, ns, seed=42
+        lhs_batching_mode,
+        batch_size,
+        framework,
+        exec_space,
+        operand_placement,
+        sparse_array_type,
+        dtype,
+        ns,
+        seed=42,
+        index_dtype=index_type,
     )
     b = _generate_rhs_batch(rhs_batching_mode, batch_size, framework, exec_space, operand_placement, dtype, ns, rhs_ks)
 
     a2 = _generate_lhs_batch(
-        lhs_batching_mode, batch_size, framework, exec_space, operand_placement, sparse_array_type, dtype, ns, seed=42 + 1
+        lhs_batching_mode,
+        batch_size,
+        framework,
+        exec_space,
+        operand_placement,
+        sparse_array_type,
+        dtype,
+        ns,
+        seed=42 + 1,
+        index_dtype=index_type,
     )
     b2 = _generate_rhs_batch(
         rhs_batching_mode, batch_size, framework, exec_space, operand_placement, dtype, ns, rhs_ks, start=-100
@@ -1285,12 +1420,13 @@ def test_invalid_sparse_format(
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
     ),
     [
-        (framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k)
+        (framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k)
         for framework in Framework.enabled()
         if framework in sparse_supporting_frameworks
         for exec_space in ExecutionSpace
@@ -1299,6 +1435,8 @@ def test_invalid_sparse_format(
         # won't be visible to the solver.
         if exec_space != ExecutionSpace.cudss_cuda or operand_placement != OperandPlacement.host
         for sparse_array_type in supported_sparse_array_types
+        for index_type in supported_index_dtypes
+        if index_type in framework2index_dtype[framework]
         for dtype in [DType.float64, DType.complex128]
         if dtype in framework2dtype[framework]
         for n in [105, 333]
@@ -1307,11 +1445,15 @@ def test_invalid_sparse_format(
     ],
     ids=idfn,
 )
-def test_matrix_solve_inplace_reset_blocking_auto(framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k):
+def test_matrix_solve_inplace_reset_blocking_auto(
+    framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k
+):
     stream = get_custom_stream(framework) if operand_placement == OperandPlacement.device else None
 
     with use_stream_or_dummy_ctx(framework, stream):
-        a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
+        a = create_random_sparse_matrix(
+            framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
+        )
         a_orig = a.copy() if framework != Framework.torch else a.clone()
         a_modifed = a.copy() if framework != Framework.torch else a.clone()
         b = create_dense_rhs(framework2tensor_framework[framework], operand_placement, rhs_k, dtype)
@@ -1350,12 +1492,13 @@ def test_matrix_solve_inplace_reset_blocking_auto(framework, exec_space, operand
         "exec_space",
         "operand_placement",
         "sparse_array_type",
+        "index_type",
         "dtype",
         "n",
         "rhs_k",
     ),
     [
-        (framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k)
+        (framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k)
         for framework in Framework.enabled()
         if framework in sparse_supporting_frameworks
         for exec_space in ExecutionSpace
@@ -1364,6 +1507,8 @@ def test_matrix_solve_inplace_reset_blocking_auto(framework, exec_space, operand
         # won't be visible to the solver.
         if exec_space != ExecutionSpace.cudss_cuda or operand_placement != OperandPlacement.host
         for sparse_array_type in supported_sparse_array_types
+        for index_type in supported_index_dtypes
+        if index_type in framework2index_dtype[framework]
         for dtype in [DType.float64, DType.complex128]
         if dtype in framework2dtype[framework]
         for n in [105, 333]
@@ -1372,11 +1517,13 @@ def test_matrix_solve_inplace_reset_blocking_auto(framework, exec_space, operand
     ],
     ids=idfn,
 )
-def test_matrix_solve_always_blocking(framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k):
+def test_matrix_solve_always_blocking(framework, exec_space, operand_placement, sparse_array_type, index_type, dtype, n, rhs_k):
     stream = get_custom_stream(framework) if operand_placement == OperandPlacement.device else None
     other_stream = get_custom_stream(framework) if operand_placement == OperandPlacement.device else None
 
-    a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
+    a = create_random_sparse_matrix(
+        framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42, index_dtype=index_type
+    )
     ref_a = a.copy() if framework != Framework.torch else a.clone()
     b = create_dense_rhs(framework2tensor_framework[framework], operand_placement, rhs_k, dtype)
     b_ref = b.copy() if framework != Framework.torch else b.clone()
diff --git a/tests/nvmath_tests/sparse/advanced/utils/support_matrix.py b/tests/nvmath_tests/sparse/advanced/utils/support_matrix.py
index cb706ae..b97c298 100644
--- a/tests/nvmath_tests/sparse/advanced/utils/support_matrix.py
+++ b/tests/nvmath_tests/sparse/advanced/utils/support_matrix.py
@@ -13,7 +13,7 @@
 
 
 supported_dtypes = (DType.float32, DType.float64, DType.complex64, DType.complex128)
-supported_index_dtypes = (DType.int32,)
+supported_index_dtypes = (DType.int32, DType.int64)
 
 supported_sparse_array_types = (SparseArrayType.CSR,)
 
@@ -29,5 +29,3 @@
     DirectSolverMatrixType.SPD: [DType.float32, DType.float64],
     DirectSolverMatrixType.HPD: [DType.complex64, DType.complex128],
 }
-
-supported_index_dtype = (DType.int32,)
diff --git a/tests/nvmath_tests/tensor/__init__.py b/tests/nvmath_tests/tensor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/tensor/conftest.py b/tests/nvmath_tests/tensor/conftest.py
new file mode 100644
index 0000000..1262aa2
--- /dev/null
+++ b/tests/nvmath_tests/tensor/conftest.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+import sys
+import pytest
+import cuda.core.experimental as ccx
+
+if sys.platform == "win32":
+    pytest.skip("Skipping tensor contraction tests because they are not supported on Windows.", allow_module_level=True)
+
+
+# starting cutensor 2.3.0, support only compute capability > 7.0
+def pytest_collection_modifyitems(config, items):
+    """Skip all tests in this directory if compute capability <= 7.0"""
+    if ccx.Device().compute_capability <= (7, 0):
+        skip_marker = pytest.mark.skip(reason="cuTensor 2.3.1+ requires compute capability > 7.0")
+        for item in items:
+            item.add_marker(skip_marker)
diff --git a/tests/nvmath_tests/tensor/test_stateful_contraction.py b/tests/nvmath_tests/tensor/test_stateful_contraction.py
new file mode 100644
index 0000000..0ce51fa
--- /dev/null
+++ b/tests/nvmath_tests/tensor/test_stateful_contraction.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from nvmath.tensor import BinaryContraction, TernaryContraction, ContractionCacheMode, ContractionAutotuneMode
+from .utils.check_helpers import get_contraction_ref, assert_all_close, get_contraction_tolerance
+from .utils.base_testers import BaseStatefulTester
+
+from .utils.common_axes import Framework, JitOption, AlgoOption, KernelRankOption
+from .utils.data import contraction_test_cases
+from .utils.support_matrix import framework_backend_support, framework_type_support
+
+
+@pytest.mark.parametrize(
+    (
+        "test_case",
+        "framework",
+        "mem_backend",
+        "dtype",
+    ),
+    [
+        (
+            test_case,
+            framework,
+            mem_backend,
+            dtype,
+        )
+        for test_case in contraction_test_cases
+        for framework in Framework.enabled()
+        for mem_backend in framework_backend_support[framework]
+        for dtype in framework_type_support[framework]
+    ],
+)
+class TestStatefulContraction(BaseStatefulTester):
+    @pytest.mark.parametrize("alpha", [False, 0, 0.3, 0.2 + 0.3j])
+    @pytest.mark.parametrize("beta", [False, 0, 0.5, 0.4 + 0.5j])
+    @pytest.mark.parametrize("use_offset", [False, True])
+    def test_coefficients(self, alpha, beta, use_offset, test_case, framework, mem_backend, dtype):
+        self._test_coefficients(alpha, beta, use_offset, test_case, framework, mem_backend, dtype)
+
+    def test_autotune(self, test_case, framework, mem_backend, dtype):
+        alpha, beta = 0.3, 0.4
+        if test_case.num_inputs == 2:
+            a, b = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            c = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = BinaryContraction(test_case.equation, a, b, c=c)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, alpha=alpha, beta=beta)
+        elif test_case.num_inputs == 3:
+            a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            d = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = TernaryContraction(test_case.equation, a, b, c, d=d)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, d=d, alpha=alpha, beta=beta)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+        tolerance = get_contraction_tolerance(dtype.name, None)
+
+        with contraction:
+            plan_preference = contraction.plan_preference
+            plan_preference.autotune_mode = ContractionAutotuneMode.INCREMENTAL
+            plan_preference.incremental_count = 3
+            contraction.plan()
+            for _ in range(5):
+                result = contraction.execute(alpha=alpha, beta=beta)
+                assert_all_close(result, reference, **tolerance)
+
+    def test_non_caching(self, test_case, framework, mem_backend, dtype):
+        alpha, beta = 0.3, 0.4
+        if test_case.num_inputs == 2:
+            a, b = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            c = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = BinaryContraction(test_case.equation, a, b, c=c)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, alpha=alpha, beta=beta)
+        elif test_case.num_inputs == 3:
+            a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            d = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = TernaryContraction(test_case.equation, a, b, c, d=d)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, d=d, alpha=alpha, beta=beta)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+        tolerance = get_contraction_tolerance(dtype.name, None)
+
+        with contraction:
+            plan_preference = contraction.plan_preference
+            plan_preference.cache_mode = ContractionCacheMode.NONE
+            contraction.plan()
+            result = contraction.execute(alpha=alpha, beta=beta)
+            assert_all_close(result, reference, **tolerance)
+
+    @pytest.mark.parametrize("algo", AlgoOption)
+    @pytest.mark.parametrize("kernel_rank", KernelRankOption)
+    def test_algorithm_kernal_rank(self, algo, kernel_rank, test_case, framework, mem_backend, dtype):
+        alpha, beta = 0.3, 0.4
+        if test_case.num_inputs == 2:
+            a, b = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            c = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = BinaryContraction(test_case.equation, a, b, c=c)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, alpha=alpha, beta=beta)
+        elif test_case.num_inputs == 3:
+            a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            d = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = TernaryContraction(test_case.equation, a, b, c, d=d)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, d=d, alpha=alpha, beta=beta)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+        tolerance = get_contraction_tolerance(dtype.name, None)
+
+        with contraction:
+            plan_preference = contraction.plan_preference
+            plan_preference.algo = algo.value
+            plan_preference.kernel_rank = kernel_rank.value
+            contraction.plan()
+            result = contraction.execute(alpha=alpha, beta=beta)
+            assert_all_close(result, reference, **tolerance)
+
+    @pytest.mark.parametrize("jit", JitOption.enabled())
+    def test_jit(self, jit, test_case, framework, mem_backend, dtype):
+        alpha, beta = 0.3, 0.4
+        if test_case.num_inputs == 2:
+            a, b = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            c = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = BinaryContraction(test_case.equation, a, b, c=c)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, alpha=alpha, beta=beta)
+        elif test_case.num_inputs == 3:
+            a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, 23)
+            d = test_case.gen_random_output(framework, dtype, mem_backend, 24)
+            contraction = TernaryContraction(test_case.equation, a, b, c, d=d)
+            reference = get_contraction_ref(test_case.equation, a, b, c=c, d=d, alpha=alpha, beta=beta)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+        tolerance = get_contraction_tolerance(dtype.name, None)
+
+        with contraction:
+            plan_preference = contraction.plan_preference
+            plan_preference.jit = jit.value
+            contraction.plan()
+            result = contraction.execute(alpha=alpha, beta=beta)
+            assert_all_close(result, reference, **tolerance)
diff --git a/tests/nvmath_tests/tensor/test_stateless_contraction.py b/tests/nvmath_tests/tensor/test_stateless_contraction.py
new file mode 100644
index 0000000..7e0ca10
--- /dev/null
+++ b/tests/nvmath_tests/tensor/test_stateless_contraction.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import itertools
+import logging
+
+import cuda.core.experimental as ccx
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+try:
+    import torch
+except ImportError:
+    torch = None
+
+from nvmath.bindings import cutensor
+from nvmath.internal import tensor_wrapper
+from nvmath.memory import _MEMORY_MANAGER
+from nvmath.tensor import binary_contraction, ExecutionCUDA, Operator, tensor_qualifiers_dtype
+
+from .utils.common_axes import Framework, ComputeType, BlockingOption, MemBackend
+from .utils.check_helpers import get_contraction_tolerance, assert_all_close
+from .utils.data import contraction_test_cases
+from .utils.support_matrix import framework_backend_support, framework_type_support
+from .utils.base_testers import BaseStatelessTester
+
+
+@pytest.mark.parametrize(
+    (
+        "test_case",
+        "framework",
+        "mem_backend",
+        "dtype",
+    ),
+    [
+        (
+            test_case,
+            framework,
+            mem_backend,
+            dtype,
+        )
+        for test_case in contraction_test_cases
+        for framework in Framework.enabled()
+        for mem_backend in framework_backend_support[framework]
+        for dtype in framework_type_support[framework]
+    ],
+)
+class TestStatelessContraction(BaseStatelessTester):
+    @pytest.mark.parametrize("alpha", [False, 0, 0.3, 0.2 + 0.3j])
+    @pytest.mark.parametrize("beta", [False, 0, 0.5, 0.4 + 0.5j])
+    @pytest.mark.parametrize("use_offset", [False, True])
+    def test_coefficients(self, alpha, beta, use_offset, test_case, framework, mem_backend, dtype):
+        self._test_coefficients(alpha, beta, use_offset, test_case, framework, mem_backend, dtype)
+
+    @pytest.mark.parametrize("offset_format", ["out", "new", False])
+    def test_inplace_output(self, offset_format, test_case, framework, mem_backend, dtype):
+        self._test_inplace_output(offset_format, test_case, framework, mem_backend, dtype)
+
+    def test_qualifiers(self, test_case, framework, mem_backend, dtype):
+        for ops in itertools.product([Operator.OP_IDENTITY, Operator.OP_CONJ], repeat=test_case.num_inputs + 1):
+            qualifiers = np.asarray(ops, dtype=tensor_qualifiers_dtype)
+            self._test_qualifiers(qualifiers, test_case, framework, mem_backend, dtype)
+
+    @pytest.mark.parametrize("stream", [None, True])
+    def test_stream(self, stream, test_case, framework, mem_backend, dtype):
+        self.run_test(test_case, framework, mem_backend, dtype, 13, use_offset=True, beta=0.6, stream=stream)
+
+    @pytest.mark.parametrize("compute_type", ComputeType)
+    def test_compute_type(self, compute_type, test_case, framework, mem_backend, dtype):
+        self._test_compute_type(compute_type, test_case, framework, mem_backend, dtype)
+
+
+@pytest.mark.parametrize(
+    (
+        "framework",
+        "mem_backend",
+    ),
+    [
+        (
+            framework,
+            mem_backend,
+        )
+        for framework in Framework.enabled()
+        for mem_backend in framework_backend_support[framework]
+    ],
+)
+class TestMiscellaneous:
+    def _run_test(self, framework, mem_backend, *, execution=None, options=None):
+        if isinstance(execution, ExecutionCUDA):
+            device_id = execution.device_id
+        else:
+            device_id = execution.get("device_id", 0) if execution is not None else 0
+        if framework == Framework.numpy:
+            a = np.random.rand(10, 10)
+        elif framework == Framework.cupy:
+            with cp.cuda.Device(device_id):
+                a = cp.random.rand(10, 10)
+        elif framework == Framework.torch:
+            if mem_backend == MemBackend.cuda:
+                a = torch.rand(10, 10, device=f"cuda:{device_id}")
+            else:
+                a = torch.rand(10, 10, device="cpu")
+        result = binary_contraction("ij,jk->ik", a, a, execution=execution, options=options)
+        reference = a @ a
+        tolerance = get_contraction_tolerance("float32", None)
+        assert_all_close(result, reference, **tolerance)
+
+    @pytest.mark.parametrize("device_id", range(ccx.system.num_devices))
+    def test_execution_device_id(self, framework, mem_backend, device_id):
+        self._run_test(framework, mem_backend, execution={"name": "cuda", "device_id": device_id})
+
+    @pytest.mark.parametrize("memory_limit", [1024**2, "1GB", "60%"])
+    def test_memory_limit(self, framework, mem_backend, memory_limit):
+        self._run_test(framework, mem_backend, options={"memory_limit": memory_limit})
+
+    def test_handle(self, framework, mem_backend):
+        try:
+            handle = cutensor.create()
+            self._run_test(framework, mem_backend, options={"handle": handle})
+        finally:
+            cutensor.destroy(handle)
+
+    @pytest.mark.parametrize("blocking", BlockingOption)
+    def test_blocking(self, blocking, framework, mem_backend):
+        self._run_test(framework, mem_backend, options={"blocking": blocking.value})
+
+    def test_allocator(self, framework, mem_backend):
+        framework_name = {
+            Framework.cupy: "cupy",
+            Framework.torch: "torch",
+            Framework.numpy: "cuda",
+        }[framework]
+        tensor_wrapper.maybe_register_package(framework_name)
+        BaseAllocatorClass = _MEMORY_MANAGER[framework_name]
+
+        class MockAllocator(BaseAllocatorClass):
+            def __init__(self, device_id, logger):
+                super().__init__(device_id, logger)
+                self.counter = 0
+
+            def memalloc(self, size, *args, **kwargs):
+                self.counter += 1
+                return super().memalloc(size, *args, **kwargs)
+
+        for cls in [BaseAllocatorClass, MockAllocator]:
+            allocator = cls(0, logging.getLogger())
+            self._run_test(framework, mem_backend, options={"allocator": allocator})
diff --git a/tests/nvmath_tests/tensor/utils/__init__.py b/tests/nvmath_tests/tensor/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/tensor/utils/axes_utils.py b/tests/nvmath_tests/tensor/utils/axes_utils.py
new file mode 100644
index 0000000..188a233
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/axes_utils.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+
+try:
+    import cupy as cp
+
+    CP_NDARRAY = cp.ndarray
+except ImportError:
+    cp = CP_NDARRAY = None
+try:
+    import torch
+
+    TORCH_TENSOR = torch.Tensor
+except ImportError:
+    torch = TORCH_TENSOR = None
+
+from .common_axes import Framework, DType
+
+
+numpy_dtype = {
+    DType.float16: np.float16,
+    DType.float32: np.float32,
+    DType.float64: np.float64,
+    DType.complex64: np.complex64,
+    DType.complex128: np.complex128,
+}
+
+if cp is None:
+    cupy_dtype = {}
+else:
+    cupy_dtype = {
+        DType.float16: cp.float16,
+        DType.float32: cp.float32,
+        DType.float64: cp.float64,
+        DType.complex64: cp.complex64,
+        DType.complex128: cp.complex128,
+    }
+
+if torch is not None:
+    torch_dtype = {
+        DType.float16: torch.float16,
+        DType.bfloat16: torch.bfloat16,
+        DType.float32: torch.float32,
+        DType.float64: torch.float64,
+        DType.complex32: torch.complex32,
+        DType.complex64: torch.complex64,
+        DType.complex128: torch.complex128,
+    }
+else:
+    torch_dtype = {}
+
+framework_dtype = {
+    Framework.numpy: numpy_dtype,
+    Framework.cupy: cupy_dtype,
+    Framework.torch: torch_dtype,
+}
+
+
+def is_complex(dtype: DType):
+    assert isinstance(dtype, DType)
+    return dtype in [DType.complex32, DType.complex64, DType.complex128]
+
+
+def get_framework_dtype(framework: Framework, dtype: DType):
+    return framework_dtype[framework][dtype]
+
+
+def get_framework_module(framework: Framework):
+    if framework == Framework.numpy:
+        return np
+    elif framework == Framework.cupy:
+        return cp
+    elif framework == Framework.torch:
+        return torch
+    else:
+        raise ValueError(f"Unknown framework {framework}")
diff --git a/tests/nvmath_tests/tensor/utils/base_testers.py b/tests/nvmath_tests/tensor/utils/base_testers.py
new file mode 100644
index 0000000..c7e0b67
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/base_testers.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import nullcontext
+
+import pytest
+
+from .axes_utils import is_complex
+from .common_axes import MemBackend
+from .input_fixtures import get_custom_stream
+
+from nvmath.tensor import (
+    binary_contraction,
+    ternary_contraction,
+    ContractionOptions,
+    BinaryContraction,
+    TernaryContraction,
+    Operator,
+)
+from .check_helpers import assert_all_close, get_contraction_ref, get_contraction_tolerance
+from .support_matrix import compute_type_support
+
+
+class BaseStatelessTester:
+    def _test_coefficients(self, alpha, beta, use_offset, test_case, framework, mem_backend, dtype):
+        if (use_offset and beta is False) or (not use_offset and beta is not False):
+            context = pytest.raises(ValueError)
+        elif not is_complex(dtype) and (isinstance(alpha, complex) or isinstance(beta, complex)):
+            context = pytest.raises(TypeError)
+        else:
+            context = nullcontext()
+
+        coeffs = {}
+        if alpha is not False:
+            coeffs["alpha"] = alpha
+        if beta is not False:
+            coeffs["beta"] = beta
+        self.run_test(test_case, framework, mem_backend, dtype, 23, use_offset=use_offset, context=context, **coeffs)
+
+    def _test_inplace_output(self, offset_format, test_case, framework, mem_backend, dtype):
+        kwargs = {
+            "alpha": 0.3,
+            "beta": 0.5,
+        }
+
+        offset_name = "c" if test_case.num_inputs == 2 else "d"
+
+        out = test_case.gen_random_output(framework, dtype, mem_backend, 3)
+
+        if offset_format == "out":
+            kwargs[offset_name] = out
+        elif offset_format == "new":
+            kwargs[offset_name] = test_case.gen_random_output(framework, dtype, mem_backend, 7)
+        elif offset_format is False:
+            kwargs[offset_name] = None
+            kwargs["beta"] = None
+        else:
+            raise ValueError(f"Invalid offset_format: {offset_format}")
+
+        self.run_test(test_case, framework, mem_backend, dtype, 23, out=out, **kwargs)
+
+    def _test_qualifiers(self, qualifiers, test_case, framework, mem_backend, dtype):
+        if not is_complex(dtype) and any(op == Operator.OP_CONJ for op in qualifiers):
+            context = pytest.raises(ValueError)
+        elif qualifiers[test_case.num_inputs] != Operator.OP_IDENTITY:
+            context = pytest.raises(ValueError)  # output operand must be the identity operator
+        else:
+            context = nullcontext()
+        self.run_test(test_case, framework, mem_backend, dtype, 23, context=context, qualifiers=qualifiers)
+
+    def _test_compute_type(self, compute_type, test_case, framework, mem_backend, dtype):
+        if compute_type in compute_type_support[dtype]:
+            context = nullcontext()
+        else:
+            context = pytest.raises(ValueError)
+        compute_type = compute_type.value
+        self.run_test(test_case, framework, mem_backend, dtype, 11, context=context, options={"compute_type": compute_type})
+
+    def _parse_operands(self, test_case, framework, mem_backend, dtype, seed, use_offset, c=None, d=None, out=None, **kwargs):
+        if test_case.num_inputs == 2:
+            a, b = test_case.gen_input_operands(framework, dtype, mem_backend, seed)
+            if use_offset:
+                assert c is None, "c cannot be provided if use_offset is True"
+                c = test_case.gen_random_output(framework, dtype, mem_backend, seed + 1)
+        elif test_case.num_inputs == 3:
+            assert c is None, "c can not be provided as a keyword argument for ternary contraction"
+            a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, seed)
+            if use_offset:
+                assert d is None, "d cannot be provided if use_offset is True"
+                d = test_case.gen_random_output(framework, dtype, mem_backend, seed + 1)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+        return a, b, c, d, out
+
+    def _parse_options(self, options):
+        options = {} if options is None else options
+        if isinstance(options, ContractionOptions):
+            blocking = options.blocking
+            compute_type = options.compute_type
+        else:
+            blocking = "auto"
+            compute_type = options.get("compute_type", None)
+        return blocking, compute_type
+
+    def run_test(
+        self,
+        test_case,
+        framework,
+        mem_backend,
+        dtype,
+        seed,
+        *,
+        use_offset=False,
+        context=None,
+        stream=None,
+        options=None,
+        **kwargs,
+    ):
+        if context is None:
+            context = nullcontext()
+
+        if stream is True:
+            stream = get_custom_stream(framework)
+
+        a, b, c, d, out = self._parse_operands(test_case, framework, mem_backend, dtype, seed, use_offset, **kwargs)
+        for key in ["c", "d", "out"]:
+            kwargs.pop(key, None)
+
+        blocking, compute_type = self._parse_options(options)
+        sync_needed = blocking == "auto" and mem_backend == MemBackend.cuda and stream is not None
+
+        tolerance = get_contraction_tolerance(dtype.name, compute_type)
+
+        with context:
+            # reference must be computed first as out may be modified
+            #   by the contraction, e.g, when c is the same as out
+            ref = get_contraction_ref(test_case.equation, a, b, c=c, d=d, **kwargs)
+            if test_case.num_inputs == 2:
+                result = binary_contraction(test_case.equation, a, b, c=c, stream=stream, options=options, out=out, **kwargs)
+            elif test_case.num_inputs == 3:
+                result = ternary_contraction(
+                    test_case.equation, a, b, c, d=d, stream=stream, options=options, out=out, **kwargs
+                )
+            else:
+                raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+            if sync_needed:
+                # stream is guaranteed to be either a
+                #   cupy.cuda.Stream or a torch.cuda.Stream object
+                stream.synchronize()
+
+            assert_all_close(result, ref, **tolerance)
+            if out is not None:
+                assert result is out
+
+
+class BaseStatefulTester:
+    _parse_options = BaseStatelessTester._parse_options
+    _parse_operands = BaseStatelessTester._parse_operands
+    _test_coefficients = BaseStatelessTester._test_coefficients
+
+    def run_test(
+        self,
+        test_case,
+        framework,
+        mem_backend,
+        dtype,
+        seed,
+        *,
+        use_offset=False,
+        context=None,
+        stream=None,
+        options=None,
+        plan_preferences=None,
+        **kwargs,
+    ):
+        if context is None:
+            context = nullcontext()
+
+        if stream is True:
+            stream = get_custom_stream(framework)
+
+        a, b, c, d, out = self._parse_operands(test_case, framework, mem_backend, dtype, seed, use_offset, **kwargs)
+        for key in ["c", "d", "out"]:
+            kwargs.pop(key, None)
+
+        blocking, compute_type = self._parse_options(options)
+        sync_needed = blocking == "auto" and mem_backend == MemBackend.cuda and stream is not None
+
+        tolerance = get_contraction_tolerance(dtype.name, compute_type)
+
+        if test_case.num_inputs == 2:
+            contraction = BinaryContraction(test_case.equation, a, b, c=c, stream=stream, options=options, out=out)
+        elif test_case.num_inputs == 3:
+            contraction = TernaryContraction(test_case.equation, a, b, c, d=d, stream=stream, options=options, out=out)
+        else:
+            raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+        with context:
+            ref = get_contraction_ref(test_case.equation, a, b, c=c, d=d, **kwargs)
+            with contraction:
+                contraction.plan()
+                result = contraction.execute(**kwargs, stream=stream)
+                if sync_needed:
+                    # stream is guaranteed to be either a cupy.cuda.Stream
+                    #   or a torch.cuda.Stream object
+                    stream.synchronize()
+                assert_all_close(result, ref, **tolerance)
+                if out is not None:
+                    assert result is out
+
+                if plan_preferences is not None:
+                    preference = contraction.plan_preference
+                    for key, value in plan_preferences.items():
+                        setattr(preference, key, value)
+                    contraction.plan()
+                if test_case.num_inputs == 2:
+                    a, b = test_case.gen_input_operands(framework, dtype, mem_backend, seed + 23)
+                    if c is not None:
+                        c = test_case.gen_random_output(framework, dtype, mem_backend, seed + 24)
+                elif test_case.num_inputs == 3:
+                    a, b, c = test_case.gen_input_operands(framework, dtype, mem_backend, seed + 23)
+                    if d is not None:
+                        d = test_case.gen_random_output(framework, dtype, mem_backend, seed + 24)
+                else:
+                    raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+
+                kwargs["alpha"] = -0.3 * kwargs.get("alpha", 1.0) + 0.2
+                if "beta" in kwargs:
+                    # NOTE: beta can only be updated/specified when offset is specified
+                    kwargs["beta"] = -0.4 * kwargs["beta"] + 0.1 if kwargs["beta"] is not None else None
+                ref = get_contraction_ref(test_case.equation, a, b, c=c, d=d, **kwargs)
+                if test_case.num_inputs == 2:
+                    contraction.reset_operands(a=a, b=b, c=c)
+                elif test_case.num_inputs == 3:
+                    contraction.reset_operands(a=a, b=b, c=c, d=d)
+                else:
+                    raise ValueError(f"Invalid number of inputs: {test_case.num_inputs}")
+                result = contraction.execute(**kwargs, stream=stream)
+                if sync_needed:
+                    stream.synchronize()
+                assert_all_close(result, ref, **tolerance)
+                if out is not None:
+                    assert result is out
diff --git a/tests/nvmath_tests/tensor/utils/check_helpers.py b/tests/nvmath_tests/tensor/utils/check_helpers.py
new file mode 100644
index 0000000..8e0f79b
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/check_helpers.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import importlib
+import numpy as np
+from collections.abc import Sequence
+
+try:
+    import cupy as cp
+
+    CP_NDARRAY = cp.ndarray
+except ImportError:
+    cp = CP_NDARRAY = None
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+from nvmath.internal import tensor_wrapper
+from nvmath.tensor import ComputeDesc, Operator
+
+from .axes_utils import TORCH_TENSOR
+
+
+def get_contraction_ref(
+    eq: str,
+    a: np.ndarray | CP_NDARRAY | TORCH_TENSOR,
+    b: np.ndarray | CP_NDARRAY | TORCH_TENSOR,
+    *,
+    c: np.ndarray | CP_NDARRAY | TORCH_TENSOR | None = None,
+    d: np.ndarray | CP_NDARRAY | TORCH_TENSOR | None = None,
+    alpha: float = 1.0,
+    beta: float | None = None,
+    qualifiers: Sequence[Operator] = [],
+):
+    num_inputs = eq.count(",") + 1
+    if len(qualifiers) == 0:
+        qualifiers = [Operator.OP_IDENTITY] * (num_inputs + 1)
+    else:
+        assert len(qualifiers) == num_inputs + 1, f"The qualifiers must be a sequence of length {num_inputs + 1}"
+    if num_inputs == 2:
+        iterator = zip([a, b], qualifiers[:num_inputs], strict=False)
+        if c is None and beta is not None:
+            raise ValueError("beta can only be set if c is specified in a binary contraction")
+        elif c is not None and beta is None:
+            raise ValueError("beta must be set when c is specified in a binary contraction")
+    else:
+        iterator = zip([a, b, c], qualifiers[:num_inputs], strict=False)
+        if d is None and beta is not None:
+            raise ValueError("beta can only be set if d is specified in a ternary contraction")
+        elif d is not None and beta is None:
+            raise ValueError("beta must be set when d is specified in a ternary contraction")
+    operands = []
+    for op, qualifier in iterator:
+        if qualifier not in {Operator.OP_IDENTITY, Operator.OP_CONJ}:
+            raise ValueError(f"Invalid operator: {qualifier}")
+        if op is not None:
+            if qualifier == Operator.OP_CONJ:
+                op = op.conj()
+            operands.append(op)
+
+    offset = None
+    match num_inputs:
+        case 2:
+            offset = c
+            assert d is None, "d cannot be set for binary contractions"
+        case 3:
+            assert c is not None, "c must be set for ternary contractions"
+            offset = d
+        case _:
+            raise ValueError(f"Invalid number of inputs: {num_inputs}")
+
+    # make sure operands are compatible (package, device_id)
+    wrapped_operands = tensor_wrapper.wrap_operands(operands)
+    package = wrapped_operands[0].name
+    module = importlib.import_module(package)
+    output = module.einsum(eq, *operands) * alpha
+    if offset is not None:
+        output = output + offset * beta
+    return output
+
+
+dtype_names = [
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
+machine_epsilon_values = [np.finfo(dtype).eps for dtype in dtype_names]
+
+rtol_mapper = dict(zip(dtype_names, [np.sqrt(m_eps) for m_eps in machine_epsilon_values], strict=False))
+
+atol_mapper = dict(zip(dtype_names, [10 * m_eps for m_eps in machine_epsilon_values], strict=False))
+
+
+def get_contraction_tolerance(dtype_name, compute_type):
+    if compute_type == ComputeDesc.COMPUTE_32F() and dtype_name in {"float64", "complex128"}:
+        return {"atol": atol_mapper["float32"], "rtol": rtol_mapper["float32"]}
+    elif compute_type in {ComputeDesc.COMPUTE_16F(), ComputeDesc.COMPUTE_16BF()}:
+        return {"atol": atol_mapper["float16"], "rtol": rtol_mapper["float16"]}
+    else:
+        tolerance = {"atol": atol_mapper[dtype_name], "rtol": rtol_mapper[dtype_name]}
+        if compute_type in {ComputeDesc.COMPUTE_TF32(), ComputeDesc.COMPUTE_3XTF32()}:
+            tolerance["rtol"] *= 100
+            tolerance["atol"] *= 100
+        return tolerance
+
+
+def assert_all_close(a, b, rtol, atol):
+    if isinstance(a, np.ndarray):
+        return np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
+    elif CP_NDARRAY is not None and isinstance(a, CP_NDARRAY):
+        return cp.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
+    elif TORCH_TENSOR is not None and isinstance(a, TORCH_TENSOR):
+        return torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+    else:
+        raise ValueError(f"Unknown array type {a}")
diff --git a/tests/nvmath_tests/tensor/utils/common_axes.py b/tests/nvmath_tests/tensor/utils/common_axes.py
new file mode 100644
index 0000000..24d919f
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/common_axes.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from enum import Enum
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+import cuda.core.experimental as ccx
+
+from nvmath.tensor import ComputeDesc, ContractionJitMode, ContractionAlgo
+
+
+class Framework(Enum):
+    numpy = 1
+    cupy = 2
+    torch = 3
+
+    @classmethod
+    def enabled(cls):
+        yield cls.numpy
+        if cp is not None:
+            yield cls.cupy
+        if torch is not None:
+            yield cls.torch
+
+
+class MemBackend(Enum):
+    cuda = 1
+    cpu = 2
+
+
+class DType(Enum):
+    float16 = 100
+    bfloat16 = 101
+    float32 = 102
+    float64 = 103
+
+    complex32 = 200
+    complex64 = 201
+    complex128 = 202
+
+
+class ComputeType(Enum):
+    float16 = ComputeDesc.COMPUTE_16F()
+    bfloat16 = ComputeDesc.COMPUTE_16BF()
+    float32 = ComputeDesc.COMPUTE_32F()
+    float64 = ComputeDesc.COMPUTE_64F()
+
+    tf32 = ComputeDesc.COMPUTE_TF32()
+    three_xtf32 = ComputeDesc.COMPUTE_3XTF32()
+
+
+class BlockingOption(Enum):
+    true = True
+    auto = "auto"
+
+
+class JitOption(Enum):
+    off = ContractionJitMode.NONE
+    on = ContractionJitMode.DEFAULT
+
+    @classmethod
+    def enabled(cls):
+        yield cls.off
+        # https://docs.nvidia.com/cuda/cutensor/latest/api/types.html#_CPPv4N17cutensorJitMode_t25CUTENSOR_JIT_MODE_DEFAULTE  # noqa
+        # Only supported for GPUs with compute capability >= 8.0
+        device = ccx.Device()
+
+        if device.compute_capability.major >= 8:
+            yield cls.on
+        del device
+
+
+class AlgoOption(Enum):
+    default_patient = ContractionAlgo.DEFAULT_PATIENT
+    gett = ContractionAlgo.GETT
+    tgett = ContractionAlgo.TGETT
+    ttgt = ContractionAlgo.TTGT
+    default = ContractionAlgo.DEFAULT
+
+
+class KernelRankOption(Enum):
+    zero = 0
+    one = 1
diff --git a/tests/nvmath_tests/tensor/utils/data.py b/tests/nvmath_tests/tensor/utils/data.py
new file mode 100644
index 0000000..adc637b
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/data.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Sequence
+
+import opt_einsum as oe
+
+from .common_axes import Framework, DType, MemBackend
+from .input_fixtures import get_random_input_data
+
+
+class ContractionTestCase:
+    def __init__(self, equation: str, shapes: Sequence[Sequence[int]]):
+        # normalize the equation using opt_einsum to handle the ellipses
+        if "..." in equation:
+            info = oe.contract_path(equation, *shapes, shapes=True)[1]
+            equation = info.eq
+        else:
+            equation = equation
+        self._equation = equation
+        self._shapes = shapes
+
+    @property
+    def equation(self):
+        return self._equation
+
+    @property
+    def shapes(self):
+        return self._shapes
+
+    @property
+    def num_inputs(self):
+        return len(self.shapes)
+
+    def gen_input_operands(self, framework: Framework, dtype: DType, mem_backend: MemBackend, seed: int):
+        operands = []
+        for i, shape in enumerate(self.shapes):
+            operands.append(get_random_input_data(framework, shape, dtype, mem_backend, seed + i))
+        return operands
+
+    def _get_output_shape(self):
+        if "->" in self.equation:
+            output_str = self.equation.split("->")[1]
+        else:
+            output_str = oe.parser.find_output_str(self.equation)
+        inputs = self.equation.split("->")[0].split(",")
+        output_shape = oe.parser.find_output_shape(inputs, self.shapes, output_str)
+        return output_shape
+
+    def gen_random_output(self, framework: Framework, dtype: DType, mem_backend: MemBackend, seed: int):
+        output_shape = self._get_output_shape()
+        return get_random_input_data(framework, output_shape, dtype, mem_backend, seed)
+
+
+contraction_test_cases = (
+    # binary tensor contraction
+    ContractionTestCase(equation="ij,jk->ik", shapes=[(2, 3), (3, 4)]),
+    ContractionTestCase(equation="a,a->", shapes=[(4,), (4,)]),
+    ContractionTestCase(equation="ax,a->ax", shapes=[(3, 5), (3,)]),
+    ContractionTestCase(equation="ac,bd->bcda", shapes=[(2, 3), (4, 1)]),
+    ContractionTestCase(equation="...,...->...", shapes=[(2, 3, 4), (2, 3, 4)]),
+    # ternary tensor contraction
+    ContractionTestCase(equation="ijkl,klmn,mnp->ijp", shapes=[(2, 2, 4, 5), (4, 5, 2, 3), (2, 3, 3)]),
+    ContractionTestCase(equation="...,...,...->...", shapes=[(2, 4), (2, 4), (2, 4)]),
+    ContractionTestCase(equation="...,...,ab->a", shapes=[(2, 4), (2, 4), (3, 5)]),
+    ContractionTestCase(equation="abc,bc,x->", shapes=[(2, 3, 4), (3, 4), (5,)]),
+    ContractionTestCase(equation="a,b,cd->abcd", shapes=[(2,), (3,), (5, 4)]),
+)
diff --git a/tests/nvmath_tests/tensor/utils/input_fixtures.py b/tests/nvmath_tests/tensor/utils/input_fixtures.py
new file mode 100644
index 0000000..936a897
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/input_fixtures.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+try:
+    import torch
+except ImportError:
+    torch = None
+
+import cuda.core.experimental as ccx
+
+
+from .common_axes import MemBackend, Framework, DType
+from .axes_utils import get_framework_dtype, is_complex
+
+
+def get_random_input_data(
+    framework: Framework,
+    shape: int | tuple[int],
+    dtype: DType,
+    mem_backend: MemBackend,
+    seed: int,
+    lo: float = -0.5,
+    hi: float = 0.5,
+    device_id=None,
+):
+    assert lo < hi
+    framework_dtype = get_framework_dtype(framework, dtype)
+    if framework in [Framework.numpy, Framework.cupy]:
+
+        def _create_array():
+            if framework == Framework.numpy:
+                assert mem_backend == MemBackend.cpu
+                rng = np.random.default_rng(seed)
+            else:
+                assert mem_backend == MemBackend.cuda
+                rng = cp.random.default_rng(seed)
+            if not is_complex(dtype):
+                a = rng.uniform(lo, hi, size=shape).astype(framework_dtype)
+            else:
+                real = rng.uniform(lo, hi, size=shape)
+                imag = rng.uniform(lo, hi, size=shape)
+                a = (real + 1j * imag).astype(framework_dtype)
+            if len(shape) == 0:
+                # real + 1j * imag will convert this to a scalar object,
+                #   not a ndarray, here we convert it back to a ndarray
+                if framework == Framework.numpy:
+                    a = np.array(a)
+                else:
+                    a = cp.array(a)
+            assert a.dtype == framework_dtype, f"{a.dtype} vs {framework_dtype}"
+            assert a.shape == shape, f"{a.shape} vs {shape}"
+            return a
+
+        if mem_backend == MemBackend.cuda and device_id is not None:
+            with get_framework_device_ctx(device_id, framework):
+                return _create_array()
+        else:
+            return _create_array()
+
+    elif framework == Framework.torch:
+        if mem_backend == MemBackend.cpu:
+            device = "cpu"
+        elif device_id is not None:
+            device = f"cuda:{device_id}"
+        else:
+            device = "cuda"
+        g = torch.Generator(device=device)
+        g = g.manual_seed(seed)
+        t = torch.rand(size=shape, generator=g, device=device, dtype=framework_dtype)
+        scale = torch.tensor(hi - lo, dtype=framework_dtype)
+        if not is_complex(dtype):
+            shift = torch.tensor(lo, dtype=framework_dtype)
+        else:
+            shift = torch.tensor(lo + 1j * lo, dtype=framework_dtype)
+        t = t.mul_(scale).add_(shift)
+        assert t.dtype == framework_dtype
+        return t
+    else:
+        raise ValueError(f"Unknown framework {framework}")
+
+
+def get_custom_stream(framework: Framework, device_id=None, is_numpy_stream_oriented=False):
+    if framework == Framework.numpy:
+        if is_numpy_stream_oriented:
+            old_device = ccx.Device()
+            device = ccx.Device(device_id)
+            try:
+                device.set_current()
+                return device.create_stream()
+            finally:
+                old_device.set_current()
+        else:
+            return None
+    elif framework == Framework.cupy:
+        if device_id is None:
+            return cp.cuda.Stream(non_blocking=True)
+        else:
+            with get_framework_device_ctx(device_id, framework):
+                return cp.cuda.Stream(non_blocking=True)
+    elif framework == Framework.torch:
+        device = None if device_id is None else f"cuda:{device_id}"
+        return torch.cuda.Stream(device=device)
+    else:
+        raise ValueError(f"Unknown GPU framework {framework}")
+
+
+def get_framework_device_ctx(device_id: int, framework: Framework):
+    if framework == Framework.numpy:
+        return contextlib.nullcontext()
+    elif framework == Framework.cupy:
+        return cp.cuda.Device(device_id)
+    elif framework == Framework.torch:
+        return torch.cuda.device(device_id)
diff --git a/tests/nvmath_tests/tensor/utils/support_matrix.py b/tests/nvmath_tests/tensor/utils/support_matrix.py
new file mode 100644
index 0000000..0a3b337
--- /dev/null
+++ b/tests/nvmath_tests/tensor/utils/support_matrix.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from packaging.version import Version
+
+try:
+    import torch
+except ModuleNotFoundError:
+
+    class torch:
+        __version__ = "0.0.0"
+
+
+from .common_axes import (
+    Framework,
+    MemBackend,
+    DType,
+    ComputeType,
+)
+
+
+framework_backend_support = {
+    Framework.cupy: [MemBackend.cuda],
+    Framework.numpy: [MemBackend.cpu],
+    Framework.torch: [MemBackend.cpu, MemBackend.cuda],
+}
+
+framework_type_support = {
+    Framework.cupy: [
+        DType.float16,
+        DType.float32,
+        DType.float64,
+        DType.complex64,
+        DType.complex128,
+    ],
+    Framework.numpy: [
+        DType.float16,
+        DType.float32,
+        DType.float64,
+        DType.complex64,
+        DType.complex128,
+    ],
+    Framework.torch: [
+        *((DType.float16,) if Version(torch.__version__) >= Version("2.2.0") else ()),
+        DType.float32,
+        DType.float64,
+        DType.complex64,
+        DType.complex128,
+    ],
+}
+
+compute_type_support = {
+    DType.float16: [
+        ComputeType.float32,
+    ],
+    DType.bfloat16: [
+        ComputeType.float32,
+    ],
+    DType.float32: [
+        ComputeType.float32,
+        ComputeType.tf32,
+        ComputeType.three_xtf32,
+        ComputeType.float16,
+        ComputeType.bfloat16,
+    ],
+    DType.float64: [
+        ComputeType.float64,
+        ComputeType.float32,
+    ],
+    DType.complex64: [
+        ComputeType.float32,
+        ComputeType.tf32,
+        ComputeType.three_xtf32,
+    ],
+    DType.complex128: [
+        ComputeType.float64,
+        ComputeType.float32,
+    ],
+}