From 7557967b2ddc43bcb04cea633ac22de732e0d484 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 27 Sep 2025 04:51:20 +0000
Subject: [PATCH 01/27] set up build system for targeting different
 cuda-bindings major verions

---
 cuda_core/build_hooks.py | 59 ++++++++++++++++++++++++++++++++++++++++
 cuda_core/pyproject.toml |  3 +-
 2 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 cuda_core/build_hooks.py

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
new file mode 100644
index 000000000..71354e57c
--- /dev/null
+++ b/cuda_core/build_hooks.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# This module implements basic PEP 517 backend support, see e.g.
+# - https://peps.python.org/pep-0517/
+# - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks
+# Specifically, there are 5 APIs required to create a proper build backend, see below.
+# For now it's mostly a pass-through to setuptools, except that we need to determine
+# some dependencies at build time.
+#
+# TODO: also implement PEP-660 API hooks
+
+import os
+import re
+import subprocess  # nosec: B404
+
+from setuptools import build_meta as _build_meta
+
+prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel
+build_wheel = _build_meta.build_wheel
+build_sdist = _build_meta.build_sdist
+get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist
+
+
+def _get_proper_cuda_bindings_major_version() -> str:
+    # for local development (with/without build isolation)
+    try:
+        import cuda.bindings
+
+        return cuda.bindings.__version__.split(".")[0]
+    except ImportError:
+        pass
+
+    # for custom overwrite, e.g. in CI
+    cuda_major = os.environ.get("CUDA_CORE_BUILD_MAJOR")
+    if cuda_major is not None:
+        return cuda_major
+
+    # also for local development
+    try:
+        out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True)  # nosec: B603, B607
+        m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode())
+        if m:
+            return m.group(1).split(".")[0]
+    except FileNotFoundError:
+        # the build machine has no driver installed
+        pass
+
+    # default fallback
+    return "13"
+
+
+# Note: this function returns a list of *build-time* dependencies, so it's not affected
+# by "--no-deps" based on the PEP-517 design.
+def get_requires_for_build_wheel(config_settings=None):
+    cuda_major = _get_proper_cuda_bindings_major_version()
+    cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"]
+    return _build_meta.get_requires_for_build_wheel(config_settings) + cuda_bindings_require
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 9e3be132f..27fa5ce19 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -4,7 +4,8 @@
 
 [build-system]
 requires = ["setuptools>=77.0.0", "Cython>=3.1"]
-build-backend = "setuptools.build_meta"
+build-backend = "build_hooks"
+backend-path = ["."]
 
 
 [project]

From 19765972a528d318db76dca3fc843decea4a6566 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 27 Sep 2025 05:53:13 +0000
Subject: [PATCH 02/27] defer cythonization until cuda-bindings is installed

---
 cuda_core/build_hooks.py | 44 +++++++++++++++++++++++++++++++++++-----
 cuda_core/setup.py       | 29 +++-----------------------
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 71354e57c..fd1692ad3 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -6,23 +6,25 @@
 # - https://peps.python.org/pep-0517/
 # - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks
 # Specifically, there are 5 APIs required to create a proper build backend, see below.
-# For now it's mostly a pass-through to setuptools, except that we need to determine
-# some dependencies at build time.
 #
 # TODO: also implement PEP-660 API hooks
 
+import functools
+import glob
 import os
 import re
 import subprocess  # nosec: B404
 
+from Cython.Build import cythonize
+from setuptools import Extension
 from setuptools import build_meta as _build_meta
 
 prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel
-build_wheel = _build_meta.build_wheel
 build_sdist = _build_meta.build_sdist
 get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist
 
 
+@functools.cache
 def _get_proper_cuda_bindings_major_version() -> str:
     # for local development (with/without build isolation)
     try:
@@ -51,8 +53,40 @@ def _get_proper_cuda_bindings_major_version() -> str:
     return "13"
 
 
-# Note: this function returns a list of *build-time* dependencies, so it's not affected
-# by "--no-deps" based on the PEP-517 design.
+# used later by setup()
+_extensions = None
+
+
+def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
+    # Customizing this hook is needed because we must defer cythonization until cuda-bindings,
+    # now a required build-time dependency that's dynamically installed via the other hook below,
+    # is installed. Otherwise, cimport any cuda.bindings modules would fail!
+
+    # It seems setuptools' wildcard support has problems for namespace packages,
+    # so we explicitly spell out all Extension instances.
+    root_module = "cuda.core.experimental"
+    root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
+    ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
+
+    def strip_prefix_suffix(filename):
+        return filename[len(root_path) : -4]
+
+    module_names = (strip_prefix_suffix(f) for f in ext_files)
+    ext_modules = tuple(
+        Extension(
+            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/experimental/{mod}.pyx"],
+            language="c++",
+        )
+        for mod in module_names
+    )
+
+    global _extensions
+    _extensions = cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True})
+
+    return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory)
+
+
 def get_requires_for_build_wheel(config_settings=None):
     cuda_major = _get_proper_cuda_bindings_major_version()
     cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"]
diff --git a/cuda_core/setup.py b/cuda_core/setup.py
index d93eec45d..4a501edc1 100644
--- a/cuda_core/setup.py
+++ b/cuda_core/setup.py
@@ -2,38 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import glob
 import os
 
-from Cython.Build import cythonize
-from setuptools import Extension, setup
+import build_hooks  # our build backend
+from setuptools import setup
 from setuptools.command.build_ext import build_ext as _build_ext
 
 nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
 
 
-# It seems setuptools' wildcard support has problems for namespace packages,
-# so we explicitly spell out all Extension instances.
-root_module = "cuda.core.experimental"
-root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
-ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
-
-
-def strip_prefix_suffix(filename):
-    return filename[len(root_path) : -4]
-
-
-module_names = (strip_prefix_suffix(f) for f in ext_files)
-ext_modules = tuple(
-    Extension(
-        f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-        sources=[f"cuda/core/experimental/{mod}.pyx"],
-        language="c++",
-    )
-    for mod in module_names
-)
-
-
 class build_ext(_build_ext):
     def build_extensions(self):
         self.parallel = nthreads
@@ -41,7 +18,7 @@ def build_extensions(self):
 
 
 setup(
-    ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}),
+    ext_modules=build_hooks._extensions,
     cmdclass={
         "build_ext": build_ext,
     },

From 67db25e9ea4ce36712f876f03a8f7b88dbc87e7e Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 02:17:58 +0000
Subject: [PATCH 03/27] cythonize stream module

---
 .../{_launcher.py => _launcher.pyx}           |  8 +-
 cuda_core/cuda/core/experimental/_stream.pxd  |  9 ++
 cuda_core/cuda/core/experimental/_stream.pyx  | 85 +++++++++++++------
 3 files changed, 72 insertions(+), 30 deletions(-)
 rename cuda_core/cuda/core/experimental/{_launcher.py => _launcher.pyx} (93%)
 create mode 100644 cuda_core/cuda/core/experimental/_stream.pxd

diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.pyx
similarity index 93%
rename from cuda_core/cuda/core/experimental/_launcher.py
rename to cuda_core/cuda/core/experimental/_launcher.pyx
index 2d0c274c7..ae808be89 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -2,12 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport uintptr_t
+
+from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
+
 from typing import Union
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr
+from cuda.core.experimental._stream import IsStreamT, Stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     _reduce_3_tuple,
@@ -60,7 +64,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         stream_handle = stream.handle
     except AttributeError:
         try:
-            stream_handle = _try_to_get_stream_ptr(stream)
+            stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
         except Exception:
             raise ValueError(
                 f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
new file mode 100644
index 000000000..f7d97de33
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# TODO: how about cuda.bindings < 12.6.2?
+from cuda.bindings cimport cydriver
+
+
+cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index a2c1a90b9..284831cd6 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -4,10 +4,16 @@
 
 from __future__ import annotations
 
+from libc.stdint cimport uintptr_t
+
+# TODO: how about cuda.bindings < 12.6.2?
+from cuda.bindings cimport cydriver
+
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
 )
+
 import sys
 
 import cython
@@ -59,7 +65,7 @@ class IsStreamT(Protocol):
         ...
 
 
-def _try_to_get_stream_ptr(obj: IsStreamT):
+cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*:
     try:
         cuda_stream_attr = obj.__cuda_stream__
     except AttributeError:
@@ -86,7 +92,7 @@ def _try_to_get_stream_ptr(obj: IsStreamT):
         raise RuntimeError(
             f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
         )
-    return driver.CUstream(info[1])
+    return <cydriver.CUstream><uintptr_t>(info[1])
 
 
 cdef class Stream:
@@ -108,7 +114,7 @@ cdef class Stream:
     """
 
     cdef:
-        object _handle
+        cydriver.CUstream _handle
         object _owner
         object _builtin
         object _nonblocking
@@ -116,6 +122,9 @@ cdef class Stream:
         object _device_id
         object _ctx_handle
 
+    def __cinit__(self, *args, **kwargs):
+        self._handle = <cydriver.CUstream>(NULL)
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError(
             "Stream objects cannot be instantiated directly. "
@@ -125,7 +134,7 @@ cdef class Stream:
     @classmethod
     def _legacy_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = driver.CUstream(driver.CU_STREAM_LEGACY)
+        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_LEGACY)
         self._owner = None
         self._builtin = True
         self._nonblocking = None  # delayed
@@ -137,7 +146,7 @@ cdef class Stream:
     @classmethod
     def _per_thread_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = driver.CUstream(driver.CU_STREAM_PER_THREAD)
+        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD)
         self._owner = None
         self._builtin = True
         self._nonblocking = None  # delayed
@@ -149,7 +158,6 @@ cdef class Stream:
     @classmethod
     def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = None
         self._owner = None
         self._builtin = False
 
@@ -169,16 +177,20 @@ cdef class Stream:
         nonblocking = opts.nonblocking
         priority = opts.priority
 
-        flags = driver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else driver.CUstream_flags.CU_STREAM_DEFAULT
-        err, high, low = driver.cuCtxGetStreamPriorityRange()
-        raise_if_driver_error(err)
+        flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT
+        # TODO: use HANDLE_RETURN
+        cdef int high, low
+        err = cydriver.cuCtxGetStreamPriorityRange(&high, &low)
         if priority is not None:
             if not (low <= priority <= high):
                 raise ValueError(f"{priority=} is out of range {[low, high]}")
         else:
             priority = high
 
-        self._handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority))
+        cdef cydriver.CUstream s
+        # TODO: add HANDLE_RETURN macro to check driver error code?
+        err = cydriver.cuStreamCreateWithPriority(&s, flags, priority)
+        self._handle = s
         self._owner = None
         self._nonblocking = nonblocking
         self._priority = priority
@@ -195,10 +207,11 @@ cdef class Stream:
 
         if self._owner is None:
             if self._handle and not self._builtin:
-                handle_return(driver.cuStreamDestroy(self._handle))
+                # TODO: use HANDLE_RETURN
+                err = cydriver.cuStreamDestroy(self._handle)
         else:
             self._owner = None
-        self._handle = None
+        self._handle = <cydriver.CUstream>(NULL)
 
     cpdef close(self):
         """Destroy the stream.
@@ -222,14 +235,16 @@ cdef class Stream:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Stream.handle)``.
         """
-        return self._handle
+        return driver.CUstream(<uintptr_t><void*>(self._handle))
 
     @property
     def is_nonblocking(self) -> bool:
         """Return True if this is a nonblocking stream, otherwise False."""
+        cdef unsigned int flags
         if self._nonblocking is None:
-            flag = handle_return(driver.cuStreamGetFlags(self._handle))
-            if flag == driver.CUstream_flags.CU_STREAM_NON_BLOCKING:
+            # TODO: switch to HANDLE_RETURN
+            err = cydriver.cuStreamGetFlags(self._handle, &flags)
+            if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING:
                 self._nonblocking = True
             else:
                 self._nonblocking = False
@@ -238,14 +253,17 @@ cdef class Stream:
     @property
     def priority(self) -> int:
         """Return the stream priority."""
+        cdef int prio
         if self._priority is None:
-            prio = handle_return(driver.cuStreamGetPriority(self._handle))
+            # TODO: switch to HANDLE_RETURN
+            err = cydriver.cuStreamGetPriority(self._handle, &prio)
             self._priority = prio
         return self._priority
 
     def sync(self):
         """Synchronize the stream."""
-        handle_return(driver.cuStreamSynchronize(self._handle))
+        # TODO: switch to HANDLE_RETURN
+        err = cydriver.cuStreamSynchronize(self._handle)
 
     def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
@@ -272,8 +290,9 @@ cdef class Stream:
         if event is None:
             self._get_device_and_context()
             event = Event._init(self._device_id, self._ctx_handle, options)
-        err, = driver.cuEventRecord(event.handle, self._handle)
-        raise_if_driver_error(err)
+        # TODO: switch to HANDLE_RETURN
+        # TODO: revisit after Event is cythonized
+        err = cydriver.cuEventRecord(<cydriver.CUevent><uintptr_t>(event.handle), self._handle)
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
@@ -286,28 +305,35 @@ cdef class Stream:
         on the stream and then waiting on it.
 
         """
+        cdef cydriver.CUevent event
+        cdef cydriver.CUstream stream
+        cdef bint discard_event
+
         if isinstance(event_or_stream, Event):
-            event = event_or_stream.handle
+            event = <cydriver.CUevent><uintptr_t>(event_or_stream.handle)
             discard_event = False
         else:
             if isinstance(event_or_stream, Stream):
-                stream = event_or_stream
+                stream = <cydriver.CUstream><uintptr_t>(event_or_stream.handle)
             else:
                 try:
-                    stream = Stream._init(obj=event_or_stream)
+                    s = Stream._init(obj=event_or_stream)
                 except Exception as e:
                     raise ValueError(
                         "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
                         f" got {type(event_or_stream)}"
                     ) from e
-            event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-            handle_return(driver.cuEventRecord(event, stream.handle))
+                stream = <cydriver.CUstream><uintptr_t>(s.handle)
+            # TODO: switch to HANDLE_RETURN
+            err = cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+            err = cydriver.cuEventRecord(event, stream)
             discard_event = True
 
         # TODO: support flags other than 0?
-        handle_return(driver.cuStreamWaitEvent(self._handle, event, 0))
+        # TODO: switch to HANDLE_RETURN
+        err = cydriver.cuStreamWaitEvent(self._handle, event, 0)
         if discard_event:
-            handle_return(driver.cuEventDestroy(event))
+            err = cydriver.cuEventDestroy(event)
 
     @property
     def device(self) -> Device:
@@ -325,9 +351,12 @@ cdef class Stream:
         return Device(self._device_id)
 
     cdef int _get_context(Stream self) except?-1:
+        # TODO: consider making self._ctx_handle typed?
+        cdef cydriver.CUcontext ctx
         if self._ctx_handle is None:
-            err, self._ctx_handle = driver.cuStreamGetCtx(self._handle)
-            raise_if_driver_error(err)
+            # TODO: switch to HANDLE_RETURN
+            err = cydriver.cuStreamGetCtx(self._handle, &ctx)
+            self._ctx_handle = driver.CUcontext(<uintptr_t>ctx)
         return 0
 
     cdef int _get_device_and_context(Stream self) except?-1:

From 07df441f8e7e9b9c8080ea2a123600efb7c0a977 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 19:39:51 +0000
Subject: [PATCH 04/27] nit: move dlpack.h to the include dir

---
 cuda_core/cuda/core/experimental/_dlpack.pxd            | 2 +-
 cuda_core/cuda/core/experimental/{ => include}/dlpack.h | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cuda_core/cuda/core/experimental/{ => include}/dlpack.h (100%)

diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd
index 843beb873..d61b6a2bc 100644
--- a/cuda_core/cuda/core/experimental/_dlpack.pxd
+++ b/cuda_core/cuda/core/experimental/_dlpack.pxd
@@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
 
 
-cdef extern from "dlpack.h" nogil:
+cdef extern from "include/dlpack.h" nogil:
     """
     #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
     #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
diff --git a/cuda_core/cuda/core/experimental/dlpack.h b/cuda_core/cuda/core/experimental/include/dlpack.h
similarity index 100%
rename from cuda_core/cuda/core/experimental/dlpack.h
rename to cuda_core/cuda/core/experimental/include/dlpack.h

From 6be8e7d0b6f0523ff63ed291b73680b7d6b0f503 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 19:40:19 +0000
Subject: [PATCH 05/27] purge cu11

---
 cuda_core/pyproject.toml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 27fa5ce19..ee0030f1c 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -39,22 +39,20 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: 11",
     "Environment :: GPU :: NVIDIA CUDA :: 12",
+    "Environment :: GPU :: NVIDIA CUDA :: 13",
 ]
 dependencies = [
     "numpy",
 ]
 
 [project.optional-dependencies]
-cu11 = ["cuda-bindings[all]==11.8.*"]
 cu12 = ["cuda-bindings[all]==12.*"]
 cu13 = ["cuda-bindings[all]==13.*"]
 # TODO: these should all be in development dependencies; optional dependencies
 # are for features exposed to *users*, not a dumping ground for all tooling
 # needed to build and test the project
 test = ["cython>=3.1", "setuptools", "pytest>=6.2.4"]
-test-cu11 = ["cuda-core[test]", "cupy-cuda11x; python_version < '3.14'", "cuda-toolkit[cudart]==11.*"]  # runtime headers needed by CuPy
 test-cu12 = ["cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
 test-cu13 = ["cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
 # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now

From 021e0f3406aceaccb1828abb392be5c0de681775 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 19:47:55 +0000
Subject: [PATCH 06/27] check in a working merger script

---
 ci/tools/merge_cuda_core_wheels.py | 200 +++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 ci/tools/merge_cuda_core_wheels.py

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
new file mode 100644
index 000000000..3b47a9f7f
--- /dev/null
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Script to merge CUDA-specific wheels into a single multi-CUDA wheel.
+
+This script takes wheels built for different CUDA versions (cu12, cu13) and merges them
+into a single wheel that supports both CUDA versions.
+
+In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
+and the associated bindings. This script merges these directories into a single wheel
+that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
+and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+is used to import the appropriate CUDA-specific bindings.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import List
+
+
+def run_command(
+    cmd: List[str], cwd: Path = None, env: dict = None
+) -> subprocess.CompletedProcess:
+    """Run a command with error handling."""
+    print(f"Running: {' '.join(cmd)}")
+    if cwd:
+        print(f"  Working directory: {cwd}")
+
+    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Command failed with return code {result.returncode}")
+        print("STDOUT:", result.stdout)
+        print("STDERR:", result.stderr)
+        result.check_returncode()
+
+    return result
+
+
+def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
+    """Merge multiple wheels into a single wheel with version-specific binaries."""
+    print("\n=== Merging wheels ===")
+    print(f"Input wheels: {[w.name for w in wheels]}")
+
+    if len(wheels) == 1:
+        raise RuntimeError("only one wheel is provided, nothing to merge")
+
+    # Extract all wheels to temporary directories
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        extracted_wheels = []
+
+        for i, wheel in enumerate(wheels):
+            print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}")
+            # Extract wheel - wheel unpack creates the directory itself
+            run_command(
+                [
+                    "python",
+                    "-m",
+                    "wheel",
+                    "unpack",
+                    str(wheel),
+                    "--dest",
+                    str(temp_path),
+                ]
+            )
+
+            # Find the extracted directory (wheel unpack creates a subdirectory)
+            extract_dir = None
+            for item in temp_path.iterdir():
+                if item.is_dir() and item.name.startswith("cuda_core"):
+                    extract_dir = item
+                    break
+
+            if not extract_dir:
+                raise RuntimeError(
+                    f"Could not find extracted wheel directory for {wheel.name}"
+                )
+
+            # Rename to our expected name
+            expected_name = temp_path / f"wheel_{i}"
+            extract_dir.rename(expected_name)
+            extract_dir = expected_name
+
+            extracted_wheels.append(extract_dir)
+
+        # Use the first wheel as the base and merge binaries from others
+        base_wheel = extracted_wheels[0]
+
+        # now copy the version-specific directory from other wheels
+        # into the appropriate place in the base wheel
+        for i, wheel_dir in enumerate(extracted_wheels):
+            cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
+            base_dir = (
+                Path("cuda")
+                / "core"
+                / "experimental"
+            )
+            # Copy from other wheels
+            print(f"  Copying {wheel_dir} to {base_wheel}")
+            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
+
+            # Overwrite the __init__.py in versioned dirs
+            open(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", "w").close()
+
+        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
+        files_to_remove = os.listdir(base_wheel / base_dir)
+        for f in files_to_remove:
+            f_abspath = base_wheel / base_dir / f
+            if f not in ("__init__.py", "cu12", "cu13", "include"):
+                if os.path.isdir(f_abspath):
+                    shutil.rmtree(f_abspath)
+                else:
+                    os.remove(f_abspath)
+
+        # Repack the merged wheel
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create a clean wheel name without CUDA version suffixes
+        base_wheel_name = wheels[0].name
+        # Remove any .cu* suffix from the wheel name
+        if ".cu" in base_wheel_name:
+            base_wheel_name = base_wheel_name.split(".cu")[0] + ".whl"
+
+        print(f"Repacking merged wheel as: {base_wheel_name}")
+        run_command(
+            [
+                "python",
+                "-m",
+                "wheel",
+                "pack",
+                str(base_wheel),
+                "--dest-dir",
+                str(output_dir),
+            ]
+        )
+
+        # Find the output wheel
+        output_wheels = list(output_dir.glob("*.whl"))
+        if not output_wheels:
+            raise RuntimeError("Failed to create merged wheel")
+
+        merged_wheel = output_wheels[0]
+        print(f"Successfully merged wheel: {merged_wheel}")
+        return merged_wheel
+
+
+def main():
+    """Main merge script."""
+    parser = argparse.ArgumentParser(
+        description="Merge CUDA-specific wheels into a single multi-CUDA wheel"
+    )
+    parser.add_argument(
+        "wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge"
+    )
+    parser.add_argument(
+        "--output-dir", "-o", default="dist", help="Output directory for merged wheel"
+    )
+
+    args = parser.parse_args()
+
+    print("cuda.core Wheel Merger")
+    print("======================")
+
+    # Convert wheel paths to Path objects and validate
+    wheels = []
+    for wheel_path in args.wheels:
+        wheel = Path(wheel_path)
+        if not wheel.exists():
+            print(f"Error: Wheel not found: {wheel}")
+            sys.exit(1)
+        if not wheel.name.endswith(".whl"):
+            print(f"Error: Not a wheel file: {wheel}")
+            sys.exit(1)
+        wheels.append(wheel)
+
+    if not wheels:
+        print("Error: No wheels provided")
+        sys.exit(1)
+
+    output_dir = Path(args.output_dir)
+
+    # Check that we have wheel tool available
+    try:
+        run_command(["python", "-m", "wheel", "--help"])
+    except Exception:
+        print("Error: wheel package not available. Install with: pip install wheel")
+        sys.exit(1)
+
+    # Merge the wheels
+    merged_wheel = merge_wheels(wheels, output_dir)
+    print(f"\nMerge complete! Output: {merged_wheel}")
+
+
+if __name__ == "__main__":
+    main()

From 19020b2768cf60087bb316996077affd7cf66b8d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 20:21:58 +0000
Subject: [PATCH 07/27] support loading from the versioned module if any exists

---
 cuda_core/cuda/core/experimental/__init__.py | 23 ++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index a06119321..40d10c3aa 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -2,6 +2,29 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+try:
+    import cuda.bindings
+except ImportError as e:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+else:
+    cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+subdir = f"cu{cuda_major}"
+try:
+    verioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(verioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del verioned_mod
+finally:
+    del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
+
 from cuda.core.experimental import utils
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import Event, EventOptions

From e51f9107fdabd6bb4ba6b1283304fa0f6b4491bd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 20:53:03 +0000
Subject: [PATCH 08/27] fix linter errors

---
 .spdx-ignore                                 |  2 +-
 ci/tools/merge_cuda_core_wheels.py           | 36 ++++++++------------
 cuda_core/cuda/core/experimental/__init__.py | 29 ++++++++--------
 3 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/.spdx-ignore b/.spdx-ignore
index 60435ebb5..84f051faf 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -10,4 +10,4 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/dlpack.h
+cuda_core/cuda/core/experimental/include/dlpack.h
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 3b47a9f7f..14c380b76 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python3
 """
 Script to merge CUDA-specific wheels into a single multi-CUDA wheel.
@@ -10,27 +14,27 @@
 that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
 and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
 is used to import the appropriate CUDA-specific bindings.
+
+This script is based on the one in NVIDIA/CCCL.
 """
 
 import argparse
 import os
 import shutil
-import subprocess
+import subprocess  # nosec: B404
 import sys
 import tempfile
 from pathlib import Path
 from typing import List
 
 
-def run_command(
-    cmd: List[str], cwd: Path = None, env: dict = None
-) -> subprocess.CompletedProcess:
+def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> subprocess.CompletedProcess:
     """Run a command with error handling."""
     print(f"Running: {' '.join(cmd)}")
     if cwd:
         print(f"  Working directory: {cwd}")
 
-    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)
+    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)  # nosec: B603
 
     if result.returncode != 0:
         print(f"Command failed with return code {result.returncode}")
@@ -77,9 +81,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
                     break
 
             if not extract_dir:
-                raise RuntimeError(
-                    f"Could not find extracted wheel directory for {wheel.name}"
-                )
+                raise RuntimeError(f"Could not find extracted wheel directory for {wheel.name}")
 
             # Rename to our expected name
             expected_name = temp_path / f"wheel_{i}"
@@ -95,11 +97,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # into the appropriate place in the base wheel
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = (
-                Path("cuda")
-                / "core"
-                / "experimental"
-            )
+            base_dir = Path("cuda") / "core" / "experimental"
             # Copy from other wheels
             print(f"  Copying {wheel_dir} to {base_wheel}")
             shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
@@ -151,15 +149,9 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
 def main():
     """Main merge script."""
-    parser = argparse.ArgumentParser(
-        description="Merge CUDA-specific wheels into a single multi-CUDA wheel"
-    )
-    parser.add_argument(
-        "wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge"
-    )
-    parser.add_argument(
-        "--output-dir", "-o", default="dist", help="Output directory for merged wheel"
-    )
+    parser = argparse.ArgumentParser(description="Merge CUDA-specific wheels into a single multi-CUDA wheel")
+    parser.add_argument("wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge")
+    parser.add_argument("--output-dir", "-o", default="dist", help="Output directory for merged wheel")
 
     args = parser.parse_args()
 
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 40d10c3aa..90d520d78 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -4,14 +4,15 @@
 
 try:
     import cuda.bindings
-except ImportError as e:
-    raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
 else:
     cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2]
     if cuda_major not in ("12", "13"):
         raise ImportError("cuda.bindings 12.x or 13.x must be installed")
 
 import importlib
+
 subdir = f"cu{cuda_major}"
 try:
     verioned_mod = importlib.import_module(f".{subdir}", __package__)
@@ -25,29 +26,29 @@
 finally:
     del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
 
-from cuda.core.experimental import utils
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import (
+from cuda.core.experimental import utils  # noqa: E402
+from cuda.core.experimental._device import Device  # noqa: E402
+from cuda.core.experimental._event import Event, EventOptions  # noqa: E402
+from cuda.core.experimental._graph import (  # noqa: E402
     Graph,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
 )
-from cuda.core.experimental._launch_config import LaunchConfig
-from cuda.core.experimental._launcher import launch
-from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._memory import (
+from cuda.core.experimental._launch_config import LaunchConfig  # noqa: E402
+from cuda.core.experimental._launcher import launch  # noqa: E402
+from cuda.core.experimental._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core.experimental._memory import (  # noqa: E402
     Buffer,
     DeviceMemoryResource,
     IPCChannel,
     LegacyPinnedMemoryResource,
     MemoryResource,
 )
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._stream import Stream, StreamOptions
-from cuda.core.experimental._system import System
+from cuda.core.experimental._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core.experimental._program import Program, ProgramOptions  # noqa: E402
+from cuda.core.experimental._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core.experimental._system import System  # noqa: E402
 
 system = System()
 __import__("sys").modules[__spec__.name + ".system"] = system

From 61617cfbf1e6b083693aa6ba9c983642b46b83cb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 23:26:33 +0000
Subject: [PATCH 09/27] set up double-build CI workflow

---
 .github/actions/fetch_ctk/action.yml |  17 +++-
 .github/workflows/build-wheel.yml    | 136 +++++++++++++++++++++++----
 .github/workflows/ci.yml             |   9 +-
 ci/tools/env-vars                    |   3 +
 ci/versions.json                     |   3 +
 cuda_bindings/pyproject.toml         |   4 -
 cuda_core/pyproject.toml             |   2 +-
 7 files changed, 145 insertions(+), 29 deletions(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 83b447f0c..be7536c63 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -18,6 +18,11 @@ inputs:
     required: false
     type: string
     default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+  cuda-path:
+    description: "where the CTK components will be installed to, relative to $PWD"
+    required: false
+    type: string
+    default: "./cuda_toolkit"
 
 runs:
   using: composite
@@ -159,18 +164,24 @@ runs:
           exit 1
         fi
 
+    - name: Move CTK to the specified location
+      if: ${{ inputs.cuda-path != './cuda_toolkit' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        mv ./cuda_toolkit ${{ inputs.cuda-path }}
+
     - name: Set output environment variables
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         # mimics actual CTK installation
         if [[ "${{ inputs.host-platform }}" == linux* ]]; then
-          CUDA_PATH=$(realpath "./cuda_toolkit")
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib" >> $GITHUB_ENV
+          CUDA_PATH=$(realpath "${{ inputs.cuda-path }}")
+          echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
         elif [[ "${{ inputs.host-platform }}" == win* ]]; then
           function normpath() {
             echo "$(echo $(cygpath -w $1) | sed 's/\\/\\\\/g')"
           }
-          CUDA_PATH=$(normpath $(realpath "./cuda_toolkit"))
+          CUDA_PATH=$(normpath $(realpath "${{ inputs.cuda-path }}"))
           echo "$(normpath ${CUDA_PATH}/bin)" >> $GITHUB_PATH
         fi
         echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index db006be32..fa11ba76f 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -11,6 +11,9 @@ on:
       cuda-version:
         required: true
         type: string
+      prev-cuda-version:
+        required: true
+        type: string
 
 defaults:
   run:
@@ -109,13 +112,33 @@ jobs:
           path: cuda_pathfinder/*.whl
           if-no-files-found: error
 
+      - name: Set up mini CTK
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ inputs.cuda-version }}
+
+      # TODO: this currently builds against the public cuda.bindings wheel. Consider
+      # building against the wheel from main instead (the below step).
       - name: Build cuda.core wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
           package-dir: ./cuda_core/
           output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-
-      - name: List the cuda.core artifacts directory
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          CIBW_ENVIRONMENT: >
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+
+      - name: List the cuda.core artifacts directory and rename
         run: |
           if [[ "${{ inputs.host-platform }}" == win* ]]; then
             export CHOWN=chown
@@ -123,31 +146,34 @@ jobs:
             export CHOWN="sudo chown"
           fi
           $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-
-      - name: Check cuda.core wheel
-        run: |
-          twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
 
-      - name: Upload cuda.core build artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
-        with:
-          name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
-          path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
-          if-no-files-found: error
+          # Rename wheel to include CUDA version suffix
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}"
+          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
+            if [[ -f "${wheel}" ]]; then
+              base_name=$(basename "${wheel}" .whl)
+              new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl"
+              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}"
+              echo "Renamed wheel to: ${new_name}"
+            fi
+          done
 
-      - name: Set up mini CTK
-        uses: ./.github/actions/fetch_ctk
-        continue-on-error: false
-        with:
-          host-platform: ${{ inputs.host-platform }}
-          cuda-version: ${{ inputs.cuda-version }}
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
       - name: Build cuda.bindings wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
           package-dir: ./cuda_bindings/
           output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
 
       - name: List the cuda.bindings artifacts directory
         run: |
@@ -241,7 +267,7 @@ jobs:
 
       - name: Build cuda.core Cython tests
         run: |
-          pip install $(ls ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl)[test]
+          pip install $(ls ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/"cu${BUILD_CUDA_MAJOR}"/*.whl)[test]
           pushd ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}
           bash build_tests.sh
           popd
@@ -252,3 +278,73 @@ jobs:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}/test_*${{ env.PY_EXT_SUFFIX }}
           if-no-files-found: error
+
+      # Note: This overwrites CUDA_PATH etc
+      - name: Set up mini CTK
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ inputs.prev-cuda-version }}
+          cuda-path: "./cuda_toolkit_prev"
+
+      # TODO: this currently builds against the public cuda.bindings wheel. Consider
+      # building against the wheel from the backport branch instead.
+      - name: Build cuda.core wheel
+        uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
+        with:
+          package-dir: ./cuda_core/
+          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          CIBW_ENVIRONMENT: >
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+
+      - name: List the cuda.core artifacts directory and rename
+        run: |
+          if [[ "${{ inputs.host-platform }}" == win* ]]; then
+            export CHOWN=chown
+          else
+            export CHOWN="sudo chown"
+          fi
+          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
+          # Rename wheel to include CUDA version suffix
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}"
+          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
+            if [[ -f "${wheel}" ]]; then
+              base_name=$(basename "${wheel}" .whl)
+              new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl"
+              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}"
+              echo "Renamed wheel to: ${new_name}"
+            fi
+          done
+
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
+      - name: Merge cuda.core wheels
+        run: |
+          pip install wheel
+          python ci/tools/merge_cuda_core_wheels.py \
+            "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_CUDA_MAJOR}"/cuda_core*.whl \
+            "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_PREV_CUDA_MAJOR}"/cuda_core*.whl \
+            --output-dir "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+
+      - name: Check cuda.core wheel
+        run: |
+          twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+
+      - name: Upload cuda.core build artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
+        with:
+          name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+          if-no-files-found: error
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1b2bb241f..fbc267135 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,17 +21,21 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }}
+      CUDA_PREV_BUILD_VER: ${{ steps.get-vars.outputs.cuda_prev_build_ver }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
         with:
           fetch-depth: 0
-      - name: Get CUDA build version
+      - name: Get CUDA build versions
         id: get-vars
         run: |
           cuda_build_ver=$(jq -r .cuda.build.version ci/versions.json)
           echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT
 
+          cuda_prev_build_ver=$(jq -r .cuda.prev_build.version ci/versions.json)
+          echo "cuda_prev_build_ver=$cuda_prev_build_ver" >> $GITHUB_OUTPUT
+
   # WARNING: make sure all of the build jobs are in sync
   build-linux-64:
     needs:
@@ -48,6 +52,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure all of the build jobs are in sync
   build-linux-aarch64:
@@ -65,6 +70,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure all of the build jobs are in sync
   build-windows:
@@ -82,6 +88,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure both Linux test jobs are in sync
   test-linux-64:
diff --git a/ci/tools/env-vars b/ci/tools/env-vars
index de4a5a6b9..f7db5179d 100755
--- a/ci/tools/env-vars
+++ b/ci/tools/env-vars
@@ -41,6 +41,9 @@ if [[ "${1}" == "build" ]]; then
   # platform is handled by the default value of platform (`auto`) in cibuildwheel
   # here we only need to specify the python version we want
   echo "CIBW_BUILD=cp${PYTHON_VERSION_FORMATTED}-*" >> $GITHUB_ENV
+  BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
+  echo "BUILD_CUDA_MAJOR=${BUILD_CUDA_MAJOR}" >> $GITHUB_ENV
+  echo "BUILD_PREV_CUDA_MAJOR=$((${BUILD_CUDA_MAJOR} - 1))" >> $GITHUB_ENV
   CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${CUDA_VER}-${HOST_PLATFORM}"
 elif [[ "${1}" == "test" ]]; then
   BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${BUILD_CUDA_VER})"
diff --git a/ci/versions.json b/ci/versions.json
index 271c69ac3..2acfae1e3 100644
--- a/ci/versions.json
+++ b/ci/versions.json
@@ -2,6 +2,9 @@
   "cuda": {
     "build": {
       "version": "13.0.1"
+    },
+    "prev_build": {
+      "version": "12.9.1"
     }
   }
 }
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 97901678a..dc6c87eef 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -63,12 +63,8 @@ environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"
-# CIBW mounts the host filesystem under /host
-environment-pass = ["CUDA_PATH"]
-environment = { CUDA_HOME = "/host/$CUDA_PATH" }
 
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
 repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
-environment = { CUDA_HOME = "$(cygpath -w $CUDA_PATH)" }
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index ee0030f1c..75cc43abf 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -79,7 +79,7 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 skip = "*-musllinux_*"
 enable = "cpython-freethreading"
 build-verbosity = 1
-environment-pass = ["CUDA_PYTHON_PARALLEL_LEVEL"]
+environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"

From 9e799e40785eac42e9bd9ed63e0ef331445b5622 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 28 Sep 2025 23:44:37 +0000
Subject: [PATCH 10/27] ensure CUDA_PATH is honored by the build backend

---
 cuda_core/build_hooks.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index fd1692ad3..82bb77869 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -24,7 +24,6 @@
 get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist
 
 
-@functools.cache
 def _get_proper_cuda_bindings_major_version() -> str:
     # for local development (with/without build isolation)
     try:
@@ -72,10 +71,21 @@ def strip_prefix_suffix(filename):
         return filename[len(root_path) : -4]
 
     module_names = (strip_prefix_suffix(f) for f in ext_files)
+
+    @functools.cache
+    def get_cuda_paths():
+        CUDA_PATH = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME", None))
+        if not CUDA_PATH:
+            raise RuntimeError("Environment variable CUDA_PATH or CUDA_HOME is not set")
+        CUDA_PATH = CUDA_PATH.split(os.pathsep)
+        print("CUDA paths:", CUDA_PATH)
+        return CUDA_PATH
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
             sources=[f"cuda/core/experimental/{mod}.pyx"],
+            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
             language="c++",
         )
         for mod in module_names

From d5001d4f8a4ef3eb9db2dcd238fda4a037100b77 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 01:26:05 +0000
Subject: [PATCH 11/27] try to reuse cuda-bindings wheels for 3.13t/3.14/3.14t

---
 .github/workflows/build-wheel.yml | 114 ++++++++++++++++++++----------
 cuda_core/pyproject.toml          |   1 -
 2 files changed, 75 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index fa11ba76f..3b47f4615 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -119,17 +119,13 @@ jobs:
           host-platform: ${{ inputs.host-platform }}
           cuda-version: ${{ inputs.cuda-version }}
 
-      # TODO: this currently builds against the public cuda.bindings wheel. Consider
-      # building against the wheel from main instead (the below step).
-      - name: Build cuda.core wheel
+      - name: Build cuda.bindings wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
-          package-dir: ./cuda_core/
-          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          package-dir: ./cuda_bindings/
+          output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
         env:
           CIBW_BUILD: ${{ env.CIBW_BUILD }}
-          CIBW_ENVIRONMENT: >
-            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
           # CIBW mounts the host filesystem under /host
           CIBW_ENVIRONMENT_LINUX: >
             CUDA_PATH=/host/${{ env.CUDA_PATH }}
@@ -138,63 +134,68 @@ jobs:
             CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
             CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
 
-      - name: List the cuda.core artifacts directory and rename
+      - name: List the cuda.bindings artifacts directory
         run: |
           if [[ "${{ inputs.host-platform }}" == win* ]]; then
             export CHOWN=chown
           else
             export CHOWN="sudo chown"
           fi
-          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
-          # Rename wheel to include CUDA version suffix
-          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}"
-          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
-            if [[ -f "${wheel}" ]]; then
-              base_name=$(basename "${wheel}" .whl)
-              new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl"
-              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}"
-              echo "Renamed wheel to: ${new_name}"
-            fi
-          done
+      - name: Check cuda.bindings wheel
+        run: |
+          twine check --strict ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
 
-          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+      - name: Upload cuda.bindings build artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
+        with:
+          name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
+          if-no-files-found: error
 
-      - name: Build cuda.bindings wheel
+      # TODO: ideally we want to build against public cuda-bindings
+      - name: Build cuda.core wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
-          package-dir: ./cuda_bindings/
-          output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          package-dir: ./cuda_core/
+          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
         env:
           CIBW_BUILD: ${{ env.CIBW_BUILD }}
           # CIBW mounts the host filesystem under /host
           CIBW_ENVIRONMENT_LINUX: >
             CUDA_PATH=/host/${{ env.CUDA_PATH }}
             CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
+            PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
           CIBW_ENVIRONMENT_WINDOWS: >
             CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
             CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
+            PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})"
 
-      - name: List the cuda.bindings artifacts directory
+      - name: List the cuda.core artifacts directory and rename
         run: |
           if [[ "${{ inputs.host-platform }}" == win* ]]; then
             export CHOWN=chown
           else
             export CHOWN="sudo chown"
           fi
-          $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
-          ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
-      - name: Check cuda.bindings wheel
-        run: |
-          twine check --strict ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
+          # Rename wheel to include CUDA version suffix
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}"
+          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
+            if [[ -f "${wheel}" ]]; then
+              base_name=$(basename "${wheel}" .whl)
+              new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl"
+              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}"
+              echo "Renamed wheel to: ${new_name}"
+            fi
+          done
 
-      - name: Upload cuda.bindings build artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
-        with:
-          name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
-          path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
-          if-no-files-found: error
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
       # We only need/want a single pure python wheel, pick linux-64 index 0.
       - name: Build and check cuda-python wheel
@@ -288,8 +289,41 @@ jobs:
           cuda-version: ${{ inputs.prev-cuda-version }}
           cuda-path: "./cuda_toolkit_prev"
 
-      # TODO: this currently builds against the public cuda.bindings wheel. Consider
-      # building against the wheel from the backport branch instead.
+      # TODO: ideally we want to build against public cuda-bindings
+      - name: Download cuda.bindings build artifacts from the prior branch
+        if: ${{ matrix.python-version == '3.13t'
+                || matrix.python-version == '3.14'
+                || matrix.python-version == '3.14t' }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if ! (command -v gh 2>&1 >/dev/null); then
+            # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt.
+            # gh is needed for artifact fetching.
+            mkdir -p -m 755 /etc/apt/keyrings \
+                  && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+                  && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \
+            && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
+            && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+            && apt update \
+            && apt install gh -y
+          fi
+
+          OLD_BRANCH=$(cat .github/BACKPORT_BRANCH)
+          OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
+            echo "LATEST_PRIOR_RUN_ID not found!"
+            exit 1
+          fi
+
+          gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python
+          rm -rf ${OLD_BASENAME}-tests  # exclude cython test artifacts
+          ls -al $OLD_BASENAME
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+          mv $OLD_BASENAME/*.whl "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+          rmdir $OLD_BASENAME
+
       - name: Build cuda.core wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
@@ -297,15 +331,17 @@ jobs:
           output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
         env:
           CIBW_BUILD: ${{ env.CIBW_BUILD }}
-          CIBW_ENVIRONMENT: >
-            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
           # CIBW mounts the host filesystem under /host
           CIBW_ENVIRONMENT_LINUX: >
             CUDA_PATH=/host/${{ env.CUDA_PATH }}
             CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
+            PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
           CIBW_ENVIRONMENT_WINDOWS: >
             CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
             CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
+            PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})"
 
       - name: List the cuda.core artifacts directory and rename
         run: |
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 75cc43abf..d107f0d6e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -79,7 +79,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 skip = "*-musllinux_*"
 enable = "cpython-freethreading"
 build-verbosity = 1
-environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"

From 1180ab6bf2fd5d4d03dd1376e1db4985171f2145 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 02:32:11 +0000
Subject: [PATCH 12/27] disable building/testing 313t/314/314t for now

---
 .github/workflows/build-wheel.yml |  6 +++---
 ci/test-matrix.json               | 16 ++--------------
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index 3b47f4615..c0b2a421b 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -33,9 +33,9 @@ jobs:
           - "3.11"
           - "3.12"
           - "3.13"
-          - "3.13t"
-          - "3.14"
-          - "3.14t"
+#          - "3.13t"
+#          - "3.14"
+#          - "3.14t"
     name: py${{ matrix.python-version }}
     runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
                  (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
diff --git a/ci/test-matrix.json b/ci/test-matrix.json
index 10721659b..41cf03018 100644
--- a/ci/test-matrix.json
+++ b/ci/test-matrix.json
@@ -14,9 +14,6 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
@@ -26,10 +23,7 @@
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
+      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
     ],
     "nightly": [
       { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
@@ -94,13 +88,7 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
     ],
     "nightly": [
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },

From 8cbf40c6597358fddd3a9224016b7ea1381a42d3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 03:06:20 +0000
Subject: [PATCH 13/27] deprecate PY39 as per #846

---
 cuda_core/cuda/core/experimental/__init__.py  | 11 +++++++++++
 cuda_core/docs/source/release/0.X.Y-notes.rst |  1 +
 2 files changed, 12 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 90d520d78..fb0a2f469 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -26,6 +26,17 @@
 finally:
     del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
 
+import sys  # noqa: E402
+import warnings  # noqa: E402
+
+if sys.version_info < (3, 10):
+    warnings.warn(
+        "support for Python 3.9 and below is deprecated and subject to future removal",
+        category=UserWarning,
+        stacklevel=1,
+    )
+del sys, warnings
+
 from cuda.core.experimental import utils  # noqa: E402
 from cuda.core.experimental._device import Device  # noqa: E402
 from cuda.core.experimental._event import Event, EventOptions  # noqa: E402
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 433e34353..551cbe65c 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -12,6 +12,7 @@ Released on TBD
 Highlights
 ----------
 
+- This is the last release that officially supports Python 3.9.
 - Fix for :class:`LaunchConfig` grid parameter unit conversion when thread block clusters are used.
 
 

From 9519904d0914be3009540bbbb78e56bed0d42bd1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 03:12:13 +0000
Subject: [PATCH 14/27] also turn on parallel cythonization

---
 cuda_core/build_hooks.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 82bb77869..73bfbe4a9 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -91,8 +91,12 @@ def get_cuda_paths():
         for mod in module_names
     )
 
+    nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
+
     global _extensions
-    _extensions = cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True})
+    _extensions = cythonize(
+        ext_modules, verbose=True, language_level=3, nthreads=nthreads, compiler_directives={"embedsignature": True}
+    )
 
     return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory)
 

From 96ce48055c5175047077f1e49cde00b232d4d106 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 03:32:29 +0000
Subject: [PATCH 15/27] cythonize event

---
 cuda_core/cuda/core/experimental/_event.pyx  | 59 +++++++++++---------
 cuda_core/cuda/core/experimental/_stream.pyx |  4 +-
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 41c0b1ce6..14f65a90d 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -4,6 +4,11 @@
 
 from __future__ import annotations
 
+from libc.stdint cimport uintptr_t
+
+# TODO: how about cuda.bindings < 12.6.2?
+from cuda.bindings cimport cydriver
+
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
@@ -78,12 +83,15 @@ cdef class Event:
 
     """
     cdef:
-        object _handle
+        cydriver.CUevent _handle
         bint _timing_disabled
         bint _busy_waited
         int _device_id
         object _ctx_handle
 
+    def __cinit__(self):
+        self._handle = <cydriver.CUevent>(NULL)
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
 
@@ -91,19 +99,19 @@ cdef class Event:
     def _init(cls, device_id: int, ctx_handle: Context, options=None):
         cdef Event self = Event.__new__(cls)
         cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
-        flags = 0x0
+        cdef unsigned int flags = 0x0
         self._timing_disabled = False
         self._busy_waited = False
         if not opts.enable_timing:
-            flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING
+            flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING
             self._timing_disabled = True
         if opts.busy_waited_sync:
-            flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
+            flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
             self._busy_waited = True
         if opts.support_ipc:
             raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
-        err, self._handle = driver.cuEventCreate(flags)
-        raise_if_driver_error(err)
+        # TODO: use HANDLE_RETURN
+        err = cydriver.cuEventCreate(&self._handle, flags)
         self._device_id = device_id
         self._ctx_handle = ctx_handle
         return self
@@ -111,10 +119,10 @@ cdef class Event:
     cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing):
         if is_shutting_down and is_shutting_down():
             return
-        if self._handle is not None:
-            err, = driver.cuEventDestroy(self._handle)
-            self._handle = None
-            raise_if_driver_error(err)
+        if self._handle != NULL:
+            # TODO: use HANDLE_RETURN
+            err = cydriver.cuEventDestroy(self._handle)
+            self._handle = <cydriver.CUevent>(NULL)
 
     cpdef close(self):
         """Destroy the event."""
@@ -129,14 +137,14 @@ cdef class Event:
     def __rsub__(self, other):
         return NotImplemented
 
-    def __sub__(self, other):
+    def __sub__(self, other: Event):
         # return self - other (in milliseconds)
-        err, timing = driver.cuEventElapsedTime(other.handle, self._handle)
-        try:
-            raise_if_driver_error(err)
+        cdef float timing
+        err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
+        if err == 0:
             return timing
-        except CUDAError as e:
-            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
+        else:
+            if err == cydriver.CUresult.CUDA_ERROR_INVALID_HANDLE:
                 if self.is_timing_disabled or other.is_timing_disabled:
                     explanation = (
                         "Both Events must be created with timing enabled in order to subtract them; "
@@ -147,15 +155,15 @@ cdef class Event:
                         "Both Events must be recorded before they can be subtracted; "
                         "use Stream.record() to record both events to a stream."
                     )
-            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
+            elif err == cydriver.CUresult.CUDA_ERROR_NOT_READY:
                 explanation = (
                     "One or both events have not completed; "
                     "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
                     "before subtracting them."
                 )
             else:
-                raise e
-            raise RuntimeError(explanation) from e
+                raise CUDAError(err)
+            raise RuntimeError(explanation)
 
     @property
     def is_timing_disabled(self) -> bool:
@@ -182,17 +190,18 @@ cdef class Event:
         has been completed.
 
         """
-        handle_return(driver.cuEventSynchronize(self._handle))
+        # TODO: use HANDLE_RETURN
+        err = cydriver.cuEventSynchronize(self._handle)
 
     @property
     def is_done(self) -> bool:
         """Return True if all captured works have been completed, otherwise False."""
-        result, = driver.cuEventQuery(self._handle)
-        if result == driver.CUresult.CUDA_SUCCESS:
+        result = cydriver.cuEventQuery(self._handle)
+        if result == cydriver.CUresult.CUDA_SUCCESS:
             return True
-        if result == driver.CUresult.CUDA_ERROR_NOT_READY:
+        if result == cydriver.CUresult.CUDA_ERROR_NOT_READY:
             return False
-        handle_return(result)
+        # TODO: use HANDLE_RETURN
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:
@@ -203,7 +212,7 @@ cdef class Event:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Event.handle)``.
         """
-        return self._handle
+        return driver.CUevent(<uintptr_t>(self._handle))
 
     @property
     def device(self) -> Device:
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 284831cd6..cad5612fa 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -122,7 +122,7 @@ cdef class Stream:
         object _device_id
         object _ctx_handle
 
-    def __cinit__(self, *args, **kwargs):
+    def __cinit__(self):
         self._handle = <cydriver.CUstream>(NULL)
 
     def __init__(self, *args, **kwargs):
@@ -235,7 +235,7 @@ cdef class Stream:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Stream.handle)``.
         """
-        return driver.CUstream(<uintptr_t><void*>(self._handle))
+        return driver.CUstream(<uintptr_t>(self._handle))
 
     @property
     def is_nonblocking(self) -> bool:

From e702b5ef328819336affb0221b10cd480e73e9b2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 04:14:01 +0000
Subject: [PATCH 16/27] fix error handling

---
 cuda_core/cuda/core/experimental/_event.pyx   | 13 +++----
 cuda_core/cuda/core/experimental/_stream.pyx  | 36 +++++++------------
 .../core/experimental/_utils/cuda_utils.pxd   | 18 ++++++++--
 .../core/experimental/_utils/cuda_utils.pyx   |  6 ++++
 4 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 14f65a90d..0d5737e37 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -10,8 +10,8 @@ from libc.stdint cimport uintptr_t
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
     check_or_create_options,
+    HANDLE_RETURN
 )
 
 from dataclasses import dataclass
@@ -110,8 +110,7 @@ cdef class Event:
             self._busy_waited = True
         if opts.support_ipc:
             raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
-        # TODO: use HANDLE_RETURN
-        err = cydriver.cuEventCreate(&self._handle, flags)
+        HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
         self._device_id = device_id
         self._ctx_handle = ctx_handle
         return self
@@ -120,8 +119,7 @@ cdef class Event:
         if is_shutting_down and is_shutting_down():
             return
         if self._handle != NULL:
-            # TODO: use HANDLE_RETURN
-            err = cydriver.cuEventDestroy(self._handle)
+            HANDLE_RETURN(cydriver.cuEventDestroy(self._handle))
             self._handle = <cydriver.CUevent>(NULL)
 
     cpdef close(self):
@@ -190,8 +188,7 @@ cdef class Event:
         has been completed.
 
         """
-        # TODO: use HANDLE_RETURN
-        err = cydriver.cuEventSynchronize(self._handle)
+        HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle))
 
     @property
     def is_done(self) -> bool:
@@ -201,7 +198,7 @@ cdef class Event:
             return True
         if result == cydriver.CUresult.CUDA_ERROR_NOT_READY:
             return False
-        # TODO: use HANDLE_RETURN
+        HANDLE_RETURN(result)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index cad5612fa..ee6f6be01 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -10,8 +10,8 @@ from libc.stdint cimport uintptr_t
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
     check_or_create_options,
+    HANDLE_RETURN,
 )
 
 import sys
@@ -178,9 +178,8 @@ cdef class Stream:
         priority = opts.priority
 
         flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT
-        # TODO: use HANDLE_RETURN
         cdef int high, low
-        err = cydriver.cuCtxGetStreamPriorityRange(&high, &low)
+        HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low))
         if priority is not None:
             if not (low <= priority <= high):
                 raise ValueError(f"{priority=} is out of range {[low, high]}")
@@ -188,8 +187,7 @@ cdef class Stream:
             priority = high
 
         cdef cydriver.CUstream s
-        # TODO: add HANDLE_RETURN macro to check driver error code?
-        err = cydriver.cuStreamCreateWithPriority(&s, flags, priority)
+        HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority))
         self._handle = s
         self._owner = None
         self._nonblocking = nonblocking
@@ -207,8 +205,7 @@ cdef class Stream:
 
         if self._owner is None:
             if self._handle and not self._builtin:
-                # TODO: use HANDLE_RETURN
-                err = cydriver.cuStreamDestroy(self._handle)
+                HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle))
         else:
             self._owner = None
         self._handle = <cydriver.CUstream>(NULL)
@@ -242,8 +239,7 @@ cdef class Stream:
         """Return True if this is a nonblocking stream, otherwise False."""
         cdef unsigned int flags
         if self._nonblocking is None:
-            # TODO: switch to HANDLE_RETURN
-            err = cydriver.cuStreamGetFlags(self._handle, &flags)
+            HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags))
             if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING:
                 self._nonblocking = True
             else:
@@ -255,15 +251,13 @@ cdef class Stream:
         """Return the stream priority."""
         cdef int prio
         if self._priority is None:
-            # TODO: switch to HANDLE_RETURN
-            err = cydriver.cuStreamGetPriority(self._handle, &prio)
+            HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio))
             self._priority = prio
         return self._priority
 
     def sync(self):
         """Synchronize the stream."""
-        # TODO: switch to HANDLE_RETURN
-        err = cydriver.cuStreamSynchronize(self._handle)
+        HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle))
 
     def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
@@ -290,9 +284,8 @@ cdef class Stream:
         if event is None:
             self._get_device_and_context()
             event = Event._init(self._device_id, self._ctx_handle, options)
-        # TODO: switch to HANDLE_RETURN
         # TODO: revisit after Event is cythonized
-        err = cydriver.cuEventRecord(<cydriver.CUevent><uintptr_t>(event.handle), self._handle)
+        HANDLE_RETURN(cydriver.cuEventRecord(<cydriver.CUevent><uintptr_t>(event.handle), self._handle))
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
@@ -324,16 +317,14 @@ cdef class Stream:
                         f" got {type(event_or_stream)}"
                     ) from e
                 stream = <cydriver.CUstream><uintptr_t>(s.handle)
-            # TODO: switch to HANDLE_RETURN
-            err = cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
-            err = cydriver.cuEventRecord(event, stream)
+            HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
+            HANDLE_RETURN(cydriver.cuEventRecord(event, stream))
             discard_event = True
 
         # TODO: support flags other than 0?
-        # TODO: switch to HANDLE_RETURN
-        err = cydriver.cuStreamWaitEvent(self._handle, event, 0)
+        HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
         if discard_event:
-            err = cydriver.cuEventDestroy(event)
+            HANDLE_RETURN(cydriver.cuEventDestroy(event))
 
     @property
     def device(self) -> Device:
@@ -354,8 +345,7 @@ cdef class Stream:
         # TODO: consider making self._ctx_handle typed?
         cdef cydriver.CUcontext ctx
         if self._ctx_handle is None:
-            # TODO: switch to HANDLE_RETURN
-            err = cydriver.cuStreamGetCtx(self._handle, &ctx)
+            HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx))
             self._ctx_handle = driver.CUcontext(<uintptr_t>ctx)
         return 0
 
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index 601736c47..c58f32610 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -2,18 +2,30 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-
 cimport cpython
-cimport libc.stdint
+from libc.stdint cimport int64_t
+
+# TODO: how about cuda.bindings < 12.6.2?
+from cuda.bindings cimport cydriver
+
+
+ctypedef fused supported_error_type:
+    cydriver.CUresult
 
 
+cdef int HANDLE_RETURN(supported_error_type err) except?-1
+
+
+# TODO: stop exposing these within the codebase?
 cpdef int _check_driver_error(error) except?-1
 cpdef int _check_runtime_error(error) except?-1
 cpdef int _check_nvrtc_error(error) except?-1
+
+
 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*)
 
 
-cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length):
+cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length):
     # Construct shape and strides tuples using the Python/C API for speed
     result = cpython.PyTuple_New(length)
     for i in range(length):
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 86588f733..c095e7564 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -52,6 +52,12 @@ def _reduce_3_tuple(t: tuple):
     return t[0] * t[1] * t[2]
 
 
+cdef int HANDLE_RETURN(supported_error_type err) except?-1:
+    if supported_error_type is cydriver.CUresult:
+        if err != cydriver.CUresult.CUDA_SUCCESS:
+            return _check_driver_error(err)
+
+
 cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS

From 1f5159e231d409b9604ec50625a100a688f8ec75 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 20:33:37 +0000
Subject: [PATCH 17/27] Revert "disable building/testing 313t/314/314t for now"

This reverts commit 1180ab6bf2fd5d4d03dd1376e1db4985171f2145.
---
 .github/workflows/build-wheel.yml |  6 +++---
 ci/test-matrix.json               | 16 ++++++++++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index c0b2a421b..3b47f4615 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -33,9 +33,9 @@ jobs:
           - "3.11"
           - "3.12"
           - "3.13"
-#          - "3.13t"
-#          - "3.14"
-#          - "3.14t"
+          - "3.13t"
+          - "3.14"
+          - "3.14t"
     name: py${{ matrix.python-version }}
     runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
                  (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
diff --git a/ci/test-matrix.json b/ci/test-matrix.json
index 41cf03018..10721659b 100644
--- a/ci/test-matrix.json
+++ b/ci/test-matrix.json
@@ -14,6 +14,9 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
@@ -23,7 +26,10 @@
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
+      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
     ],
     "nightly": [
       { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
@@ -88,7 +94,13 @@
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }
     ],
     "nightly": [
       { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },

From dc8d076ffa7a76524aa25d9507ee0a076f218643 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 29 Sep 2025 20:55:27 +0000
Subject: [PATCH 18/27] fix artifact location

---
 .github/workflows/build-wheel.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index 3b47f4615..4e97d36fb 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -320,8 +320,8 @@ jobs:
           gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python
           rm -rf ${OLD_BASENAME}-tests  # exclude cython test artifacts
           ls -al $OLD_BASENAME
-          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
-          mv $OLD_BASENAME/*.whl "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+          mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
+          mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
           rmdir $OLD_BASENAME
 
       - name: Build cuda.core wheel

From a501cc751c0d3bdd5c04b13efe2993f446b328eb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 13:39:08 +0000
Subject: [PATCH 19/27] cythonize device

---
 cuda_core/build_hooks.py                      |   6 +-
 .../experimental/{_device.py => _device.pyx}  | 164 +++++++++---------
 2 files changed, 91 insertions(+), 79 deletions(-)
 rename cuda_core/cuda/core/experimental/{_device.py => _device.pyx} (91%)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 73bfbe4a9..c712e92cb 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -24,6 +24,7 @@
 get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist
 
 
+@functools.cache
 def _get_proper_cuda_bindings_major_version() -> str:
     # for local development (with/without build isolation)
     try:
@@ -92,10 +93,13 @@ def get_cuda_paths():
     )
 
     nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
+    compile_time_env = {"CUDA_CORE_BUILD_MAJOR": _get_proper_cuda_bindings_major_version()}
 
     global _extensions
     _extensions = cythonize(
-        ext_modules, verbose=True, language_level=3, nthreads=nthreads, compiler_directives={"embedsignature": True}
+        ext_modules, verbose=True, language_level=3, nthreads=nthreads,
+        compiler_directives={"embedsignature": True, "warn.deprecated.IF": False},
+        compile_time_env=compile_time_env
     )
 
     return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory)
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.pyx
similarity index 91%
rename from cuda_core/cuda/core/experimental/_device.py
rename to cuda_core/cuda/core/experimental/_device.pyx
index 0499baa58..589d5a42c 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport uintptr_t
+
+# TODO: how about cuda.bindings < 12.6.2?
+from cuda.bindings cimport cydriver
+
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+
 import threading
 from typing import Optional, Union
 
@@ -14,41 +21,44 @@
 from cuda.core.experimental._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
-    _check_driver_error,
     driver,
     handle_return,
     runtime,
 )
 
+
 _tls = threading.local()
 _lock = threading.Lock()
-_is_cuInit = False
+cdef bint _is_cuInit = False
 
 
-class DeviceProperties:
+cdef class DeviceProperties:
     """
     A class to query various attributes of a CUDA device.
 
     Attributes are read-only and provide information about the device.
     """
+    cdef:
+        int _handle
+        dict _cache
 
-    def __new__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError("DeviceProperties cannot be instantiated directly. Please use Device APIs.")
 
-    __slots__ = ("_handle", "_cache")
-
     @classmethod
     def _init(cls, handle):
-        self = super().__new__(cls)
+        cdef DeviceProperties self = DeviceProperties.__new__(cls)
         self._handle = handle
         self._cache = {}
         return self
 
-    def _get_attribute(self, attr):
+    cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr):
         """Retrieve the attribute value directly from the driver."""
-        return handle_return(driver.cuDeviceGetAttribute(attr, self._handle))
+        cdef int val
+        HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle))
+        return val
 
-    def _get_cached_attribute(self, attr):
+    cdef _get_cached_attribute(self, attr):
         """Retrieve the attribute value, using cache if applicable."""
         if attr not in self._cache:
             self._cache[attr] = self._get_attribute(attr)
@@ -931,8 +941,17 @@ def multicast_supported(self) -> bool:
         return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED))
 
 
-_SUCCESS = driver.CUresult.CUDA_SUCCESS
-_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT
+cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL:
+    try:
+        primary_ctxs = _tls.primary_ctxs
+    except AttributeError:
+        total = len(_tls.devices)
+        primary_ctxs = _tls.primary_ctxs = [0] * total
+    cdef cydriver.CUcontext ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
+    if ctx == NULL:
+        HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
+        primary_ctxs[dev_id] = <uintptr_t>(ctx)
+    return ctx
 
 
 class Device:
@@ -961,55 +980,56 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
-
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id: Optional[int] = None):
         global _is_cuInit
         if _is_cuInit is False:
             with _lock:
-                handle_return(driver.cuInit(0))
+                HANDLE_RETURN(cydriver.cuInit(0))
                 _is_cuInit = True
 
         # important: creating a Device instance does not initialize the GPU!
+        cdef cydriver.CUdevice dev
+        cdef cydriver.CUcontext ctx
         if device_id is None:
-            err, dev = driver.cuCtxGetDevice()
-            if err == _SUCCESS:
+            err = cydriver.cuCtxGetDevice(&dev)
+            if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
-            elif err == _INVALID_CTX:
-                ctx = handle_return(driver.cuCtxGetCurrent())
-                assert int(ctx) == 0
+            elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
+                HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+                assert <void*>(ctx) == NULL
                 device_id = 0  # cudart behavior
             else:
-                _check_driver_error(err)
+                HANDLE_RETURN(err)
         elif device_id < 0:
             raise ValueError(f"device_id must be >= 0, got {device_id}")
 
         # ensure Device is singleton
+        cdef int total, attr
         try:
             devices = _tls.devices
         except AttributeError:
-            total = handle_return(driver.cuDeviceGetCount())
+            HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
             devices = _tls.devices = []
             for dev_id in range(total):
-                dev = super().__new__(cls)
-                dev._id = dev_id
+                device = super().__new__(cls)
+                device._id = dev_id
                 # If the device is in TCC mode, or does not support memory pools for some other reason,
                 # use the SynchronousMemoryResource which does not use memory pools.
-                if (
-                    handle_return(
-                        driver.cuDeviceGetAttribute(
-                            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
-                        )
+                HANDLE_RETURN(
+                    cydriver.cuDeviceGetAttribute(
+                        &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
                     )
-                ) == 1:
-                    dev._mr = DeviceMemoryResource(dev_id)
+                )
+                if attr == 1:
+                    device._mr = DeviceMemoryResource(dev_id)
                 else:
-                    dev._mr = _SynchronousMemoryResource(dev_id)
+                    device._mr = _SynchronousMemoryResource(dev_id)
 
-                dev._has_inited = False
-                dev._properties = None
-                devices.append(dev)
+                device._has_inited = False
+                device._properties = None
+                devices.append(device)
 
         try:
             return devices[device_id]
@@ -1022,36 +1042,17 @@ def _check_context_initialized(self):
                 f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
             )
 
-    def _get_primary_context(self) -> driver.CUcontext:
-        try:
-            primary_ctxs = _tls.primary_ctxs
-        except AttributeError:
-            total = len(_tls.devices)
-            primary_ctxs = _tls.primary_ctxs = [None] * total
-        ctx = primary_ctxs[self._id]
-        if ctx is None:
-            ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id))
-            primary_ctxs[self._id] = ctx
-        return ctx
-
     def _get_current_context(self, check_consistency=False) -> driver.CUcontext:
-        err, ctx = driver.cuCtxGetCurrent()
-
-        # TODO: We want to just call this:
-        # _check_driver_error(err)
-        # but even the simplest success check causes 50-100 ns. Wait until we cythonize this file...
-        if ctx is None:
-            _check_driver_error(err)
-
-        if int(ctx) == 0:
+        cdef cydriver.CUcontext ctx
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+        if ctx == NULL:
             raise CUDAError("No context is bound to the calling CPU thread.")
+        cdef cydriver.CUdevice dev
         if check_consistency:
-            err, dev = driver.cuCtxGetDevice()
-            if err != _SUCCESS:
-                handle_return((err,))
-            if int(dev) != self._id:
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+            if <int>(dev) != self._id:
                 raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return ctx
+        return driver.CUcontext(<uintptr_t>ctx)
 
     @property
     def device_id(self) -> int:
@@ -1078,20 +1079,23 @@ def uuid(self) -> str:
         driver is older than CUDA 11.4.
 
         """
-        driver_ver = handle_return(driver.cuDriverGetVersion())
-        if 11040 <= driver_ver < 13000:
-            uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id))
-        else:
-            uuid = handle_return(driver.cuDeviceGetUuid(self._id))
-        uuid = uuid.bytes.hex()
+        cdef cydriver.CUuuid uuid
+        IF CUDA_CORE_BUILD_MAJOR == "12":
+            HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id))
+        ELSE:  # 13.0+
+            HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id))
+        cdef bytes uuid_b = uuid.bytes
+        cdef str uuid_hex = uuid_b.hex()
         # 8-4-4-4-12
-        return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+        return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}"
 
     @property
     def name(self) -> str:
         """Return the device name."""
         # Use 256 characters to be consistent with CUDA Runtime
-        name = handle_return(driver.cuDeviceGetName(256, self._id))
+        cdef int LENGTH = 256
+        cdef bytes name = bytes(LENGTH)
+        HANDLE_RETURN(cydriver.cuDeviceGetName(<char*>name, LENGTH, self._id))
         name = name.split(b"\0")[0]
         return name.decode()
 
@@ -1106,10 +1110,11 @@ def properties(self) -> DeviceProperties:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        if "compute_capability" in self.properties._cache:
-            return self.properties._cache["compute_capability"]
-        cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor)
-        self.properties._cache["compute_capability"] = cc
+        cdef DeviceProperties prop = self.properties
+        if "compute_capability" in prop._cache:
+            return prop._cache["compute_capability"]
+        cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor)
+        prop._cache["compute_capability"] = cc
         return cc
 
     @property
@@ -1190,22 +1195,25 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
         >>> # ... do work on device 0 ...
 
         """
+        cdef cydriver.CUcontext _ctx
         if ctx is not None:
+            # TODO: revisit once Context is cythonized
             assert_type(ctx, Context)
             if ctx._id != self._id:
                 raise RuntimeError(
                     "the provided context was created on the device with"
                     f" id={ctx._id}, which is different from the target id={self._id}"
                 )
-            prev_ctx = handle_return(driver.cuCtxPopCurrent())
-            handle_return(driver.cuCtxPushCurrent(ctx._handle))
+            # _ctx is the previous context
+            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx))
+            HANDLE_RETURN(cydriver.cuCtxPushCurrent(<cydriver.CUcontext>(ctx._handle)))
             self._has_inited = True
-            if int(prev_ctx) != 0:
-                return Context._from_ctx(prev_ctx, self._id)
+            if _ctx != NULL:
+                return Context._from_ctx(<uintptr_t>(_ctx), self._id)
         else:
             # use primary ctx
-            ctx = self._get_primary_context()
-            handle_return(driver.cuCtxSetCurrent(ctx))
+            _ctx = _get_primary_context(self._id)
+            HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx))
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:

From 04d3f5dc58c88a832a79477541d9ec8ac652841b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 13:44:20 +0000
Subject: [PATCH 20/27] making the linter happy, again

---
 ci/tools/merge_cuda_core_wheels.py |  4 ++--
 cuda_core/build_hooks.py           | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 14c380b76..51cc97ba3 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -21,7 +21,7 @@
 import argparse
 import os
 import shutil
-import subprocess  # nosec: B404
+import subprocess
 import sys
 import tempfile
 from pathlib import Path
@@ -34,7 +34,7 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub
     if cwd:
         print(f"  Working directory: {cwd}")
 
-    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)  # nosec: B603
+    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)  # noqa: S603
 
     if result.returncode != 0:
         print(f"Command failed with return code {result.returncode}")
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index c712e92cb..7c5fd4672 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -13,7 +13,7 @@
 import glob
 import os
 import re
-import subprocess  # nosec: B404
+import subprocess
 
 from Cython.Build import cythonize
 from setuptools import Extension
@@ -41,7 +41,7 @@ def _get_proper_cuda_bindings_major_version() -> str:
 
     # also for local development
     try:
-        out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True)  # nosec: B603, B607
+        out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True)  # noqa: S603, S607
         m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode())
         if m:
             return m.group(1).split(".")[0]
@@ -97,9 +97,12 @@ def get_cuda_paths():
 
     global _extensions
     _extensions = cythonize(
-        ext_modules, verbose=True, language_level=3, nthreads=nthreads,
+        ext_modules,
+        verbose=True,
+        language_level=3,
+        nthreads=nthreads,
         compiler_directives={"embedsignature": True, "warn.deprecated.IF": False},
-        compile_time_env=compile_time_env
+        compile_time_env=compile_time_env,
     )
 
     return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory)

From a5d6826ac129879febd07abb19f413a1cdfca81c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 14:31:03 +0000
Subject: [PATCH 21/27] fix uuid handling

---
 cuda_core/cuda/core/experimental/_device.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 589d5a42c..0d05679f1 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -1084,7 +1084,8 @@ class Device:
             HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id))
         ELSE:  # 13.0+
             HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id))
-        cdef bytes uuid_b = uuid.bytes
+        cdef bytearray uuid_b = bytearray(sizeof(uuid.bytes))
+        uuid_b[:] = uuid.bytes
         cdef str uuid_hex = uuid_b.hex()
         # 8-4-4-4-12
         return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}"

From 578984dcac4d6ef8f3465277930e75498683e59d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 14:43:39 +0000
Subject: [PATCH 22/27] update release notes to note about compatibility
 requirement

---
 cuda_core/cuda/core/experimental/_device.pyx           | 1 -
 cuda_core/cuda/core/experimental/_event.pyx            | 1 -
 cuda_core/cuda/core/experimental/_stream.pxd           | 1 -
 cuda_core/cuda/core/experimental/_stream.pyx           | 1 -
 cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd | 1 -
 cuda_core/docs/source/release/0.X.Y-notes.rst          | 2 ++
 6 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 0d05679f1..d3cd4bf4c 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -4,7 +4,6 @@
 
 from libc.stdint cimport uintptr_t
 
-# TODO: how about cuda.bindings < 12.6.2?
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 0d5737e37..db243717f 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -6,7 +6,6 @@ from __future__ import annotations
 
 from libc.stdint cimport uintptr_t
 
-# TODO: how about cuda.bindings < 12.6.2?
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._utils.cuda_utils cimport (
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index f7d97de33..6b8a7f0f6 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# TODO: how about cuda.bindings < 12.6.2?
 from cuda.bindings cimport cydriver
 
 
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index ee6f6be01..737fd13f9 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -6,7 +6,6 @@ from __future__ import annotations
 
 from libc.stdint cimport uintptr_t
 
-# TODO: how about cuda.bindings < 12.6.2?
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._utils.cuda_utils cimport (
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index c58f32610..bf570965f 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -5,7 +5,6 @@
 cimport cpython
 from libc.stdint cimport int64_t
 
-# TODO: how about cuda.bindings < 12.6.2?
 from cuda.bindings cimport cydriver
 
 
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 551cbe65c..5ed53c723 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -20,6 +20,7 @@ Breaking Changes
 ----------------
 
 - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x.
+- Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout <https://nvidia.github.io/cuda-python/cuda-bindings/latest/release/12.6.1-notes.html#cuda-namespace-cleanup-with-a-new-module-layout>`_. As per the ``cuda-bindings`` `support policy <https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html>`_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y.
 - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``.
 - When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident.
 
@@ -49,3 +50,4 @@ Fixes and enhancements
 - Make :class:`Buffer` creation more performant.
 - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals.
 - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown.
+- General performance improvement.

From 0e6f9278739d75c2f62495a6770c4cc2c2395804 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 14:48:23 +0000
Subject: [PATCH 23/27] fix env vars being passed twice

---
 cuda_bindings/pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index dc6c87eef..f6a3c5f40 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -59,7 +59,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 skip = "*-musllinux_*"
 enable = "cpython-freethreading"
 build-verbosity = 1
-environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"

From 0e28aa19e2eb8c5d0ab9acb61ee37b081c833448 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 15:24:47 +0000
Subject: [PATCH 24/27] fix uuid handling, again

---
 cuda_core/cuda/core/experimental/_device.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index d3cd4bf4c..2808d025b 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+cimport cpython
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
@@ -1083,8 +1084,7 @@ class Device:
             HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id))
         ELSE:  # 13.0+
             HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id))
-        cdef bytearray uuid_b = bytearray(sizeof(uuid.bytes))
-        uuid_b[:] = uuid.bytes
+        cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes))
         cdef str uuid_hex = uuid_b.hex()
         # 8-4-4-4-12
         return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}"

From 4ba0090a740cac0743186aa1bca6dd7cfc0c04fd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 30 Sep 2025 12:51:17 -0400
Subject: [PATCH 25/27] Apply suggestions from code review

Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
---
 ci/tools/merge_cuda_core_wheels.py | 47 ++++++++++++++----------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 51cc97ba3..992a13185 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -47,8 +47,8 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub
 
 def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
     """Merge multiple wheels into a single wheel with version-specific binaries."""
-    print("\n=== Merging wheels ===")
-    print(f"Input wheels: {[w.name for w in wheels]}")
+    print("\n=== Merging wheels ===", file=sys.stderr)
+    print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr)
 
     if len(wheels) == 1:
         raise RuntimeError("only one wheel is provided, nothing to merge")
@@ -59,11 +59,11 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         extracted_wheels = []
 
         for i, wheel in enumerate(wheels):
-            print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}")
+            print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}", file=sys.stderr)
             # Extract wheel - wheel unpack creates the directory itself
             run_command(
                 [
-                    "python",
+                    sys.executable,
                     "-m",
                     "wheel",
                     "unpack",
@@ -99,18 +99,18 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
             base_dir = Path("cuda") / "core" / "experimental"
             # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}")
+            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
             shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
 
             # Overwrite the __init__.py in versioned dirs
-            open(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", "w").close()
+            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
 
         # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.listdir(base_wheel / base_dir)
+        files_to_remove = os.scandir(base_wheel / base_dir)
         for f in files_to_remove:
-            f_abspath = base_wheel / base_dir / f
-            if f not in ("__init__.py", "cu12", "cu13", "include"):
-                if os.path.isdir(f_abspath):
+            f_abspath = f.path
+            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
+                if f.is_dir():
                     shutil.rmtree(f_abspath)
                 else:
                     os.remove(f_abspath)
@@ -119,15 +119,12 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         output_dir.mkdir(parents=True, exist_ok=True)
 
         # Create a clean wheel name without CUDA version suffixes
-        base_wheel_name = wheels[0].name
-        # Remove any .cu* suffix from the wheel name
-        if ".cu" in base_wheel_name:
-            base_wheel_name = base_wheel_name.split(".cu")[0] + ".whl"
+        base_wheel_name = wheels[0].with_suffix(".whl").name
 
-        print(f"Repacking merged wheel as: {base_wheel_name}")
+        print(f"Repacking merged wheel as: {base_wheel_name}", file=sys.stderr)
         run_command(
             [
-                "python",
+                sys.executable,
                 "-m",
                 "wheel",
                 "pack",
@@ -143,7 +140,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
             raise RuntimeError("Failed to create merged wheel")
 
         merged_wheel = output_wheels[0]
-        print(f"Successfully merged wheel: {merged_wheel}")
+        print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr)
         return merged_wheel
 
 
@@ -155,32 +152,32 @@ def main():
 
     args = parser.parse_args()
 
-    print("cuda.core Wheel Merger")
-    print("======================")
+    print("cuda.core Wheel Merger", file=sys.stderr)
+    print("======================", file=sys.stderr)
 
     # Convert wheel paths to Path objects and validate
     wheels = []
     for wheel_path in args.wheels:
         wheel = Path(wheel_path)
         if not wheel.exists():
-            print(f"Error: Wheel not found: {wheel}")
+            print(f"Error: Wheel not found: {wheel}", file=sys.stderr)
             sys.exit(1)
         if not wheel.name.endswith(".whl"):
-            print(f"Error: Not a wheel file: {wheel}")
+            print(f"Error: Not a wheel file: {wheel}", file=sys.stderr)
             sys.exit(1)
         wheels.append(wheel)
 
     if not wheels:
-        print("Error: No wheels provided")
+        print("Error: No wheels provided", file=sys.stderr)
         sys.exit(1)
 
     output_dir = Path(args.output_dir)
 
     # Check that we have wheel tool available
     try:
-        run_command(["python", "-m", "wheel", "--help"])
-    except Exception:
-        print("Error: wheel package not available. Install with: pip install wheel")
+        import wheel
+    except ImportError:
+        print("Error: wheel package not available. Install with: pip install wheel", file=sys.stderr)
         sys.exit(1)
 
     # Merge the wheels

From 20c5a99130aa5a6eec640b88ace6d8a7ae19dda1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 1 Oct 2025 23:13:44 +0000
Subject: [PATCH 26/27] address review comments

---
 .github/workflows/build-wheel.yml            | 2 --
 ci/tools/merge_cuda_core_wheels.py           | 3 ++-
 cuda_core/cuda/core/experimental/__init__.py | 6 +++---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index ab7672e7f..2c4be6695 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -155,7 +155,6 @@ jobs:
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
           if-no-files-found: error
 
-      # TODO: ideally we want to build against public cuda-bindings
       - name: Build cuda.core wheel
         uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
         with:
@@ -289,7 +288,6 @@ jobs:
           cuda-version: ${{ inputs.prev-cuda-version }}
           cuda-path: "./cuda_toolkit_prev"
 
-      # TODO: ideally we want to build against public cuda-bindings
       - name: Download cuda.bindings build artifacts from the prior branch
         if: ${{ matrix.python-version == '3.13t'
                 || matrix.python-version == '3.14'
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 51cc97ba3..71ff069f3 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -1,8 +1,9 @@
+#!/usr/bin/env python3
+
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
-#!/usr/bin/env python3
 """
 Script to merge CUDA-specific wheels into a single multi-CUDA wheel.
 
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index fb0a2f469..b0383e408 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -15,14 +15,14 @@
 
 subdir = f"cu{cuda_major}"
 try:
-    verioned_mod = importlib.import_module(f".{subdir}", __package__)
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
     # Import all symbols from the module
-    globals().update(verioned_mod.__dict__)
+    globals().update(versioned_mod.__dict__)
 except ImportError:
     # This is not a wheel build, but a conda or local build, do nothing
     pass
 else:
-    del verioned_mod
+    del versioned_mod
 finally:
     del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
 

From d79e317cd11e450a8013da6cb66ebaeffed64d3b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 2 Oct 2025 10:14:28 -0400
Subject: [PATCH 27/27] switch to use FutureWarning

---
 cuda_core/cuda/core/experimental/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index b0383e408..2bdcc4f83 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -32,7 +32,7 @@
 if sys.version_info < (3, 10):
     warnings.warn(
         "support for Python 3.9 and below is deprecated and subject to future removal",
-        category=UserWarning,
+        category=FutureWarning,
         stacklevel=1,
     )
 del sys, warnings