diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 83b447f0c..be7536c63 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -18,6 +18,11 @@ inputs:
     required: false
     type: string
     default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+  cuda-path:
+    description: "where the CTK components will be installed to, relative to $PWD"
+    required: false
+    type: string
+    default: "./cuda_toolkit"
 
 runs:
   using: composite
@@ -159,18 +164,24 @@ runs:
           exit 1
         fi
 
+    - name: Move CTK to the specified location
+      if: ${{ inputs.cuda-path != './cuda_toolkit' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        mv ./cuda_toolkit ${{ inputs.cuda-path }}
+
     - name: Set output environment variables
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         # mimics actual CTK installation
         if [[ "${{ inputs.host-platform }}" == linux* ]]; then
-          CUDA_PATH=$(realpath "./cuda_toolkit")
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib" >> $GITHUB_ENV
+          CUDA_PATH=$(realpath "${{ inputs.cuda-path }}")
+          echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
         elif [[ "${{ inputs.host-platform }}" == win* ]]; then
           function normpath() {
             echo "$(echo $(cygpath -w $1) | sed 's/\\/\\\\/g')"
           }
-          CUDA_PATH=$(normpath $(realpath "./cuda_toolkit"))
+          CUDA_PATH=$(normpath $(realpath "${{ inputs.cuda-path }}"))
           echo "$(normpath ${CUDA_PATH}/bin)" >> $GITHUB_PATH
         fi
         echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index 472833faa..90bd005d4 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -11,6 +11,9 @@ on:
       cuda-version:
         required: true
         type: string
+      prev-cuda-version:
+        required: true
+        type: string
 
 defaults:
   run:
@@ -109,33 +112,6 @@ jobs:
           path: cuda_pathfinder/*.whl
           if-no-files-found: error
 
-      - name: Build cuda.core wheel
-        uses: pypa/cibuildwheel@7c619efba910c04005a835b110b057fc28fd6e93  # v3.2.0
-        with:
-          package-dir: ./cuda_core/
-          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-
-      - name: List the cuda.core artifacts directory
-        run: |
-          if [[ "${{ inputs.host-platform }}" == win* ]]; then
-            export CHOWN=chown
-          else
-            export CHOWN="sudo chown"
-          fi
-          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
-
-      - name: Check cuda.core wheel
-        run: |
-          twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
-
-      - name: Upload cuda.core build artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
-        with:
-          name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
-          path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
-          if-no-files-found: error
-
       - name: Set up mini CTK
         uses: ./.github/actions/fetch_ctk
         continue-on-error: false
@@ -148,6 +124,15 @@ jobs:
         with:
           package-dir: ./cuda_bindings/
           output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
 
       - name: List the cuda.bindings artifacts directory
         run: |
@@ -170,6 +155,47 @@ jobs:
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
           if-no-files-found: error
 
+      - name: Build cuda.core wheel
+        uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
+        with:
+          package-dir: ./cuda_core/
+          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
+            PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }}
+            PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})"
+
+      - name: List the cuda.core artifacts directory and rename
+        run: |
+          if [[ "${{ inputs.host-platform }}" == win* ]]; then
+            export CHOWN=chown
+          else
+            export CHOWN="sudo chown"
+          fi
+          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
+          # Rename wheel to include CUDA version suffix
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}"
+          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
+            if [[ -f "${wheel}" ]]; then
+              base_name=$(basename "${wheel}" .whl)
+              new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl"
+              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}"
+              echo "Renamed wheel to: ${new_name}"
+            fi
+          done
+
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
       # We only need/want a single pure python wheel, pick linux-64 index 0.
       - name: Build and check cuda-python wheel
         if: ${{ strategy.job-index == 0 && inputs.host-platform == 'linux-64' }}
@@ -241,7 +267,7 @@ jobs:
 
       - name: Build cuda.core Cython tests
         run: |
-          pip install ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl --group ./cuda_core/pyproject.toml:test
+          pip install ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/"cu${BUILD_CUDA_MAJOR}"/*.whl --group ./cuda_core/pyproject.toml:test
           pushd ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}
           bash build_tests.sh
           popd
@@ -252,3 +278,107 @@ jobs:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}/test_*${{ env.PY_EXT_SUFFIX }}
           if-no-files-found: error
+
+      # Note: This overwrites CUDA_PATH etc
+      - name: Set up mini CTK
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ inputs.prev-cuda-version }}
+          cuda-path: "./cuda_toolkit_prev"
+
+      - name: Download cuda.bindings build artifacts from the prior branch
+        if: ${{ matrix.python-version == '3.13t'
+                || matrix.python-version == '3.14'
+                || matrix.python-version == '3.14t' }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if ! (command -v gh 2>&1 >/dev/null); then
+            # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt.
+            # gh is needed for artifact fetching.
+            mkdir -p -m 755 /etc/apt/keyrings \
+                  && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+                  && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \
+            && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
+            && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+            && apt update \
+            && apt install gh -y
+          fi
+
+          OLD_BRANCH=$(cat .github/BACKPORT_BRANCH)
+          OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
+            echo "LATEST_PRIOR_RUN_ID not found!"
+            exit 1
+          fi
+
+          gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python
+          rm -rf ${OLD_BASENAME}-tests  # exclude cython test artifacts
+          ls -al $OLD_BASENAME
+          mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
+          mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
+          rmdir $OLD_BASENAME
+
+      - name: Build cuda.core wheel
+        uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6  # v3.1.4
+        with:
+          package-dir: ./cuda_core/
+          output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CUDA_PATH=/host/${{ env.CUDA_PATH }}
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
+            PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          CIBW_ENVIRONMENT_WINDOWS: >
+            CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})"
+            CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }}
+            CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }}
+            PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})"
+
+      - name: List the cuda.core artifacts directory and rename
+        run: |
+          if [[ "${{ inputs.host-platform }}" == win* ]]; then
+            export CHOWN=chown
+          else
+            export CHOWN="sudo chown"
+          fi
+          $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
+          # Rename wheel to include CUDA version suffix
+          mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}"
+          for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do
+            if [[ -f "${wheel}" ]]; then
+              base_name=$(basename "${wheel}" .whl)
+              new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl"
+              mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}"
+              echo "Renamed wheel to: ${new_name}"
+            fi
+          done
+
+          ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+
+      - name: Merge cuda.core wheels
+        run: |
+          pip install wheel
+          python ci/tools/merge_cuda_core_wheels.py \
+            "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_CUDA_MAJOR}"/cuda_core*.whl \
+            "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_PREV_CUDA_MAJOR}"/cuda_core*.whl \
+            --output-dir "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+
+      - name: Check cuda.core wheel
+        run: |
+          twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+
+      - name: Upload cuda.core build artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
+        with:
+          name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+          if-no-files-found: error
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cfe8bc7a6..e8641119a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,17 +21,21 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }}
+      CUDA_PREV_BUILD_VER: ${{ steps.get-vars.outputs.cuda_prev_build_ver }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
         with:
           fetch-depth: 0
-      - name: Get CUDA build version
+      - name: Get CUDA build versions
         id: get-vars
         run: |
           cuda_build_ver=$(jq -r .cuda.build.version ci/versions.json)
           echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT
 
+          cuda_prev_build_ver=$(jq -r .cuda.prev_build.version ci/versions.json)
+          echo "cuda_prev_build_ver=$cuda_prev_build_ver" >> $GITHUB_OUTPUT
+
   should-skip:
     runs-on: ubuntu-latest
     outputs:
@@ -69,6 +73,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure all of the build jobs are in sync
   build-linux-aarch64:
@@ -87,6 +92,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure all of the build jobs are in sync
   build-windows:
@@ -105,6 +111,7 @@ jobs:
     with:
       host-platform: ${{ matrix.host-platform }}
       cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
 
   # WARNING: make sure both Linux test jobs are in sync
   test-linux-64:
diff --git a/.spdx-ignore b/.spdx-ignore
index 60435ebb5..84f051faf 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -10,4 +10,4 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/dlpack.h
+cuda_core/cuda/core/experimental/include/dlpack.h
diff --git a/ci/tools/env-vars b/ci/tools/env-vars
index de4a5a6b9..f7db5179d 100755
--- a/ci/tools/env-vars
+++ b/ci/tools/env-vars
@@ -41,6 +41,9 @@ if [[ "${1}" == "build" ]]; then
   # platform is handled by the default value of platform (`auto`) in cibuildwheel
   # here we only need to specify the python version we want
   echo "CIBW_BUILD=cp${PYTHON_VERSION_FORMATTED}-*" >> $GITHUB_ENV
+  BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
+  echo "BUILD_CUDA_MAJOR=${BUILD_CUDA_MAJOR}" >> $GITHUB_ENV
+  echo "BUILD_PREV_CUDA_MAJOR=$((${BUILD_CUDA_MAJOR} - 1))" >> $GITHUB_ENV
   CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${CUDA_VER}-${HOST_PLATFORM}"
 elif [[ "${1}" == "test" ]]; then
   BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${BUILD_CUDA_VER})"
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
new file mode 100644
index 000000000..359b98d6a
--- /dev/null
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Script to merge CUDA-specific wheels into a single multi-CUDA wheel.
+
+This script takes wheels built for different CUDA versions (cu12, cu13) and merges them
+into a single wheel that supports both CUDA versions.
+
+In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
+and the associated bindings. This script merges these directories into a single wheel
+that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
+and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+is used to import the appropriate CUDA-specific bindings.
+
+This script is based on the one in NVIDIA/CCCL.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import List
+
+
+def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> subprocess.CompletedProcess:
+    """Run a command with error handling."""
+    print(f"Running: {' '.join(cmd)}")
+    if cwd:
+        print(f"  Working directory: {cwd}")
+
+    result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)  # noqa: S603
+
+    if result.returncode != 0:
+        print(f"Command failed with return code {result.returncode}")
+        print("STDOUT:", result.stdout)
+        print("STDERR:", result.stderr)
+        result.check_returncode()
+
+    return result
+
+
+def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
+    """Merge multiple wheels into a single wheel with version-specific binaries."""
+    print("\n=== Merging wheels ===", file=sys.stderr)
+    print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr)
+
+    if len(wheels) == 1:
+        raise RuntimeError("only one wheel is provided, nothing to merge")
+
+    # Extract all wheels to temporary directories
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        extracted_wheels = []
+
+        for i, wheel in enumerate(wheels):
+            print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}", file=sys.stderr)
+            # Extract wheel - wheel unpack creates the directory itself
+            run_command(
+                [
+                    sys.executable,
+                    "-m",
+                    "wheel",
+                    "unpack",
+                    str(wheel),
+                    "--dest",
+                    str(temp_path),
+                ]
+            )
+
+            # Find the extracted directory (wheel unpack creates a subdirectory)
+            extract_dir = None
+            for item in temp_path.iterdir():
+                if item.is_dir() and item.name.startswith("cuda_core"):
+                    extract_dir = item
+                    break
+
+            if not extract_dir:
+                raise RuntimeError(f"Could not find extracted wheel directory for {wheel.name}")
+
+            # Rename to our expected name
+            expected_name = temp_path / f"wheel_{i}"
+            extract_dir.rename(expected_name)
+            extract_dir = expected_name
+
+            extracted_wheels.append(extract_dir)
+
+        # Use the first wheel as the base and merge binaries from others
+        base_wheel = extracted_wheels[0]
+
+        # now copy the version-specific directory from other wheels
+        # into the appropriate place in the base wheel
+        for i, wheel_dir in enumerate(extracted_wheels):
+            cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
+            base_dir = Path("cuda") / "core" / "experimental"
+            # Copy from other wheels
+            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
+            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
+
+            # Overwrite the __init__.py in versioned dirs
+            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
+
+        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
+        files_to_remove = os.scandir(base_wheel / base_dir)
+        for f in files_to_remove:
+            f_abspath = f.path
+            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
+                if f.is_dir():
+                    shutil.rmtree(f_abspath)
+                else:
+                    os.remove(f_abspath)
+
+        # Repack the merged wheel
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create a clean wheel name without CUDA version suffixes
+        base_wheel_name = wheels[0].with_suffix(".whl").name
+
+        print(f"Repacking merged wheel as: {base_wheel_name}", file=sys.stderr)
+        run_command(
+            [
+                sys.executable,
+                "-m",
+                "wheel",
+                "pack",
+                str(base_wheel),
+                "--dest-dir",
+                str(output_dir),
+            ]
+        )
+
+        # Find the output wheel
+        output_wheels = list(output_dir.glob("*.whl"))
+        if not output_wheels:
+            raise RuntimeError("Failed to create merged wheel")
+
+        merged_wheel = output_wheels[0]
+        print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr)
+        return merged_wheel
+
+
+def main():
+    """Main merge script."""
+    parser = argparse.ArgumentParser(description="Merge CUDA-specific wheels into a single multi-CUDA wheel")
+    parser.add_argument("wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge")
+    parser.add_argument("--output-dir", "-o", default="dist", help="Output directory for merged wheel")
+
+    args = parser.parse_args()
+
+    print("cuda.core Wheel Merger", file=sys.stderr)
+    print("======================", file=sys.stderr)
+
+    # Convert wheel paths to Path objects and validate
+    wheels = []
+    for wheel_path in args.wheels:
+        wheel = Path(wheel_path)
+        if not wheel.exists():
+            print(f"Error: Wheel not found: {wheel}", file=sys.stderr)
+            sys.exit(1)
+        if not wheel.name.endswith(".whl"):
+            print(f"Error: Not a wheel file: {wheel}", file=sys.stderr)
+            sys.exit(1)
+        wheels.append(wheel)
+
+    if not wheels:
+        print("Error: No wheels provided", file=sys.stderr)
+        sys.exit(1)
+
+    output_dir = Path(args.output_dir)
+
+    # Check that we have wheel tool available
+    try:
+        import wheel
+    except ImportError:
+        print("Error: wheel package not available. Install with: pip install wheel", file=sys.stderr)
+        sys.exit(1)
+
+    # Merge the wheels
+    merged_wheel = merge_wheels(wheels, output_dir)
+    print(f"\nMerge complete! Output: {merged_wheel}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ci/versions.json b/ci/versions.json
index 271c69ac3..2acfae1e3 100644
--- a/ci/versions.json
+++ b/ci/versions.json
@@ -2,6 +2,9 @@
   "cuda": {
     "build": {
       "version": "13.0.1"
+    },
+    "prev_build": {
+      "version": "12.9.1"
     }
   }
 }
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 5545c966c..250f8e407 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -59,16 +59,11 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 skip = "*-musllinux_*"
 enable = "cpython-freethreading"
 build-verbosity = 1
-environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"
-# CIBW mounts the host filesystem under /host
-environment-pass = ["CUDA_PATH"]
-environment = { CUDA_HOME = "/host/$CUDA_PATH" }
 
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
 repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
-environment = { CUDA_HOME = "$(cygpath -w $CUDA_PATH)" }
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
new file mode 100644
index 000000000..7c5fd4672
--- /dev/null
+++ b/cuda_core/build_hooks.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# This module implements basic PEP 517 backend support, see e.g.
+# - https://peps.python.org/pep-0517/
+# - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks
+# Specifically, there are 5 APIs required to create a proper build backend, see below.
+#
+# TODO: also implement PEP-660 API hooks
+
+import functools
+import glob
+import os
+import re
+import subprocess
+
+from Cython.Build import cythonize
+from setuptools import Extension
+from setuptools import build_meta as _build_meta
+
+prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel
+build_sdist = _build_meta.build_sdist
+get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist
+
+
+@functools.cache
+def _get_proper_cuda_bindings_major_version() -> str:
+    # for local development (with/without build isolation)
+    try:
+        import cuda.bindings
+
+        return cuda.bindings.__version__.split(".")[0]
+    except ImportError:
+        pass
+
+    # for custom overwrite, e.g. in CI
+    cuda_major = os.environ.get("CUDA_CORE_BUILD_MAJOR")
+    if cuda_major is not None:
+        return cuda_major
+
+    # also for local development
+    try:
+        out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True)  # noqa: S603, S607
+        m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode())
+        if m:
+            return m.group(1).split(".")[0]
+    except FileNotFoundError:
+        # the build machine has no driver installed
+        pass
+
+    # default fallback
+    return "13"
+
+
+# used later by setup()
+_extensions = None
+
+
+def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
+    # Customizing this hook is needed because we must defer cythonization until cuda-bindings,
+    # now a required build-time dependency that's dynamically installed via the other hook below,
+    # is installed. Otherwise, cimport any cuda.bindings modules would fail!
+
+    # It seems setuptools' wildcard support has problems for namespace packages,
+    # so we explicitly spell out all Extension instances.
+    root_module = "cuda.core.experimental"
+    root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
+    ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
+
+    def strip_prefix_suffix(filename):
+        return filename[len(root_path) : -4]
+
+    module_names = (strip_prefix_suffix(f) for f in ext_files)
+
+    @functools.cache
+    def get_cuda_paths():
+        CUDA_PATH = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME", None))
+        if not CUDA_PATH:
+            raise RuntimeError("Environment variable CUDA_PATH or CUDA_HOME is not set")
+        CUDA_PATH = CUDA_PATH.split(os.pathsep)
+        print("CUDA paths:", CUDA_PATH)
+        return CUDA_PATH
+
+    ext_modules = tuple(
+        Extension(
+            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/experimental/{mod}.pyx"],
+            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            language="c++",
+        )
+        for mod in module_names
+    )
+
+    nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
+    compile_time_env = {"CUDA_CORE_BUILD_MAJOR": _get_proper_cuda_bindings_major_version()}
+
+    global _extensions
+    _extensions = cythonize(
+        ext_modules,
+        verbose=True,
+        language_level=3,
+        nthreads=nthreads,
+        compiler_directives={"embedsignature": True, "warn.deprecated.IF": False},
+        compile_time_env=compile_time_env,
+    )
+
+    return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory)
+
+
+def get_requires_for_build_wheel(config_settings=None):
+    cuda_major = _get_proper_cuda_bindings_major_version()
+    cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"]
+    return _build_meta.get_requires_for_build_wheel(config_settings) + cuda_bindings_require
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index a01134373..dc4d20222 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -2,29 +2,64 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import utils
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import (
+try:
+    import cuda.bindings
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
+else:
+    cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+
+subdir = f"cu{cuda_major}"
+try:
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(versioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del versioned_mod
+finally:
+    del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
+
+import sys  # noqa: E402
+import warnings  # noqa: E402
+
+if sys.version_info < (3, 10):
+    warnings.warn(
+        "support for Python 3.9 and below is deprecated and subject to future removal",
+        category=FutureWarning,
+        stacklevel=1,
+    )
+del sys, warnings
+
+from cuda.core.experimental import utils  # noqa: E402
+from cuda.core.experimental._device import Device  # noqa: E402
+from cuda.core.experimental._event import Event, EventOptions  # noqa: E402
+from cuda.core.experimental._graph import (  # noqa: E402
     Graph,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
 )
-from cuda.core.experimental._launch_config import LaunchConfig
-from cuda.core.experimental._launcher import launch
-from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._memory import (
+from cuda.core.experimental._launch_config import LaunchConfig  # noqa: E402
+from cuda.core.experimental._launcher import launch  # noqa: E402
+from cuda.core.experimental._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core.experimental._memory import (  # noqa: E402
     Buffer,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
     LegacyPinnedMemoryResource,
     MemoryResource,
 )
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._stream import Stream, StreamOptions
-from cuda.core.experimental._system import System
+from cuda.core.experimental._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core.experimental._program import Program, ProgramOptions  # noqa: E402
+from cuda.core.experimental._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core.experimental._system import System  # noqa: E402
 
 system = System()
 __import__("sys").modules[__spec__.name + ".system"] = system
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.pyx
similarity index 91%
rename from cuda_core/cuda/core/experimental/_device.py
rename to cuda_core/cuda/core/experimental/_device.pyx
index be8c5170a..ae1c7f38c 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+cimport cpython
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
+
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+
 import threading
 from typing import Optional, Union
 
@@ -14,41 +21,44 @@
 from cuda.core.experimental._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
-    _check_driver_error,
     driver,
     handle_return,
     runtime,
 )
 
+
 _tls = threading.local()
 _lock = threading.Lock()
-_is_cuInit = False
+cdef bint _is_cuInit = False
 
 
-class DeviceProperties:
+cdef class DeviceProperties:
     """
     A class to query various attributes of a CUDA device.
 
     Attributes are read-only and provide information about the device.
     """
+    cdef:
+        int _handle
+        dict _cache
 
-    def __new__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError("DeviceProperties cannot be instantiated directly. Please use Device APIs.")
 
-    __slots__ = ("_handle", "_cache")
-
     @classmethod
     def _init(cls, handle):
-        self = super().__new__(cls)
+        cdef DeviceProperties self = DeviceProperties.__new__(cls)
         self._handle = handle
         self._cache = {}
         return self
 
-    def _get_attribute(self, attr):
+    cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr):
         """Retrieve the attribute value directly from the driver."""
-        return handle_return(driver.cuDeviceGetAttribute(attr, self._handle))
+        cdef int val
+        HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle))
+        return val
 
-    def _get_cached_attribute(self, attr):
+    cdef _get_cached_attribute(self, attr):
         """Retrieve the attribute value, using cache if applicable."""
         if attr not in self._cache:
             self._cache[attr] = self._get_attribute(attr)
@@ -931,8 +941,17 @@ def multicast_supported(self) -> bool:
         return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED))
 
 
-_SUCCESS = driver.CUresult.CUDA_SUCCESS
-_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT
+cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL:
+    try:
+        primary_ctxs = _tls.primary_ctxs
+    except AttributeError:
+        total = len(_tls.devices)
+        primary_ctxs = _tls.primary_ctxs = [0] * total
+    cdef cydriver.CUcontext ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
+    if ctx == NULL:
+        HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
+        primary_ctxs[dev_id] = <uintptr_t>(ctx)
+    return ctx
 
 
 class Device:
@@ -961,55 +980,56 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
-
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id: Optional[int] = None):
         global _is_cuInit
         if _is_cuInit is False:
             with _lock:
-                handle_return(driver.cuInit(0))
+                HANDLE_RETURN(cydriver.cuInit(0))
                 _is_cuInit = True
 
         # important: creating a Device instance does not initialize the GPU!
+        cdef cydriver.CUdevice dev
+        cdef cydriver.CUcontext ctx
         if device_id is None:
-            err, dev = driver.cuCtxGetDevice()
-            if err == _SUCCESS:
+            err = cydriver.cuCtxGetDevice(&dev)
+            if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
-            elif err == _INVALID_CTX:
-                ctx = handle_return(driver.cuCtxGetCurrent())
-                assert int(ctx) == 0
+            elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
+                HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+                assert <void*>(ctx) == NULL
                 device_id = 0  # cudart behavior
             else:
-                _check_driver_error(err)
+                HANDLE_RETURN(err)
         elif device_id < 0:
             raise ValueError(f"device_id must be >= 0, got {device_id}")
 
         # ensure Device is singleton
+        cdef int total, attr
         try:
             devices = _tls.devices
         except AttributeError:
-            total = handle_return(driver.cuDeviceGetCount())
+            HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
             devices = _tls.devices = []
             for dev_id in range(total):
-                dev = super().__new__(cls)
-                dev._id = dev_id
+                device = super().__new__(cls)
+                device._id = dev_id
                 # If the device is in TCC mode, or does not support memory pools for some other reason,
                 # use the SynchronousMemoryResource which does not use memory pools.
-                if (
-                    handle_return(
-                        driver.cuDeviceGetAttribute(
-                            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
-                        )
+                HANDLE_RETURN(
+                    cydriver.cuDeviceGetAttribute(
+                        &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
                     )
-                ) == 1:
-                    dev._mr = DeviceMemoryResource(dev_id)
+                )
+                if attr == 1:
+                    device._mr = DeviceMemoryResource(dev_id)
                 else:
-                    dev._mr = _SynchronousMemoryResource(dev_id)
+                    device._mr = _SynchronousMemoryResource(dev_id)
 
-                dev._has_inited = False
-                dev._properties = None
-                devices.append(dev)
+                device._has_inited = False
+                device._properties = None
+                devices.append(device)
 
         try:
             return devices[device_id]
@@ -1022,36 +1042,17 @@ def _check_context_initialized(self):
                 f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
             )
 
-    def _get_primary_context(self) -> driver.CUcontext:
-        try:
-            primary_ctxs = _tls.primary_ctxs
-        except AttributeError:
-            total = len(_tls.devices)
-            primary_ctxs = _tls.primary_ctxs = [None] * total
-        ctx = primary_ctxs[self._id]
-        if ctx is None:
-            ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id))
-            primary_ctxs[self._id] = ctx
-        return ctx
-
     def _get_current_context(self, check_consistency=False) -> driver.CUcontext:
-        err, ctx = driver.cuCtxGetCurrent()
-
-        # TODO: We want to just call this:
-        # _check_driver_error(err)
-        # but even the simplest success check causes 50-100 ns. Wait until we cythonize this file...
-        if ctx is None:
-            _check_driver_error(err)
-
-        if int(ctx) == 0:
+        cdef cydriver.CUcontext ctx
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+        if ctx == NULL:
             raise CUDAError("No context is bound to the calling CPU thread.")
+        cdef cydriver.CUdevice dev
         if check_consistency:
-            err, dev = driver.cuCtxGetDevice()
-            if err != _SUCCESS:
-                handle_return((err,))
-            if int(dev) != self._id:
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+            if <int>(dev) != self._id:
                 raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return ctx
+        return driver.CUcontext(<uintptr_t>ctx)
 
     @property
     def device_id(self) -> int:
@@ -1078,20 +1079,23 @@ def uuid(self) -> str:
         driver is older than CUDA 11.4.
 
         """
-        driver_ver = handle_return(driver.cuDriverGetVersion())
-        if 11040 <= driver_ver < 13000:
-            uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id))
-        else:
-            uuid = handle_return(driver.cuDeviceGetUuid(self._id))
-        uuid = uuid.bytes.hex()
+        cdef cydriver.CUuuid uuid
+        IF CUDA_CORE_BUILD_MAJOR == "12":
+            HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id))
+        ELSE:  # 13.0+
+            HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id))
+        cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes))
+        cdef str uuid_hex = uuid_b.hex()
         # 8-4-4-4-12
-        return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+        return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}"
 
     @property
     def name(self) -> str:
         """Return the device name."""
         # Use 256 characters to be consistent with CUDA Runtime
-        name = handle_return(driver.cuDeviceGetName(256, self._id))
+        cdef int LENGTH = 256
+        cdef bytes name = bytes(LENGTH)
+        HANDLE_RETURN(cydriver.cuDeviceGetName(<char*>name, LENGTH, self._id))
         name = name.split(b"\0")[0]
         return name.decode()
 
@@ -1106,10 +1110,11 @@ def properties(self) -> DeviceProperties:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        if "compute_capability" in self.properties._cache:
-            return self.properties._cache["compute_capability"]
-        cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor)
-        self.properties._cache["compute_capability"] = cc
+        cdef DeviceProperties prop = self.properties
+        if "compute_capability" in prop._cache:
+            return prop._cache["compute_capability"]
+        cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor)
+        prop._cache["compute_capability"] = cc
         return cc
 
     @property
@@ -1193,22 +1198,25 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
         >>> # ... do work on device 0 ...
 
         """
+        cdef cydriver.CUcontext _ctx
         if ctx is not None:
+            # TODO: revisit once Context is cythonized
             assert_type(ctx, Context)
             if ctx._id != self._id:
                 raise RuntimeError(
                     "the provided context was created on the device with"
                     f" id={ctx._id}, which is different from the target id={self._id}"
                 )
-            prev_ctx = handle_return(driver.cuCtxPopCurrent())
-            handle_return(driver.cuCtxPushCurrent(ctx._handle))
+            # _ctx is the previous context
+            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx))
+            HANDLE_RETURN(cydriver.cuCtxPushCurrent(<cydriver.CUcontext>(ctx._handle)))
             self._has_inited = True
-            if int(prev_ctx) != 0:
-                return Context._from_ctx(prev_ctx, self._id)
+            if _ctx != NULL:
+                return Context._from_ctx(<uintptr_t>(_ctx), self._id)
         else:
             # use primary ctx
-            ctx = self._get_primary_context()
-            handle_return(driver.cuCtxSetCurrent(ctx))
+            _ctx = _get_primary_context(self._id)
+            HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx))
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd
index 843beb873..d61b6a2bc 100644
--- a/cuda_core/cuda/core/experimental/_dlpack.pxd
+++ b/cuda_core/cuda/core/experimental/_dlpack.pxd
@@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
 
 
-cdef extern from "dlpack.h" nogil:
+cdef extern from "include/dlpack.h" nogil:
     """
     #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
     #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 41c0b1ce6..db243717f 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -4,9 +4,13 @@
 
 from __future__ import annotations
 
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
+
 from cuda.core.experimental._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
     check_or_create_options,
+    HANDLE_RETURN
 )
 
 from dataclasses import dataclass
@@ -78,12 +82,15 @@ cdef class Event:
 
     """
     cdef:
-        object _handle
+        cydriver.CUevent _handle
         bint _timing_disabled
         bint _busy_waited
         int _device_id
         object _ctx_handle
 
+    def __cinit__(self):
+        self._handle = <cydriver.CUevent>(NULL)
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
 
@@ -91,19 +98,18 @@ cdef class Event:
     def _init(cls, device_id: int, ctx_handle: Context, options=None):
         cdef Event self = Event.__new__(cls)
         cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
-        flags = 0x0
+        cdef unsigned int flags = 0x0
         self._timing_disabled = False
         self._busy_waited = False
         if not opts.enable_timing:
-            flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING
+            flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING
             self._timing_disabled = True
         if opts.busy_waited_sync:
-            flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
+            flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
             self._busy_waited = True
         if opts.support_ipc:
             raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
-        err, self._handle = driver.cuEventCreate(flags)
-        raise_if_driver_error(err)
+        HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
         self._device_id = device_id
         self._ctx_handle = ctx_handle
         return self
@@ -111,10 +117,9 @@ cdef class Event:
     cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing):
         if is_shutting_down and is_shutting_down():
             return
-        if self._handle is not None:
-            err, = driver.cuEventDestroy(self._handle)
-            self._handle = None
-            raise_if_driver_error(err)
+        if self._handle != NULL:
+            HANDLE_RETURN(cydriver.cuEventDestroy(self._handle))
+            self._handle = <cydriver.CUevent>(NULL)
 
     cpdef close(self):
         """Destroy the event."""
@@ -129,14 +134,14 @@ cdef class Event:
     def __rsub__(self, other):
         return NotImplemented
 
-    def __sub__(self, other):
+    def __sub__(self, other: Event):
         # return self - other (in milliseconds)
-        err, timing = driver.cuEventElapsedTime(other.handle, self._handle)
-        try:
-            raise_if_driver_error(err)
+        cdef float timing
+        err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
+        if err == 0:
             return timing
-        except CUDAError as e:
-            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
+        else:
+            if err == cydriver.CUresult.CUDA_ERROR_INVALID_HANDLE:
                 if self.is_timing_disabled or other.is_timing_disabled:
                     explanation = (
                         "Both Events must be created with timing enabled in order to subtract them; "
@@ -147,15 +152,15 @@ cdef class Event:
                         "Both Events must be recorded before they can be subtracted; "
                         "use Stream.record() to record both events to a stream."
                     )
-            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
+            elif err == cydriver.CUresult.CUDA_ERROR_NOT_READY:
                 explanation = (
                     "One or both events have not completed; "
                     "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
                     "before subtracting them."
                 )
             else:
-                raise e
-            raise RuntimeError(explanation) from e
+                raise CUDAError(err)
+            raise RuntimeError(explanation)
 
     @property
     def is_timing_disabled(self) -> bool:
@@ -182,17 +187,17 @@ cdef class Event:
         has been completed.
 
         """
-        handle_return(driver.cuEventSynchronize(self._handle))
+        HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle))
 
     @property
     def is_done(self) -> bool:
         """Return True if all captured works have been completed, otherwise False."""
-        result, = driver.cuEventQuery(self._handle)
-        if result == driver.CUresult.CUDA_SUCCESS:
+        result = cydriver.cuEventQuery(self._handle)
+        if result == cydriver.CUresult.CUDA_SUCCESS:
             return True
-        if result == driver.CUresult.CUDA_ERROR_NOT_READY:
+        if result == cydriver.CUresult.CUDA_ERROR_NOT_READY:
             return False
-        handle_return(result)
+        HANDLE_RETURN(result)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:
@@ -203,7 +208,7 @@ cdef class Event:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Event.handle)``.
         """
-        return self._handle
+        return driver.CUevent(<uintptr_t>(self._handle))
 
     @property
     def device(self) -> Device:
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.pyx
similarity index 93%
rename from cuda_core/cuda/core/experimental/_launcher.py
rename to cuda_core/cuda/core/experimental/_launcher.pyx
index 2d0c274c7..ae808be89 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -2,12 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport uintptr_t
+
+from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
+
 from typing import Union
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr
+from cuda.core.experimental._stream import IsStreamT, Stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     _reduce_3_tuple,
@@ -60,7 +64,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         stream_handle = stream.handle
     except AttributeError:
         try:
-            stream_handle = _try_to_get_stream_ptr(stream)
+            stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
         except Exception:
             raise ValueError(
                 f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
new file mode 100644
index 000000000..6b8a7f0f6
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index a2c1a90b9..737fd13f9 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -4,10 +4,15 @@
 
 from __future__ import annotations
 
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
+
 from cuda.core.experimental._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
     check_or_create_options,
+    HANDLE_RETURN,
 )
+
 import sys
 
 import cython
@@ -59,7 +64,7 @@ class IsStreamT(Protocol):
         ...
 
 
-def _try_to_get_stream_ptr(obj: IsStreamT):
+cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*:
     try:
         cuda_stream_attr = obj.__cuda_stream__
     except AttributeError:
@@ -86,7 +91,7 @@ def _try_to_get_stream_ptr(obj: IsStreamT):
         raise RuntimeError(
             f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
         )
-    return driver.CUstream(info[1])
+    return <cydriver.CUstream><uintptr_t>(info[1])
 
 
 cdef class Stream:
@@ -108,7 +113,7 @@ cdef class Stream:
     """
 
     cdef:
-        object _handle
+        cydriver.CUstream _handle
         object _owner
         object _builtin
         object _nonblocking
@@ -116,6 +121,9 @@ cdef class Stream:
         object _device_id
         object _ctx_handle
 
+    def __cinit__(self):
+        self._handle = <cydriver.CUstream>(NULL)
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError(
             "Stream objects cannot be instantiated directly. "
@@ -125,7 +133,7 @@ cdef class Stream:
     @classmethod
     def _legacy_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = driver.CUstream(driver.CU_STREAM_LEGACY)
+        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_LEGACY)
         self._owner = None
         self._builtin = True
         self._nonblocking = None  # delayed
@@ -137,7 +145,7 @@ cdef class Stream:
     @classmethod
     def _per_thread_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = driver.CUstream(driver.CU_STREAM_PER_THREAD)
+        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD)
         self._owner = None
         self._builtin = True
         self._nonblocking = None  # delayed
@@ -149,7 +157,6 @@ cdef class Stream:
     @classmethod
     def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = None
         self._owner = None
         self._builtin = False
 
@@ -169,16 +176,18 @@ cdef class Stream:
         nonblocking = opts.nonblocking
         priority = opts.priority
 
-        flags = driver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else driver.CUstream_flags.CU_STREAM_DEFAULT
-        err, high, low = driver.cuCtxGetStreamPriorityRange()
-        raise_if_driver_error(err)
+        flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT
+        cdef int high, low
+        HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low))
         if priority is not None:
             if not (low <= priority <= high):
                 raise ValueError(f"{priority=} is out of range {[low, high]}")
         else:
             priority = high
 
-        self._handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority))
+        cdef cydriver.CUstream s
+        HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority))
+        self._handle = s
         self._owner = None
         self._nonblocking = nonblocking
         self._priority = priority
@@ -195,10 +204,10 @@ cdef class Stream:
 
         if self._owner is None:
             if self._handle and not self._builtin:
-                handle_return(driver.cuStreamDestroy(self._handle))
+                HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle))
         else:
             self._owner = None
-        self._handle = None
+        self._handle = <cydriver.CUstream>(NULL)
 
     cpdef close(self):
         """Destroy the stream.
@@ -222,14 +231,15 @@ cdef class Stream:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Stream.handle)``.
         """
-        return self._handle
+        return driver.CUstream(<uintptr_t>(self._handle))
 
     @property
     def is_nonblocking(self) -> bool:
         """Return True if this is a nonblocking stream, otherwise False."""
+        cdef unsigned int flags
         if self._nonblocking is None:
-            flag = handle_return(driver.cuStreamGetFlags(self._handle))
-            if flag == driver.CUstream_flags.CU_STREAM_NON_BLOCKING:
+            HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags))
+            if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING:
                 self._nonblocking = True
             else:
                 self._nonblocking = False
@@ -238,14 +248,15 @@ cdef class Stream:
     @property
     def priority(self) -> int:
         """Return the stream priority."""
+        cdef int prio
         if self._priority is None:
-            prio = handle_return(driver.cuStreamGetPriority(self._handle))
+            HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio))
             self._priority = prio
         return self._priority
 
     def sync(self):
         """Synchronize the stream."""
-        handle_return(driver.cuStreamSynchronize(self._handle))
+        HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle))
 
     def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
@@ -272,8 +283,8 @@ cdef class Stream:
         if event is None:
             self._get_device_and_context()
             event = Event._init(self._device_id, self._ctx_handle, options)
-        err, = driver.cuEventRecord(event.handle, self._handle)
-        raise_if_driver_error(err)
+        # TODO: revisit after Event is cythonized
+        HANDLE_RETURN(cydriver.cuEventRecord(<cydriver.CUevent><uintptr_t>(event.handle), self._handle))
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
@@ -286,28 +297,33 @@ cdef class Stream:
         on the stream and then waiting on it.
 
         """
+        cdef cydriver.CUevent event
+        cdef cydriver.CUstream stream
+        cdef bint discard_event
+
         if isinstance(event_or_stream, Event):
-            event = event_or_stream.handle
+            event = <cydriver.CUevent><uintptr_t>(event_or_stream.handle)
             discard_event = False
         else:
             if isinstance(event_or_stream, Stream):
-                stream = event_or_stream
+                stream = <cydriver.CUstream><uintptr_t>(event_or_stream.handle)
             else:
                 try:
-                    stream = Stream._init(obj=event_or_stream)
+                    s = Stream._init(obj=event_or_stream)
                 except Exception as e:
                     raise ValueError(
                         "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
                         f" got {type(event_or_stream)}"
                     ) from e
-            event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-            handle_return(driver.cuEventRecord(event, stream.handle))
+                stream = <cydriver.CUstream><uintptr_t>(s.handle)
+            HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
+            HANDLE_RETURN(cydriver.cuEventRecord(event, stream))
             discard_event = True
 
         # TODO: support flags other than 0?
-        handle_return(driver.cuStreamWaitEvent(self._handle, event, 0))
+        HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
         if discard_event:
-            handle_return(driver.cuEventDestroy(event))
+            HANDLE_RETURN(cydriver.cuEventDestroy(event))
 
     @property
     def device(self) -> Device:
@@ -325,9 +341,11 @@ cdef class Stream:
         return Device(self._device_id)
 
     cdef int _get_context(Stream self) except?-1:
+        # TODO: consider making self._ctx_handle typed?
+        cdef cydriver.CUcontext ctx
         if self._ctx_handle is None:
-            err, self._ctx_handle = driver.cuStreamGetCtx(self._handle)
-            raise_if_driver_error(err)
+            HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx))
+            self._ctx_handle = driver.CUcontext(<uintptr_t>ctx)
         return 0
 
     cdef int _get_device_and_context(Stream self) except?-1:
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index 601736c47..bf570965f 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -2,18 +2,29 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-
 cimport cpython
-cimport libc.stdint
+from libc.stdint cimport int64_t
+
+from cuda.bindings cimport cydriver
+
+
+ctypedef fused supported_error_type:
+    cydriver.CUresult
 
 
+cdef int HANDLE_RETURN(supported_error_type err) except?-1
+
+
+# TODO: stop exposing these within the codebase?
 cpdef int _check_driver_error(error) except?-1
 cpdef int _check_runtime_error(error) except?-1
 cpdef int _check_nvrtc_error(error) except?-1
+
+
 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*)
 
 
-cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length):
+cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length):
     # Construct shape and strides tuples using the Python/C API for speed
     result = cpython.PyTuple_New(length)
     for i in range(length):
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 86588f733..c095e7564 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -52,6 +52,12 @@ def _reduce_3_tuple(t: tuple):
     return t[0] * t[1] * t[2]
 
 
+cdef int HANDLE_RETURN(supported_error_type err) except?-1:
+    if supported_error_type is cydriver.CUresult:
+        if err != cydriver.CUresult.CUDA_SUCCESS:
+            return _check_driver_error(err)
+
+
 cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS
diff --git a/cuda_core/cuda/core/experimental/dlpack.h b/cuda_core/cuda/core/experimental/include/dlpack.h
similarity index 100%
rename from cuda_core/cuda/core/experimental/dlpack.h
rename to cuda_core/cuda/core/experimental/include/dlpack.h
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 454a9d465..e87cbdee3 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -12,6 +12,7 @@ Released on TBD
 Highlights
 ----------
 
+- This is the last release that officially supports Python 3.9.
 - Fix for :class:`LaunchConfig` grid parameter unit conversion when thread block clusters are used.
 
 
@@ -19,6 +20,7 @@ Breaking Changes
 ----------------
 
 - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x.
+- Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout <https://nvidia.github.io/cuda-python/cuda-bindings/latest/release/12.6.1-notes.html#cuda-namespace-cleanup-with-a-new-module-layout>`_. As per the ``cuda-bindings`` `support policy <https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html>`_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y.
 - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``.
 - When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident.
 
@@ -49,3 +51,4 @@ Fixes and enhancements
 - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals.
 - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown.
 - :class:`StridedMemoryView` of an underlying array using the DLPack protocol will no longer leak memory.
+- General performance improvement.
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 4e4ab5028..8bbfca07d 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -4,7 +4,8 @@
 
 [build-system]
 requires = ["setuptools>=77.0.0", "Cython>=3.1"]
-build-backend = "setuptools.build_meta"
+build-backend = "build_hooks"
+backend-path = ["."]
 
 
 [project]
@@ -38,21 +39,19 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: 11",
     "Environment :: GPU :: NVIDIA CUDA :: 12",
+    "Environment :: GPU :: NVIDIA CUDA :: 13",
 ]
 dependencies = [
     "numpy",
 ]
 
 [project.optional-dependencies]
-cu11 = ["cuda-bindings[all]==11.8.*"]
 cu12 = ["cuda-bindings[all]==12.*"]
 cu13 = ["cuda-bindings[all]==13.*"]
 
 [dependency-groups]
 test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"]
-test-cu11 = ["cuda-core[test]", "cupy-cuda11x; python_version < '3.14'", "cuda-toolkit[cudart]==11.*"]  # runtime headers needed by CuPy
 test-cu12 = ["cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
 test-cu13 = ["cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
 # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now
@@ -79,7 +78,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 skip = "*-musllinux_*"
 enable = "cpython-freethreading"
 build-verbosity = 1
-environment-pass = ["CUDA_PYTHON_PARALLEL_LEVEL"]
 
 [tool.cibuildwheel.linux]
 archs = "native"
diff --git a/cuda_core/setup.py b/cuda_core/setup.py
index d93eec45d..4a501edc1 100644
--- a/cuda_core/setup.py
+++ b/cuda_core/setup.py
@@ -2,38 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import glob
 import os
 
-from Cython.Build import cythonize
-from setuptools import Extension, setup
+import build_hooks  # our build backend
+from setuptools import setup
 from setuptools.command.build_ext import build_ext as _build_ext
 
 nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
 
 
-# It seems setuptools' wildcard support has problems for namespace packages,
-# so we explicitly spell out all Extension instances.
-root_module = "cuda.core.experimental"
-root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
-ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
-
-
-def strip_prefix_suffix(filename):
-    return filename[len(root_path) : -4]
-
-
-module_names = (strip_prefix_suffix(f) for f in ext_files)
-ext_modules = tuple(
-    Extension(
-        f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-        sources=[f"cuda/core/experimental/{mod}.pyx"],
-        language="c++",
-    )
-    for mod in module_names
-)
-
-
 class build_ext(_build_ext):
     def build_extensions(self):
         self.parallel = nthreads
@@ -41,7 +18,7 @@ def build_extensions(self):
 
 
 setup(
-    ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}),
+    ext_modules=build_hooks._extensions,
     cmdclass={
         "build_ext": build_ext,
     },