From 4aadce2b6b0933557c6831eb37dd3cf9419bf47c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 02:14:28 +0000
Subject: [PATCH 1/9] Add nightly CI pipeline for optional-dependency testing
 (PyTorch, numba-cuda)

Add ci-nightly.yml that downloads wheels from the latest successful CI
run on main and tests them against PyTorch and numba-cuda, without
rebuilding.

Key changes:
- ci-nightly.yml: new orchestrator (schedule 2 AM UTC + workflow_dispatch)
- test-wheel-linux/windows.yml: add run-id input for cross-run artifact
  downloads, and test-mode input (standard/nightly-pytorch/nightly-numba-cuda)
  with conditional test steps
- ci/test-matrix.yml: add nightly entries with MODE field (4 pytorch +
  6 numba-cuda across linux-64, linux-aarch64, win-64)
- ci/tools/run-tests: add nightly-install mode that installs all wheels
  without running standard tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml         | 177 +++++++++++++++++++++++
 .github/workflows/test-wheel-linux.yml   |  73 +++++++++-
 .github/workflows/test-wheel-windows.yml |  76 +++++++++-
 ci/test-matrix.yml                       |  22 ++-
 ci/tools/run-tests                       |  35 ++++-
 5 files changed, 376 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/ci-nightly.yml

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
new file mode 100644
index 00000000000..dda13abb5e2
--- /dev/null
+++ b/.github/workflows/ci-nightly.yml
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Nightly CI pipeline that tests optional dependencies (PyTorch, numba-cuda)
+# against the latest cuda-python wheels built on main.
+#
+# This workflow does NOT build wheels — it downloads them from the latest
+# successful CI run on main and runs integration tests with optional deps.
+
+name: "CI: Nightly optional-deps"
+
+on:
+  schedule:
+    # 2 AM UTC daily, after the midnight main CI build finishes
+    - cron: "0 2 * * *"
+  workflow_dispatch:
+    inputs:
+      run-id:
+        description: >
+          Override the CI run ID to download artifacts from.
+          Leave empty to auto-detect the latest successful main run.
+        type: string
+        default: ''
+
+jobs:
+  find-wheels:
+    runs-on: ubuntu-latest
+    outputs:
+      RUN_ID: ${{ steps.find.outputs.run_id }}
+      CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 0
+
+      - name: Get CUDA build versions
+        id: get-vars
+        run: |
+          cuda_build_ver=$(yq '.cuda.build.version' ci/versions.yml)
+          echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT
+
+      - name: Find latest successful CI run on main
+        id: find
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [[ -n "${{ inputs.run-id }}" ]]; then
+            echo "run_id=${{ inputs.run-id }}" >> $GITHUB_OUTPUT
+            echo "Using manually specified run ID: ${{ inputs.run-id }}"
+            exit 0
+          fi
+
+          RUN_ID=$(gh run list \
+            -b main \
+            -L 1 \
+            -w "CI" \
+            -s success \
+            -R "${{ github.repository }}" \
+            --json databaseId \
+            | jq -r '.[0].databaseId')
+
+          if [[ -z "$RUN_ID" || "$RUN_ID" == "null" ]]; then
+            echo "::error::No successful CI run found on main"
+            exit 1
+          fi
+
+          echo "run_id=$RUN_ID" >> $GITHUB_OUTPUT
+          echo "Using latest successful CI run: $RUN_ID"
+
+  # ── PyTorch interop tests ──
+
+  test-pytorch-linux:
+    name: "Nightly PyTorch (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      test-mode: nightly-pytorch
+      matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
+
+  test-pytorch-windows:
+    name: "Nightly PyTorch (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      test-mode: nightly-pytorch
+      matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
+
+  # ── numba-cuda tests ──
+
+  test-numba-cuda-linux-64:
+    name: "Nightly numba-cuda (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  test-numba-cuda-linux-aarch64:
+    name: "Nightly numba-cuda (linux-aarch64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-aarch64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  test-numba-cuda-windows:
+    name: "Nightly numba-cuda (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  # ── Status check ──
+
+  checks:
+    name: Nightly check status
+    if: always()
+    runs-on: ubuntu-latest
+    needs:
+      - test-pytorch-linux
+      - test-pytorch-windows
+      - test-numba-cuda-linux-64
+      - test-numba-cuda-linux-aarch64
+      - test-numba-cuda-windows
+    steps:
+      - name: Exit
+        run: |
+          # if any dependencies were cancelled or failed, that's a failure
+          jq -e 'all(.[]; . == "success" or . == "skipped")' <<< '${{ toJSON(needs.*.result) }}'
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 35c5e6c3734..3e7b5ee97a3 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -29,6 +29,18 @@ on:
       skip-bindings-test:
         type: boolean
         default: false
+      run-id:
+        description: >
+          Workflow run ID to download artifacts from.
+          Defaults to the current run when empty.
+        type: string
+        default: ''
+      test-mode:
+        description: >
+          Test mode: 'standard' (default), 'nightly-pytorch', or
+          'nightly-numba-cuda'.
+        type: string
+        default: 'standard'
 
 defaults:
   run:
@@ -128,6 +140,8 @@ jobs:
         with:
           name: cuda-pathfinder-wheel
           path: ./cuda_pathfinder
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda-python build artifacts
         if: ${{ env.USE_BACKPORT_BINDINGS == '0' }}
@@ -135,6 +149,8 @@ jobs:
         with:
           name: cuda-python-wheel
           path: .
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda.bindings build artifacts
         if: ${{ env.USE_BACKPORT_BINDINGS == '0' }}
@@ -142,6 +158,8 @@ jobs:
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda-python & cuda.bindings build artifacts from the prior branch
         if: ${{ env.USE_BACKPORT_BINDINGS == '1' }}
@@ -194,6 +212,8 @@ jobs:
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_BINDINGS_CYTHON_TESTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.bindings Cython tests
         if: ${{ env.SKIP_CYTHON_TEST == '0' }}
@@ -206,6 +226,8 @@ jobs:
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
           path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.core build artifacts
         run: |
@@ -218,6 +240,8 @@ jobs:
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.core Cython tests
         if: ${{ env.SKIP_CYTHON_TEST == '0' }}
@@ -256,7 +280,9 @@ jobs:
       - name: Set up test repetition on nightly runs
         run: echo "PYTEST_ADDOPTS=\"--count=${{ inputs.nruns }}\"" >> "$GITHUB_ENV"
 
+      # ── Standard test steps (skipped for nightly modes) ──
       - name: Run cuda.pathfinder tests with see_what_works
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: see_what_works
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: see_what_works
@@ -264,14 +290,14 @@ jobs:
         run: run-tests pathfinder
 
       - name: Run cuda.bindings tests
-        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        if: ${{ inputs.test-mode == 'standard' && env.SKIP_CUDA_BINDINGS_TEST == '0' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests bindings
 
       - name: Run cuda.bindings benchmarks (smoke test)
-        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        if: ${{ inputs.test-mode == 'standard' && env.SKIP_CUDA_BINDINGS_TEST == '0' }}
         run: |
           pip install pyperf
           pushd benchmarks/cuda_bindings
@@ -279,12 +305,14 @@ jobs:
           popd
 
       - name: Run cuda.core tests
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests core
 
       - name: Ensure cuda-python installable
+        if: ${{ inputs.test-mode == 'standard' }}
         run: |
           if [[ "${{ matrix.LOCAL_CTK }}" == 1 ]]; then
             pip install --only-binary=:all: cuda_python*.whl
@@ -293,6 +321,7 @@ jobs:
           fi
 
       - name: Install cuda.pathfinder extra wheels for testing
+        if: ${{ inputs.test-mode == 'standard' }}
         run: |
           set -euo pipefail
           pushd cuda_pathfinder
@@ -301,8 +330,48 @@ jobs:
           popd
 
       - name: Run cuda.pathfinder tests with all_must_work
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work
         run: run-tests pathfinder
+
+      # ── Nightly: install all cuda-python wheels ──
+      - name: Install cuda-python wheels for nightly testing
+        if: ${{ inputs.test-mode != 'standard' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: run-tests nightly-install
+
+      # ── Nightly: PyTorch interop tests ──
+      - name: Install PyTorch
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+        run: |
+          TORCH_VER="${{ matrix.TORCH_VER }}"
+          TORCH_CUDA="${{ matrix.TORCH_CUDA }}"
+          if [[ "$TORCH_VER" == "latest" ]]; then
+            pip install torch --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
+          else
+            pip install "torch==${TORCH_VER}" --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
+          fi
+          python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
+
+      - name: Run PyTorch interop tests
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+        run: |
+          pushd cuda_core
+          pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
+          popd
+
+      # ── Nightly: numba-cuda tests ──
+      - name: Install numba-cuda
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        run: |
+          pip install numba-cuda
+          python -c "import numba_cuda; print(f'numba-cuda installed')"
+
+      - name: Run numba-cuda tests
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        run: python -m numba_cuda.numba.cuda.tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 765823c6bfc..ad219a2f30d 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -29,6 +29,18 @@ on:
       skip-bindings-test:
         type: boolean
         default: false
+      run-id:
+        description: >
+          Workflow run ID to download artifacts from.
+          Defaults to the current run when empty.
+        type: string
+        default: ''
+      test-mode:
+        description: >
+          Test mode: 'standard' (default), 'nightly-pytorch', or
+          'nightly-numba-cuda'.
+        type: string
+        default: 'standard'
 
 jobs:
   compute-matrix:
@@ -123,6 +135,8 @@ jobs:
         with:
           name: cuda-pathfinder-wheel
           path: ./cuda_pathfinder
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda-python build artifacts
         if: ${{ env.USE_BACKPORT_BINDINGS == '0' }}
@@ -130,6 +144,8 @@ jobs:
         with:
           name: cuda-python-wheel
           path: .
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda.bindings build artifacts
         if: ${{ env.USE_BACKPORT_BINDINGS == '0' }}
@@ -137,6 +153,8 @@ jobs:
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Download cuda-python & cuda.bindings build artifacts from the prior branch
         if: ${{ env.USE_BACKPORT_BINDINGS == '1' }}
@@ -180,6 +198,8 @@ jobs:
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_BINDINGS_CYTHON_TESTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.bindings Cython tests
         if: ${{ env.SKIP_CYTHON_TEST == '0' }}
@@ -192,6 +212,8 @@ jobs:
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
           path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.core build artifacts
         run: |
@@ -204,6 +226,8 @@ jobs:
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests
           path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}
+          run-id: ${{ inputs.run-id || github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Display structure of downloaded cuda.core Cython tests
         if: ${{ env.SKIP_CYTHON_TEST == '0' }}
@@ -237,7 +261,9 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: echo "PYTEST_ADDOPTS=\"--count=${{ inputs.nruns }}\"" >> "$GITHUB_ENV"
 
+      # ── Standard test steps (skipped for nightly modes) ──
       - name: Run cuda.pathfinder tests with see_what_works
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: see_what_works
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: see_what_works
@@ -246,7 +272,7 @@ jobs:
         run: run-tests pathfinder
 
       - name: Run cuda.bindings tests
-        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        if: ${{ inputs.test-mode == 'standard' && env.SKIP_CUDA_BINDINGS_TEST == '0' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
@@ -254,6 +280,7 @@ jobs:
         run: run-tests bindings
 
       - name: Run cuda.core tests
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
@@ -261,6 +288,7 @@ jobs:
         run: run-tests core
 
       - name: Ensure cuda-python installable
+        if: ${{ inputs.test-mode == 'standard' }}
         run: |
           if ('${{ matrix.LOCAL_CTK }}' -eq '1') {
             pip install --only-binary=:all: (Get-ChildItem -Filter cuda_python*.whl).FullName
@@ -269,6 +297,7 @@ jobs:
           }
 
       - name: Install cuda.pathfinder extra wheels for testing
+        if: ${{ inputs.test-mode == 'standard' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pushd cuda_pathfinder
@@ -277,9 +306,54 @@ jobs:
           popd
 
       - name: Run cuda.pathfinder tests with all_must_work
+        if: ${{ inputs.test-mode == 'standard' }}
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests pathfinder
+
+      # ── Nightly: install all cuda-python wheels ──
+      - name: Install cuda-python wheels for nightly testing
+        if: ${{ inputs.test-mode != 'standard' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: run-tests nightly-install
+
+      # ── Nightly: PyTorch interop tests ──
+      - name: Install PyTorch
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          TORCH_VER="${{ matrix.TORCH_VER }}"
+          TORCH_CUDA="${{ matrix.TORCH_CUDA }}"
+          if [[ "$TORCH_VER" == "latest" ]]; then
+            pip install torch --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
+          else
+            pip install "torch==${TORCH_VER}" --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
+          fi
+          python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
+
+      - name: Run PyTorch interop tests
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pushd cuda_core
+          pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
+          popd
+
+      # ── Nightly: numba-cuda tests ──
+      - name: Install numba-cuda
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pip install numba-cuda
+          python -c "import numba_cuda; print(f'numba-cuda installed')"
+
+      - name: Run numba-cuda tests
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: python -m numba_cuda.numba.cuda.tests
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index a402e3e4cf7..d8be9b350a5 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -62,7 +62,17 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.11',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090',    GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
-  nightly: []
+  nightly:
+    # nightly-pytorch (amd64 only — PyTorch does not ship arm64 GPU wheels)
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    # nightly-numba-cuda
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
 
 windows:
   pull-request:
@@ -85,4 +95,12 @@ windows:
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
-  nightly: []
+  nightly:
+    # nightly-pytorch
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    # nightly-numba-cuda
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index d42634a7073..3500bfa0d28 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -13,8 +13,8 @@ if [[ ${#} -ne 1 ]]; then
   echo "Error: This script requires exactly 1 argument. You provided ${#}"
   exit 1
 fi
-if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" ]]; then
-  echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', or 'pathfinder'"
+if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" && "${1}" != "nightly-install" ]]; then
+  echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', 'pathfinder', or 'nightly-install'"
   exit 1
 fi
 
@@ -91,4 +91,35 @@ elif [[ "${test_module}" == "core" ]]; then
     ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
   fi
   popd
+elif [[ "${test_module}" == "nightly-install" ]]; then
+  # Install all wheels (pathfinder already installed above) without running tests.
+  # Used by nightly optional-dependency pipelines (pytorch, numba-cuda).
+  echo "Installing bindings wheel"
+  pushd ./cuda_bindings
+  if [[ "${LOCAL_CTK}" == 1 ]]; then
+    pip install "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl
+  else
+    pip install $(ls "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)[all]
+  fi
+  popd
+
+  TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
+  echo "Installing core wheel"
+
+  FREE_THREADING=""
+  if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
+    FREE_THREADING+="-ft"
+  fi
+
+  pushd ./cuda_core
+  CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
+  WHL_EXTRA=("${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)
+  if [[ "${LOCAL_CTK}" != 1 ]]; then
+    WHL_EXTRA=("${WHL_EXTRA[0]}[cu${TEST_CUDA_MAJOR}]")
+  fi
+  pip install "${WHL_EXTRA[@]}" --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
+  popd
+
+  echo "All cuda-python wheels installed for nightly testing"
+  pip list | grep -i "cuda\|pathfinder"
 fi

From 6dea4c3d5acb0542ad8222699ad80dcd3cfe08ce Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 02:33:37 +0000
Subject: [PATCH 2/9] Add concurrency group and fix checks job in
 ci-nightly.yml

- Add concurrency group matching ci.yml's pattern
- Replace jq one-liner with explicit cancelled/failure checks per
  ci.yml's battle-tested pattern (see long comment there for rationale)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index dda13abb5e2..f0e76c8d2d8 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -10,6 +10,10 @@
 
 name: "CI: Nightly optional-deps"
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
 on:
   schedule:
     # 2 AM UTC daily, after the midnight main CI build finishes
@@ -173,5 +177,21 @@ jobs:
     steps:
       - name: Exit
         run: |
-          # if any dependencies were cancelled or failed, that's a failure
-          jq -e 'all(.[]; . == "success" or . == "skipped")' <<< '${{ toJSON(needs.*.result) }}'
+          # If any dependency was cancelled or failed, that's a failure.
+          #
+          # See ci.yml for the full rationale on why we must use always()
+          # and explicitly check each result rather than relying on the
+          # default behaviour.
+          if ${{ needs.test-pytorch-linux.result == 'cancelled' ||
+                 needs.test-pytorch-linux.result == 'failure' ||
+                 needs.test-pytorch-windows.result == 'cancelled' ||
+                 needs.test-pytorch-windows.result == 'failure' ||
+                 needs.test-numba-cuda-linux-64.result == 'cancelled' ||
+                 needs.test-numba-cuda-linux-64.result == 'failure' ||
+                 needs.test-numba-cuda-linux-aarch64.result == 'cancelled' ||
+                 needs.test-numba-cuda-linux-aarch64.result == 'failure' ||
+                 needs.test-numba-cuda-windows.result == 'cancelled' ||
+                 needs.test-numba-cuda-windows.result == 'failure' }}; then
+            exit 1
+          fi
+          exit 0

From ac5238ca67d353c1df06d012432e8eb095ee9334 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 02:35:11 +0000
Subject: [PATCH 3/9] Temporarily add push trigger to ci-nightly.yml for
 testing

Remove before merging.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index f0e76c8d2d8..eb40c7979a6 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -15,6 +15,10 @@ concurrency:
   cancel-in-progress: true
 
 on:
+  push:
+    branches:
+      - "main"
+      - "pull-request/[0-9]+"
   schedule:
     # 2 AM UTC daily, after the midnight main CI build finishes
     - cron: "0 2 * * *"

From cb5aefa876b9f1ca94d06345cfd599dd010363bc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 02:39:29 +0000
Subject: [PATCH 4/9] Use shallow clone (fetch-depth: 1) in ci-nightly.yml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Full history is not needed — we only read ci/versions.yml.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index eb40c7979a6..d218ace0531 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
-          fetch-depth: 0
+          fetch-depth: 1
 
       - name: Get CUDA build versions
         id: get-vars

From 92865983b14b26c2f8f793f067b2521ac95bd5b7 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 02:42:26 +0000
Subject: [PATCH 5/9] Fix artifact name mismatch in nightly CI by passing
 source SHA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Artifact names embed the commit SHA from the build that created them.
When the nightly workflow downloads artifacts from a different CI run,
it must use that run's SHA — not github.sha (the nightly run's own
SHA) — to construct the correct artifact names.

- ci-nightly.yml: resolve head_sha from the source CI run via
  `gh run view --json headSha`, pass it to test workflows
- test-wheel-linux/windows.yml: add `sha` input (defaults to
  github.sha for backward compatibility), use it in env-vars

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml         | 45 +++++++++++++++---------
 .github/workflows/test-wheel-linux.yml   |  8 ++++-
 .github/workflows/test-wheel-windows.yml |  8 ++++-
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index d218ace0531..72778445a91 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -36,6 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       RUN_ID: ${{ steps.find.outputs.run_id }}
+      HEAD_SHA: ${{ steps.find.outputs.head_sha }}
       CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }}
     steps:
       - name: Checkout repository
@@ -55,27 +56,34 @@ jobs:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           if [[ -n "${{ inputs.run-id }}" ]]; then
-            echo "run_id=${{ inputs.run-id }}" >> $GITHUB_OUTPUT
-            echo "Using manually specified run ID: ${{ inputs.run-id }}"
-            exit 0
+            RUN_ID="${{ inputs.run-id }}"
+            echo "Using manually specified run ID: $RUN_ID"
+          else
+            RUN_ID=$(gh run list \
+              -b main \
+              -L 1 \
+              -w "CI" \
+              -s success \
+              -R "${{ github.repository }}" \
+              --json databaseId \
+              | jq -r '.[0].databaseId')
+
+            if [[ -z "$RUN_ID" || "$RUN_ID" == "null" ]]; then
+              echo "::error::No successful CI run found on main"
+              exit 1
+            fi
+            echo "Using latest successful CI run: $RUN_ID"
           fi
 
-          RUN_ID=$(gh run list \
-            -b main \
-            -L 1 \
-            -w "CI" \
-            -s success \
+          # Resolve the head SHA from the CI run — artifact names embed this.
+          HEAD_SHA=$(gh run view "$RUN_ID" \
             -R "${{ github.repository }}" \
-            --json databaseId \
-            | jq -r '.[0].databaseId')
-
-          if [[ -z "$RUN_ID" || "$RUN_ID" == "null" ]]; then
-            echo "::error::No successful CI run found on main"
-            exit 1
-          fi
+            --json headSha \
+            | jq -r '.headSha')
 
           echo "run_id=$RUN_ID" >> $GITHUB_OUTPUT
-          echo "Using latest successful CI run: $RUN_ID"
+          echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT
+          echo "Source commit: $HEAD_SHA"
 
   # ── PyTorch interop tests ──
 
@@ -93,6 +101,7 @@ jobs:
       host-platform: linux-64
       build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
       run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
       test-mode: nightly-pytorch
       matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
 
@@ -110,6 +119,7 @@ jobs:
       host-platform: win-64
       build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
       run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
       test-mode: nightly-pytorch
       matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
 
@@ -129,6 +139,7 @@ jobs:
       host-platform: linux-64
       build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
       run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
       test-mode: nightly-numba-cuda
       matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
 
@@ -146,6 +157,7 @@ jobs:
       host-platform: linux-aarch64
       build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
       run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
       test-mode: nightly-numba-cuda
       matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
 
@@ -163,6 +175,7 @@ jobs:
       host-platform: win-64
       build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
       run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
       test-mode: nightly-numba-cuda
       matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
 
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 3e7b5ee97a3..892bcb19056 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -41,6 +41,12 @@ on:
           'nightly-numba-cuda'.
         type: string
         default: 'standard'
+      sha:
+        description: >
+          Commit SHA used to construct artifact names.
+          Defaults to github.sha (current run) when empty.
+        type: string
+        default: ''
 
 defaults:
   run:
@@ -131,7 +137,7 @@ jobs:
           HOST_PLATFORM: ${{ inputs.host-platform }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
           PY_VER: ${{ matrix.PY_VER }}
-          SHA: ${{ github.sha }}
+          SHA: ${{ inputs.sha || github.sha }}
           SKIP_BINDINGS_TEST_OVERRIDE: ${{ inputs.skip-bindings-test && '1' || '0' }}
         run: ./ci/tools/env-vars test
 
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index ad219a2f30d..300ca157bf8 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -41,6 +41,12 @@ on:
           'nightly-numba-cuda'.
         type: string
         default: 'standard'
+      sha:
+        description: >
+          Commit SHA used to construct artifact names.
+          Defaults to github.sha (current run) when empty.
+        type: string
+        default: ''
 
 jobs:
   compute-matrix:
@@ -125,7 +131,7 @@ jobs:
           HOST_PLATFORM: ${{ inputs.host-platform }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
           PY_VER: ${{ matrix.PY_VER }}
-          SHA: ${{ github.sha }}
+          SHA: ${{ inputs.sha || github.sha }}
           SKIP_BINDINGS_TEST_OVERRIDE: ${{ inputs.skip-bindings-test && '1' || '0' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: ./ci/tools/env-vars test

From 0b7cc50c5cea3cbac79059365f31578929f45087 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 03:14:36 +0000
Subject: [PATCH 6/9] Fix nightly CI: single pip call, display step, numba cmd,
 Windows VC++

- Install ALL wheels (pathfinder + bindings + core) and optional dep
  (torch/numba-cuda) in a single pip call so pip resolves everything
  together and avoids costly reinstall cycles from version conflicts
- Fix "Display structure" step: show only artifact files (cuda_python*.whl,
  cuda_pathfinder/) instead of ls -lahR . which lists the entire repo
- Fix numba-cuda test command: python -m numba.runtests numba.cuda.tests
- Install Visual C++ Redistributable on Windows before PyTorch
  (https://github.com/pytorch/pytorch/issues/166628)
- run-tests now does pip list at the end of nightly installs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-wheel-linux.yml   | 40 ++++-------
 .github/workflows/test-wheel-windows.yml | 49 +++++++------
 ci/tools/run-tests                       | 88 +++++++++++++++++-------
 3 files changed, 100 insertions(+), 77 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 892bcb19056..e1a36bc086b 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -205,7 +205,7 @@ jobs:
       - name: Display structure of downloaded cuda-python artifacts
         run: |
           pwd
-          ls -lahR .
+          ls -lah cuda_python*.whl cuda_pathfinder/
 
       - name: Display structure of downloaded cuda.bindings artifacts
         run: |
@@ -343,27 +343,24 @@ jobs:
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work
         run: run-tests pathfinder
 
-      # ── Nightly: install all cuda-python wheels ──
-      - name: Install cuda-python wheels for nightly testing
-        if: ${{ inputs.test-mode != 'standard' }}
+      # ── Nightly: install wheels + optional dep together ──
+      - name: Install cuda-python wheels + PyTorch
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
-        run: run-tests nightly-install
+          TORCH_VER: ${{ matrix.TORCH_VER }}
+          TORCH_CUDA: ${{ matrix.TORCH_CUDA }}
+        run: run-tests nightly-pytorch
 
-      # ── Nightly: PyTorch interop tests ──
-      - name: Install PyTorch
-        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
-        run: |
-          TORCH_VER="${{ matrix.TORCH_VER }}"
-          TORCH_CUDA="${{ matrix.TORCH_CUDA }}"
-          if [[ "$TORCH_VER" == "latest" ]]; then
-            pip install torch --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
-          else
-            pip install "torch==${TORCH_VER}" --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
-          fi
-          python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
+      - name: Install cuda-python wheels + numba-cuda
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: run-tests nightly-numba-cuda
 
+      # ── Nightly: run tests ──
       - name: Run PyTorch interop tests
         if: ${{ inputs.test-mode == 'nightly-pytorch' }}
         run: |
@@ -371,13 +368,6 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
-      # ── Nightly: numba-cuda tests ──
-      - name: Install numba-cuda
-        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
-        run: |
-          pip install numba-cuda
-          python -c "import numba_cuda; print(f'numba-cuda installed')"
-
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
-        run: python -m numba_cuda.numba.cuda.tests
+        run: python -m numba.runtests numba.cuda.tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 300ca157bf8..eefc9273594 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -191,7 +191,8 @@ jobs:
       - name: Display structure of downloaded cuda-python artifacts
         run: |
           Get-Location
-          Get-ChildItem -Recurse -Force | Select-Object Mode, LastWriteTime, Length, FullName
+          Get-ChildItem cuda_python*.whl | Select-Object Mode, LastWriteTime, Length, FullName
+          Get-ChildItem cuda_pathfinder/ | Select-Object Mode, LastWriteTime, Length, FullName
 
       - name: Display structure of downloaded cuda.bindings artifacts
         run: |
@@ -320,29 +321,33 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests pathfinder
 
-      # ── Nightly: install all cuda-python wheels ──
-      - name: Install cuda-python wheels for nightly testing
-        if: ${{ inputs.test-mode != 'standard' }}
+      # ── Nightly: install wheels + optional dep together ──
+      - name: Install Visual C++ Redistributable (required by PyTorch on Windows)
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+        run: |
+          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vc_redist.x64.exe" -OutFile "vc_redist.x64.exe"
+          Start-Process -FilePath ".\vc_redist.x64.exe" -ArgumentList "/install", "/quiet", "/norestart" -Wait
+          Remove-Item "vc_redist.x64.exe"
+
+      - name: Install cuda-python wheels + PyTorch
+        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+          TORCH_VER: ${{ matrix.TORCH_VER }}
+          TORCH_CUDA: ${{ matrix.TORCH_CUDA }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: run-tests nightly-install
+        run: run-tests nightly-pytorch
 
-      # ── Nightly: PyTorch interop tests ──
-      - name: Install PyTorch
-        if: ${{ inputs.test-mode == 'nightly-pytorch' }}
+      - name: Install cuda-python wheels + numba-cuda
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: |
-          TORCH_VER="${{ matrix.TORCH_VER }}"
-          TORCH_CUDA="${{ matrix.TORCH_CUDA }}"
-          if [[ "$TORCH_VER" == "latest" ]]; then
-            pip install torch --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
-          else
-            pip install "torch==${TORCH_VER}" --index-url "https://download.pytorch.org/whl/${TORCH_CUDA}"
-          fi
-          python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
+        run: run-tests nightly-numba-cuda
 
+      # ── Nightly: run tests ──
       - name: Run PyTorch interop tests
         if: ${{ inputs.test-mode == 'nightly-pytorch' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
@@ -351,15 +356,7 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
-      # ── Nightly: numba-cuda tests ──
-      - name: Install numba-cuda
-        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
-        shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: |
-          pip install numba-cuda
-          python -c "import numba_cuda; print(f'numba-cuda installed')"
-
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: python -m numba_cuda.numba.cuda.tests
+        run: python -m numba.runtests numba.cuda.tests
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index 3500bfa0d28..f76c45e325c 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -13,19 +13,22 @@ if [[ ${#} -ne 1 ]]; then
   echo "Error: This script requires exactly 1 argument. You provided ${#}"
   exit 1
 fi
-if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" && "${1}" != "nightly-install" ]]; then
-  echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', 'pathfinder', or 'nightly-install'"
+if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" && "${1}" != "nightly-pytorch" && "${1}" != "nightly-numba-cuda" ]]; then
+  echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', 'pathfinder', 'nightly-pytorch', or 'nightly-numba-cuda'"
   exit 1
 fi
 
 test_module=${1}
 
-# Unconditionally install pathfinder wheel
-# (it is a direct dependency of bindings, and a transitive dependency of core)
-pushd ./cuda_pathfinder
-echo "Installing pathfinder wheel"
-pip install ./*.whl --group test
-popd
+# For standard modes, install pathfinder up front (it is a direct dependency
+# of bindings, and a transitive dependency of core).  Nightly modes install
+# all wheels together in a single pip call further below.
+if [[ "${test_module}" != "nightly-pytorch" && "${test_module}" != "nightly-numba-cuda" ]]; then
+  pushd ./cuda_pathfinder
+  echo "Installing pathfinder wheel"
+  pip install ./*.whl --group test
+  popd
+fi
 
 if [[ "${test_module}" == "pathfinder" ]]; then
   pushd ./cuda_pathfinder
@@ -91,35 +94,68 @@ elif [[ "${test_module}" == "core" ]]; then
     ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
   fi
   popd
-elif [[ "${test_module}" == "nightly-install" ]]; then
-  # Install all wheels (pathfinder already installed above) without running tests.
-  # Used by nightly optional-dependency pipelines (pytorch, numba-cuda).
-  echo "Installing bindings wheel"
-  pushd ./cuda_bindings
-  if [[ "${LOCAL_CTK}" == 1 ]]; then
-    pip install "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl
-  else
-    pip install $(ls "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)[all]
-  fi
-  popd
+elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-numba-cuda" ]]; then
+  # Nightly optional-dependency testing.
+  # Install ALL wheels (pathfinder + bindings + core) and the optional dep
+  # in a single pip call so pip resolves version constraints in one shot
+  # and avoids costly uninstall/reinstall cycles.
+  #
+  # We pushd into cuda_core/ so that --group reads test dependency groups
+  # from cuda_core/pyproject.toml (needed for numpy, cupy, ml_dtypes, etc.).
+  # All other wheel paths use ../ to reach the repo root.
 
   TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
-  echo "Installing core wheel"
+  CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
 
   FREE_THREADING=""
   if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
     FREE_THREADING+="-ft"
   fi
 
+  # Resolve the pathfinder wheel path before pushd (it's relative to repo root).
+  # CUDA_BINDINGS_ARTIFACTS_DIR and CUDA_CORE_ARTIFACTS_DIR are already absolute
+  # (set via realpath in env-vars).
+  PATHFINDER_WHL=($(realpath ./cuda_pathfinder/*.whl))
+
   pushd ./cuda_core
-  CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
-  WHL_EXTRA=("${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)
+
+  # Build wheel specs (paths are absolute, so pushd doesn't affect them)
+  BINDINGS_WHL=("${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)
   if [[ "${LOCAL_CTK}" != 1 ]]; then
-    WHL_EXTRA=("${WHL_EXTRA[0]}[cu${TEST_CUDA_MAJOR}]")
+    BINDINGS_WHL=("${BINDINGS_WHL[0]}[all]")
   fi
-  pip install "${WHL_EXTRA[@]}" --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
+
+  CORE_WHL=("${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)
+  if [[ "${LOCAL_CTK}" != 1 ]]; then
+    CORE_WHL=("${CORE_WHL[0]}[cu${TEST_CUDA_MAJOR}]")
+  fi
+
+  # All packages in one pip call: pathfinder + bindings + core + test deps + optional dep
+  PIP_ARGS=(
+    "${PATHFINDER_WHL[@]}"
+    "${BINDINGS_WHL[@]}"
+    "${CORE_WHL[@]}"
+    --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}"
+    "cuda-toolkit==${CUDA_VER_MINOR}.*"
+  )
+
+  if [[ "${test_module}" == "nightly-pytorch" ]]; then
+    # TORCH_VER and TORCH_CUDA must be set by the caller.
+    echo "Installing pathfinder + bindings + core + test deps + PyTorch ${TORCH_VER} (${TORCH_CUDA})"
+    if [[ "${TORCH_VER}" == "latest" ]]; then
+      PIP_ARGS+=(torch)
+    else
+      PIP_ARGS+=("torch==${TORCH_VER}")
+    fi
+    PIP_ARGS+=(--extra-index-url "https://download.pytorch.org/whl/${TORCH_CUDA}")
+  elif [[ "${test_module}" == "nightly-numba-cuda" ]]; then
+    echo "Installing pathfinder + bindings + core + test deps + numba-cuda"
+    PIP_ARGS+=("numba-cuda[cu${TEST_CUDA_MAJOR}]")
+  fi
+
+  pip install "${PIP_ARGS[@]}"
   popd
 
-  echo "All cuda-python wheels installed for nightly testing"
-  pip list | grep -i "cuda\|pathfinder"
+  echo "Nightly install complete — installed packages:"
+  pip list
 fi

From edeaa76015bfa221636feefad7c3b6a7c706f994 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 04:04:16 +0000
Subject: [PATCH 7/9] Match CUDA_VER to TORCH_CUDA in nightly pytorch matrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CUDA_VER in the test environment should match TORCH_CUDA in
major.minor. BUILD_CUDA_VER (from build-ctk-ver input) is used
for artifact names, so CUDA_VER can differ.

- cu126 → CUDA_VER: 12.6.3 (was 12.9.1)
- cu130 → CUDA_VER: 13.0.2 (was 13.2.1)

For CUDA 12 entries, USE_BACKPORT_BINDINGS kicks in automatically
since BUILD_CUDA_MAJOR (13) \!= TEST_CUDA_MAJOR (12), pulling
bindings from the backport branch.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-wheel-linux.yml   |   4 +
 .github/workflows/test-wheel-windows.yml |   5 ++
 ci/test-matrix.yml                       |  16 ++--
 ci/tools/patch-numba-cuda                | 105 +++++++++++++++++++++++
 ci/tools/run-tests                       |  30 ++++---
 5 files changed, 139 insertions(+), 21 deletions(-)
 create mode 100755 ci/tools/patch-numba-cuda

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index e1a36bc086b..a546c8cf47b 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -368,6 +368,10 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
+      - name: Patch numba-cuda (upstream bug workarounds)
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        run: python ci/tools/patch-numba-cuda
+
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         run: python -m numba.runtests numba.cuda.tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index eefc9273594..0fef1e5b8af 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -356,6 +356,11 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
+      - name: Patch numba-cuda (upstream bug workarounds)
+        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: python ci/tools/patch-numba-cuda
+
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index d8be9b350a5..19931c3943a 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -64,10 +64,10 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090',    GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
   nightly:
     # nightly-pytorch (amd64 only — PyTorch does not ship arm64 GPU wheels)
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
@@ -97,10 +97,10 @@ windows:
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
   nightly:
     # nightly-pytorch
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/patch-numba-cuda b/ci/tools/patch-numba-cuda
new file mode 100755
index 00000000000..4e0b94c2688
--- /dev/null
+++ b/ci/tools/patch-numba-cuda
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Patch known upstream bugs in installed numba-cuda before running tests.
+
+These patches are temporary workarounds; each should be removed once the
+corresponding upstream fix is released.
+"""
+
+import pathlib
+import site
+
+
+def patch_test_linker_indent():
+    """Fix indentation bug in test_linker.py.
+
+    add_from_numba and debuggable_kernel reference test_device_functions_ltoir
+    which is only defined inside ``if TEST_BIN_DIR:``.  They must be indented
+    under that block.
+
+    Upstream: https://github.com/NVIDIA/numba-cuda/blob/200c2b96/
+              numba_cuda/numba/cuda/tests/cudadrv/test_linker.py#L120
+    """
+    # Find the installed test_linker.py across all site-packages paths
+    rel_path = pathlib.Path("numba_cuda", "numba", "cuda", "tests", "cudadrv", "test_linker.py")
+    target = None
+    for sp in site.getsitepackages():
+        candidate = pathlib.Path(sp) / rel_path
+        if candidate.exists():
+            target = candidate
+            break
+    if target is None:
+        # Fallback: locate via importlib
+        try:
+            import numba_cuda
+
+            pkg_dir = pathlib.Path(numba_cuda.__file__).parent
+            candidate = pkg_dir / "numba" / "cuda" / "tests" / "cudadrv" / "test_linker.py"
+            if candidate.exists():
+                target = candidate
+        except ImportError:
+            pass
+    if target is None:
+        print("SKIP: test_linker.py not found in any site-packages")
+        return
+    print(f"Found: {target}")
+
+    src = target.read_text()
+
+    old = (
+        "\nadd_from_numba = cuda.declare_device(\n"
+        '    "add_from_numba",\n'
+        '    "int32(int32, int32)",\n'
+        "    link=[test_device_functions_ltoir],\n"
+        ")\n"
+        "\n"
+        "\n"
+        "def debuggable_kernel(result):\n"
+        "    i = cuda.grid(1)\n"
+        "    result[i] = add_from_numba(i, i)"
+    )
+
+    new = (
+        "\n    add_from_numba = cuda.declare_device(\n"
+        '        "add_from_numba",\n'
+        '        "int32(int32, int32)",\n'
+        "        link=[test_device_functions_ltoir],\n"
+        "    )\n"
+        "\n"
+        "    def debuggable_kernel(result):\n"
+        "        i = cuda.grid(1)\n"
+        "        result[i] = add_from_numba(i, i)"
+    )
+
+    if old not in src:
+        print(f"SKIP: indent patch target not found in {target} (already patched?)")
+        return
+
+    src = src.replace(old, new)
+
+    # Part 2: add @unittest.skipUnless to test_debug_kernel_with_lto.
+    # After moving debuggable_kernel inside `if TEST_BIN_DIR:`, the symbol
+    # is undefined when the env var is not set, so the test must be skipped.
+    old_test = "    def test_debug_kernel_with_lto(self):\n"
+    new_test = (
+        '    @unittest.skipUnless(TEST_BIN_DIR, "NUMBA_CUDA_TEST_BIN_DIR not set")\n'
+        "    def test_debug_kernel_with_lto(self):\n"
+    )
+
+    if old_test not in src:
+        print(f"SKIP: skip-decorator patch target not found in {target}")
+    elif new_test in src:
+        print("SKIP: skip decorator already present")
+    else:
+        src = src.replace(old_test, new_test, 1)
+
+    target.write_text(src)
+    print(f"PATCHED: {target}")
+
+
+if __name__ == "__main__":
+    patch_test_linker_indent()
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index f76c45e325c..a4001b7a001 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -99,10 +99,6 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
   # Install ALL wheels (pathfinder + bindings + core) and the optional dep
   # in a single pip call so pip resolves version constraints in one shot
   # and avoids costly uninstall/reinstall cycles.
-  #
-  # We pushd into cuda_core/ so that --group reads test dependency groups
-  # from cuda_core/pyproject.toml (needed for numpy, cupy, ml_dtypes, etc.).
-  # All other wheel paths use ../ to reach the repo root.
 
   TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
   CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
@@ -112,14 +108,11 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     FREE_THREADING+="-ft"
   fi
 
-  # Resolve the pathfinder wheel path before pushd (it's relative to repo root).
-  # CUDA_BINDINGS_ARTIFACTS_DIR and CUDA_CORE_ARTIFACTS_DIR are already absolute
-  # (set via realpath in env-vars).
+  # Resolve pathfinder wheel to absolute path before pushd.
+  # CUDA_BINDINGS_ARTIFACTS_DIR and CUDA_CORE_ARTIFACTS_DIR are already
+  # absolute (set via realpath in env-vars).
   PATHFINDER_WHL=($(realpath ./cuda_pathfinder/*.whl))
 
-  pushd ./cuda_core
-
-  # Build wheel specs (paths are absolute, so pushd doesn't affect them)
   BINDINGS_WHL=("${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)
   if [[ "${LOCAL_CTK}" != 1 ]]; then
     BINDINGS_WHL=("${BINDINGS_WHL[0]}[all]")
@@ -130,18 +123,23 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     CORE_WHL=("${CORE_WHL[0]}[cu${TEST_CUDA_MAJOR}]")
   fi
 
-  # All packages in one pip call: pathfinder + bindings + core + test deps + optional dep
+  # pushd so --group reads test dependency groups from cuda_core/pyproject.toml.
+  # The explicit cuda-toolkit[...]==X.Y.* pin overrides the group's looser ==X.*.
+  pushd ./cuda_core
+
   PIP_ARGS=(
     "${PATHFINDER_WHL[@]}"
     "${BINDINGS_WHL[@]}"
     "${CORE_WHL[@]}"
     --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}"
-    "cuda-toolkit==${CUDA_VER_MINOR}.*"
   )
 
   if [[ "${test_module}" == "nightly-pytorch" ]]; then
     # TORCH_VER and TORCH_CUDA must be set by the caller.
+    # Use cuda-toolkit[cudart] only — torch brings its own nvcc/nvrtc/etc.
+    # This avoids version conflicts between our nvidia-* pins and torch's.
     echo "Installing pathfinder + bindings + core + test deps + PyTorch ${TORCH_VER} (${TORCH_CUDA})"
+    PIP_ARGS+=("cuda-toolkit[cudart]==${CUDA_VER_MINOR}.*")
     if [[ "${TORCH_VER}" == "latest" ]]; then
       PIP_ARGS+=(torch)
     else
@@ -150,7 +148,13 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     PIP_ARGS+=(--extra-index-url "https://download.pytorch.org/whl/${TORCH_CUDA}")
   elif [[ "${test_module}" == "nightly-numba-cuda" ]]; then
     echo "Installing pathfinder + bindings + core + test deps + numba-cuda"
-    PIP_ARGS+=("numba-cuda[cu${TEST_CUDA_MAJOR}]")
+    # numba-cuda's test-cuXX group deps (can't use --group for a wheel install):
+    PIP_ARGS+=(
+      "cuda-toolkit[curand,cublas]==${CUDA_VER_MINOR}.*"
+      "numba-cuda[cu${TEST_CUDA_MAJOR}]"
+      "cupy-cuda${TEST_CUDA_MAJOR}x"
+      psutil cffi pytest-xdist pytest-benchmark filecheck ml_dtypes statistics
+    )
   fi
 
   pip install "${PIP_ARGS[@]}"

From 6f04205a03f37a1053718c0ccdf93e3ecb1e01fc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 04:04:16 +0000
Subject: [PATCH 8/9] Match CUDA_VER to TORCH_CUDA in nightly pytorch matrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CUDA_VER in the test environment should match TORCH_CUDA in
major.minor. BUILD_CUDA_VER (from build-ctk-ver input) is used
for artifact names, so CUDA_VER can differ.

- cu126 → CUDA_VER: 12.6.3 (was 12.9.1)
- cu130 → CUDA_VER: 13.0.2 (was 13.2.1)

For CUDA 12 entries, USE_BACKPORT_BINDINGS kicks in automatically
since BUILD_CUDA_MAJOR (13) \!= TEST_CUDA_MAJOR (12), pulling
bindings from the backport branch.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ci/test-matrix.yml | 16 ++++++++--------
 ci/tools/run-tests | 30 +++++++++++++++++-------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index d8be9b350a5..19931c3943a 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -64,10 +64,10 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090',    GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
   nightly:
     # nightly-pytorch (amd64 only — PyTorch does not ship arm64 GPU wheels)
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
@@ -97,10 +97,10 @@ windows:
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
   nightly:
     # nightly-pytorch
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
-    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: 'latest', TORCH_CUDA: 'cu130' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu126' }
+    - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index f76c45e325c..a4001b7a001 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -99,10 +99,6 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
   # Install ALL wheels (pathfinder + bindings + core) and the optional dep
   # in a single pip call so pip resolves version constraints in one shot
   # and avoids costly uninstall/reinstall cycles.
-  #
-  # We pushd into cuda_core/ so that --group reads test dependency groups
-  # from cuda_core/pyproject.toml (needed for numpy, cupy, ml_dtypes, etc.).
-  # All other wheel paths use ../ to reach the repo root.
 
   TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
   CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
@@ -112,14 +108,11 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     FREE_THREADING+="-ft"
   fi
 
-  # Resolve the pathfinder wheel path before pushd (it's relative to repo root).
-  # CUDA_BINDINGS_ARTIFACTS_DIR and CUDA_CORE_ARTIFACTS_DIR are already absolute
-  # (set via realpath in env-vars).
+  # Resolve pathfinder wheel to absolute path before pushd.
+  # CUDA_BINDINGS_ARTIFACTS_DIR and CUDA_CORE_ARTIFACTS_DIR are already
+  # absolute (set via realpath in env-vars).
   PATHFINDER_WHL=($(realpath ./cuda_pathfinder/*.whl))
 
-  pushd ./cuda_core
-
-  # Build wheel specs (paths are absolute, so pushd doesn't affect them)
   BINDINGS_WHL=("${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)
   if [[ "${LOCAL_CTK}" != 1 ]]; then
     BINDINGS_WHL=("${BINDINGS_WHL[0]}[all]")
@@ -130,18 +123,23 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     CORE_WHL=("${CORE_WHL[0]}[cu${TEST_CUDA_MAJOR}]")
   fi
 
-  # All packages in one pip call: pathfinder + bindings + core + test deps + optional dep
+  # pushd so --group reads test dependency groups from cuda_core/pyproject.toml.
+  # The explicit cuda-toolkit[...]==X.Y.* pin overrides the group's looser ==X.*.
+  pushd ./cuda_core
+
   PIP_ARGS=(
     "${PATHFINDER_WHL[@]}"
     "${BINDINGS_WHL[@]}"
     "${CORE_WHL[@]}"
     --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}"
-    "cuda-toolkit==${CUDA_VER_MINOR}.*"
   )
 
   if [[ "${test_module}" == "nightly-pytorch" ]]; then
     # TORCH_VER and TORCH_CUDA must be set by the caller.
+    # Use cuda-toolkit[cudart] only — torch brings its own nvcc/nvrtc/etc.
+    # This avoids version conflicts between our nvidia-* pins and torch's.
     echo "Installing pathfinder + bindings + core + test deps + PyTorch ${TORCH_VER} (${TORCH_CUDA})"
+    PIP_ARGS+=("cuda-toolkit[cudart]==${CUDA_VER_MINOR}.*")
     if [[ "${TORCH_VER}" == "latest" ]]; then
       PIP_ARGS+=(torch)
     else
@@ -150,7 +148,13 @@ elif [[ "${test_module}" == "nightly-pytorch" || "${test_module}" == "nightly-nu
     PIP_ARGS+=(--extra-index-url "https://download.pytorch.org/whl/${TORCH_CUDA}")
   elif [[ "${test_module}" == "nightly-numba-cuda" ]]; then
     echo "Installing pathfinder + bindings + core + test deps + numba-cuda"
-    PIP_ARGS+=("numba-cuda[cu${TEST_CUDA_MAJOR}]")
+    # numba-cuda's test-cuXX group deps (can't use --group for a wheel install):
+    PIP_ARGS+=(
+      "cuda-toolkit[curand,cublas]==${CUDA_VER_MINOR}.*"
+      "numba-cuda[cu${TEST_CUDA_MAJOR}]"
+      "cupy-cuda${TEST_CUDA_MAJOR}x"
+      psutil cffi pytest-xdist pytest-benchmark filecheck ml_dtypes statistics
+    )
   fi
 
   pip install "${PIP_ARGS[@]}"

From cec1d8d9f443374970abca8bf8317661831d37b3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 30 Apr 2026 19:49:57 +0000
Subject: [PATCH 9/9] Remove numba-cuda test_linker.py patch (fixed upstream)

The indentation bug in test_linker.py was fixed in the latest
numba-cuda release, so the workaround patch is no longer needed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-wheel-linux.yml   |   4 -
 .github/workflows/test-wheel-windows.yml |   5 --
 ci/tools/patch-numba-cuda                | 105 -----------------------
 3 files changed, 114 deletions(-)
 delete mode 100755 ci/tools/patch-numba-cuda

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index a546c8cf47b..e1a36bc086b 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -368,10 +368,6 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
-      - name: Patch numba-cuda (upstream bug workarounds)
-        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
-        run: python ci/tools/patch-numba-cuda
-
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         run: python -m numba.runtests numba.cuda.tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 0fef1e5b8af..eefc9273594 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -356,11 +356,6 @@ jobs:
           pytest -rxXs -v --durations=0 tests/test_utils.py tests/example_tests/
           popd
 
-      - name: Patch numba-cuda (upstream bug workarounds)
-        if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
-        shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: python ci/tools/patch-numba-cuda
-
       - name: Run numba-cuda tests
         if: ${{ inputs.test-mode == 'nightly-numba-cuda' }}
         shell: bash --noprofile --norc -xeuo pipefail {0}
diff --git a/ci/tools/patch-numba-cuda b/ci/tools/patch-numba-cuda
deleted file mode 100755
index 4e0b94c2688..00000000000
--- a/ci/tools/patch-numba-cuda
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Patch known upstream bugs in installed numba-cuda before running tests.
-
-These patches are temporary workarounds; each should be removed once the
-corresponding upstream fix is released.
-"""
-
-import pathlib
-import site
-
-
-def patch_test_linker_indent():
-    """Fix indentation bug in test_linker.py.
-
-    add_from_numba and debuggable_kernel reference test_device_functions_ltoir
-    which is only defined inside ``if TEST_BIN_DIR:``.  They must be indented
-    under that block.
-
-    Upstream: https://github.com/NVIDIA/numba-cuda/blob/200c2b96/
-              numba_cuda/numba/cuda/tests/cudadrv/test_linker.py#L120
-    """
-    # Find the installed test_linker.py across all site-packages paths
-    rel_path = pathlib.Path("numba_cuda", "numba", "cuda", "tests", "cudadrv", "test_linker.py")
-    target = None
-    for sp in site.getsitepackages():
-        candidate = pathlib.Path(sp) / rel_path
-        if candidate.exists():
-            target = candidate
-            break
-    if target is None:
-        # Fallback: locate via importlib
-        try:
-            import numba_cuda
-
-            pkg_dir = pathlib.Path(numba_cuda.__file__).parent
-            candidate = pkg_dir / "numba" / "cuda" / "tests" / "cudadrv" / "test_linker.py"
-            if candidate.exists():
-                target = candidate
-        except ImportError:
-            pass
-    if target is None:
-        print("SKIP: test_linker.py not found in any site-packages")
-        return
-    print(f"Found: {target}")
-
-    src = target.read_text()
-
-    old = (
-        "\nadd_from_numba = cuda.declare_device(\n"
-        '    "add_from_numba",\n'
-        '    "int32(int32, int32)",\n'
-        "    link=[test_device_functions_ltoir],\n"
-        ")\n"
-        "\n"
-        "\n"
-        "def debuggable_kernel(result):\n"
-        "    i = cuda.grid(1)\n"
-        "    result[i] = add_from_numba(i, i)"
-    )
-
-    new = (
-        "\n    add_from_numba = cuda.declare_device(\n"
-        '        "add_from_numba",\n'
-        '        "int32(int32, int32)",\n'
-        "        link=[test_device_functions_ltoir],\n"
-        "    )\n"
-        "\n"
-        "    def debuggable_kernel(result):\n"
-        "        i = cuda.grid(1)\n"
-        "        result[i] = add_from_numba(i, i)"
-    )
-
-    if old not in src:
-        print(f"SKIP: indent patch target not found in {target} (already patched?)")
-        return
-
-    src = src.replace(old, new)
-
-    # Part 2: add @unittest.skipUnless to test_debug_kernel_with_lto.
-    # After moving debuggable_kernel inside `if TEST_BIN_DIR:`, the symbol
-    # is undefined when the env var is not set, so the test must be skipped.
-    old_test = "    def test_debug_kernel_with_lto(self):\n"
-    new_test = (
-        '    @unittest.skipUnless(TEST_BIN_DIR, "NUMBA_CUDA_TEST_BIN_DIR not set")\n'
-        "    def test_debug_kernel_with_lto(self):\n"
-    )
-
-    if old_test not in src:
-        print(f"SKIP: skip-decorator patch target not found in {target}")
-    elif new_test in src:
-        print("SKIP: skip decorator already present")
-    else:
-        src = src.replace(old_test, new_test, 1)
-
-    target.write_text(src)
-    print(f"PATCHED: {target}")
-
-
-if __name__ == "__main__":
-    patch_test_linker_indent()