From b1b6070a823898a01e87c49cb1682f6d65a96eb9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 01:48:44 +0000
Subject: [PATCH 1/4] CI: allow specifying custom driver versions in test
 matrix

Extends the DRIVER field in ci/test-matrix.yml beyond 'latest'/'earliest'
to accept an explicit version string (e.g. '580.65.06'). For Linux,
ci/tools/install_gpu_driver.sh (adapted from nv-gha-runners/vm-images
PR #256) swaps the driver in-job via nsenter when the row uses a custom
version; for Windows, ci/tools/install_gpu_driver.ps1 is split into
install + configure_driver_mode, with the install step gated on the
DRIVER value and the mode step always running.

The matrix row is routed to a 'latest' runner image when the DRIVER is
a custom version (the install scripts perform the swap themselves).
Container privileges on Linux (--privileged --pid=host) are added only
on rows with a custom DRIVER. Custom DRIVER + FLAVOR=wsl is rejected
eagerly in the compute-matrix step.

Two existing nightly-numba-cuda rows exercise the new path:
- Linux amd64 / 13.3.0 / l4 -> 580.65.06
- Windows amd64 / 13.3.0 / l4 -> 610.47

Closes #293
Closes #1265
---
 .github/workflows/coverage.yml           |   8 +-
 .github/workflows/test-wheel-linux.yml   |  28 +++-
 .github/workflows/test-wheel-windows.yml |  20 ++-
 ci/test-matrix.yml                       |  13 +-
 ci/tools/configure_driver_mode.ps1       |  45 ++++++
 ci/tools/install_gpu_driver.ps1          |  51 +++----
 ci/tools/install_gpu_driver.sh           | 167 +++++++++++++++++++++++
 7 files changed, 288 insertions(+), 44 deletions(-)
 create mode 100644 ci/tools/configure_driver_mode.ps1
 create mode 100755 ci/tools/install_gpu_driver.sh

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 9581cff3088..de1e713e499 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -275,13 +275,15 @@ jobs:
         uses: nv-gha-runners/setup-proxy-cache@main
         continue-on-error: true
 
-      - name: Update driver
+      # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally
+      # skipped (it errors on latest/earliest); configure_driver_mode.ps1
+      # still runs to put the pre-installed driver into TCC mode.
+      - name: Configure driver mode
         shell: powershell
         env:
           DRIVER_MODE: "TCC"
-          GPU_TYPE: "a100"
         run: |
-          ci/tools/install_gpu_driver.ps1
+          ci/tools/configure_driver_mode.ps1
 
       - name: Ensure GPU is working
         run: |
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index f8002f5124a..4f56cb57740 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -85,8 +85,13 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the
+          # in-container driver swap doesn't work under WSL); add a
+          # RUNNER_DRIVER field that maps any custom version back to
+          # 'latest' (the install script swaps the driver itself, so we
+          # need to land on the runner that ships with the most recent
+          # pre-installed driver); wrap in include structure.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -101,13 +106,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
-    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     # The build stage could fail but we want the CI to keep moving.
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # Our self-hosted runners require a container
     # TODO: use a different (nvidia?) container
     container:
-      options: -u root --security-opt seccomp=unconfined --shm-size 16g
+      # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh
+      # can nsenter to the host for the install + refresh the toolkit bind mounts
+      # back inside the container. Stock options for latest/earliest rows.
+      options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }}
       image: ubuntu:22.04
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
@@ -131,6 +139,18 @@ jobs:
           dependencies: "jq wget libgl1 libegl1 g++"
           dependent_exes: "jq wget"
 
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
+        env:
+          DRIVER: ${{ matrix.DRIVER }}
+          GPU_TYPE: ${{ matrix.GPU }}
+        run: |
+          # util-linux for nsenter; install_gpu_driver.sh re-execs onto the
+          # host (requires --privileged --pid=host on the container, set
+          # conditionally above) and refreshes the toolkit bind mounts here.
+          apt-get -y install --no-install-recommends util-linux
+          ./ci/tools/install_gpu_driver.sh
+
       - name: Set environment variables
         env:
           BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 320817177f3..5675b395afe 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -81,8 +81,11 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; add a RUNNER_DRIVER field that maps any
+          # custom DRIVER version back to 'latest' (install_gpu_driver.ps1
+          # swaps the driver itself, so the runner must be the one that
+          # ships the most recent pre-installed driver); wrap in include.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -95,7 +98,7 @@ jobs:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
-    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
@@ -106,13 +109,20 @@ jobs:
         with:
           enable-apt: true
 
-      - name: Update driver
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
         env:
-          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+          DRIVER: ${{ matrix.DRIVER }}
           GPU_TYPE: ${{ matrix.GPU }}
         run: |
           ci/tools/install_gpu_driver.ps1
 
+      - name: Configure driver mode
+        env:
+          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+        run: |
+          ci/tools/configure_driver_mode.ps1
+
       - name: Ensure GPU is working
         run: |
           nvidia-smi
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 95c5e714caa..3d5693a188a 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -13,7 +13,16 @@
 # Windows entries also include DRIVER_MODE.
 #
 # Notes:
+# - DRIVER accepts:
+#     * 'latest'   - use the runner's pre-installed latest driver (no install step)
+#     * 'earliest' - use the runner's pre-installed earliest driver (no install step)
+#     * a version string (e.g. '580.65.06')
+#                  - install that version via ci/tools/install_gpu_driver.sh (Linux)
+#                    or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the
+#                    job. The matrix row is routed to the 'latest' runner image (the
+#                    install scripts swap the driver themselves).
 # - DRIVER: 'earliest' does not work with CUDA 12.9.1
+# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux.
 
 linux:
   pull-request:
@@ -74,7 +83,7 @@ linux:
     - { MODE: 'nightly-pytorch',    ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     # nightly-standard (arm64 l4×2 — nightly-only per runner team request)
@@ -113,4 +122,4 @@ windows:
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47',  DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1
new file mode 100644
index 00000000000..280e725e11b
--- /dev/null
+++ b/ci/tools/configure_driver_mode.ps1
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI
+# runner and cycle the display devices so the new mode takes effect
+# without rebooting. Always runs (whether or not install_gpu_driver.ps1
+# just ran). When install_gpu_driver.ps1 has run, this single device
+# cycle also activates the freshly-installed driver.
+#
+# Inputs (env):
+#   DRIVER_MODE  One of WDDM, TCC, MCDM.
+
+function Set-DriverMode {
+
+    # Map matrix DRIVER_MODE to nvidia-smi -fdm code.
+    # This assumes we have the prior knowledge on which GPU can use which mode.
+    $driver_mode = $env:DRIVER_MODE
+    if ($driver_mode -eq "WDDM") {
+        Write-Output "Setting driver mode to WDDM..."
+        nvidia-smi -fdm 0
+    } elseif ($driver_mode -eq "TCC") {
+        Write-Output "Setting driver mode to TCC..."
+        nvidia-smi -fdm 1
+    } elseif ($driver_mode -eq "MCDM") {
+        Write-Output "Setting driver mode to MCDM..."
+        nvidia-smi -fdm 2
+    } else {
+        Write-Output "Unknown driver mode: $driver_mode"
+        exit 1
+    }
+
+    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
+    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
+    foreach ($device in $nvidia_devices) {
+        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
+        pnputil /disable-device "$($device.InstanceId)"
+        pnputil /enable-device "$($device.InstanceId)"
+    }
+    # Give it a minute to settle:
+    Start-Sleep -Seconds 5
+}
+
+# Run the functions
+Set-DriverMode
diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1
index c98416c87e2..e61c6bbdbb1 100644
--- a/ci/tools/install_gpu_driver.ps1
+++ b/ci/tools/install_gpu_driver.ps1
@@ -1,13 +1,30 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a
+# Windows CI runner. Driver-mode selection and the post-install device
+# power-cycle are the responsibility of configure_driver_mode.ps1, which
+# the workflow runs immediately after this script (or by itself when
+# DRIVER is 'latest'/'earliest' and the runner already brings up the
+# right driver).
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "610.47". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "l4", "rtx4090").
+#             Selects the data-center vs desktop installer variant.
 
 # Install the driver
 function Install-Driver {
 
-    # Set the correct URL, filename, and arguments to the installer
-    # This driver is picked to support Windows 11 & CUDA 13.0
-    $version = '581.15'
+    # Driver version is plumbed from the matrix via the DRIVER env var.
+    $version = $env:DRIVER
+    if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') {
+        Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'."
+        exit 1
+    }
 
     # Get GPU type from environment variable
     $gpu_type = $env:GPU_TYPE
@@ -54,33 +71,7 @@ function Install-Driver {
     # Install the file with the specified path from earlier
     Write-Output 'Running the driver installer...'
     Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
-    Write-Output 'Done!'
-
-    # Handle driver mode configuration
-    # This assumes we have the prior knowledge on which GPU can use which mode.
-    $driver_mode = $env:DRIVER_MODE
-    if ($driver_mode -eq "WDDM") {
-        Write-Output "Setting driver mode to WDDM..."
-        nvidia-smi -fdm 0
-    } elseif ($driver_mode -eq "TCC") {
-        Write-Output "Setting driver mode to TCC..."
-        nvidia-smi -fdm 1
-    } elseif ($driver_mode -eq "MCDM") {
-        Write-Output "Setting driver mode to MCDM..."
-        nvidia-smi -fdm 2
-    } else {
-        Write-Output "Unknown driver mode: $driver_mode"
-        exit 1
-    }
-    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
-    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
-    foreach ($device in $nvidia_devices) {
-        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
-        pnputil /disable-device "$($device.InstanceId)"
-        pnputil /enable-device "$($device.InstanceId)"
-    }
-    # Give it a minute to settle:
-    Start-Sleep -Seconds 5
+    Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.'
 }
 
 # Run the functions
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
new file mode 100755
index 00000000000..5dff7043487
--- /dev/null
+++ b/ci/tools/install_gpu_driver.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.sh -- install a specific NVIDIA driver version on a
+# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256
+# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI.
+#
+# !!! ALPHA !!!
+# Performs live modifications to the host driver stack (kernel module
+# reload, package replacement, and -- inside containers -- toolkit
+# bind-mount refresh) and may cause issues.
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "580.65.06". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "v100", "l4",
+#             "h100"). Used only to pick the kernel module flavor
+#             (Volta needs the proprietary/legacy module; everything
+#             newer can use the open module).
+#
+# Arch is detected from `uname -m`.
+#
+# When the script runs inside a container (the cuda-python Linux jobs do)
+# it re-execs itself on the host via `nsenter`. The job must declare
+# `options: --privileged --pid=host` (the workflow only does this for
+# matrix rows with a custom DRIVER). After the host-side install, the
+# container's bind-mounted nvidia libs/binaries are refreshed in-place so
+# the new driver is visible without restarting the container.
+set -euo pipefail
+
+: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}"
+: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}"
+
+case "$DRIVER" in
+  latest|earliest)
+    echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2
+    exit 1
+    ;;
+esac
+
+VERSION="$DRIVER"
+
+# Volta (V100) requires the legacy/proprietary kernel module; all newer
+# GPUs in this matrix support the open module. Extend this if/when older
+# GPUs return to the matrix.
+case "$GPU_TYPE" in
+  v100) KMT=proprietary ;;
+  *)    KMT=open ;;
+esac
+
+case "$(uname -m)" in
+  x86_64)
+    ARCH_DIR=Linux-x86_64
+    ARCH_SUFFIX=x86_64
+    ;;
+  aarch64)
+    ARCH_DIR=Linux-aarch64
+    ARCH_SUFFIX=aarch64
+    ;;
+  *)
+    echo "::error::unsupported arch: $(uname -m)" >&2
+    exit 1
+    ;;
+esac
+
+URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run"
+
+# Re-elevate to root if needed (sudo is preinstalled on the runner image).
+if [ "$(id -u)" != 0 ]; then
+  exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@"
+fi
+
+echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2
+echo "DRIVER=${VERSION}  GPU_TYPE=${GPU_TYPE}  KMT=${KMT}  ARCH=${ARCH_SUFFIX}" >&2
+echo "URL=${URL}" >&2
+
+# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia
+# resolves nvidia-container-runtime through these, and removing them
+# breaks `docker exec` against any container started with that runtime.
+KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$'
+
+in_container() {
+  [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null
+}
+
+host_install() {
+  apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod
+
+  systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true
+  # if-test instead of `fuser ... || true` so a kill failure surfaces
+  # (fuser exits 1 when nothing holds the device, which is the happy path).
+  if fuser /dev/nvidia* >/dev/null 2>&1; then
+    fuser -kv /dev/nvidia*
+  fi
+  sleep 1
+  for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+    rmmod "$m" 2>/dev/null || true
+  done
+
+  # Purge existing nvidia/libnvidia packages, except the toolkit pieces
+  # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip
+  # and the .run installer is about to replace everything anyway.
+  dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \
+    | awk -v re="$KEEP_RE" '$0 !~ re' \
+    | xargs -r apt-get -y remove --purge || true
+
+  local d
+  d=$(mktemp -d)
+  ( cd "$d" \
+    && wget -q -O installer.run "$URL" \
+    && sh installer.run --silent --dkms --no-questions \
+         --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
+  modprobe nvidia nvidia_uvm nvidia_modeset
+}
+
+# Replace the toolkit's bind-mounted nvidia libs/binaries inside this
+# container with copies from the host's new install. `cp` (not
+# `mount --bind`) because procfs-routed binds drop the exec bit.
+refresh_container_libs() {
+  # Walk /proc/self/mountinfo and match the toolkit-injected nvidia
+  # binds via mount point (field 5) so deleted source paths -- which
+  # the kernel suffixes field 4 with " (deleted)" once the host unlinks
+  # the old lib -- don't break discovery. Filters skip what we can't or
+  # shouldn't refresh:
+  #   $3 ~ /^0:/                tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs)
+  #   $5 ~ /\.json$/            vulkan/glvnd config remaps (not version-bound)
+  #   $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers
+  local mounts
+  mounts=$(awk '
+    $3 !~ /^0:/                     &&
+    $5 !~ /\.json$/                 &&
+    $5 !~ /\/(firmware|xorg)\//     &&
+    $5 ~ /(nvidia|libcuda)/         { print $5 }
+  ' /proc/self/mountinfo | sort -u)
+
+  for tgt in $mounts; do
+    local src="/proc/1/root$tgt"
+    if [ ! -e "$src" ]; then
+      # Driver swap rewrites the version suffix (libfoo.so.595.71.05 ->
+      # libfoo.so.580.65.06); strip it and find the new file.
+      local base
+      base=$(basename "$tgt")
+      base="${base%.so.*}.so"
+      src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \
+            | sort -V | tail -n1)
+      [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; }
+    fi
+    umount "$tgt" 2>/dev/null || true
+    cp -f --remove-destination "$src" "$tgt" \
+      || echo "WARN: refresh failed for $tgt (src=$src)" >&2
+  done
+  ldconfig
+}
+
+if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
+  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \
+    || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
+  refresh_container_libs
+else
+  host_install
+fi
+
+nvidia-smi >/dev/null
+grep -qF "$VERSION" /proc/driver/nvidia/version

From 3e016b572dcded7701f9b2f12c25cec3cb7e5b1d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:09:01 +0000
Subject: [PATCH 2/4] CI: fix Linux driver nsenter re-exec, swap Windows
 version, enable ci.yml dispatch

- install_gpu_driver.sh: pipe the script body to the host-side bash via
  stdin (bash -s < "$0") instead of re-execing "$0". The script lives
  in the GH workspace mount (container-only), so the relative path
  doesn't resolve after nsenter switches the mount namespace.
  The < "$0" fd is opened before nsenter and survives the flip.
- test-matrix.yml: Windows nightly-numba-cuda row 610.47 -> 596.36
  (610.47 isn't published on the CDN; install hit 404).
- ci.yml: add workflow_dispatch: trigger so the pipeline can be
  re-run manually. The existing should-skip / detect-changes gates
  already handle non-PR events.
---
 .github/workflows/ci.yml       | 1 +
 ci/test-matrix.yml             | 2 +-
 ci/tools/install_gpu_driver.sh | 8 +++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9f9236b09fe..82ab7210c92 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ on:
   schedule:
     # every 24 hours at midnight UTC
     - cron: "0 0 * * *"
+  workflow_dispatch:
 
 jobs:
   ci-vars:
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 3d5693a188a..730791ac283 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -122,4 +122,4 @@ windows:
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47',  DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36',  DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 5dff7043487..b7aeb3434f4 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -156,7 +156,13 @@ refresh_container_libs() {
 }
 
 if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
-  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \
+  # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a
+  # host-side absolute path so `"$0"` survives the mount-namespace flip;
+  # ours lives in the GH workspace mount (container-only), so we pipe the
+  # script body in via stdin instead -- the `< "$0"` fd is opened before
+  # nsenter and stays valid across the namespace switch. Env vars (DRIVER,
+  # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash.
+  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \
     || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
   refresh_container_libs
 else

From c0ca8696e64c7fb1e275ae628905ba6d86144279 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:27:53 +0000
Subject: [PATCH 3/4] CI: move 'Ensure GPU is working' after 'Install GPU
 driver' on Linux

So nvidia-smi validates the post-install driver state on custom-DRIVER
rows. Windows test-wheel + coverage already use Install -> Configure ->
Ensure; this brings the Linux test-wheel job into line.
---
 .github/workflows/test-wheel-linux.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 4f56cb57740..57bc4dc555f 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -121,9 +121,6 @@ jobs:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
         PIP_CACHE_DIR: "/tmp/pip-cache"
     steps:
-      - name: Ensure GPU is working
-        run: nvidia-smi
-
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
 
@@ -151,6 +148,9 @@ jobs:
           apt-get -y install --no-install-recommends util-linux
           ./ci/tools/install_gpu_driver.sh
 
+      - name: Ensure GPU is working
+        run: nvidia-smi
+
       - name: Set environment variables
         env:
           BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}

From 4a23b23a26111a8ead0cc40519a2b558ea9bfe66 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:30:30 +0000
Subject: [PATCH 4/4] CI: flip two PR-matrix Linux rows to DRIVER=610.43.02

Exercises the custom-driver install path on every PR (not just nightly).
Both rows are amd64 / 13.3.0 / local-CTK, on l4 and rtxpro6000 -- both
in the 'open' kernel-module flavor (only Volta needs 'legacy').
---
 ci/test-matrix.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 730791ac283..51f0d3f063f 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -38,10 +38,10 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }