From b1b6070a823898a01e87c49cb1682f6d65a96eb9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 01:48:44 +0000 Subject: [PATCH 1/4] CI: allow specifying custom driver versions in test matrix Extends the DRIVER field in ci/test-matrix.yml beyond 'latest'/'earliest' to accept an explicit version string (e.g. '580.65.06'). For Linux, ci/tools/install_gpu_driver.sh (adapted from nv-gha-runners/vm-images PR #256) swaps the driver in-job via nsenter when the row uses a custom version; for Windows, ci/tools/install_gpu_driver.ps1 is split into install + configure_driver_mode, with the install step gated on the DRIVER value and the mode step always running. The matrix row is routed to a 'latest' runner image when the DRIVER is a custom version (the install scripts perform the swap themselves). Container privileges on Linux (--privileged --pid=host) are added only on rows with a custom DRIVER. Custom DRIVER + FLAVOR=wsl is rejected eagerly in the compute-matrix step. Two existing nightly-numba-cuda rows exercise the new path: - Linux amd64 / 13.3.0 / l4 -> 580.65.06 - Windows amd64 / 13.3.0 / l4 -> 610.47 Closes #293 Closes #1265 --- .github/workflows/coverage.yml | 8 +- .github/workflows/test-wheel-linux.yml | 28 +++- .github/workflows/test-wheel-windows.yml | 20 ++- ci/test-matrix.yml | 13 +- ci/tools/configure_driver_mode.ps1 | 45 ++++++ ci/tools/install_gpu_driver.ps1 | 51 +++---- ci/tools/install_gpu_driver.sh | 167 +++++++++++++++++++++++ 7 files changed, 288 insertions(+), 44 deletions(-) create mode 100644 ci/tools/configure_driver_mode.ps1 create mode 100755 ci/tools/install_gpu_driver.sh diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 9581cff3088..de1e713e499 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -275,13 +275,15 @@ jobs: uses: nv-gha-runners/setup-proxy-cache@main continue-on-error: true - - name: Update driver + # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally + # skipped (it errors on latest/earliest); configure_driver_mode.ps1 + # still runs to put the pre-installed driver into TCC mode. + - name: Configure driver mode shell: powershell env: DRIVER_MODE: "TCC" - GPU_TYPE: "a100" run: | - ci/tools/install_gpu_driver.ps1 + ci/tools/configure_driver_mode.ps1 - name: Ensure GPU is working run: | diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index f8002f5124a..4f56cb57740 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -85,8 +85,13 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the + # in-container driver swap doesn't work under WSL); add a + # RUNNER_DRIVER field that maps any custom version back to + # 'latest' (the install script swaps the driver itself, so we + # need to land on the runner that ships with the most recent + # pre-installed driver); wrap in include structure. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -101,13 +106,16 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} - runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" # The build stage could fail but we want the CI to keep moving. if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} # Our self-hosted runners require a container # TODO: use a different (nvidia?) container container: - options: -u root --security-opt seccomp=unconfined --shm-size 16g + # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh + # can nsenter to the host for the install + refresh the toolkit bind mounts + # back inside the container. Stock options for latest/earliest rows. + options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }} image: ubuntu:22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} @@ -131,6 +139,18 @@ jobs: dependencies: "jq wget libgl1 libegl1 g++" dependent_exes: "jq wget" + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} + env: + DRIVER: ${{ matrix.DRIVER }} + GPU_TYPE: ${{ matrix.GPU }} + run: | + # util-linux for nsenter; install_gpu_driver.sh re-execs onto the + # host (requires --privileged --pid=host on the container, set + # conditionally above) and refreshes the toolkit bind mounts here. + apt-get -y install --no-install-recommends util-linux + ./ci/tools/install_gpu_driver.sh + - name: Set environment variables env: BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 320817177f3..5675b395afe 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -81,8 +81,11 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; add a RUNNER_DRIVER field that maps any + # custom DRIVER version back to 'latest' (install_gpu_driver.ps1 + # swaps the driver itself, so the runner must be the one that + # ships the most recent pre-installed driver); wrap in include. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -95,7 +98,7 @@ jobs: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} - runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -106,13 +109,20 @@ jobs: with: enable-apt: true - - name: Update driver + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} env: - DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + DRIVER: ${{ matrix.DRIVER }} GPU_TYPE: ${{ matrix.GPU }} run: | ci/tools/install_gpu_driver.ps1 + - name: Configure driver mode + env: + DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + run: | + ci/tools/configure_driver_mode.ps1 + - name: Ensure GPU is working run: | nvidia-smi diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 95c5e714caa..3d5693a188a 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -13,7 +13,16 @@ # Windows entries also include DRIVER_MODE. # # Notes: +# - DRIVER accepts: +# * 'latest' - use the runner's pre-installed latest driver (no install step) +# * 'earliest' - use the runner's pre-installed earliest driver (no install step) +# * a version string (e.g. '580.65.06') +# - install that version via ci/tools/install_gpu_driver.sh (Linux) +# or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the +# job. The matrix row is routed to the 'latest' runner image (the +# install scripts swap the driver themselves). # - DRIVER: 'earliest' does not work with CUDA 12.9.1 +# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux. linux: pull-request: @@ -74,7 +83,7 @@ linux: - { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } # nightly-standard (arm64 l4×2 — nightly-only per runner team request) @@ -113,4 +122,4 @@ windows: - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47', DRIVER_MODE: 'TCC' } diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 new file mode 100644 index 00000000000..280e725e11b --- /dev/null +++ b/ci/tools/configure_driver_mode.ps1 @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI +# runner and cycle the display devices so the new mode takes effect +# without rebooting. Always runs (whether or not install_gpu_driver.ps1 +# just ran). When install_gpu_driver.ps1 has run, this single device +# cycle also activates the freshly-installed driver. +# +# Inputs (env): +# DRIVER_MODE One of WDDM, TCC, MCDM. + +function Set-DriverMode { + + # Map matrix DRIVER_MODE to nvidia-smi -fdm code. + # This assumes we have the prior knowledge on which GPU can use which mode. + $driver_mode = $env:DRIVER_MODE + if ($driver_mode -eq "WDDM") { + Write-Output "Setting driver mode to WDDM..." + nvidia-smi -fdm 0 + } elseif ($driver_mode -eq "TCC") { + Write-Output "Setting driver mode to TCC..." + nvidia-smi -fdm 1 + } elseif ($driver_mode -eq "MCDM") { + Write-Output "Setting driver mode to MCDM..." + nvidia-smi -fdm 2 + } else { + Write-Output "Unknown driver mode: $driver_mode" + exit 1 + } + + # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) + $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" + foreach ($device in $nvidia_devices) { + Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" + pnputil /disable-device "$($device.InstanceId)" + pnputil /enable-device "$($device.InstanceId)" + } + # Give it a minute to settle: + Start-Sleep -Seconds 5 +} + +# Run the functions +Set-DriverMode diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1 index c98416c87e2..e61c6bbdbb1 100644 --- a/ci/tools/install_gpu_driver.ps1 +++ b/ci/tools/install_gpu_driver.ps1 @@ -1,13 +1,30 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a +# Windows CI runner. Driver-mode selection and the post-install device +# power-cycle are the responsibility of configure_driver_mode.ps1, which +# the workflow runs immediately after this script (or by itself when +# DRIVER is 'latest'/'earliest' and the runner already brings up the +# right driver). +# +# Inputs (env): +# DRIVER Driver version, e.g. "610.47". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "l4", "rtx4090"). +# Selects the data-center vs desktop installer variant. # Install the driver function Install-Driver { - # Set the correct URL, filename, and arguments to the installer - # This driver is picked to support Windows 11 & CUDA 13.0 - $version = '581.15' + # Driver version is plumbed from the matrix via the DRIVER env var. + $version = $env:DRIVER + if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') { + Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'." + exit 1 + } # Get GPU type from environment variable $gpu_type = $env:GPU_TYPE @@ -54,33 +71,7 @@ function Install-Driver { # Install the file with the specified path from earlier Write-Output 'Running the driver installer...' Start-Process -FilePath $filepath -ArgumentList $install_args -Wait - Write-Output 'Done!' - - # Handle driver mode configuration - # This assumes we have the prior knowledge on which GPU can use which mode. - $driver_mode = $env:DRIVER_MODE - if ($driver_mode -eq "WDDM") { - Write-Output "Setting driver mode to WDDM..." - nvidia-smi -fdm 0 - } elseif ($driver_mode -eq "TCC") { - Write-Output "Setting driver mode to TCC..." - nvidia-smi -fdm 1 - } elseif ($driver_mode -eq "MCDM") { - Write-Output "Setting driver mode to MCDM..." - nvidia-smi -fdm 2 - } else { - Write-Output "Unknown driver mode: $driver_mode" - exit 1 - } - # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) - $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" - foreach ($device in $nvidia_devices) { - Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" - pnputil /disable-device "$($device.InstanceId)" - pnputil /enable-device "$($device.InstanceId)" - } - # Give it a minute to settle: - Start-Sleep -Seconds 5 + Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.' } # Run the functions diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh new file mode 100755 index 00000000000..5dff7043487 --- /dev/null +++ b/ci/tools/install_gpu_driver.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.sh -- install a specific NVIDIA driver version on a +# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256 +# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI. +# +# !!! ALPHA !!! +# Performs live modifications to the host driver stack (kernel module +# reload, package replacement, and -- inside containers -- toolkit +# bind-mount refresh) and may cause issues. +# +# Inputs (env): +# DRIVER Driver version, e.g. "580.65.06". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "v100", "l4", +# "h100"). Used only to pick the kernel module flavor +# (Volta needs the proprietary/legacy module; everything +# newer can use the open module). +# +# Arch is detected from `uname -m`. +# +# When the script runs inside a container (the cuda-python Linux jobs do) +# it re-execs itself on the host via `nsenter`. The job must declare +# `options: --privileged --pid=host` (the workflow only does this for +# matrix rows with a custom DRIVER). After the host-side install, the +# container's bind-mounted nvidia libs/binaries are refreshed in-place so +# the new driver is visible without restarting the container. +set -euo pipefail + +: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}" +: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}" + +case "$DRIVER" in + latest|earliest) + echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2 + exit 1 + ;; +esac + +VERSION="$DRIVER" + +# Volta (V100) requires the legacy/proprietary kernel module; all newer +# GPUs in this matrix support the open module. Extend this if/when older +# GPUs return to the matrix. +case "$GPU_TYPE" in + v100) KMT=proprietary ;; + *) KMT=open ;; +esac + +case "$(uname -m)" in + x86_64) + ARCH_DIR=Linux-x86_64 + ARCH_SUFFIX=x86_64 + ;; + aarch64) + ARCH_DIR=Linux-aarch64 + ARCH_SUFFIX=aarch64 + ;; + *) + echo "::error::unsupported arch: $(uname -m)" >&2 + exit 1 + ;; +esac + +URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run" + +# Re-elevate to root if needed (sudo is preinstalled on the runner image). +if [ "$(id -u)" != 0 ]; then + exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@" +fi + +echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2 +echo "DRIVER=${VERSION} GPU_TYPE=${GPU_TYPE} KMT=${KMT} ARCH=${ARCH_SUFFIX}" >&2 +echo "URL=${URL}" >&2 + +# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia +# resolves nvidia-container-runtime through these, and removing them +# breaks `docker exec` against any container started with that runtime. +KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$' + +in_container() { + [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null +} + +host_install() { + apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod + + systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true + # if-test instead of `fuser ... || true` so a kill failure surfaces + # (fuser exits 1 when nothing holds the device, which is the happy path). + if fuser /dev/nvidia* >/dev/null 2>&1; then + fuser -kv /dev/nvidia* + fi + sleep 1 + for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + rmmod "$m" 2>/dev/null || true + done + + # Purge existing nvidia/libnvidia packages, except the toolkit pieces + # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip + # and the .run installer is about to replace everything anyway. + dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \ + | awk -v re="$KEEP_RE" '$0 !~ re' \ + | xargs -r apt-get -y remove --purge || true + + local d + d=$(mktemp -d) + ( cd "$d" \ + && wget -q -O installer.run "$URL" \ + && sh installer.run --silent --dkms --no-questions \ + --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) + modprobe nvidia nvidia_uvm nvidia_modeset +} + +# Replace the toolkit's bind-mounted nvidia libs/binaries inside this +# container with copies from the host's new install. `cp` (not +# `mount --bind`) because procfs-routed binds drop the exec bit. +refresh_container_libs() { + # Walk /proc/self/mountinfo and match the toolkit-injected nvidia + # binds via mount point (field 5) so deleted source paths -- which + # the kernel suffixes field 4 with " (deleted)" once the host unlinks + # the old lib -- don't break discovery. Filters skip what we can't or + # shouldn't refresh: + # $3 ~ /^0:/ tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs) + # $5 ~ /\.json$/ vulkan/glvnd config remaps (not version-bound) + # $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers + local mounts + mounts=$(awk ' + $3 !~ /^0:/ && + $5 !~ /\.json$/ && + $5 !~ /\/(firmware|xorg)\// && + $5 ~ /(nvidia|libcuda)/ { print $5 } + ' /proc/self/mountinfo | sort -u) + + for tgt in $mounts; do + local src="/proc/1/root$tgt" + if [ ! -e "$src" ]; then + # Driver swap rewrites the version suffix (libfoo.so.595.71.05 -> + # libfoo.so.580.65.06); strip it and find the new file. + local base + base=$(basename "$tgt") + base="${base%.so.*}.so" + src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \ + | sort -V | tail -n1) + [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; } + fi + umount "$tgt" 2>/dev/null || true + cp -f --remove-destination "$src" "$tgt" \ + || echo "WARN: refresh failed for $tgt (src=$src)" >&2 + done + ldconfig +} + +if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then + _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \ + || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } + refresh_container_libs +else + host_install +fi + +nvidia-smi >/dev/null +grep -qF "$VERSION" /proc/driver/nvidia/version From 3e016b572dcded7701f9b2f12c25cec3cb7e5b1d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:09:01 +0000 Subject: [PATCH 2/4] CI: fix Linux driver nsenter re-exec, swap Windows version, enable ci.yml dispatch - install_gpu_driver.sh: pipe the script body to the host-side bash via stdin (bash -s < "$0") instead of re-execing "$0". The script lives in the GH workspace mount (container-only), so the relative path doesn't resolve after nsenter switches the mount namespace. The < "$0" fd is opened before nsenter and survives the flip. - test-matrix.yml: Windows nightly-numba-cuda row 610.47 -> 596.36 (610.47 isn't published on the CDN; install hit 404). - ci.yml: add workflow_dispatch: trigger so the pipeline can be re-run manually. The existing should-skip / detect-changes gates already handle non-PR events. --- .github/workflows/ci.yml | 1 + ci/test-matrix.yml | 2 +- ci/tools/install_gpu_driver.sh | 8 +++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f9236b09fe..82ab7210c92 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,7 @@ on: schedule: # every 24 hours at midnight UTC - cron: "0 0 * * *" + workflow_dispatch: jobs: ci-vars: diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 3d5693a188a..730791ac283 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -122,4 +122,4 @@ windows: - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47', DRIVER_MODE: 'TCC' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' } diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 5dff7043487..b7aeb3434f4 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -156,7 +156,13 @@ refresh_container_libs() { } if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then - _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \ + # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a + # host-side absolute path so `"$0"` survives the mount-namespace flip; + # ours lives in the GH workspace mount (container-only), so we pipe the + # script body in via stdin instead -- the `< "$0"` fd is opened before + # nsenter and stays valid across the namespace switch. Env vars (DRIVER, + # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash. + _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \ || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } refresh_container_libs else From c0ca8696e64c7fb1e275ae628905ba6d86144279 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:27:53 +0000 Subject: [PATCH 3/4] CI: move 'Ensure GPU is working' after 'Install GPU driver' on Linux So nvidia-smi validates the post-install driver state on custom-DRIVER rows. Windows test-wheel + coverage already use Install -> Configure -> Ensure; this brings the Linux test-wheel job into line. --- .github/workflows/test-wheel-linux.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 4f56cb57740..57bc4dc555f 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -121,9 +121,6 @@ jobs: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} PIP_CACHE_DIR: "/tmp/pip-cache" steps: - - name: Ensure GPU is working - run: nvidia-smi - - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -151,6 +148,9 @@ jobs: apt-get -y install --no-install-recommends util-linux ./ci/tools/install_gpu_driver.sh + - name: Ensure GPU is working + run: nvidia-smi + - name: Set environment variables env: BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} From 4a23b23a26111a8ead0cc40519a2b558ea9bfe66 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:30:30 +0000 Subject: [PATCH 4/4] CI: flip two PR-matrix Linux rows to DRIVER=610.43.02 Exercises the custom-driver install path on every PR (not just nightly). Both rows are amd64 / 13.3.0 / local-CTK, on l4 and rtxpro6000 -- both in the 'open' kernel-module flavor (only Volta needs 'legacy'). --- ci/test-matrix.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 730791ac283..51f0d3f063f 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -38,10 +38,10 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }