Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ on:
schedule:
# every 24 hours at midnight UTC
- cron: "0 0 * * *"
workflow_dispatch:

jobs:
ci-vars:
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,13 +275,15 @@ jobs:
uses: nv-gha-runners/setup-proxy-cache@main
continue-on-error: true

- name: Update driver
# DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally
# skipped (it errors on latest/earliest); configure_driver_mode.ps1
# still runs to put the pre-installed driver into TCC mode.
- name: Configure driver mode
shell: powershell
env:
DRIVER_MODE: "TCC"
GPU_TYPE: "a100"
run: |
ci/tools/install_gpu_driver.ps1
ci/tools/configure_driver_mode.ps1

- name: Ensure GPU is working
run: |
Expand Down
34 changes: 27 additions & 7 deletions .github/workflows/test-wheel-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,13 @@ jobs:
# Read base matrix from YAML file for the specific architecture
TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)

# Apply matrix filter and wrap in include structure
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
# Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the
# in-container driver swap doesn't work under WSL); add a
# RUNNER_DRIVER field that maps any custom version back to
# 'latest' (the install script swaps the driver itself, so we
# need to land on the runner that ships with the most recent
# pre-installed driver); wrap in include structure.
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')

echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"

Expand All @@ -101,21 +106,21 @@ jobs:
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
# The build stage could fail but we want the CI to keep moving.
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
# Our self-hosted runners require a container
# TODO: use a different (nvidia?) container
container:
options: -u root --security-opt seccomp=unconfined --shm-size 16g
# Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh
# can nsenter to the host for the install + refresh the toolkit bind mounts
# back inside the container. Stock options for latest/earliest rows.
options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }}
image: ubuntu:22.04
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
PIP_CACHE_DIR: "/tmp/pip-cache"
steps:
- name: Ensure GPU is working
run: nvidia-smi

- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

Expand All @@ -131,6 +136,21 @@ jobs:
dependencies: "jq wget libgl1 libegl1 g++"
dependent_exes: "jq wget"

- name: Install GPU driver
if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
env:
DRIVER: ${{ matrix.DRIVER }}
GPU_TYPE: ${{ matrix.GPU }}
run: |
# util-linux for nsenter; install_gpu_driver.sh re-execs onto the
# host (requires --privileged --pid=host on the container, set
# conditionally above) and refreshes the toolkit bind mounts here.
apt-get -y install --no-install-recommends util-linux
./ci/tools/install_gpu_driver.sh

- name: Ensure GPU is working
run: nvidia-smi

- name: Set environment variables
env:
BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}
Expand Down
20 changes: 15 additions & 5 deletions .github/workflows/test-wheel-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ jobs:
# Read base matrix from YAML file for the specific architecture
TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)

# Apply matrix filter and wrap in include structure
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
# Apply matrix filter; add a RUNNER_DRIVER field that maps any
# custom DRIVER version back to 'latest' (install_gpu_driver.ps1
# swaps the driver itself, so the runner must be the one that
# ships the most recent pre-installed driver); wrap in include.
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')

echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"

Expand All @@ -95,7 +98,7 @@ jobs:
fail-fast: false
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
steps:
- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
Expand All @@ -106,13 +109,20 @@ jobs:
with:
enable-apt: true

- name: Update driver
- name: Install GPU driver
if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
env:
DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
DRIVER: ${{ matrix.DRIVER }}
GPU_TYPE: ${{ matrix.GPU }}
run: |
ci/tools/install_gpu_driver.ps1

- name: Configure driver mode
env:
DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
run: |
ci/tools/configure_driver_mode.ps1

- name: Ensure GPU is working
run: |
nvidia-smi
Expand Down
17 changes: 13 additions & 4 deletions ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,16 @@
# Windows entries also include DRIVER_MODE.
#
# Notes:
# - DRIVER accepts:
# * 'latest' - use the runner's pre-installed latest driver (no install step)
# * 'earliest' - use the runner's pre-installed earliest driver (no install step)
# * a version string (e.g. '580.65.06')
# - install that version via ci/tools/install_gpu_driver.sh (Linux)
# or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the
# job. The matrix row is routed to the 'latest' runner image (the
# install scripts swap the driver themselves).
# - DRIVER: 'earliest' does not work with CUDA 12.9.1
# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux.

linux:
pull-request:
Expand All @@ -29,10 +38,10 @@ linux:
- { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
Expand Down Expand Up @@ -74,7 +83,7 @@ linux:
- { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' }
# nightly-numba-cuda
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' }
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
# nightly-standard (arm64 l4×2 — nightly-only per runner team request)
Expand Down Expand Up @@ -113,4 +122,4 @@ windows:
- { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' }
# nightly-numba-cuda
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' }
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

45 changes: 45 additions & 0 deletions ci/tools/configure_driver_mode.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI
# runner and cycle the display devices so the new mode takes effect
# without rebooting. Always runs (whether or not install_gpu_driver.ps1
# just ran). When install_gpu_driver.ps1 has run, this single device
# cycle also activates the freshly-installed driver.
#
# Inputs (env):
# DRIVER_MODE One of WDDM, TCC, MCDM.

function Set-DriverMode {

# Map matrix DRIVER_MODE to nvidia-smi -fdm code.
# This assumes we have the prior knowledge on which GPU can use which mode.
$driver_mode = $env:DRIVER_MODE
if ($driver_mode -eq "WDDM") {
Write-Output "Setting driver mode to WDDM..."
nvidia-smi -fdm 0
} elseif ($driver_mode -eq "TCC") {
Write-Output "Setting driver mode to TCC..."
nvidia-smi -fdm 1
} elseif ($driver_mode -eq "MCDM") {
Write-Output "Setting driver mode to MCDM..."
nvidia-smi -fdm 2
} else {
Write-Output "Unknown driver mode: $driver_mode"
exit 1
}

# Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
$nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
foreach ($device in $nvidia_devices) {
Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
pnputil /disable-device "$($device.InstanceId)"
pnputil /enable-device "$($device.InstanceId)"
}
Comment on lines +34 to +39
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems this function now cycles display devices on every Windows job, even when no driver was installed and nvidia-smi -fdm reports the mode was already correct (e.g. Driver model is already set to MCDM for GPU 00000000:0A:00.0. in the log). Is that intentional? In the h100/MCDM CI failure, NVML becomes unavailable right after the pnputil cycle here, so I’m wondering if the device restart should only happen after an install or an actual mode change.

# Give it a minute to settle:
Start-Sleep -Seconds 5
}

# Run the functions
Set-DriverMode
51 changes: 21 additions & 30 deletions ci/tools/install_gpu_driver.ps1
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a
# Windows CI runner. Driver-mode selection and the post-install device
# power-cycle are the responsibility of configure_driver_mode.ps1, which
# the workflow runs immediately after this script (or by itself when
# DRIVER is 'latest'/'earliest' and the runner already brings up the
# right driver).
#
# Inputs (env):
# DRIVER Driver version, e.g. "610.47". Must NOT be 'latest' or
# 'earliest' -- those are runner-pre-installed and the
# workflow is expected to skip this script for them.
# GPU_TYPE Lower-case GPU label from the matrix (e.g. "l4", "rtx4090").
# Selects the data-center vs desktop installer variant.

# Install the driver
function Install-Driver {

# Set the correct URL, filename, and arguments to the installer
# This driver is picked to support Windows 11 & CUDA 13.0
$version = '581.15'
# Driver version is plumbed from the matrix via the DRIVER env var.
$version = $env:DRIVER
if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') {
Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'."
exit 1
}

# Get GPU type from environment variable
$gpu_type = $env:GPU_TYPE
Expand Down Expand Up @@ -54,33 +71,7 @@ function Install-Driver {
# Install the file with the specified path from earlier
Write-Output 'Running the driver installer...'
Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
Write-Output 'Done!'

# Handle driver mode configuration
# This assumes we have the prior knowledge on which GPU can use which mode.
$driver_mode = $env:DRIVER_MODE
if ($driver_mode -eq "WDDM") {
Write-Output "Setting driver mode to WDDM..."
nvidia-smi -fdm 0
} elseif ($driver_mode -eq "TCC") {
Write-Output "Setting driver mode to TCC..."
nvidia-smi -fdm 1
} elseif ($driver_mode -eq "MCDM") {
Write-Output "Setting driver mode to MCDM..."
nvidia-smi -fdm 2
} else {
Write-Output "Unknown driver mode: $driver_mode"
exit 1
}
# Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
$nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
foreach ($device in $nvidia_devices) {
Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
pnputil /disable-device "$($device.InstanceId)"
pnputil /enable-device "$($device.InstanceId)"
}
# Give it a minute to settle:
Start-Sleep -Seconds 5
Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.'
}

# Run the functions
Expand Down
Loading
Loading