From 11822cb67a23c3a40964df9b41763c9cbd34db38 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 20 May 2026 17:32:30 -0700 Subject: [PATCH 1/2] ci: trim PR matrix to PY3.12 + CUDA 12.9.1 only (debug only) Temporary debugging change: cut the linux and windows pull-request matrices down to a single Python 3.12 + CUDA 12.9.1 row each so the TestIpcReexport hang on r595-driver runners reproduces quickly without also burning compute on unrelated configurations. The nightly matrices are intentionally untouched. Revert before merging. --- ci/test-matrix.yml | 66 ++++------------------------------------------ 1 file changed, 5 insertions(+), 61 deletions(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 35f02847ed7..a29f78e8a4e 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -17,51 +17,12 @@ linux: pull-request: + # TEMPORARY: cut down to just Python 3.12 + CUDA 12.9.1 to debug the + # IPC re-export hang on r595 driver runners. Revert before merging. # linux-64 - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } # linux-aarch64 - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - # special runners - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' } nightly: # nightly-pytorch - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.11.0', TORCH_CUDA: 'cu126' } @@ -83,28 +44,11 @@ linux: windows: pull-request: + # TEMPORARY: cut down to just Python 3.12 + CUDA 12.9.1 to keep the + # matrix non-empty while we debug the Linux IPC re-export hang. + # Revert before merging. # win-64 - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtx2080', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - # special runners - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } nightly: # nightly-pytorch - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.11.0', TORCH_CUDA: 'cu126' } From 672d2287a457e2054b81d3cbae8c2de7ca21dedc Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 20 May 2026 18:23:34 -0700 Subject: [PATCH 2/2] ci: force-disable compute-sanitizer on PR matrix (debug only) Wire the existing SETUP_SANITIZER gate behind a literal `false &&` so every PR test job exports SETUP_SANITIZER=0 and SANITIZER_CMD=, which makes run-tests invoke pytest directly instead of under compute-sanitizer. Pairs with the matrix trim in this branch to test the hypothesis that the TestIpcReexport hang on r595-driver runners is specific to running pytest under compute-sanitizer. Revert before merging. --- ci/tools/env-vars | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 30fac1cdce8..19cc80ec700 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -88,7 +88,10 @@ elif [[ "${1}" == "test" ]]; then # We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix # Only local ctk installs have compute-sanitizer; there is no wheel for it - if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then + # TEMPORARY: force-disable compute-sanitizer to test whether it is the + # cause of the TestIpcReexport hang on r595-driver runners. Revert before + # merging. + if false && [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then echo "LATEST_CUDA_VERSION=$(bash ci/tools/guess_latest.sh $TEST_CUDA_MAJOR)" >> $GITHUB_ENV SETUP_SANITIZER=1 else