diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 35f02847ed7..a29f78e8a4e 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -17,51 +17,12 @@ linux: pull-request: + # TEMPORARY: cut down to just Python 3.12 + CUDA 12.9.1 to debug the + # IPC re-export hang on r595 driver runners. Revert before merging. # linux-64 - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } # linux-aarch64 - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - # special runners - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' } nightly: # nightly-pytorch - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.11.0', TORCH_CUDA: 'cu126' } @@ -83,28 +44,11 @@ linux: windows: pull-request: + # TEMPORARY: cut down to just Python 3.12 + CUDA 12.9.1 to keep the + # matrix non-empty while we debug the Linux IPC re-export hang. + # Revert before merging. # win-64 - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtx2080', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - # special runners - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } nightly: # nightly-pytorch - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.11.0', TORCH_CUDA: 'cu126' } diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 30fac1cdce8..19cc80ec700 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -88,7 +88,10 @@ elif [[ "${1}" == "test" ]]; then # We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix # Only local ctk installs have compute-sanitizer; there is no wheel for it - if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then + # TEMPORARY: force-disable compute-sanitizer to test whether it is the + # cause of the TestIpcReexport hang on r595-driver runners. Revert before + # merging. + if false && [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then echo "LATEST_CUDA_VERSION=$(bash ci/tools/guess_latest.sh $TEST_CUDA_MAJOR)" >> $GITHUB_ENV SETUP_SANITIZER=1 else