Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions .github/workflows/ci-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.

# GPU tests live in a separate workflow because NVIDIA self-hosted runners
# block pull_request events entirely. Keeping them here avoids a confusing
# "Skipped" entry with unresolved matrix names on every PR.

name: CI / GPU

on:
workflow_dispatch:
push:
branches:
- main
- "pull-request/[0-9]+"
merge_group:
types:
- checks_requested

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
PIP_NO_CACHE_DIR: "1"
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_PREFER_BINARY: "1"

jobs:
gpu-tests:
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
container:
image: ubuntu:22.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
python-version: ["3.11", "3.12", "3.13"]
name: "gpu / py${{ matrix.python-version }}"
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true

- name: Install system dependencies
run: |
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y git git-lfs gcc software-properties-common
add-apt-repository -y ppa:deadsnakes/ppa
apt-get update
apt-get install -y \
python${{ matrix.python-version }} \
python${{ matrix.python-version }}-venv \
python${{ matrix.python-version }}-dev
git lfs install

- uses: actions/checkout@v4
with:
lfs: true

- name: Verify GPU
run: nvidia-smi

- name: Install dependencies and run tests
run: bash code/scripts/check_python_compat.sh
env:
PYTHON_BIN: python${{ matrix.python-version }}
MODE: train
SKIP_TESTS: "0"
REQUIRE_GPU: "1"

- name: Smoke training + inference
shell: bash
run: |
source .venv_train_${{ matrix.python-version }}/bin/activate
bash code/scripts/smoke_run.sh
env:
EXPERIMENT_NAME: ci_smoke
PREDECODER_TRAIN_SAMPLES: "4096"
PREDECODER_VAL_SAMPLES: "512"
PREDECODER_TEST_SAMPLES: "512"
PREDECODER_TRAIN_EPOCHS: "1"
135 changes: 16 additions & 119 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,9 @@ env:
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_PREFER_BINARY: "1"

# ---------------------------------------------------------------------------
# CPU jobs (GitHub-hosted runners)
# ---------------------------------------------------------------------------
jobs:
spdx-header-check:
runs-on: ubuntu-latest
runs-on: linux-amd64-cpu4
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
Expand All @@ -43,24 +40,29 @@ jobs:
- run: python3 code/scripts/spdx_headers.py --check

unit-tests:
runs-on: ubuntu-latest
runs-on: linux-amd64-cpu4
strategy:
fail-fast: false
matrix:
python-version: ["3.11", "3.12", "3.13"]
name: "unit-tests / py${{ matrix.python-version }}"
steps:
- uses: actions/checkout@v4
with:
lfs: true
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install -r code/requirements_public_inference.txt \
--extra-index-url https://download.pytorch.org/whl/cpu
- name: Run tests
run: PYTHONPATH=code python -m unittest discover -s code/tests -p "test_*.py"
python-version: ${{ matrix.python-version }}
- name: Install dependencies and run tests
run: bash code/scripts/check_python_compat.sh
env:
PYTHON_BIN: python
MODE: inference
SKIP_TESTS: "0"
PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu"

unit-tests-coverage:
runs-on: ubuntu-latest
runs-on: linux-amd64-cpu4
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -88,108 +90,3 @@ jobs:
path: |
htmlcov/
coverage.xml

python-compat:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.11", "3.12", "3.13"]
mode: [inference, train]
name: "compat / py${{ matrix.python-version }} / ${{ matrix.mode }}"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Check Python compatibility
run: bash code/scripts/check_python_compat.sh
env:
MODE: ${{ matrix.mode }}
PYTHON_BIN: python
SKIP_TESTS: "1"
PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu"

# ---------------------------------------------------------------------------
# GPU jobs (self-hosted NVIDIA runners)
# ---------------------------------------------------------------------------
gpu-tests:
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
container:
image: ubuntu:22.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 30
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true

- name: Install system dependencies
run: |
apt-get update
apt-get install -y git git-lfs python3 python3-pip python3-venv
git lfs install

- uses: actions/checkout@v4
with:
lfs: true

- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip setuptools wheel
pip install -r code/requirements_public_inference.txt

- name: Verify GPU
run: |
nvidia-smi
python3 -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print(torch.cuda.get_device_name(0))"

- name: Run full test suite (CPU + GPU)
run: PYTHONPATH=code python3 -m unittest discover -s code/tests -p "test_*.py"

smoke-test-gpu:
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
needs: gpu-tests
container:
image: ubuntu:22.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 30
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true

- name: Install system dependencies
run: |
apt-get update
apt-get install -y git git-lfs python3 python3-pip python3-venv
git lfs install

- uses: actions/checkout@v4
with:
lfs: true

- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip setuptools wheel
pip install -r code/requirements_public_train.txt

- name: Verify GPU
run: |
nvidia-smi
python3 -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'"

- name: Smoke training + inference
run: bash code/scripts/smoke_run.sh
env:
EXPERIMENT_NAME: ci_smoke
PREDECODER_TRAIN_SAMPLES: "4096"
PREDECODER_VAL_SAMPLES: "512"
PREDECODER_TEST_SAMPLES: "512"
PREDECODER_TRAIN_EPOCHS: "1"