Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions .github/workflows/gpu-e2e.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
name: gpu-e2e

# NVIDIA self-hosted runners refuse workflows triggered by `pull_request`
# events from forks (policy). copy-pr-bot mirrors fork PR branches into
# this repo under `pull-request/<N>`; we trigger on the resulting push.
#
# Trigger matrix:
# - schedule daily smoke against main
# - push main (post-merge) and pull-request/<N> (bot-mirror),
# path-filtered so doc-only changes skip the workflow
# - workflow_dispatch manual
on:
schedule:
- cron: '0 6 * * *'
Comment thread
dims marked this conversation as resolved.
push:
branches:
- main
- 'pull-request/[0-9]+'
paths:
- '.github/workflows/gpu-e2e.yaml'
- 'hack/ci/**'
- 'cmd/**'
- 'pkg/**'
- 'examples/**'
- 'Makefile'
- 'go.mod'
- 'go.sum'
- 'vendor/modules.txt'
workflow_dispatch: {}

permissions:
contents: read

jobs:

e2e:
concurrency:
group: gpu-e2e-${{ github.event_name }}-${{ github.ref }}-${{ matrix.arch }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
strategy:
fail-fast: false
matrix:
include:
- arch: amd64
runner: linux-amd64-gpu-t4-latest-1
gpu: t4
run-dra: true
- arch: arm64
runner: linux-arm64-gpu-l4-latest-1
gpu: l4
run-dra: false
name: e2e-${{ matrix.arch }}-${{ matrix.gpu }}
runs-on: ${{ matrix.runner }}
timeout-minutes: 45
env:
KIND_VERSION: v0.31.0
KUBECTL_VERSION: v1.35.1
HELM_VERSION: v3.18.1
KIND_NODE_IMAGE: kindest/node:v1.35.1
GPU_OPERATOR_VERSION: v26.3.1
DRA_CHART_VERSION: "25.12.0"
CLUSTER_PREFIX: nv-${{ github.run_id }}-${{ matrix.arch }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false

- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
cache-dependency-path: |
go.sum
vendor/modules.txt

- name: Verify host GPU
run: |
nvidia-smi -L
test -c /dev/nvidiactl

- name: Configure docker for GPU + CDI
run: |
sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
sudo nvidia-ctk config --set \
accept-nvidia-visible-devices-as-volume-mounts=true --in-place
sudo systemctl restart docker
sudo sysctl -w fs.inotify.max_user_watches=524288
sudo sysctl -w fs.inotify.max_user_instances=8192
docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all \
ubuntu:22.04 nvidia-smi -L
Comment thread
cdesiniotis marked this conversation as resolved.

- name: Install kind / kubectl / helm
run: |
curl -sSLo /tmp/kind \
"https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-${{ matrix.arch }}"
sudo install -m0755 /tmp/kind /usr/local/bin/kind
curl -sSLo /tmp/kubectl \
"https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${{ matrix.arch }}/kubectl"
sudo install -m0755 /tmp/kubectl /usr/local/bin/kubectl
curl -sSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-${{ matrix.arch }}.tar.gz" \
| sudo tar xz -C /usr/local/bin --strip-components=1 "linux-${{ matrix.arch }}/helm"

- name: Build nvkind
run: |
make build
sudo install -m0755 ./nvkind /usr/local/bin/nvkind
nvkind cluster --help > /dev/null

# `nvkind cluster create` can exit non-zero with benign umount errors
# in CDI mode; `kubectl wait` below handles the subsequent kubelet-
# registration race. Pattern matches aicr's gpu-cluster-setup.

- name: S1 default cluster lifecycle
env:
CLUSTER: ${{ env.CLUSTER_PREFIX }}-default
run: |
set -x
nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true
kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
nc=$(kubectl --context "kind-$CLUSTER" get nodes --no-headers | wc -l)
[ "$nc" -eq 2 ] || { echo "expected 2 nodes, got $nc"; exit 1; }
kubectl --context "kind-$CLUSTER" get runtimeclass nvidia > /dev/null
# Assert `nvkind cluster print-gpus` reports the exact same GPU UUIDs
# as `nvidia-smi` on the host. Catches template / GPU-inject regressions
# that would otherwise slip past a bare `grep -q gpu`.
host_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | sort)
kind_uuids=$(nvkind cluster print-gpus --name "$CLUSTER" \
| jq -r '[.[].gpus[].UUID] | sort | .[]')
[ "$host_uuids" = "$kind_uuids" ] || {
echo "GPU UUID mismatch"
echo "host: $host_uuids"
echo "kind: $kind_uuids"
exit 1
}
kind delete cluster --name "$CLUSTER"

- name: S2 GPU Operator + nvidia-smi pod
env:
CLUSTER: ${{ env.CLUSTER_PREFIX }}-dp
run: |
set -x
nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true
kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
# GPU Operator (minimal mode) mirrors aicr's proven path: NFD labels
# the GPU node, the Operator brings its own preconfigured device-plugin
# daemonset. Driver/toolkit/DCGM disabled — they live on the host.
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null
helm repo update > /dev/null
helm --kube-context "kind-$CLUSTER" upgrade -i gpu-operator \
nvidia/gpu-operator --version "$GPU_OPERATOR_VERSION" \
-n gpu-operator --create-namespace \
--set driver.enabled=false --set toolkit.enabled=false \
--set dcgmExporter.enabled=false --set nfd.enabled=true \
--wait --timeout=600s
kubectl --context "kind-$CLUSTER" -n gpu-operator rollout status \
daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
for i in $(seq 1 60); do
c=$(kubectl --context "kind-$CLUSTER" get nodes \
-o jsonpath='{.items[*].status.capacity.nvidia\.com/gpu}' \
| tr ' ' '\n' | grep -cvx 0 || true)
[ "${c:-0}" -ge 1 ] && break
sleep 2
done
[ "${c:-0}" -ge 1 ] || { echo "no nvidia.com/gpu capacity advertised"; exit 1; }
kubectl --context "kind-$CLUSTER" apply -f hack/ci/smi-pod.yaml
kubectl --context "kind-$CLUSTER" wait \
--for=jsonpath='{.status.phase}'=Succeeded pod/smi --timeout=240s
kubectl --context "kind-$CLUSTER" logs smi | grep -q NVIDIA-SMI
kind delete cluster --name "$CLUSTER"

- name: S3 DRA driver + resource claim
if: matrix.run-dra == true
env:
CLUSTER: ${{ env.CLUSTER_PREFIX }}-dra
run: |
set -x
nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" \
--config-template hack/ci/templates/dra.yaml.tmpl || true
kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null
helm repo update > /dev/null
helm --kube-context "kind-$CLUSTER" upgrade -i dra \
nvidia/nvidia-dra-driver-gpu --version "$DRA_CHART_VERSION" \
-n nvidia-dra-driver-gpu --create-namespace \
--set nvidiaDriverRoot=/ --set gpuResourcesEnabledOverride=true \
--wait --timeout=300s
for i in $(seq 1 60); do
c=$(kubectl --context "kind-$CLUSTER" get resourceslices \
--no-headers 2>/dev/null | wc -l)
[ "${c:-0}" -ge 1 ] && break
sleep 5
done
[ "${c:-0}" -ge 1 ] || { echo "no ResourceSlice published"; exit 1; }
kubectl --context "kind-$CLUSTER" apply -f hack/ci/dra-pod.yaml
kubectl --context "kind-$CLUSTER" wait \
--for=jsonpath='{.status.phase}'=Succeeded pod/dra-smi --timeout=240s
# Assert the pod saw exactly one GPU (rules out "all host GPUs leaked
# into the pod"). On a single-GPU runner this is a lower-bound check;
# multi-GPU isolation coverage is a follow-up scenario once such a
# runner class exists.
pod_log=$(kubectl --context "kind-$CLUSTER" logs dra-smi)
gpu_lines=$(echo "$pod_log" | grep -c '^GPU [0-9]\+:' || true)
[ "$gpu_lines" = "1" ] || {
echo "expected exactly 1 GPU in dra-smi logs, got $gpu_lines"
echo "$pod_log"
exit 1
}
# Assert DRA actually engaged for this pod. The ResourceClaim
# created from a template is pod-scoped, so it is deallocated
# and garbage-collected once the pod reaches Succeeded —
# `kubectl get resourceclaim` races that GC. The pod's
# `status.resourceClaimStatuses` is set by the ResourceClaim
# controller when the claim is created and survives pod
# completion, so it's the reliable signal that DRA ran.
# If DRA is bypassed (gate off, controller not running, etc.)
# this field stays empty even though the pod can still succeed.
claim_name=$(kubectl --context "kind-$CLUSTER" get pod dra-smi \
-o jsonpath='{.status.resourceClaimStatuses[?(@.name=="gpu")].resourceClaimName}')
[ -n "$claim_name" ] || {
echo "pod has no status.resourceClaimStatuses[name=gpu] — DRA did not engage"
kubectl --context "kind-$CLUSTER" get pod dra-smi -o yaml
exit 1
}
kind delete cluster --name "$CLUSTER"

- name: Collect artifacts
if: always()
run: |
D=/tmp/nvkind-artifacts
mkdir -p "$D"
for c in $(kind get clusters 2>/dev/null); do
kind export logs "$D/kind-$c" --name "$c" || true
kubectl --context "kind-$c" get pods -A -o wide > "$D/pods-$c.txt" || true
kubectl --context "kind-$c" get events -A \
--sort-by=.lastTimestamp > "$D/events-$c.txt" || true
done
sudo cat /etc/docker/daemon.json > "$D/docker-daemon.json" 2>/dev/null || true
sudo cat /etc/nvidia-container-runtime/config.toml \
> "$D/nvidia-ctk.toml" 2>/dev/null || true

- uses: actions/upload-artifact@v4
if: always()
with:
name: nvkind-e2e-${{ matrix.arch }}-${{ github.run_id }}
path: /tmp/nvkind-artifacts
retention-days: 7

- name: Teardown
if: always()
run: |
for c in $(kind get clusters 2>/dev/null | grep "^${CLUSTER_PREFIX}-" || true); do
kind delete cluster --name "$c" || true
done
docker system prune -f || true
30 changes: 30 additions & 0 deletions hack/ci/dra-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: resource.k8s.io/v1
kind: ResourceClaimTemplate
metadata:
name: rct-gpu
spec:
spec:
devices:
requests:
- name: gpu
exactly:
deviceClassName: gpu.nvidia.com
---
apiVersion: v1
kind: Pod
metadata:
name: dra-smi
spec:
restartPolicy: OnFailure
containers:
- name: smi
image: nvidia/cuda:12.5.0-devel-ubuntu22.04
# Use `-L` so the log is one line per GPU (`GPU N: ... (UUID: GPU-...)`),
# which lets the workflow assert the pod sees exactly one GPU.
command: ["nvidia-smi", "-L"]
resources:
claims:
- name: gpu
resourceClaims:
- name: gpu
resourceClaimTemplateName: rct-gpu
13 changes: 13 additions & 0 deletions hack/ci/smi-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Pod
metadata:
name: smi
spec:
restartPolicy: OnFailure
containers:
- name: smi
image: nvidia/cuda:12.5.0-devel-ubuntu22.04
command: ["nvidia-smi"]
resources:
limits:
nvidia.com/gpu: 1
45 changes: 45 additions & 0 deletions hack/ci/templates/dra.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# nvkind / kind cluster config template for DRA.
# Rendered by nvkind: `numGPUs` is provided automatically based on host GPU count.
# Enables DynamicResourceAllocation across control-plane components and kubelet,
# and turns on CDI in containerd.
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
featureGates:
DynamicResourceAllocation: true
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri"]
enable_cdi = true
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiServer:
extraArgs:
feature-gates: "DynamicResourceAllocation=true"
controllerManager:
extraArgs:
feature-gates: "DynamicResourceAllocation=true"
scheduler:
extraArgs:
feature-gates: "DynamicResourceAllocation=true"
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
feature-gates: "DynamicResourceAllocation=true"
- role: worker
labels:
nvidia.com/gpu.present: "true"
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
feature-gates: "DynamicResourceAllocation=true"
extraMounts:
{{- range $gpu := until numGPUs }}
- hostPath: /dev/null
containerPath: /var/run/nvidia-container-devices/{{ $gpu }}
{{- end }}
Loading