NVIDIA · dims · Apr 20, 2026 · Apr 20, 2026
diff --git a/.github/workflows/gpu-e2e.yaml b/.github/workflows/gpu-e2e.yaml
@@ -0,0 +1,254 @@
+name: gpu-e2e
+
+# NVIDIA self-hosted runners refuse workflows triggered by `pull_request`
+# events from forks (policy). copy-pr-bot mirrors fork PR branches into
+# this repo under `pull-request/<N>`; we trigger on the resulting push.
+#
+# Trigger matrix:
+#   - schedule          daily smoke against main
+#   - push              main (post-merge) and pull-request/<N> (bot-mirror),
+#                       path-filtered so doc-only changes skip the workflow
+#   - workflow_dispatch manual
+on:
+  schedule:
+    - cron: '0 6 * * *'
+  push:
+    branches:
+      - main
+      - 'pull-request/[0-9]+'
+    paths:
+      - '.github/workflows/gpu-e2e.yaml'
+      - 'hack/ci/**'
+      - 'cmd/**'
+      - 'pkg/**'
+      - 'examples/**'
+      - 'Makefile'
+      - 'go.mod'
+      - 'go.sum'
+      - 'vendor/modules.txt'
+  workflow_dispatch: {}
+
+permissions:
+  contents: read
+
+jobs:
+
+  e2e:
+    concurrency:
+      group: gpu-e2e-${{ github.event_name }}-${{ github.ref }}-${{ matrix.arch }}
+      cancel-in-progress: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: amd64
+          runner: linux-amd64-gpu-t4-latest-1
+          gpu: t4
+          run-dra: true
+        - arch: arm64
+          runner: linux-arm64-gpu-l4-latest-1
+          gpu: l4
+          run-dra: false
+    name: e2e-${{ matrix.arch }}-${{ matrix.gpu }}
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 45
+    env:
+      KIND_VERSION: v0.31.0
+      KUBECTL_VERSION: v1.35.1
+      HELM_VERSION: v3.18.1
+      KIND_NODE_IMAGE: kindest/node:v1.35.1
+      GPU_OPERATOR_VERSION: v26.3.1
+      DRA_CHART_VERSION: "25.12.0"
+      CLUSTER_PREFIX: nv-${{ github.run_id }}-${{ matrix.arch }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+
+    - uses: actions/setup-go@v5
+      with:
+        go-version-file: go.mod
+        cache: true
+        cache-dependency-path: |
+          go.sum
+          vendor/modules.txt
+
+    - name: Verify host GPU
+      run: |
+        nvidia-smi -L
+        test -c /dev/nvidiactl
+
+    - name: Configure docker for GPU + CDI
+      run: |
+        sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+        sudo nvidia-ctk config --set \
+          accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+        sudo systemctl restart docker
+        sudo sysctl -w fs.inotify.max_user_watches=524288
+        sudo sysctl -w fs.inotify.max_user_instances=8192
+        docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all \
+          ubuntu:22.04 nvidia-smi -L
+
+    - name: Install kind / kubectl / helm
+      run: |
+        curl -sSLo /tmp/kind \
+          "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-${{ matrix.arch }}"
+        sudo install -m0755 /tmp/kind /usr/local/bin/kind
+        curl -sSLo /tmp/kubectl \
+          "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${{ matrix.arch }}/kubectl"
+        sudo install -m0755 /tmp/kubectl /usr/local/bin/kubectl
+        curl -sSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-${{ matrix.arch }}.tar.gz" \
+          | sudo tar xz -C /usr/local/bin --strip-components=1 "linux-${{ matrix.arch }}/helm"
+
+    - name: Build nvkind
+      run: |
+        make build
+        sudo install -m0755 ./nvkind /usr/local/bin/nvkind
+        nvkind cluster --help > /dev/null
+
+    # `nvkind cluster create` can exit non-zero with benign umount errors
+    # in CDI mode; `kubectl wait` below handles the subsequent kubelet-
+    # registration race. Pattern matches aicr's gpu-cluster-setup.
+
+    - name: S1 default cluster lifecycle
+      env:
+        CLUSTER: ${{ env.CLUSTER_PREFIX }}-default
+      run: |
+        set -x
+        nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true
+        kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
+        nc=$(kubectl --context "kind-$CLUSTER" get nodes --no-headers | wc -l)
+        [ "$nc" -eq 2 ] || { echo "expected 2 nodes, got $nc"; exit 1; }
+        kubectl --context "kind-$CLUSTER" get runtimeclass nvidia > /dev/null
+        # Assert `nvkind cluster print-gpus` reports the exact same GPU UUIDs
+        # as `nvidia-smi` on the host. Catches template / GPU-inject regressions
+        # that would otherwise slip past a bare `grep -q gpu`.
+        host_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | sort)
+        kind_uuids=$(nvkind cluster print-gpus --name "$CLUSTER" \
+          | jq -r '[.[].gpus[].UUID] | sort | .[]')
+        [ "$host_uuids" = "$kind_uuids" ] || {
+          echo "GPU UUID mismatch"
+          echo "host: $host_uuids"
+          echo "kind: $kind_uuids"
+          exit 1
+        }
+        kind delete cluster --name "$CLUSTER"
+
+    - name: S2 GPU Operator + nvidia-smi pod
+      env:
+        CLUSTER: ${{ env.CLUSTER_PREFIX }}-dp
+      run: |
+        set -x
+        nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true
+        kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
+        # GPU Operator (minimal mode) mirrors aicr's proven path: NFD labels
+        # the GPU node, the Operator brings its own preconfigured device-plugin
+        # daemonset. Driver/toolkit/DCGM disabled — they live on the host.
+        helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null
+        helm repo update > /dev/null
+        helm --kube-context "kind-$CLUSTER" upgrade -i gpu-operator \
+          nvidia/gpu-operator --version "$GPU_OPERATOR_VERSION" \
+          -n gpu-operator --create-namespace \
+          --set driver.enabled=false --set toolkit.enabled=false \
+          --set dcgmExporter.enabled=false --set nfd.enabled=true \
+          --wait --timeout=600s
+        kubectl --context "kind-$CLUSTER" -n gpu-operator rollout status \
+          daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
+        for i in $(seq 1 60); do
+          c=$(kubectl --context "kind-$CLUSTER" get nodes \
+              -o jsonpath='{.items[*].status.capacity.nvidia\.com/gpu}' \
+              | tr ' ' '\n' | grep -cvx 0 || true)
+          [ "${c:-0}" -ge 1 ] && break
+          sleep 2
+        done
+        [ "${c:-0}" -ge 1 ] || { echo "no nvidia.com/gpu capacity advertised"; exit 1; }
+        kubectl --context "kind-$CLUSTER" apply -f hack/ci/smi-pod.yaml
+        kubectl --context "kind-$CLUSTER" wait \
+          --for=jsonpath='{.status.phase}'=Succeeded pod/smi --timeout=240s
+        kubectl --context "kind-$CLUSTER" logs smi | grep -q NVIDIA-SMI
+        kind delete cluster --name "$CLUSTER"
+
+    - name: S3 DRA driver + resource claim
+      if: matrix.run-dra == true
+      env:
+        CLUSTER: ${{ env.CLUSTER_PREFIX }}-dra
+      run: |
+        set -x
+        nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" \
+          --config-template hack/ci/templates/dra.yaml.tmpl || true
+        kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s
+        helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null
+        helm repo update > /dev/null
+        helm --kube-context "kind-$CLUSTER" upgrade -i dra \
+          nvidia/nvidia-dra-driver-gpu --version "$DRA_CHART_VERSION" \
+          -n nvidia-dra-driver-gpu --create-namespace \
+          --set nvidiaDriverRoot=/ --set gpuResourcesEnabledOverride=true \
+          --wait --timeout=300s
+        for i in $(seq 1 60); do
+          c=$(kubectl --context "kind-$CLUSTER" get resourceslices \
+              --no-headers 2>/dev/null | wc -l)
+          [ "${c:-0}" -ge 1 ] && break
+          sleep 5
+        done
+        [ "${c:-0}" -ge 1 ] || { echo "no ResourceSlice published"; exit 1; }
+        kubectl --context "kind-$CLUSTER" apply -f hack/ci/dra-pod.yaml
+        kubectl --context "kind-$CLUSTER" wait \
+          --for=jsonpath='{.status.phase}'=Succeeded pod/dra-smi --timeout=240s
+        # Assert the pod saw exactly one GPU (rules out "all host GPUs leaked
+        # into the pod"). On a single-GPU runner this is a lower-bound check;
+        # multi-GPU isolation coverage is a follow-up scenario once such a
+        # runner class exists.
+        pod_log=$(kubectl --context "kind-$CLUSTER" logs dra-smi)
+        gpu_lines=$(echo "$pod_log" | grep -c '^GPU [0-9]\+:' || true)
+        [ "$gpu_lines" = "1" ] || {
+          echo "expected exactly 1 GPU in dra-smi logs, got $gpu_lines"
+          echo "$pod_log"
+          exit 1
+        }
+        # Assert DRA actually engaged for this pod. The ResourceClaim
+        # created from a template is pod-scoped, so it is deallocated
+        # and garbage-collected once the pod reaches Succeeded —
+        # `kubectl get resourceclaim` races that GC. The pod's
+        # `status.resourceClaimStatuses` is set by the ResourceClaim
+        # controller when the claim is created and survives pod
+        # completion, so it's the reliable signal that DRA ran.
+        # If DRA is bypassed (gate off, controller not running, etc.)
+        # this field stays empty even though the pod can still succeed.
+        claim_name=$(kubectl --context "kind-$CLUSTER" get pod dra-smi \
+          -o jsonpath='{.status.resourceClaimStatuses[?(@.name=="gpu")].resourceClaimName}')
+        [ -n "$claim_name" ] || {
+          echo "pod has no status.resourceClaimStatuses[name=gpu] — DRA did not engage"
+          kubectl --context "kind-$CLUSTER" get pod dra-smi -o yaml
+          exit 1
+        }
+        kind delete cluster --name "$CLUSTER"
+
+    - name: Collect artifacts
+      if: always()
+      run: |
+        D=/tmp/nvkind-artifacts
+        mkdir -p "$D"
+        for c in $(kind get clusters 2>/dev/null); do
+          kind export logs "$D/kind-$c" --name "$c" || true
+          kubectl --context "kind-$c" get pods -A -o wide > "$D/pods-$c.txt" || true
+          kubectl --context "kind-$c" get events -A \
+            --sort-by=.lastTimestamp > "$D/events-$c.txt" || true
+        done
+        sudo cat /etc/docker/daemon.json > "$D/docker-daemon.json" 2>/dev/null || true
+        sudo cat /etc/nvidia-container-runtime/config.toml \
+          > "$D/nvidia-ctk.toml" 2>/dev/null || true
+
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: nvkind-e2e-${{ matrix.arch }}-${{ github.run_id }}
+        path: /tmp/nvkind-artifacts
+        retention-days: 7
+
+    - name: Teardown
+      if: always()
+      run: |
+        for c in $(kind get clusters 2>/dev/null | grep "^${CLUSTER_PREFIX}-" || true); do
+          kind delete cluster --name "$c" || true
+        done
+        docker system prune -f || true
diff --git a/hack/ci/dra-pod.yaml b/hack/ci/dra-pod.yaml
@@ -0,0 +1,30 @@
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-gpu
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: dra-smi
+spec:
+  restartPolicy: OnFailure
+  containers:
+  - name: smi
+    image: nvidia/cuda:12.5.0-devel-ubuntu22.04
+    # Use `-L` so the log is one line per GPU (`GPU N: ... (UUID: GPU-...)`),
+    # which lets the workflow assert the pod sees exactly one GPU.
+    command: ["nvidia-smi", "-L"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-gpu
diff --git a/hack/ci/smi-pod.yaml b/hack/ci/smi-pod.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: smi
+spec:
+  restartPolicy: OnFailure
+  containers:
+  - name: smi
+    image: nvidia/cuda:12.5.0-devel-ubuntu22.04
+    command: ["nvidia-smi"]
+    resources:
+      limits:
+        nvidia.com/gpu: 1
diff --git a/hack/ci/templates/dra.yaml.tmpl b/hack/ci/templates/dra.yaml.tmpl
@@ -0,0 +1,45 @@
+# nvkind / kind cluster config template for DRA.
+# Rendered by nvkind: `numGPUs` is provided automatically based on host GPU count.
+# Enables DynamicResourceAllocation across control-plane components and kubelet,
+# and turns on CDI in containerd.
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+featureGates:
+  DynamicResourceAllocation: true
+containerdConfigPatches:
+- |-
+  [plugins."io.containerd.grpc.v1.cri"]
+    enable_cdi = true
+nodes:
+- role: control-plane
+  kubeadmConfigPatches:
+  - |
+    kind: ClusterConfiguration
+    apiServer:
+      extraArgs:
+        feature-gates: "DynamicResourceAllocation=true"
+    controllerManager:
+      extraArgs:
+        feature-gates: "DynamicResourceAllocation=true"
+    scheduler:
+      extraArgs:
+        feature-gates: "DynamicResourceAllocation=true"
+  - |
+    kind: InitConfiguration
+    nodeRegistration:
+      kubeletExtraArgs:
+        feature-gates: "DynamicResourceAllocation=true"
+- role: worker
+  labels:
+    nvidia.com/gpu.present: "true"
+  kubeadmConfigPatches:
+  - |
+    kind: JoinConfiguration
+    nodeRegistration:
+      kubeletExtraArgs:
+        feature-gates: "DynamicResourceAllocation=true"
+  extraMounts:
+  {{- range $gpu := until numGPUs }}
+  - hostPath: /dev/null
+    containerPath: /var/run/nvidia-container-devices/{{ $gpu }}
+  {{- end }}