Add working k8s GRPO recipe

aybchan · aybchan · commit 2bd3206bdccd · 2025-11-28T20:43:54.000Z
diff --git a/.github/gke-workflow/jax-vllm-offloading/grpo/jobset.env b/.github/gke-workflow/jax-vllm-offloading/grpo/jobset.env
@@ -0,0 +1,22 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+CUDA_DEVICE_ORDER=PCI_BUS_ID
+CUDA_DEVICE_MAX_CONNECTIONS=16
+VLLM_ENFORCE_EAGER=1
+VLLM_GPU_MEMORY_UTILIZATION=0.7
+VLLM_TENSOR_PARALLEL_SIZE=8
+VLLM_DISTRIBUTED_BACKEND=mp
+VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1
+VLLM_LOAD_FORMAT=dummy
+NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
+NCCL_TUNER_PLUGIN=none
+MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
+NCCL_CUMEM_ENABLE=0
+NCCL_BUFFSIZE=16777216
+XLA_FLAGS=--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_command_buffer=FUSION,CUBLAS,CUDNN,CUSTOM_CALL --xla_gpu_collective_permute_combine_threshold_bytes=8589934592 --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592 --xla_gpu_all_gather_combine_threshold_bytes=8589934592 --xla_gpu_all_reduce_combine_threshold_bytes=8589934592
+TRANSFER_MODE=grouped
+USE_POLYMORPHIC_MESH=0
+JAX_COORDINATOR_PORT=3389
+JAX_COORDINATOR_ADDRESS=$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):$(JAX_COORDINATOR_PORT)
+GATEWAY_PORT=50051
+GATEWAY_URL=$(JOBSET_NAME):$(GATEWAY_PORT)
+OUTPUT_DIR=/opt/output
diff --git a/.github/gke-workflow/jax-vllm-offloading/grpo/jobset.yaml b/.github/gke-workflow/jax-vllm-offloading/grpo/jobset.yaml
@@ -0,0 +1,280 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  annotations:
+  name: jax-vllm-grpo
+  namespace: default
+spec:
+  network:
+    enableDNSHostnames: true
+    publishNotReadyAddresses: true
+  replicatedJobs:
+  - name: slice-job
+    replicas: 1
+    template:
+      metadata: {}
+      spec:
+        backoffLimit: 0
+        completionMode: Indexed
+        completions: 2
+        parallelism: 2
+        template:
+          metadata:
+            annotations:
+              devices.gke.io/container.tcpxo-daemon: |
+                - path: /dev/nvidia0
+                - path: /dev/nvidia1
+                - path: /dev/nvidia2
+                - path: /dev/nvidia3
+                - path: /dev/nvidia4
+                - path: /dev/nvidia5
+                - path: /dev/nvidia6
+                - path: /dev/nvidia7
+                - path: /dev/nvidiactl
+                - path: /dev/nvidia-uvm
+                - path: /dev/dmabuf_import_helper
+              networking.gke.io/default-interface: eth0
+              networking.gke.io/interfaces: |-
+                [
+                    {"interfaceName":"eth0","network":"default"},
+                    {"interfaceName":"eth1","network":"jtb-2025-10-07-gpunet-0-subnet"},
+                    {"interfaceName":"eth2","network":"jtb-2025-10-07-gpunet-1-subnet"},
+                    {"interfaceName":"eth3","network":"jtb-2025-10-07-gpunet-2-subnet"},
+                    {"interfaceName":"eth4","network":"jtb-2025-10-07-gpunet-3-subnet"},
+                    {"interfaceName":"eth5","network":"jtb-2025-10-07-gpunet-4-subnet"},
+                    {"interfaceName":"eth6","network":"jtb-2025-10-07-gpunet-5-subnet"},
+                    {"interfaceName":"eth7","network":"jtb-2025-10-07-gpunet-6-subnet"},
+                    {"interfaceName":"eth8","network":"jtb-2025-10-07-gpunet-7-subnet"}
+                ]
+          spec:
+            imagePullSecrets:
+            - name: jax-toolbox-ghcr
+            containers:
+            - name: gpu-image
+              image: ghcr.io/nvidia/jax-toolbox-internal:19751502075-jio-amd64
+              imagePullPolicy: Always
+              command:
+              - bash
+              - -c
+              - |
+                pip install jax[k8s]
+                python -c "
+                import jax
+                jax.distributed.initialize()
+                print(jax.devices())
+                print(jax.local_devices())
+                assert jax.process_count() > 1
+                assert len(jax.devices()) > len(jax.local_devices())"
+
+                PIDS=()
+                # hard-code split of vLLM-JAX on 1x node each on 2x slice jobset
+                if [ ${NODE_RANK} = "0" ]; then
+                  echo "Starting gateway"
+                  cd /opt/jtbx/jax-inference-offloading
+                  python jax_inference_offloading/controller/gateway.py 2>&1 | tee -a gateway.log &
+                  PIDS+=($!)
+
+                  echo "Starting rollout"
+                  cd /opt/jtbx/jax-inference-offloading/examples
+                  python rollout.py 2>&1 | tee -a rollout.log &
+                  PIDS+=($!)
+                else
+                  echo "Starting trainer"
+                  export MODEL_PATH=$(python "download_model.py" --hub=hf --model=${MODEL_NAME} --ignore="*.pth")
+                  python trainer_grpo.py 2>&1 | tee -a trainer_grpo.log &
+                  PIDS+=($!)
+                fi
+
+                wait "${PIDS[@]}"
+                echo "All done"
+              env:
+              # jobset
+              - name: REPLICATED_JOB_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+              - name: JOBSET_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
+              - name: NODE_RANK
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: USE_GPUDIRECT
+                value: tcpxo
+              - name: GPUS_PER_NODE
+                value: "8"
+
+              - name: LD_LIBRARY_PATH
+                value: "/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.9/compat/lib.real:/usr/local/nvidia/lib64"
+
+              # huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              - name: MODEL_NAME
+                value: "meta-llama/Llama-3.1-8B-Instruct"
+              - name: SCRATCHDIR
+                value: "/opt/scratch"
+
+              # gateway
+              - name: GATEWAY_PORT
+                value: "50051"
+              - name: GATEWAY_URL
+                value: "$(JOBSET_NAME):$(GATEWAY_PORT)"
+
+              # JAX
+              - name: JAX_COORDINATOR_PORT
+                value: "3389"
+              - name: JAX_COORDINATOR_ADDRESS
+                value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):3389
+
+              # CUDA
+              - name: CUDA_VISIBLE_DEVICES
+                value: "0,1,2,3,4,5,6,7"
+              - name: CUDA_DEVICE_ORDER
+                value: "PCI_BUS_ID"
+              - name: CUDA_DEVICE_MAX_CONNECTIONS
+                value: "16"
+
+              # vLLM
+              - name: VLLM_ENFORCE_EAGER
+                value: "1"
+              - name: VLLM_GPU_MEMORY_UTILIZATION
+                value: "0.7"
+              - name: VLLM_TENSOR_PARALLEL_SIZE
+                value: "8"
+              - name: VLLM_DISTRIBUTED_BACKEND
+                value: "mp"
+              - name: VLLM_ATTENTION_BACKEND
+                value: "TRITON_ATTN"
+              - name: VLLM_LOAD_FORMAT
+                value: "dummy"
+
+              # NCCL
+              - name: NCCL_NET_PLUGIN
+                value: "/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so"
+              - name: NCCL_TUNER_PLUGIN
+                value: "none"
+              - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
+                value: /dev/aperture_devices
+              - name: NCCL_CUMEM_ENABLE
+                value: "0"  # https://docs.vllm.ai/en/v0.9.1/usage/troubleshooting.html#known-issues
+              - name: NCCL_BUFFSIZE
+                value: "16777216"
+
+              # XLA
+              - name: XLA_PYTHON_CLIENT_MEM_FRACTION
+                value: "0.95"
+              - name: XLA_FLAGS
+                value: "--xla_gpu_enable_latency_hiding_scheduler=true
+                        --xla_gpu_enable_command_buffer=FUSION,CUBLAS,CUDNN,CUSTOM_CALL
+                        --xla_gpu_collective_permute_combine_threshold_bytes=8589934592
+                        --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592
+                        --xla_gpu_all_gather_combine_threshold_bytes=8589934592
+                        --xla_gpu_all_reduce_combine_threshold_bytes=8589934592"
+              
+              # trainer
+              - name: TRANSFER_MODE
+                value: "grouped"
+              - name: USE_POLYMORPHIC_MESH
+                value: "0"
+              - name: JAX_COMPILATION_CACHE_DIR
+                value: /opt/jax-compilation
+              - name: JAX_PERSISTENT_CACHE_MIN_COMPILE_TIME_SECS
+                value: "0.1"
+              - name: RUN_MODE
+                value: "timing"
+              - name: ROLLOUT_ENGINE
+                value: "vllm_gpu"
+              - name: GRPO_TRAIN_MICRO_BATCH_SIZE
+                value: "2"
+
+
+              ports:
+              - containerPort: 50051
+                protocol: TCP
+              - containerPort: 3389
+                protocol: TCP
+              resources:
+                limits:
+                  nvidia.com/gpu: "8"
+              securityContext:
+                privileged: true
+              volumeMounts:
+              - mountPath: /dev/aperture_devices
+                name: aperture-devices
+              - mountPath: /usr/local/nvidia
+                name: libraries
+              - mountPath: /dev/shm
+                name: dshm
+              - mountPath: /opt/scratch
+                name: scratch
+            dnsPolicy: ClusterFirstWithHostNet
+            initContainers:
+            - args:
+              - |-
+                set -ex
+                chmod 755 /fts/entrypoint_rxdm_container.sh
+                /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
+              command:
+              - /bin/sh
+              - -c
+              env:
+              - name: LD_LIBRARY_PATH
+                value: /usr/local/nvidia/lib64
+              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.12
+              imagePullPolicy: Always
+              name: tcpxo-daemon
+              resources: {}
+              restartPolicy: Always
+              securityContext:
+                capabilities:
+                  add:
+                  - NET_ADMIN
+                  - NET_BIND_SERVICE
+              volumeMounts:
+              - mountPath: /usr/local/nvidia
+                name: libraries
+              - mountPath: /hostsysfs
+                name: sys
+              - mountPath: /hostprocsysfs
+                name: proc-sys
+            nodeSelector:
+              cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
+            priorityClassName: high
+            terminationGracePeriodSeconds: 30
+            tolerations:
+            - key: nvidia.com/gpu
+              operator: Exists
+            - effect: NoSchedule
+              key: user-workload
+              operator: Equal
+              value: "true"
+            volumes:
+            - hostPath:
+                path: /home/kubernetes/bin/nvidia
+              name: libraries
+            - hostPath:
+                path: /sys
+              name: sys
+            - hostPath:
+                path: /proc/sys
+              name: proc-sys
+            - hostPath:
+                path: /dev/aperture_devices
+              name: aperture-devices
+            - emptyDir:
+                medium: Memory
+              name: dshm
+            - emptyDir:
+                sizeLimit: 2Gi
+              name: scratch
+  startupPolicy:
+    startupPolicyOrder: AnyOrder
+  successPolicy:
+    operator: All
+  ttlSecondsAfterFinished: 100000
diff --git a/.github/gke-workflow/jax-vllm-offloading/transfer/jobset.yaml b/.github/gke-workflow/jax-vllm-offloading/transfer/jobset.yaml
@@ -2,7 +2,7 @@ apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
   annotations:
-  name: jax-vllm-jobset
+  name: jax-vllm-transfer
   namespace: default
 spec:
   network:
@@ -161,11 +161,11 @@ spec:
               # XLA
               - name: XLA_FLAGS
                 value: "--xla_gpu_enable_latency_hiding_scheduler=true
-                  --xla_gpu_enable_command_buffer=FUSION,CUBLAS,CUDNN,CUSTOM_CALL
-                  --xla_gpu_collective_permute_combine_threshold_bytes=8589934592
-                  --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592
-                  --xla_gpu_all_gather_combine_threshold_bytes=8589934592
-                  --xla_gpu_all_reduce_combine_threshold_bytes=8589934592"
+                        --xla_gpu_enable_command_buffer=FUSION,CUBLAS,CUDNN,CUSTOM_CALL
+                        --xla_gpu_collective_permute_combine_threshold_bytes=8589934592
+                        --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592
+                        --xla_gpu_all_gather_combine_threshold_bytes=8589934592
+                        --xla_gpu_all_reduce_combine_threshold_bytes=8589934592"
               
               # trainer
               - name: TRANSFER_MODE