diff --git a/docs/quick-start/sriov-network-rdma.rst b/docs/quick-start/sriov-network-rdma.rst index f4e60dfa..8b9612e1 100644 --- a/docs/quick-start/sriov-network-rdma.rst +++ b/docs/quick-start/sriov-network-rdma.rst @@ -66,12 +66,51 @@ Deploy SR-IOV Network with RDMA kubectl apply -f pod.yaml -Verify the deployment: +**Step 6**: Verify the deployment + +Check that the pods are running on different nodes: + +.. code-block:: bash + + kubectl get pods -n default -o wide + +Verify RDMA devices are available in the pods: + +.. code-block:: bash + + kubectl -n default exec sriov-rdma-server -- ibv_devices + kubectl -n default exec sriov-rdma-client -- ibv_devices + +Capture the server IP and RDMA device names in environment variables: + +.. code-block:: bash + + export SERVER_IP=$(kubectl get pod sriov-rdma-server -n default -o jsonpath='{.metadata.annotations.k8s\.v1\.cni\.cncf\.io/network-status}' | jq -r '.[] | select(.name=="default/sriov-rdma-network") | .ips[0]') + export SERVER_RDMA_DEV=$(kubectl -n default exec sriov-rdma-server -- ibv_devices | awk 'NR==3 {print $1}') + export CLIENT_RDMA_DEV=$(kubectl -n default exec sriov-rdma-client -- ibv_devices | awk 'NR==3 {print $1}') + + echo "Server IP: $SERVER_IP" + echo "Server RDMA Device: $SERVER_RDMA_DEV" + echo "Client RDMA Device: $CLIENT_RDMA_DEV" + + +**Step 7**: Test RDMA connectivity + +Start the RDMA bandwidth test server: .. code-block:: bash - kubectl exec -it sriov-test-pod -- ip addr show - kubectl exec -it sriov-test-pod -- ibv_devices + kubectl -n default exec -it sriov-rdma-server -- bash -lc "ib_write_bw -d $SERVER_RDMA_DEV -R -a --report_gbits" + +In a separate terminal, run the RDMA bandwidth test client: + +.. code-block:: bash + + kubectl -n default exec -it sriov-rdma-client -- bash -lc "ib_write_bw -d $CLIENT_RDMA_DEV -R -a --report_gbits $SERVER_IP" + +.. note:: + The commands above automatically use the first available RDMA device from each pod. + If you need to use a different device, manually set the environment variables or replace them in the command. **Complete Configuration** diff --git a/examples/processed/sriov-network-rdma/10-nicclusterpolicy.yaml b/examples/processed/sriov-network-rdma/10-nicclusterpolicy.yaml index 1074744f..147ce38c 100644 --- a/examples/processed/sriov-network-rdma/10-nicclusterpolicy.yaml +++ b/examples/processed/sriov-network-rdma/10-nicclusterpolicy.yaml @@ -3,10 +3,6 @@ kind: NicClusterPolicy metadata: name: nic-cluster-policy spec: - sriovDevicePlugin: - image: sriov-network-device-plugin - repository: nvcr.io/nvstaging/mellanox - version: network-operator-v25.10.0-rc.2 nvIpam: image: nvidia-k8s-ipam repository: nvcr.io/nvstaging/mellanox diff --git a/examples/processed/sriov-network-rdma/50-pod.yaml b/examples/processed/sriov-network-rdma/50-pod.yaml index f1d4a1dd..5ea12647 100644 --- a/examples/processed/sriov-network-rdma/50-pod.yaml +++ b/examples/processed/sriov-network-rdma/50-pod.yaml @@ -1,19 +1,69 @@ +--- apiVersion: v1 kind: Pod metadata: - name: sriov-test-pod + name: sriov-rdma-server + namespace: default + labels: + app: sriov-rdma + role: server annotations: k8s.v1.cni.cncf.io/networks: sriov-rdma-network spec: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + restartPolicy: Never containers: - - name: test-container - image: mellanox/rping-test + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: add: ["IPC_LOCK"] + privileged: true resources: requests: - nvidia.com/sriov_resource: '1' + nvidia.com/sriov_resource: "1" limits: - nvidia.com/sriov_resource: '1' \ No newline at end of file + nvidia.com/sriov_resource: "1" +--- +apiVersion: v1 +kind: Pod +metadata: + name: sriov-rdma-client + namespace: default + labels: + app: sriov-rdma + role: client + annotations: + k8s.v1.cni.cncf.io/networks: sriov-rdma-network +spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: role + operator: In + values: + - server + topologyKey: kubernetes.io/hostname + restartPolicy: Never + containers: + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host + command: ["/bin/bash", "-c", "sleep infinity"] + securityContext: + capabilities: + add: ["IPC_LOCK"] + privileged: true + resources: + requests: + nvidia.com/sriov_resource: "1" + limits: + nvidia.com/sriov_resource: "1" \ No newline at end of file diff --git a/examples/processed/sriov-network-rdma/complete.yaml b/examples/processed/sriov-network-rdma/complete.yaml index 186f7057..2bbdebfe 100644 --- a/examples/processed/sriov-network-rdma/complete.yaml +++ b/examples/processed/sriov-network-rdma/complete.yaml @@ -3,10 +3,6 @@ kind: NicClusterPolicy metadata: name: nic-cluster-policy spec: - sriovDevicePlugin: - image: sriov-network-device-plugin - repository: nvcr.io/nvstaging/mellanox - version: network-operator-v25.10.0-rc.2 nvIpam: image: nvidia-k8s-ipam repository: nvcr.io/nvstaging/mellanox @@ -64,22 +60,72 @@ spec: networkNamespace: default resourceName: sriov_resource --- +--- +apiVersion: v1 +kind: Pod +metadata: + name: sriov-rdma-server + namespace: default + labels: + app: sriov-rdma + role: server + annotations: + k8s.v1.cni.cncf.io/networks: sriov-rdma-network +spec: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + restartPolicy: Never + containers: + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host + command: ["/bin/bash", "-c", "sleep infinity"] + securityContext: + capabilities: + add: ["IPC_LOCK"] + privileged: true + resources: + requests: + nvidia.com/sriov_resource: "1" + limits: + nvidia.com/sriov_resource: "1" +--- apiVersion: v1 kind: Pod metadata: - name: sriov-test-pod + name: sriov-rdma-client + namespace: default + labels: + app: sriov-rdma + role: client annotations: k8s.v1.cni.cncf.io/networks: sriov-rdma-network spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: role + operator: In + values: + - server + topologyKey: kubernetes.io/hostname + restartPolicy: Never containers: - - name: test-container - image: mellanox/rping-test + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: add: ["IPC_LOCK"] + privileged: true resources: requests: - nvidia.com/sriov_resource: '1' + nvidia.com/sriov_resource: "1" limits: - nvidia.com/sriov_resource: '1' + nvidia.com/sriov_resource: "1" diff --git a/examples/templates/sriov-network-rdma/10-nicclusterpolicy.yaml b/examples/templates/sriov-network-rdma/10-nicclusterpolicy.yaml index edc69b18..4cfa7a11 100644 --- a/examples/templates/sriov-network-rdma/10-nicclusterpolicy.yaml +++ b/examples/templates/sriov-network-rdma/10-nicclusterpolicy.yaml @@ -3,10 +3,6 @@ kind: NicClusterPolicy metadata: name: nic-cluster-policy spec: - sriovDevicePlugin: - image: sriov-network-device-plugin - repository: |sriovnetop-repository| - version: |sriovnetop-sriov-device-plugin-version| nvIpam: image: nvidia-k8s-ipam repository: |nvidia-ipam-repository| diff --git a/examples/templates/sriov-network-rdma/50-pod.yaml b/examples/templates/sriov-network-rdma/50-pod.yaml index f1d4a1dd..5ea12647 100644 --- a/examples/templates/sriov-network-rdma/50-pod.yaml +++ b/examples/templates/sriov-network-rdma/50-pod.yaml @@ -1,19 +1,69 @@ +--- apiVersion: v1 kind: Pod metadata: - name: sriov-test-pod + name: sriov-rdma-server + namespace: default + labels: + app: sriov-rdma + role: server annotations: k8s.v1.cni.cncf.io/networks: sriov-rdma-network spec: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + restartPolicy: Never containers: - - name: test-container - image: mellanox/rping-test + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: add: ["IPC_LOCK"] + privileged: true resources: requests: - nvidia.com/sriov_resource: '1' + nvidia.com/sriov_resource: "1" limits: - nvidia.com/sriov_resource: '1' \ No newline at end of file + nvidia.com/sriov_resource: "1" +--- +apiVersion: v1 +kind: Pod +metadata: + name: sriov-rdma-client + namespace: default + labels: + app: sriov-rdma + role: client + annotations: + k8s.v1.cni.cncf.io/networks: sriov-rdma-network +spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: role + operator: In + values: + - server + topologyKey: kubernetes.io/hostname + restartPolicy: Never + containers: + - name: rdma-test + image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host + command: ["/bin/bash", "-c", "sleep infinity"] + securityContext: + capabilities: + add: ["IPC_LOCK"] + privileged: true + resources: + requests: + nvidia.com/sriov_resource: "1" + limits: + nvidia.com/sriov_resource: "1" \ No newline at end of file