Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions docs/quick-start/sriov-network-rdma.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,51 @@ Deploy SR-IOV Network with RDMA

kubectl apply -f pod.yaml

Verify the deployment:
**Step 6**: Verify the deployment

Check that the pods are running on different nodes:

.. code-block:: bash

kubectl get pods -n default -o wide

Verify RDMA devices are available in the pods:

.. code-block:: bash

kubectl -n default exec sriov-rdma-server -- ibv_devices
kubectl -n default exec sriov-rdma-client -- ibv_devices

Capture the server IP and RDMA device names in environment variables:

.. code-block:: bash

export SERVER_IP=$(kubectl get pod sriov-rdma-server -n default -o jsonpath='{.metadata.annotations.k8s\.v1\.cni\.cncf\.io/network-status}' | jq -r '.[] | select(.name=="default/sriov-rdma-network") | .ips[0]')
export SERVER_RDMA_DEV=$(kubectl -n default exec sriov-rdma-server -- ibv_devices | awk 'NR==3 {print $1}')
export CLIENT_RDMA_DEV=$(kubectl -n default exec sriov-rdma-client -- ibv_devices | awk 'NR==3 {print $1}')

echo "Server IP: $SERVER_IP"
echo "Server RDMA Device: $SERVER_RDMA_DEV"
echo "Client RDMA Device: $CLIENT_RDMA_DEV"


**Step 7**: Test RDMA connectivity

Start the RDMA bandwidth test server:

.. code-block:: bash

kubectl exec -it sriov-test-pod -- ip addr show
kubectl exec -it sriov-test-pod -- ibv_devices
kubectl -n default exec -it sriov-rdma-server -- bash -lc "ib_write_bw -d $SERVER_RDMA_DEV -R -a --report_gbits"

In a separate terminal, run the RDMA bandwidth test client:

.. code-block:: bash

kubectl -n default exec -it sriov-rdma-client -- bash -lc "ib_write_bw -d $CLIENT_RDMA_DEV -R -a --report_gbits $SERVER_IP"

.. note::
The commands above automatically use the first available RDMA device from each pod.
If you need to use a different device, manually set the environment variables or replace them in the command.

**Complete Configuration**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
spec:
sriovDevicePlugin:
image: sriov-network-device-plugin
repository: nvcr.io/nvstaging/mellanox
version: network-operator-v25.10.0-rc.2
nvIpam:
image: nvidia-k8s-ipam
repository: nvcr.io/nvstaging/mellanox
Expand Down
60 changes: 55 additions & 5 deletions examples/processed/sriov-network-rdma/50-pod.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,69 @@
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-test-pod
name: sriov-rdma-server
namespace: default
labels:
app: sriov-rdma
role: server
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
restartPolicy: Never
containers:
- name: test-container
image: mellanox/rping-test
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-rdma-client
namespace: default
labels:
app: sriov-rdma
role: client
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: role
operator: In
values:
- server
topologyKey: kubernetes.io/hostname
restartPolicy: Never
containers:
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: "1"
64 changes: 55 additions & 9 deletions examples/processed/sriov-network-rdma/complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
spec:
sriovDevicePlugin:
image: sriov-network-device-plugin
repository: nvcr.io/nvstaging/mellanox
version: network-operator-v25.10.0-rc.2
nvIpam:
image: nvidia-k8s-ipam
repository: nvcr.io/nvstaging/mellanox
Expand Down Expand Up @@ -64,22 +60,72 @@ spec:
networkNamespace: default
resourceName: sriov_resource
---
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-rdma-server
namespace: default
labels:
app: sriov-rdma
role: server
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
restartPolicy: Never
containers:
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: "1"
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-test-pod
name: sriov-rdma-client
namespace: default
labels:
app: sriov-rdma
role: client
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: role
operator: In
values:
- server
topologyKey: kubernetes.io/hostname
restartPolicy: Never
containers:
- name: test-container
image: mellanox/rping-test
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
spec:
sriovDevicePlugin:
image: sriov-network-device-plugin
repository: |sriovnetop-repository|
version: |sriovnetop-sriov-device-plugin-version|
nvIpam:
image: nvidia-k8s-ipam
repository: |nvidia-ipam-repository|
Expand Down
60 changes: 55 additions & 5 deletions examples/templates/sriov-network-rdma/50-pod.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,69 @@
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-test-pod
name: sriov-rdma-server
namespace: default
labels:
app: sriov-rdma
role: server
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
restartPolicy: Never
containers:
- name: test-container
image: mellanox/rping-test
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: '1'
nvidia.com/sriov_resource: "1"
---
apiVersion: v1
kind: Pod
metadata:
name: sriov-rdma-client
namespace: default
labels:
app: sriov-rdma
role: client
annotations:
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: role
operator: In
values:
- server
topologyKey: kubernetes.io/hostname
restartPolicy: Never
containers:
- name: rdma-test
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
command: ["/bin/bash", "-c", "sleep infinity"]
securityContext:
capabilities:
add: ["IPC_LOCK"]
privileged: true
resources:
requests:
nvidia.com/sriov_resource: "1"
limits:
nvidia.com/sriov_resource: "1"