diff --git a/.github/workflows/docs-ci.yaml b/.github/workflows/docs-ci.yaml index 5dc28dde..dfe45622 100644 --- a/.github/workflows/docs-ci.yaml +++ b/.github/workflows/docs-ci.yaml @@ -27,7 +27,7 @@ jobs: env: NGC_CLI_API_KEY: ${{ secrets.NVCR_TOKEN }} run: | - make api-docs helm-docs generate-docs-versions-var + make api-docs helm-docs generate-docs-versions-var nic-conf-docs - name: Close any existing documentation PRs run: | for pr_number in $(gh pr list --search "$PR_TITLE_PREFIX" --json number --jq ".[].number"); do diff --git a/Makefile b/Makefile index bd526a8c..6a664dd9 100644 --- a/Makefile +++ b/Makefile @@ -30,11 +30,15 @@ endif # Network Operator source tar location REPO_TAR_URL ?= https://github.com/Mellanox/network-operator/archive/refs/$(TAR_PATH) +# NIC Configuration Operator source tar location +NIC_CONF_REPO_TAR_URL ?= https://github.com/Mellanox/nic-configuration-operator/archive/refs/$(TAR_PATH) # release.yaml location RELEASE_YAML_URL ?= https://raw.githubusercontent.com/Mellanox/network-operator/$(if $(TAG),$(TAG),$(BRANCH))/hack/release.yaml # Path to download the crd api to. CRD_API_DEP_ROOT = $(BUILDDIR)/crd +# Path to download the nic-conf-operator crd api to. +NIC_CONF_CRD_API_DEP_ROOT = $(BUILDDIR)/nic-conf-crd # Path to download the helm chart to. HELM_CHART_DEP_ROOT = $(BUILDDIR)/helmcharts # Helm chart version and url @@ -42,7 +46,7 @@ HELM_CHART_VERSION ?= 24.4.1 NGC_HELM_CHART_URL ?= https://helm.ngc.nvidia.com/nvidia/charts/network-operator-${HELM_CHART_VERSION}.tgz HELM_CHART_PATH ?= -$(BUILDDIR) $(TOOLSDIR) $(HELM_CHART_DEP_ROOT) $(CRD_API_DEP_ROOT): ; $(info Creating directory $@...) +$(BUILDDIR) $(TOOLSDIR) $(HELM_CHART_DEP_ROOT) $(CRD_API_DEP_ROOT) $(NIC_CONF_CRD_API_DEP_ROOT): ; $(info Creating directory $@...) mkdir -p $@ @@ -113,16 +117,48 @@ download-api: | $(CRD_API_DEP_ROOT) curl -sL ${REPO_TAR_URL} \ | tar -xz -C ${CRD_API_DEP_ROOT} +.PHONY: download-nic-conf-api +download-nic-conf-api: | $(NIC_CONF_CRD_API_DEP_ROOT) + curl -sL ${NIC_CONF_REPO_TAR_URL} \ + | tar -xz -C ${NIC_CONF_CRD_API_DEP_ROOT} + gen-crd-api-docs: | $(GEN_CRD_API_REFERENCE_DOCS) download-api cd ${CRD_API_DEP_ROOT}/network-operator-${SRC}/api/v1alpha1 && \ $(GEN_CRD_API_REFERENCE_DOCS) -api-dir=. -config=${CURDIR}/hack/api-docs/config.json \ -template-dir=${CURDIR}/hack/api-docs/templates -out-file=${BUILDDIR}/crds-api.html +gen-nic-conf-crd-api-docs: | $(GEN_CRD_API_REFERENCE_DOCS) download-nic-conf-api + cd ${NIC_CONF_CRD_API_DEP_ROOT}/nic-configuration-operator-${SRC}/api/v1alpha1 && \ + $(GEN_CRD_API_REFERENCE_DOCS) -api-dir=. -config=${CURDIR}/hack/api-docs/nic-conf-config.json \ + -template-dir=${CURDIR}/hack/api-docs/templates -out-file=${BUILDDIR}/nic-conf-crds-api.html + .PHONY: api-docs api-docs: gen-crd-api-docs docker run --rm --volume "`pwd`:/data:Z" pandoc/minimal -f html -t rst --lua-filter=/data/hack/ref_links.lua \ --columns 200 /data/build/_output/crds-api.html -o /data/docs/customizations/crds.rst + +.PHONY: nic-conf-api-docs +nic-conf-api-docs: gen-nic-conf-crd-api-docs + docker run --rm --volume "`pwd`:/data:Z" pandoc/minimal -f html -t rst --lua-filter=/data/hack/ref_links.lua \ + --columns 200 /data/build/_output/nic-conf-crds-api.html -o /data/docs/nic-conf-operator/crds.rst + +.PHONY: nic-conf-api-docs-versioned +nic-conf-api-docs-versioned: + $(eval NIC_CONF_VERSION := $(shell grep "nic-configuration-operator-version" docs/common/vars.rst | sed 's/.*replace:: //')) + @echo "Using NIC Configuration Operator version: $(NIC_CONF_VERSION)" + TAG=$(NIC_CONF_VERSION) make nic-conf-api-docs + +.PHONY: fetch-config-docs +fetch-config-docs: + $(eval NIC_CONF_VERSION := $(shell grep "nic-configuration-operator-version" docs/common/vars.rst | sed 's/.*replace:: //')) + @echo "Fetching configuration documentation for version: $(NIC_CONF_VERSION)" + ./hack/fetch-config-docs.sh Mellanox/nic-configuration-operator $(NIC_CONF_VERSION) README.md docs/nic-conf-operator/configuration-details.rst + +.PHONY: nic-conf-docs +nic-conf-docs: nic-conf-api-docs-versioned fetch-config-docs + @echo "Generated all NIC Configuration Operator documentation" + .PHONY: build-cache build-cache: @if [ -d "$(CACHE_DIR)" ]; then \ diff --git a/docs/getting-started-kubernetes.rst b/docs/getting-started-kubernetes.rst index 90ef63cc..2506a5bb 100644 --- a/docs/getting-started-kubernetes.rst +++ b/docs/getting-started-kubernetes.rst @@ -2680,274 +2680,3 @@ The ``pod.yaml`` configuration file for such a deployment: - sh - -c - sleep inf - - -=========================================================================== -Configure NIC Firmware using the NIC Configuration Operator -=========================================================================== -`NVIDIA NIC Configuration Operator `_ provides Kubernetes API (Custom Resource Definition) to allow Firmware update and configuration on NVIDIA NICs in a coordinated manner. It deploys a configuration daemon on each of the desired nodes to configure NVIDIA NICs there. NVIDIA NIC Configuration Operator uses `Maintenance Operator `_ to prepare a node for maintenance before the actual configuration. - -.. warning:: NVIDIA NIC Configuration Operator does not support FW reset flow for DPU mode. Check `limitations `_ - -.. note:: - To perform Firmware validation and update on NIC devices, NIC Configuration Operator requires a persistent storage set up in the cluster. - To set up a persistent NFS storage in the cluster, the `example from the CSI NFS Driver repository `_ might be used. - After deploying the NFS server and NFS CSI driver, the `storage class `_ should become available in the cluster. The name of the storage class should then be passed when configuring the NIC Configuration Operator. - -First install the Network Operator helm chart with the Maintenance Operator enabled and deploy a NIC Cluster Policy CRD with NIC Configuration Operator enabled: - -``values.yaml``: - -.. code-block:: yaml - - maintenanceOperator: - enabled: true - -``nicclusterpolicy.yaml``: - -.. code-block:: yaml - :substitutions: - - apiVersion: mellanox.com/v1alpha1 - kind: NicClusterPolicy - metadata: - name: nic-cluster-policy - spec: - nicConfigurationOperator: - operator: - image: nic-configuration-operator - repository: |nic-configuration-operator-repository| - version: |nic-configuration-operator-version| - configurationDaemon: - image: nic-configuration-operator-daemon - repository: |nic-configuration-operator-repository| - version: |nic-configuration-operator-version| - nicFirmwareStorage: - create: true - pvcName: nic-fw-storage-pvc - # Name of the storage class is provided by the user - storageClassName: nfs-csi - availableStorageSize: 1Gi - -Observe the NicDevice CRs detected in the cluster. The name of the CR is composed from the node name, NIC type and its serial number: - -.. code:: bash - - > kubectl get nicdevices -n nvidia-network-operator - - NAME AGE - node1-1015-mt1627x08307 1m - node1-101d-mt1952x03330 1m - node2-1015-mt1627x08305 1m - node2-101d-mt1952x03327 1m - -Discover more information about a specific device: - -.. code:: bash - - kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o yaml - -.. code-block:: yaml - - apiVersion: configuration.net.nvidia.com/v1alpha1 - kind: NicDevice - metadata: - creationTimestamp: "2024-09-21T08:43:08Z" - generation: 1 - name: node1-101d-mt1952x03327 - namespace: nvidia-network-operator - ownerReferences: - - apiVersion: v1 - kind: Node - name: node1 - uid: 25c4f4e2-f7ba-4ba9-9a87-8056313ffc79 - resourceVersion: "1177095" - uid: ac6763bf-67c6-4af5-81f8-1aad5da929bf - spec: {} - status: - conditions: - - type: FirmwareUpdateInProgress - status: "False" - reason: DeviceFirmwareSpecEmpty - message: Device firmware spec is empty, cannot update or validate firmware - lastTransitionTime: "2024-09-21T08:43:04Z" - - type: ConfigUpdateInProgress - status: "False" - reason: DeviceConfigSpecEmpty - message: Device configuration spec is empty, cannot update configuration - lastTransitionTime: "2024-09-21T08:43:08Z" - firmwareVersion: 22.39.1015 - node: cloud-dev-41 - partNumber: mcx623106ac-cdat - ports: - - networkInterface: enp3s0f0np0 - pci: "0000:03:00.0" - rdmaInterface: mlx5_0 - - networkInterface: enp3s0f1np1 - pci: "0000:03:00.1" - rdmaInterface: mlx5_1 - psid: mt_0000000436 - serialNumber: mt1952x03327 - type: 101d - -Configure and apply the NICFirmwareSource CR: - -.. code-block:: yaml - - apiVersion: configuration.net.nvidia.com/v1alpha1 - kind: NicFirmwareSource - metadata: - name: connectx6-dx-firmware-22-44-1036 - namespace: nvidia-network-operator - finalizers: - - configuration.net.nvidia.com/nic-configuration-operator - spec: - # a list of firmware binaries zip archives from the Mellanox website, can point to any url accessible from the cluster - binUrlSources: - - https://www.mellanox.com/downloads/firmware/fw-ConnectX6Dx-rel-22_44_1036-MCX623106AC-CDA_Ax-UEFI-14.37.14-FlexBoot-3.7.500.signed.bin.zip - -Observe the NICFirmwareSource status: - -.. code:: bash - - > kubectl get nicfirmwaresource -n nvidia-network-operator connectx6-dx-firmware-22-44-1036 -o yaml - - ... - status: - state: Success - versions: - 22.44.1036: - - mt_0000000436 - -Configure and apply the NicFirmwareTemplate CR: - -.. code-block:: yaml - - apiVersion: configuration.net.nvidia.com/v1alpha1 - kind: NicFirmwareTemplate - metadata: - name: connectx6dx-config - namespace: nvidia-network-operator - spec: - nodeSelector: - kubernetes.io/hostname: node1 - nicSelector: - nicType: "101d" - template: - nicFirmwareSourceRef: connectx6dx-firmware-22-44-1036 - updatePolicy: Update - -Configure and apply the NicConfigurationTemplate CR: - -.. code-block:: yaml - - apiVersion: configuration.net.nvidia.com/v1alpha1 - kind: NicConfigurationTemplate - metadata: - name: connectx6-config - namespace: nvidia-network-operator - spec: - nodeSelector: - feature.node.kubernetes.io/network-sriov.capable: "true" - nicSelector: - # nicType selector is mandatory the rest are optional. Only a single type can be specified. - nicType: 101d - pciAddresses: - - "0000:03:00.0" - - “0000:04:00.0” - serialNumbers: - - "mt1952x03327" - resetToDefault: false # if set, template is ignored, device configuration should reset - template: - # numVfs and linkType fields are mandatory, the rest are optional - numVfs: 2 - linkType: Ethernet - pciPerformanceOptimized: - enabled: true - maxReadRequest: 4096 - roceOptimized: - enabled: true - qos: - trust: dscp - pfc: "0,0,0,1,0,0,0,0" - gpuDirectOptimized: - enabled: true - env: Baremetal - -.. note:: It's not possible to apply more than one template of each kind (NICFirmwareTemplate or NICConfigurationTemplate) to a single device. In this case, no template will be applied and an error event will be emitted for the corresponding NicDevice CR. - -.. note:: To use the NIC Configuration Operator functionality together with SR-IOV Network Operator, "mellanox" `plugin should be disabled `_ in the SR-IOV Network Operator. - -For more information about the CRD API, refer to `API documentation `_. -For more information, which FW parameter each settings corresponds to, refer to `Configuration details doc section `_. - -Spec of the NicDevice CR is updated in accordance with the NICFirmwareTemplate and NicConfigurationTemplate CRs matching the device - -.. code-block:: bash - - > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.spec}' | yq -P - - template: - firmware: - nicFirmwareSourceRef: connectx6dx-firmware-22-44-1036 - updatePolicy: Update - configuration: - numVfs: 2 - linkType: Ethernet - pciPerformanceOptimized: - enabled: true - roceOptimized: - enabled: true - qos: - trust: dscp - pfc: "0,0,0,1,0,0,0,0" - gpuDirectOptimized: - enabled: true - env: Baremetal - - -Status conditions of the NicDevice CR reflect the status of the configuration update and indicate any errors that might occur during the process - -.. code-block:: bash - - > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P - - - type: FirmwareUpdateInProgress - status: "False" - reason: DeviceFirmwareConfigMatch - message: Firmware matches the requested version - observedGeneration: 4 - lastTransitionTime: "2024-09-21T08:42:23Z" - - type: ConfigUpdateInProgress - status: "True" - reason: UpdateStarted - message: "" - lastTransitionTime: "2024-09-21T08:43:08Z" - ----------------------------------- -NIC Firmware Mismatch Notification ----------------------------------- - -NIC Configuration Operator updates status conditions of the NicDevice CR to set `FirmwareConfigMatch` condition based on a current NIC firmware: - -.. code-block:: bash - - > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P - - - type: FirmwareConfigMatch - status: "True" - reason: DeviceFirmwareConfigMatch - message: Device firmware '20.42.1000' matches to recommended version '20.42.1000' - lastTransitionTime: "2024-09-21T08:43:10Z" - -`FirmwareConfigMatch` condition status is set to `Unknown` if DOCA-OFED Driver is not installed otherwise it notifies if current NIC firmware is recommended or not recommended by DOCA-OFED Driver. E.g.: - -.. code-block:: bash - - > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P - - - type: FirmwareConfigMatch - status: "True" - reason: DeviceFirmwareConfigMatch - message: Device firmware '20.42.1000' matches to recommended version '20.42.1000' - lastTransitionTime: "2024-11-08T09:19:41Z" diff --git a/docs/index.rst b/docs/index.rst index b3c62c87..85c033ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,6 +25,7 @@ Platform Support Getting Started with Kubernetes Getting Started with Red Hat OpenShift + NIC Configuration Operator Customization Options and CRDs Life Cycle Management Advanced Configurations diff --git a/docs/nic-conf-operator/configuration-details.rst b/docs/nic-conf-operator/configuration-details.rst new file mode 100644 index 00000000..246f0db4 --- /dev/null +++ b/docs/nic-conf-operator/configuration-details.rst @@ -0,0 +1,72 @@ +.. license-header + SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + SPDX-License-Identifier: Apache-2.0 + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. headings # #, * *, =, -, ^, " + +========================================== +Configuration Details +========================================== + + +Configuration details +^^^^^^^^^^^^^^^^^^^^^ + +- ``numVFs``: if provided, configure SR-IOV VFs via nvconfig. + + - This is a mandatory parameter. + - E.g: if ``numVFs=2`` then ``SRIOV_EN=1`` and ``SRIOV_NUM_OF_VFS=2``. + - If ``numVFs=0`` then ``SRIOV_EN=0`` and ``SRIOV_NUM_OF_VFS=0``. + +- ``linkType``: if provided configure ``linkType`` for the NIC for all NIC ports. + + - This is a mandatory parameter. + - E.g ``linkType = Infiniband`` then set ``LINK_TYPE_P1=IB`` and ``LINK_TYPE_P2=IB`` if second PCI function is present + +- ``pciPerformanceOptimized``: performs PCI performance optimizations. If enabled then by default the following will happen: + + - Set nvconfig ``MAX_ACC_OUT_READ`` nvconfig parameter to ``0`` (use device defaults) + - Set PCI max read request size for each PF to ``4096`` (note: this is a runtime config and is not persistent) + - Users can override values via ``maxAccOutRead`` and ``maxReadRequest`` + - **IMPORTANT** : + + - According to the PRM, setting ``MAX_ACC_OUT_READ`` to zero enables the auto mode,which applies the best suitable optimizations. However, there is a bug in certain FW versions, where the zero value is not available. + - In this case, until the fix is available, ``MAX_ACC_OUT_READ`` will not be set and a warning event will be emitted for this device’s CR. + +- ``roceOptimized``: performs RoCE related optimizations. If enabled performs the following by default: + + - Nvconfig set for both ports (can be applied from PF0) + + - Conditionally applied for second port if present + + - ``ROCE_CC_PRIO_MASK_P1=255``, ``ROCE_CC_PRIO_MASK_P2=255`` + - ``CNP_DSCP_P1=4``, ``CNP_DSCP_P2=4`` + - ``CNP_802P_PRIO_P1=6``, ``CNP_802P_PRIO_P2=6`` + + - Configure pfc (Priority Flow Control) for priority 3 and set trust to dscp on each PF + + - Non-persistent (need to be applied after each boot) + - Users can override values via ``trust`` and ``pfc`` parameters + + - Can only be enabled with ``linkType=Ethernet`` + +- ``gpuDirectOptimized``: performs gpu direct optimizations. ATM only optimizations for Baremetal environment are supported. If enabled perform the following: + + - Set nvconfig ``ATS_ENABLED=0`` + - Can only be enabled when ``pciPerformanceOptimized`` is enabled + - Both the numeric values and their string aliases, supported by NVConfig, are allowed (e.g. ``REAL_TIME_CLOCK_ENABLE=False``, ``REAL_TIME_CLOCK_ENABLE=0``). + - For per port parameters (suffix ``_P1``, ``_P2``) parameters with ``_P2`` suffix are ignored if the device is single port. + +- If a configuration is not set in spec, its non-volatile configuration parameters (if any) should be set to device default. diff --git a/docs/nic-conf-operator/crds.rst b/docs/nic-conf-operator/crds.rst new file mode 100644 index 00000000..75b7d018 --- /dev/null +++ b/docs/nic-conf-operator/crds.rst @@ -0,0 +1,548 @@ +Network Operator API reference v1alpha1 +======================================= + +Packages: + +- :ref:`configuration.net.nvidia.com/v1alpha1 ` + +.. _configuration.net.nvidia.com/v1alpha1: + +configuration.net.nvidia.com/v1alpha1 +------------------------------------- + +Package v1alpha1 contains API Schema definitions for the configuration.net v1alpha1 API group + +Resource Types: + +.. _ConfigurationTemplateSpec: + +ConfigurationTemplateSpec +~~~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicConfigurationTemplateSpec `, :ref:`NicDeviceConfigurationSpec `) + +ConfigurationTemplateSpec is a set of configurations for the NICs + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``numVfs`` | Number of VFs to be configured | + | int | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``linkType`` | LinkType to be configured, Ethernet|Infiniband | + | :ref:`LinkTypeEnum ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``pciPerformanceOptimized`` | PCI performance optimization settings | + | :ref:`PciPerformanceOptimizedSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``roceOptimized`` | RoCE optimization settings | + | :ref:`RoceOptimizedSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``gpuDirectOptimized`` | GPU Direct optimization settings | + | :ref:`GpuDirectOptimizedSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _FirmwareTemplateSpec: + +FirmwareTemplateSpec +~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicDeviceSpec `, :ref:`NicFirmwareTemplateSpec `) + +FirmwareTemplateSpec specifies a FW update policy for a given FW source ref + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``nicFirmwareSourceRef`` | NicFirmwareSourceRef refers to existing NicFirmwareSource CR on where to get the FW from | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``updatePolicy`` | UpdatePolicy indicates whether the operator needs to validate installed FW or upgrade it | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _GpuDirectOptimizedSpec: + +GpuDirectOptimizedSpec +~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`ConfigurationTemplateSpec `) + +GpuDirectOptimizedSpec specifies GPU Direct optimization settings + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``enabled`` | Optimize GPU Direct | + | bool | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``env`` | GPU direct environment, e.g. Baremetal | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _LinkTypeEnum: + +LinkTypeEnum (``string`` alias) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`ConfigurationTemplateSpec `) + +LinkTypeEnum described the link type (Ethernet / Infiniband) + +.. _NicConfigurationTemplate: + +NicConfigurationTemplate +~~~~~~~~~~~~~~~~~~~~~~~~ + +NicConfigurationTemplate is the Schema for the nicconfigurationtemplates API + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +=================================================================================================================+===================================================================================================+ + | ``metadata`` | Refer to the Kubernetes API documentation for the fields of the ``metadata`` field. | + | `Kubernetes | | + | meta/v1.ObjectMeta `__ | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``spec`` | Defines the desired state of NICs | + | :ref:`NicConfigurationTemplateSpec ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``status`` | Defines the observed state of NicConfigurationTemplate | + | :ref:`NicTemplateStatus ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicConfigurationTemplateSpec: + +NicConfigurationTemplateSpec +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicConfigurationTemplate `) + +NicConfigurationTemplateSpec defines the desired state of NicConfigurationTemplate + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``nodeSelector`` | NodeSelector contains labels required on the node. When empty, the template will be applied to | + | map[string]string | matching devices on all nodes. | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``nicSelector`` | NIC selector configuration | + | :ref:`NicSelectorSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``resetToDefault`` | *(Optional)* | + | bool | ResetToDefault specifies whether node agent needs to perform a reset flow The following | + | | operations will be performed: \* Nvconfig reset of all non-volatile configurations - Mstconfig -d | + | | reset for each PF - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 \* Node reboot - Applies new NIC NV | + | | config - Will undo any runtime configuration previously performed for the device/driver | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``template`` | Configuration template to be applied to matching devices | + | :ref:`ConfigurationTemplateSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicDevice: + +NicDevice +~~~~~~~~~ + +NicDevice is the Schema for the nicdevices API + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +=================================================================================================================+===================================================================================================+ + | ``metadata`` | Refer to the Kubernetes API documentation for the fields of the ``metadata`` field. | + | `Kubernetes | | + | meta/v1.ObjectMeta `__ | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``spec`` | | + | :ref:`NicDeviceSpec ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``status`` | | + | :ref:`NicDeviceStatus ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicDeviceConfigurationSpec: + +NicDeviceConfigurationSpec +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicDeviceSpec `) + +NicDeviceConfigurationSpec contains desired configuration of the NIC + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``resetToDefault`` | ResetToDefault specifies whether node agent needs to perform a reset flow. In NIC Configuration | + | bool | Operator template v0.1.14 BF2/BF3 DPUs (not SuperNics) FW reset flow isn’t supported. The | + | | following operations will be performed: \* Nvconfig reset of all non-volatile configurations - | + | | Mstconfig -d reset for each PF - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 \* Node reboot - | + | | Applies new NIC NV config - Will undo any runtime configuration previously performed for the | + | | device/driver | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``template`` | Configuration template applied from the NicConfigurationTemplate CR | + | :ref:`ConfigurationTemplateSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicDevicePortSpec: + +NicDevicePortSpec +~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicDeviceStatus `) + +NicDevicePortSpec describes the ports of the NIC + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``pci`` | PCI is a PCI address of the port, e.g. 0000:3b:00.0 | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``networkInterface`` | NetworkInterface is the name of the network interface for this port, e.g. eth1 | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``rdmaInterface`` | RdmaInterface is the name of the rdma interface for this port, e.g. mlx5_1 | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicDeviceSpec: + +NicDeviceSpec +~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicDevice `) + +NicDeviceSpec defines the desired state of NicDevice + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``configuration`` | Configuration specifies the configuration requested by NicConfigurationTemplate | + | :ref:`NicDeviceConfigurationSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``firmware`` | Firmware specifies the fw upgrade policy requested by NicFirmwareTemplate | + | :ref:`FirmwareTemplateSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicDeviceStatus: + +NicDeviceStatus +~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicDevice `) + +NicDeviceStatus defines the observed state of NicDevice + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===============================================================================================================+===================================================================================================+ + | ``node`` | Node where the device is located | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``type`` | Type of device, e.g. ConnectX7 | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``serialNumber`` | Serial number of the device, e.g. MT2116X09299 | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``partNumber`` | Part number of the device, e.g. MCX713106AEHEA_QP1 | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``psid`` | Product Serial ID of the device, e.g. MT_0000000221 | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``firmwareVersion`` | Firmware version currently installed on the device, e.g. 22.31.1014 | + | string | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``ports`` | List of ports for the device | + | :ref:`[]NicDevicePortSpec ` | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``conditions`` | List of conditions observed for the device | + | `[]Kubernetes | | + | meta/v1.Condition `__ | | + +---------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicFirmwareSource: + +NicFirmwareSource +~~~~~~~~~~~~~~~~~ + +NicFirmwareSource is the Schema for the nicfirmwaresources API + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +=================================================================================================================+===================================================================================================+ + | ``metadata`` | Refer to the Kubernetes API documentation for the fields of the ``metadata`` field. | + | `Kubernetes | | + | meta/v1.ObjectMeta `__ | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``spec`` | | + | :ref:`NicFirmwareSourceSpec ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``status`` | | + | :ref:`NicFirmwareSourceStatus ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicFirmwareSourceSpec: + +NicFirmwareSourceSpec +~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicFirmwareSource `) + +NicFirmwareSourceSpec represents a list of url sources for FW + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``binUrlSources`` | *(Optional)* | + | []string | BinUrlSources represents a list of url sources for ConnectX Firmware | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``bfbUrlSource`` | *(Optional)* | + | string | BFBUrlSource represents a url source for BlueField Bundle | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicFirmwareSourceStatus: + +NicFirmwareSourceStatus +~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicFirmwareSource `) + +NicFirmwareSourceStatus represents the status of the FW from given sources, e.g. version available for PSIDs + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``state`` | State represents the firmware processing state | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``reason`` | Reason shows an error message if occurred | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``binaryVersions`` | Versions is a map of available FW binaries versions to PSIDs a PSID should have only a single FW | + | map[string][]string | version available for it | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``bfbVersions`` | BFBVersions represents the FW versions available in the provided BFB bundle | + | map[string]string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicFirmwareTemplate: + +NicFirmwareTemplate +~~~~~~~~~~~~~~~~~~~ + +NicFirmwareTemplate is the Schema for the nicfirmwaretemplates API + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +=================================================================================================================+===================================================================================================+ + | ``metadata`` | Refer to the Kubernetes API documentation for the fields of the ``metadata`` field. | + | `Kubernetes | | + | meta/v1.ObjectMeta `__ | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``spec`` | | + | :ref:`NicFirmwareTemplateSpec ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``status`` | | + | :ref:`NicTemplateStatus ` | | + +-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicFirmwareTemplateSpec: + +NicFirmwareTemplateSpec +~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicFirmwareTemplate `) + +NicFirmwareTemplateSpec defines the FW templates and node/nic selectors for it + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``nodeSelector`` | NodeSelector contains labels required on the node. When empty, the template will be applied to | + | map[string]string | matching devices on all nodes. | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``nicSelector`` | NIC selector configuration | + | :ref:`NicSelectorSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``template`` | Firmware update template | + | :ref:`FirmwareTemplateSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicSelectorSpec: + +NicSelectorSpec +~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicConfigurationTemplateSpec `, :ref:`NicFirmwareTemplateSpec `) + +NicSelectorSpec is a desired configuration for NICs + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``nicType`` | Type of the NIC to be selected, e.g. 101d,1015,a2d6 etc. | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``pciAddresses`` | Array of PCI addresses to be selected, e.g. “0000:03:00.0” | + | []string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``serialNumbers`` | Serial numbers of the NICs to be selected, e.g. MT2116X09299 | + | []string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _NicTemplateStatus: + +NicTemplateStatus +~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`NicConfigurationTemplate `, :ref:`NicFirmwareTemplate `) + +NicTemplateStatus defines the observed state of NicConfigurationTemplate and NicFirmwareTemplate + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``nicDevices`` | NicDevice CRs matching this configuration / firmware template | + | []string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _PciPerformanceOptimizedSpec: + +PciPerformanceOptimizedSpec +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`ConfigurationTemplateSpec `) + +PciPerformanceOptimizedSpec specifies PCI performance optimization settings + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``enabled`` | Specifies whether to enable PCI performance optimization | + | bool | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``maxAccOutRead`` | Specifies the PCIe Max Accumulative Outstanding read bytes | + | int | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``maxReadRequest`` | Specifies the size of a single PCI read request in bytes | + | int | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _QosSpec: + +QosSpec +~~~~~~~ + +(*Appears on:* :ref:`RoceOptimizedSpec `) + +QosSpec specifies Quality of Service settings + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``trust`` | Trust mode for QoS settings, e.g. trust-dscp | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``pfc`` | Priority-based Flow Control configuration, e.g. “0,0,0,1,0,0,0,0” | + | string | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +.. _RoceOptimizedSpec: + +RoceOptimizedSpec +~~~~~~~~~~~~~~~~~ + +(*Appears on:* :ref:`ConfigurationTemplateSpec `) + +RoceOptimizedSpec specifies RoCE optimization settings + +.. container:: md-typeset__scrollwrap + + .. container:: md-typeset__table + + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Field | Description | + +===================================================================================================+===================================================================================================+ + | ``enabled`` | Optimize RoCE | + | bool | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | ``qos`` | Quality of Service settings | + | :ref:`QosSpec ` | | + +---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ diff --git a/docs/nic-conf-operator/nic-configuration-operator.rst b/docs/nic-conf-operator/nic-configuration-operator.rst new file mode 100644 index 00000000..3a81db1b --- /dev/null +++ b/docs/nic-conf-operator/nic-configuration-operator.rst @@ -0,0 +1,26 @@ +.. license-header + SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + SPDX-License-Identifier: Apache-2.0 + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. headings # #, * *, =, -, ^, " + +************************** +NIC Configuration Operator +************************** + +.. toctree:: + NIC Firmware Configuration + Configuration Details + CRD API Reference diff --git a/docs/nic-conf-operator/nic-fw-configuration.rst b/docs/nic-conf-operator/nic-fw-configuration.rst new file mode 100644 index 00000000..1fc6f02b --- /dev/null +++ b/docs/nic-conf-operator/nic-fw-configuration.rst @@ -0,0 +1,299 @@ +.. license-header + SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + SPDX-License-Identifier: Apache-2.0 + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. headings # #, * *, =, -, ^, " + +.. include:: ../common/vars.rst + +************************** +NIC Firmware Configuration +************************** + +.. contents:: On this page + :depth: 4 + :local: + :backlinks: none + + +=========================================================================== +Configure NIC Firmware using the NIC Configuration Operator +=========================================================================== +`NVIDIA NIC Configuration Operator `_ provides Kubernetes API (Custom Resource Definition) to allow Firmware update and configuration on NVIDIA NICs in a coordinated manner. It deploys a configuration daemon on each of the desired nodes to configure NVIDIA NICs there. NVIDIA NIC Configuration Operator uses `Maintenance Operator `_ to prepare a node for maintenance before the actual configuration. + +.. warning:: NVIDIA NIC Configuration Operator does not support FW reset flow for DPU mode. Check :doc:`limitations <../release-notes>`. + +.. note:: + To perform Firmware validation and update on NIC devices, NIC Configuration Operator requires a persistent storage set up in the cluster. + To set up a persistent NFS storage in the cluster, the `example from the CSI NFS Driver repository `_ might be used. + After deploying the NFS server and NFS CSI driver, the `storage class `_ should become available in the cluster. The name of the storage class should then be passed when configuring the NIC Configuration Operator. + +First install the Network Operator helm chart with the Maintenance Operator enabled and deploy a NIC Cluster Policy CRD with NIC Configuration Operator enabled: + +``values.yaml``: + +.. code-block:: yaml + + maintenanceOperator: + enabled: true + +``nicclusterpolicy.yaml``: + +.. code-block:: yaml + :substitutions: + + apiVersion: mellanox.com/v1alpha1 + kind: NicClusterPolicy + metadata: + name: nic-cluster-policy + spec: + nicConfigurationOperator: + operator: + image: nic-configuration-operator + repository: |nic-configuration-operator-repository| + version: |nic-configuration-operator-version| + configurationDaemon: + image: nic-configuration-operator-daemon + repository: |nic-configuration-operator-repository| + version: |nic-configuration-operator-version| + nicFirmwareStorage: + create: true + pvcName: nic-fw-storage-pvc + # Name of the storage class is provided by the user + storageClassName: nfs-csi + availableStorageSize: 1Gi + +Observe the NicDevice CRs detected in the cluster. The name of the CR is composed from the node name, NIC type and its serial number: + +.. code:: bash + + > kubectl get nicdevices -n nvidia-network-operator + + NAME AGE + node1-1015-mt1627x08307 1m + node1-101d-mt1952x03330 1m + node2-1015-mt1627x08305 1m + node2-101d-mt1952x03327 1m + +Discover more information about a specific device: + +.. code:: bash + + kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o yaml + +.. code-block:: yaml + + apiVersion: configuration.net.nvidia.com/v1alpha1 + kind: NicDevice + metadata: + creationTimestamp: "2024-09-21T08:43:08Z" + generation: 1 + name: node1-101d-mt1952x03327 + namespace: nvidia-network-operator + ownerReferences: + - apiVersion: v1 + kind: Node + name: node1 + uid: 25c4f4e2-f7ba-4ba9-9a87-8056313ffc79 + resourceVersion: "1177095" + uid: ac6763bf-67c6-4af5-81f8-1aad5da929bf + spec: {} + status: + conditions: + - type: FirmwareUpdateInProgress + status: "False" + reason: DeviceFirmwareSpecEmpty + message: Device firmware spec is empty, cannot update or validate firmware + lastTransitionTime: "2024-09-21T08:43:04Z" + - type: ConfigUpdateInProgress + status: "False" + reason: DeviceConfigSpecEmpty + message: Device configuration spec is empty, cannot update configuration + lastTransitionTime: "2024-09-21T08:43:08Z" + firmwareVersion: 22.39.1015 + node: cloud-dev-41 + partNumber: mcx623106ac-cdat + ports: + - networkInterface: enp3s0f0np0 + pci: "0000:03:00.0" + rdmaInterface: mlx5_0 + - networkInterface: enp3s0f1np1 + pci: "0000:03:00.1" + rdmaInterface: mlx5_1 + psid: mt_0000000436 + serialNumber: mt1952x03327 + type: 101d + +Configure and apply the NICFirmwareSource CR: + +.. code-block:: yaml + + apiVersion: configuration.net.nvidia.com/v1alpha1 + kind: NicFirmwareSource + metadata: + name: connectx6-dx-firmware-22-44-1036 + namespace: nvidia-network-operator + finalizers: + - configuration.net.nvidia.com/nic-configuration-operator + spec: + # a list of firmware binaries zip archives from the Mellanox website, can point to any url accessible from the cluster + binUrlSources: + - https://www.mellanox.com/downloads/firmware/fw-ConnectX6Dx-rel-22_44_1036-MCX623106AC-CDA_Ax-UEFI-14.37.14-FlexBoot-3.7.500.signed.bin.zip + +Observe the NICFirmwareSource status: + +.. code:: bash + + > kubectl get nicfirmwaresource -n nvidia-network-operator connectx6-dx-firmware-22-44-1036 -o yaml + + ... + status: + state: Success + versions: + 22.44.1036: + - mt_0000000436 + +Configure and apply the NicFirmwareTemplate CR: + +.. code-block:: yaml + + apiVersion: configuration.net.nvidia.com/v1alpha1 + kind: NicFirmwareTemplate + metadata: + name: connectx6dx-config + namespace: nvidia-network-operator + spec: + nodeSelector: + kubernetes.io/hostname: node1 + nicSelector: + nicType: "101d" + template: + nicFirmwareSourceRef: connectx6dx-firmware-22-44-1036 + updatePolicy: Update + +Configure and apply the NicConfigurationTemplate CR: + +.. code-block:: yaml + + apiVersion: configuration.net.nvidia.com/v1alpha1 + kind: NicConfigurationTemplate + metadata: + name: connectx6-config + namespace: nvidia-network-operator + spec: + nodeSelector: + feature.node.kubernetes.io/network-sriov.capable: "true" + nicSelector: + # nicType selector is mandatory the rest are optional. Only a single type can be specified. + nicType: 101d + pciAddresses: + - "0000:03:00.0" + - “0000:04:00.0” + serialNumbers: + - "mt1952x03327" + resetToDefault: false # if set, template is ignored, device configuration should reset + template: + # numVfs and linkType fields are mandatory, the rest are optional + numVfs: 2 + linkType: Ethernet + pciPerformanceOptimized: + enabled: true + maxReadRequest: 4096 + roceOptimized: + enabled: true + qos: + trust: dscp + pfc: "0,0,0,1,0,0,0,0" + gpuDirectOptimized: + enabled: true + env: Baremetal + +.. note:: It's not possible to apply more than one template of each kind (NICFirmwareTemplate or NICConfigurationTemplate) to a single device. In this case, no template will be applied and an error event will be emitted for the corresponding NicDevice CR. + +.. note:: To use the NIC Configuration Operator functionality together with SR-IOV Network Operator, "mellanox" `plugin should be disabled `_ in the SR-IOV Network Operator. + +For more information about the CRD API, refer to :doc:`CRD API Reference `. +For detailed information about firmware parameters and configuration settings, refer to :doc:`Configuration Details `. + +Spec of the NicDevice CR is updated in accordance with the NICFirmwareTemplate and NicConfigurationTemplate CRs matching the device + +.. code-block:: bash + + > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.spec}' | yq -P + + template: + firmware: + nicFirmwareSourceRef: connectx6dx-firmware-22-44-1036 + updatePolicy: Update + configuration: + numVfs: 2 + linkType: Ethernet + pciPerformanceOptimized: + enabled: true + roceOptimized: + enabled: true + qos: + trust: dscp + pfc: "0,0,0,1,0,0,0,0" + gpuDirectOptimized: + enabled: true + env: Baremetal + + +Status conditions of the NicDevice CR reflect the status of the configuration update and indicate any errors that might occur during the process + +.. code-block:: bash + + > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P + + - type: FirmwareUpdateInProgress + status: "False" + reason: DeviceFirmwareConfigMatch + message: Firmware matches the requested version + observedGeneration: 4 + lastTransitionTime: "2024-09-21T08:42:23Z" + - type: ConfigUpdateInProgress + status: "True" + reason: UpdateStarted + message: "" + lastTransitionTime: "2024-09-21T08:43:08Z" + +---------------------------------- +NIC Firmware Mismatch Notification +---------------------------------- + +NIC Configuration Operator updates status conditions of the NicDevice CR to set `FirmwareConfigMatch` condition based on a current NIC firmware: + +.. code-block:: bash + + > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P + + - type: FirmwareConfigMatch + status: "True" + reason: DeviceFirmwareConfigMatch + message: Device firmware '20.42.1000' matches to recommended version '20.42.1000' + lastTransitionTime: "2024-09-21T08:43:10Z" + +`FirmwareConfigMatch` condition status is set to `Unknown` if DOCA-OFED Driver is not installed otherwise it notifies if current NIC firmware is recommended or not recommended by DOCA-OFED Driver. E.g.: + +.. code-block:: bash + + > kubectl get nicdevice -n nvidia-network-operator node1-101d-mt1952x03327 -o jsonpath='{.status.conditions}' | yq -P + + - type: FirmwareConfigMatch + status: "True" + reason: DeviceFirmwareConfigMatch + message: Device firmware '20.42.1000' matches to recommended version '20.42.1000' + lastTransitionTime: "2024-11-08T09:19:41Z" diff --git a/hack/api-docs/nic-conf-config.json b/hack/api-docs/nic-conf-config.json new file mode 100644 index 00000000..cd2ca823 --- /dev/null +++ b/hack/api-docs/nic-conf-config.json @@ -0,0 +1,38 @@ +{ + "hideMemberFields": [ + "TypeMeta" + ], + "hideTypePatterns": [ + "ParseError$", + "List$" + ], + "externalPackages": [ + { + "typeMatchPrefix": "^k8s\\.io/api/core/v1\\.EnvVar$", + "docsURLTemplate": "https://godoc.org/k8s.io/api/core/v1#EnvVar" + }, + { + "typeMatchPrefix": "^k8s\\.io/api/core/v1\\.ResourceList$", + "docsURLTemplate": "https://godoc.org/k8s.io/api/core/v1#ResourceList" + }, + { + "typeMatchPrefix": "^k8s\\.io/api/core/v1\\.NodeAffinity$", + "docsURLTemplate": "https://godoc.org/k8s.io/api/core/v1#NodeAffinity" + }, + { + "typeMatchPrefix": "^k8s\\.io/api/core/v1\\.Toleration$", + "docsURLTemplate": "https://godoc.org/k8s.io/api/core/v1#Toleration" + }, + { + "typeMatchPrefix": "^k8s\\.io/(api|apimachinery/pkg/apis)/", + "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.30/#{{lower .TypeIdentifier}}-{{arrIndex .PackageSegments -1}}-{{arrIndex .PackageSegments -2}}" + } + ], + "typeDisplayNamePrefixOverrides": { + "k8s.io/api/": "Kubernetes ", + "k8s.io/apimachinery/pkg/apis/": "Kubernetes ", + "k8s.io/apiextensions-apiserver/": "Kubernetes ", + "configuration.net.nvidia.com/": "NIC Configuration " + }, + "markdownDisabled": false +} \ No newline at end of file diff --git a/hack/fetch-config-docs.sh b/hack/fetch-config-docs.sh new file mode 100755 index 00000000..b295cb45 --- /dev/null +++ b/hack/fetch-config-docs.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Script to fetch configuration documentation from GitHub +# Usage: ./hack/fetch-config-docs.sh + +set -e + +REPO=${1:-"Mellanox/nic-configuration-operator"} +BRANCH=${2:-"main"} +FILE_PATH=${3:-"README.md"} +OUTPUT_PATH=${4:-"docs/nic-conf-operator/configuration-details.rst"} + +echo "Fetching configuration documentation from ${REPO}/${BRANCH}/${FILE_PATH}" + +# Create output directory if it doesn't exist +mkdir -p "$(dirname "$OUTPUT_PATH")" + +# Fetch the content from GitHub +curl -s "https://raw.githubusercontent.com/${REPO}/${BRANCH}/${FILE_PATH}" > /tmp/config_docs.md + +# Convert markdown to reStructuredText and extract configuration details section +cat > "$OUTPUT_PATH" << 'EOF' +.. license-header + SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + SPDX-License-Identifier: Apache-2.0 + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. headings # #, * *, =, -, ^, " + +========================================== +Configuration Details +========================================== + +EOF + +# Extract the configuration details section from the markdown +awk '/#### Configuration details/,/### NicFirmwareSource/' /tmp/config_docs.md | head -n -1 > /tmp/config_section.md + +# Convert to reStructuredText format +pandoc /tmp/config_section.md -f markdown -t rst --wrap=none > /tmp/config_section.rst + +# Create the final output +cat "$OUTPUT_PATH" > /tmp/final_output.rst +echo "" >> /tmp/final_output.rst +cat /tmp/config_section.rst >> /tmp/final_output.rst + +mv /tmp/final_output.rst "$OUTPUT_PATH" + +echo "Configuration documentation saved to ${OUTPUT_PATH}" +echo "Source: https://github.com/${REPO}/blob/${BRANCH}/${FILE_PATH}#configuration-details" \ No newline at end of file diff --git a/hack/ref_links.lua b/hack/ref_links.lua index 083bb0c4..5e7adbfb 100644 --- a/hack/ref_links.lua +++ b/hack/ref_links.lua @@ -33,6 +33,12 @@ function Link(el) local ref_name = el.content[1].text return pandoc.RawInline('rst', string.format(":ref:`%s <%s>`", el.content[1].text, ref_name)) end + -- Check if the link is "#configuration.net.nvidia.com%2fv1alpha1" + if href:match("configuration%.net%.nvidia%.com%%2fv*") then + -- Update the href to be the correct path + local ref_name = el.content[1].text + return pandoc.RawInline('rst', string.format(":ref:`%s <%s>`", el.content[1].text, ref_name)) + end -- Remove asterisks from around the link text if present local link_text = el.content[1].text:gsub("^%*", ""):gsub("%*$", "") -- Remove the leading '#' if present