diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 2e7612cf3..c1f17022a 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1632,20 +1632,20 @@ type VGPUDevicesConfigSpec struct { // CDIConfigSpec defines how the Container Device Interface is used in the cluster. type CDIConfigSpec struct { - // Enabled indicates whether CDI can be used to make GPUs accessible to containers. + // Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional // +kubebuilder:default=false // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as a mechanism for making GPUs accessible to containers" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` - // Default indicates whether to use CDI as the default mechanism for providing GPU access to containers. + // Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional // +kubebuilder:default=false // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Configure CDI as the default mechanism for making GPUs accessible to containers" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden" Default *bool `json:"default,omitempty"` } diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 0f8d48728..0836da07a 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -536,6 +536,20 @@ spec: path: toolkit.imagePullPolicy x-descriptors: - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - displayName: CDI + description: Container Device Interface (CDI) Configuration + path: cdi + - displayName: Enabled + description: 'Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.' + path: cdi.enabled + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' + - displayName: Default + description: 'Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.' + path: cdi.default + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:hidden' + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' - displayName: NVIDIA DCGM config description: NVIDIA DCGM config path: dcgm diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index dfa368e9c..ffe836e7d 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,14 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether CDI should be used as the + mechanism for making GPUs accessible to containers. type: boolean type: object daemonsets: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index dfa368e9c..ffe836e7d 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,14 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether CDI should be used as the + mechanism for making GPUs accessible to containers. type: boolean type: object daemonsets: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 46b65f6df..0b8c57552 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -180,6 +180,8 @@ const ( // DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path // of the driver install dir mounted in the container DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH" + // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime + NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" ) // ContainerProbe defines container probe types @@ -939,7 +941,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1195,6 +1197,28 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { return envVars } +func transformToolkitForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) { + if !config.CDI.IsEnabled() { + return + } + + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") + + if !n.runtimeSupportsCDI { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") + } + + // When the container runtime supports CDI, we do not configure 'nvidia' as the default runtime. + // Instead, we leverage native CDI support in containerd / cri-o to inject GPUs into workloads. + // The 'nvidia' runtime will be set as the runtime class for our management containers so that they + // get access to all GPUs. + if n.runtimeSupportsCDI { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaRuntimeSetAsDefaultEnvName, "false") + } +} + // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container @@ -1233,14 +1257,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n } // update env required for CDI support - if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") - if config.CDI.IsDefault() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") - } - } + transformToolkitForCDI(obj, config, n) // set install directory for the toolkit if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir { @@ -1352,6 +1369,29 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return nil } +func transformDevicePluginForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) { + if !config.CDI.IsEnabled() { + return + } + + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") + if config.Toolkit.IsEnabled() { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) + } + + // When the container runtime supports CDI, we leverage native CDI support in container / cri-o + // to inject GPUs into workloads. If native CDI is not supported, we leverage CDI support in + // NVIDIA Container Toolkit. + deviceListStrategy := "cdi-cri" + cdiAnnotationPrefix := "cdi.k8s.io/" + if !n.runtimeSupportsCDI { + deviceListStrategy = "envvar,cdi-annotations" + cdiAnnotationPrefix = "nvidia.cdi.k8s.io/" + } + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, deviceListStrategy) + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, cdiAnnotationPrefix) +} + // TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container @@ -1406,20 +1446,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) // update env required for CDI support - if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") - if config.Toolkit.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) - } - } + transformDevicePluginForCDI(obj, config, n) // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" && @@ -1494,7 +1527,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(mainContainer, config.MIG.Strategy) @@ -1608,7 +1641,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { @@ -1725,7 +1758,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) return nil } @@ -1775,7 +1808,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2060,7 +2093,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2392,13 +2425,15 @@ func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { return DefaultRuntimeClass } -func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { - if runtime == gpuv1.Containerd { - if runtimeClass == "" { - runtimeClass = DefaultRuntimeClass - } - podSpec.RuntimeClassName = &runtimeClass +func setRuntimeClass(podSpec *corev1.PodSpec, n ClusterPolicyController, runtimeClass string) { + if !n.singleton.Spec.CDI.IsEnabled() && n.runtime != gpuv1.Containerd { + return + } + + if runtimeClass == "" { + runtimeClass = DefaultRuntimeClass } + podSpec.RuntimeClassName = &runtimeClass } func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { diff --git a/controllers/state_manager.go b/controllers/state_manager.go index d8f3ef7e6..f7318d383 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -161,10 +161,11 @@ type ClusterPolicyController struct { openshift string ocpDriverToolkit OpenShiftDriverToolkit - runtime gpuv1.Runtime - hasGPUNodes bool - hasNFDLabels bool - sandboxEnabled bool + runtime gpuv1.Runtime + runtimeSupportsCDI bool + hasGPUNodes bool + hasNFDLabels bool + sandboxEnabled bool } func addState(n *ClusterPolicyController, path string) { @@ -580,7 +581,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { return clusterHasNFDLabels, gpuNodesTotal, nil } -func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) { +func getRuntimeVersionString(node corev1.Node) (gpuv1.Runtime, string, error) { // ContainerRuntimeVersion string will look like :// runtimeVer := node.Status.NodeInfo.ContainerRuntimeVersion var runtime gpuv1.Runtime @@ -592,9 +593,11 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) { case strings.HasPrefix(runtimeVer, "cri-o"): runtime = gpuv1.CRIO default: - return "", fmt.Errorf("runtime not recognized: %s", runtimeVer) + return "", "", fmt.Errorf("runtime not recognized: %s", runtimeVer) } - return runtime, nil + version := strings.SplitAfter(runtimeVer, "//")[1] + vVersion := "v" + strings.TrimPrefix(version, "v") + return runtime, vVersion, nil } func (n *ClusterPolicyController) setPodSecurityLabelsForNamespace() error { @@ -706,13 +709,14 @@ func (n *ClusterPolicyController) ocpEnsureNamespaceMonitoring() error { return nil } -// getRuntime will detect the container runtime used by nodes in the -// cluster and correctly set the value for clusterPolicyController.runtime -// For openshift, set runtime to crio. Otherwise, the default runtime is -// containerd -- if >=1 node is configured with containerd, set -// clusterPolicyController.runtime = containerd -func (n *ClusterPolicyController) getRuntime() error { +// getContainerRuntimeInfo will detect the container runtime version used by nodes +// in the cluster and correctly set the value for clusterPolicyController.runtime +// and clusterPolicyController.runtimeSupportsCDI. On OpenShift, the runtime +// is always assumed to be cri-o. We assume the runtime supports CDI unless +// containerd < 1.7.0 is detected. +func (n *ClusterPolicyController) getContainerRuntimeInfo() error { ctx := n.ctx + n.runtimeSupportsCDI = true // assume crio for openshift clusters if n.openshift != "" { n.runtime = gpuv1.CRIO @@ -725,27 +729,26 @@ func (n *ClusterPolicyController) getRuntime() error { list := &corev1.NodeList{} err := n.client.List(ctx, list, opts...) if err != nil { - return fmt.Errorf("Unable to list nodes prior to checking container runtime: %v", err) + return fmt.Errorf("failed to list nodes: %w", err) } var runtime gpuv1.Runtime - for _, node := range list.Items { - rt, err := getRuntimeString(node) + for i, node := range list.Items { + rt, version, err := getRuntimeVersionString(node) if err != nil { - n.logger.Info(fmt.Sprintf("Unable to get runtime info for node %s: %v", node.Name, err)) - continue + return fmt.Errorf("failed to get runtime info for node %s: %w", node.Name, err) + } + if i == 0 { + runtime = rt + } else if rt != runtime { + n.logger.Error(nil, "Different runtimes on different worker nodes is not supported") + return fmt.Errorf("different runtimes on different worker nodes is not supported") } - runtime = rt - if runtime == gpuv1.Containerd { - // default to containerd if >=1 node running containerd - break + if runtime == gpuv1.Containerd && semver.Compare(version, "v1.7.0") < 0 { + n.runtimeSupportsCDI = false } } - if runtime.String() == "" { - n.logger.Info("Unable to get runtime info from the cluster, defaulting to containerd") - runtime = gpuv1.Containerd - } n.runtime = runtime return nil } @@ -868,11 +871,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP } // detect the container runtime on worker nodes - err = n.getRuntime() + err = n.getContainerRuntimeInfo() if err != nil { - return err + return fmt.Errorf("failed to get container runtime info: %w", err) } n.logger.Info(fmt.Sprintf("Using container runtime: %s", n.runtime.String())) + n.logger.Info(fmt.Sprintf("Container runtime supports CDI: %t", n.runtimeSupportsCDI)) // fetch all kernel versions from the GPU nodes in the cluster if n.singleton.Spec.Driver.IsEnabled() && n.singleton.Spec.Driver.UsePrecompiledDrivers() { diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index bdec856e0..2dd792526 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -19,36 +19,56 @@ package controllers import ( "testing" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" ) -func TestGetRuntimeString(t *testing.T) { +func TestGetRuntimeVersionString(t *testing.T) { testCases := []struct { description string runtimeVer string expectedRuntime gpuv1.Runtime + expectedVersion string + errorExpected bool }{ { "containerd", "containerd://1.0.0", gpuv1.Containerd, + "v1.0.0", + false, }, { "docker", "docker://1.0.0", gpuv1.Docker, + "v1.0.0", + false, }, { "crio", "cri-o://1.0.0", gpuv1.CRIO, + "v1.0.0", + false, }, { "unknown", "unknown://1.0.0", "", + "v1.0.0", + true, + }, + { + "containerd with v prefix", + "containerd://v1.0.0", + gpuv1.Containerd, + "v1.0.0", + false, }, } @@ -61,11 +81,142 @@ func TestGetRuntimeString(t *testing.T) { }, }, } - runtime, _ := getRuntimeString(node) - // TODO: update to use require pkg after MR !311 is merged - if runtime != tc.expectedRuntime { - t.Errorf("expected %s but got %s", tc.expectedRuntime.String(), runtime.String()) + runtime, version, err := getRuntimeVersionString(node) + if tc.errorExpected { + require.Error(t, err) + return + } + require.NoError(t, err) + require.EqualValues(t, tc.expectedRuntime, runtime) + require.EqualValues(t, tc.expectedVersion, version) + }) + } +} + +func TestGetContainerRuntimeInfo(t *testing.T) { + testCases := []struct { + description string + ctrl *ClusterPolicyController + expectedRuntime gpuv1.Runtime + runtimeSupportsCDI bool + errorExpected bool + }{ + { + description: "containerd", + ctrl: &ClusterPolicyController{ + client: fake.NewFakeClient( + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ContainerRuntimeVersion: "containerd://1.7.0"}, + }, + }), + }, + expectedRuntime: gpuv1.Containerd, + runtimeSupportsCDI: true, + errorExpected: false, + }, + { + description: "cri-o", + ctrl: &ClusterPolicyController{ + client: fake.NewFakeClient( + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ContainerRuntimeVersion: "cri-o://1.30.0"}, + }, + }), + }, + expectedRuntime: gpuv1.CRIO, + runtimeSupportsCDI: true, + errorExpected: false, + }, + { + description: "openshift", + ctrl: &ClusterPolicyController{ + openshift: "1.0.0", + }, + expectedRuntime: gpuv1.CRIO, + runtimeSupportsCDI: true, + errorExpected: false, + }, + { + description: "containerd, multiple nodes, cdi not supported", + ctrl: &ClusterPolicyController{ + client: fake.NewFakeClient( + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + ContainerRuntimeVersion: "containerd://1.7.0", + }, + }, + }, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-2", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + ContainerRuntimeVersion: "containerd://1.6.30", + }, + }, + }), + }, + expectedRuntime: gpuv1.Containerd, + runtimeSupportsCDI: false, + errorExpected: false, + }, + { + description: "multiple nodes, different runtimes", + ctrl: &ClusterPolicyController{ + client: fake.NewFakeClient( + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + ContainerRuntimeVersion: "containerd://1.7.0", + }, + }, + }, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-2", + Labels: map[string]string{commonGPULabelKey: "true"}, + }, + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + ContainerRuntimeVersion: "cri-o://1.30.0", + }, + }, + }), + }, + errorExpected: true, + }, + } + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := tc.ctrl.getContainerRuntimeInfo() + if tc.errorExpected { + require.Error(t, err) + return } + require.NoError(t, err) + require.EqualValues(t, tc.expectedRuntime, tc.ctrl.runtime) + require.EqualValues(t, tc.runtimeSupportsCDI, tc.ctrl.runtimeSupportsCDI) }) } } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 83b504b7e..14458d6b0 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1096,7 +1096,11 @@ func TestTransformValidator(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - err := TransformValidator(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{runtime: gpuv1.Containerd, logger: ctrl.Log.WithName("test")}) + err := TransformValidator(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{ + runtime: gpuv1.Containerd, + logger: ctrl.Log.WithName("test"), + singleton: &gpuv1.ClusterPolicy{Spec: *tc.cpSpec}, + }) if tc.errorExpected { require.Error(t, err) return @@ -1163,3 +1167,177 @@ func TestTransformSandboxValidator(t *testing.T) { }) } } + +func TestTransformToolkitForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + ctrl ClusterPolicyController + expectedDs Daemonset + }{ + { + description: "cdi disabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(false), + }, + }, + ctrl: ClusterPolicyController{}, + expectedDs: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + }, + { + description: "cdi enabled, runtime does not support cdi", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + }, + ctrl: ClusterPolicyController{}, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: CrioConfigModeEnvName, Value: "config"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: NvidiaCtrRuntimeCDIPrefixesEnvName, Value: "nvidia.cdi.k8s.io/"}, + }, + }), + }, + { + description: "cdi enabled, runtime supports cdi", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + }, + ctrl: ClusterPolicyController{ + runtimeSupportsCDI: true, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: CrioConfigModeEnvName, Value: "config"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + transformToolkitForCDI(tc.ds.DaemonSet, tc.cpSpec, tc.ctrl) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} + +func TestTransformDevicePluginForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + ctrl ClusterPolicyController + expectedDs Daemonset + }{ + { + description: "cdi disabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(false), + }, + }, + ctrl: ClusterPolicyController{}, + expectedDs: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + }, + { + description: "cdi enabled, toolkit disabled, runtime supports cdi", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(false), + }, + }, + ctrl: ClusterPolicyController{ + runtimeSupportsCDI: true, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + }, + }), + }, + { + description: "cdi enabled, toolkit disabled, runtime does not support cdi", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(false), + }, + }, + ctrl: ClusterPolicyController{ + runtimeSupportsCDI: false, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "envvar,cdi-annotations"}, + {Name: CDIAnnotationPrefixEnvName, Value: "nvidia.cdi.k8s.io/"}, + }, + }), + }, + { + description: "cdi enabled, toolkit enabled, runtime supports cdi", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + ctrl: ClusterPolicyController{ + runtimeSupportsCDI: true, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + transformDevicePluginForCDI(tc.ds.DaemonSet, tc.cpSpec, tc.ctrl) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index dfa368e9c..ffe836e7d 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,14 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether CDI should be used as the + mechanism for making GPUs accessible to containers. type: boolean type: object daemonsets: diff --git a/tests/scripts/enable-cdi.sh b/tests/scripts/enable-cdi.sh new file mode 100755 index 000000000..3647aa5e1 --- /dev/null +++ b/tests/scripts/enable-cdi.sh @@ -0,0 +1,12 @@ +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +# Import the check definitions +source ${SCRIPT_DIR}/checks.sh + +kubectl patch clusterpolicy/cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value": true}]' +if [ "$?" -ne 0 ]; then + echo "failed to enable CDI in clusterpolicy" + exit 1 +fi +sleep 5 diff --git a/tests/scripts/end-to-end.sh b/tests/scripts/end-to-end.sh index 46fceb949..39c384dd9 100755 --- a/tests/scripts/end-to-end.sh +++ b/tests/scripts/end-to-end.sh @@ -19,6 +19,7 @@ echo "-------------------------------------------------------------------------- # Install a workload and verify that this works as expected "${SCRIPT_DIR}"/install-workload.sh "${SCRIPT_DIR}"/verify-workload.sh +"${SCRIPT_DIR}"/uninstall-workload.sh echo "" echo "" @@ -27,6 +28,16 @@ echo "-------------------------------------------------------------------------- # Test updates through ClusterPolicy "${SCRIPT_DIR}"/update-clusterpolicy.sh +echo "" +echo "" +echo "--------------CDI Tests--------------------------------------------" +echo "------------------------------------------------------------------------------------" +"${SCRIPT_DIR}"/enable-cdi.sh +"${SCRIPT_DIR}"/verify-operator.sh +"${SCRIPT_DIR}"/install-workload.sh +"${SCRIPT_DIR}"/verify-workload.sh +"${SCRIPT_DIR}"/uninstall-workload.sh + echo "" echo "" echo "--------------GPU Operator Restart Test---------------------------------------------" @@ -43,8 +54,7 @@ test_restart_operator ${TEST_NAMESPACE} ${CONTAINER_RUNTIME} "${SCRIPT_DIR}"/enable-operands.sh "${SCRIPT_DIR}"/verify-operator.sh -# Uninstall the workload and operator -"${SCRIPT_DIR}"/uninstall-workload.sh +# Uninstall the operator "${SCRIPT_DIR}"/uninstall-operator.sh echo ""