diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 7d4577928..7fdb2613e 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -958,7 +958,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1241,6 +1241,7 @@ func transformToolkitCtrForCDI(container *corev1.Container) { setContainerEnv(container, CDIEnabledEnvName, "true") setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false") setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") + setContainerEnv(container, CRIOConfigModeEnvName, "config") } // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy @@ -1283,6 +1284,18 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n // update env required for CDI support if config.CDI.IsEnabled() { transformToolkitCtrForCDI(toolkitMainContainer) + } else if n.runtime == gpuv1.CRIO { + // (cdesiniotis) When CDI is not enabled and cri-o is the container runtime, + // we continue to install the OCI prestart hook as opposed to adding nvidia + // runtime handlers to the cri-o configuration. Users can override this behavior + // and have nvidia runtime handlers added to the cri-o configuration by setting + // the 'CRIO_CONFIG_MODE' environment variable to 'config' in the toolkit container. + // However, one should note setting 'CRIO_CONFIG_MODE' to 'config' in this case + // (when CDI is not enabled) would result in the 'nvidia' runtime being set as + // the default runtime. While this should work in theory, it is a significant + // change -- which was the primary motivation to continue using the OCI prestart + // hook by default in this case. + setContainerEnv(toolkitMainContainer, CRIOConfigModeEnvName, "hook") } // set install directory for the toolkit @@ -1337,11 +1350,6 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config)) } - if runtime == gpuv1.CRIO.String() { - // We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook - setContainerEnv(container, CRIOConfigModeEnvName, "config") - } - // For runtime config files we have top-level configs and drop-in files. // These are supported as follows: // * Docker only supports top-level config files. @@ -1517,7 +1525,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy) @@ -1597,7 +1605,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy) @@ -1705,7 +1713,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // set hostPID if specified for DCGM Exporter if config.DCGMExporter.IsHostPIDEnabled() { @@ -1830,7 +1838,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) return nil } @@ -1872,7 +1880,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2166,7 +2174,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return fmt.Errorf("%v", err) } - setRuntimeClassName(&obj.Spec.Template.Spec, config) + setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2540,7 +2548,10 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string { return DefaultRuntimeClass } -func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) { +func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) { + if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO { + return + } runtimeClassName := getRuntimeClassName(config) podSpec.RuntimeClassName = &runtimeClassName } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 35300c6c3..3fee7d685 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -478,7 +478,6 @@ func TestTransformForRuntime(t *testing.T) { Name: "test-ctr", Env: []corev1.EnvVar{ {Name: "RUNTIME", Value: gpuv1.CRIO.String()}, - {Name: CRIOConfigModeEnvName, Value: "config"}, {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, @@ -775,12 +774,14 @@ func TestTransformToolkit(t *testing.T) { description string ds Daemonset // Input DaemonSet cpSpec *gpuv1.ClusterPolicySpec // Input configuration - expectedDs Daemonset // Expected output DaemonSet + runtime gpuv1.Runtime + expectedDs Daemonset // Expected output DaemonSet }{ { description: "transform nvidia-container-toolkit-ctr container", ds: NewDaemonset(). WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}), + runtime: gpuv1.Containerd, cpSpec: &gpuv1.ClusterPolicySpec{ Toolkit: gpuv1.ToolkitSpec{ Repository: "nvcr.io/nvidia/cloud-native", @@ -822,6 +823,7 @@ func TestTransformToolkit(t *testing.T) { {Name: CDIEnabledEnvName, Value: "true"}, {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: CRIOConfigModeEnvName, Value: "config"}, {Name: "foo", Value: "bar"}, {Name: "RUNTIME", Value: "containerd"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -847,6 +849,7 @@ func TestTransformToolkit(t *testing.T) { description: "transform nvidia-container-toolkit-ctr container with custom ctr runtime socket", ds: NewDaemonset(). WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}), + runtime: gpuv1.Containerd, cpSpec: &gpuv1.ClusterPolicySpec{ Toolkit: gpuv1.ToolkitSpec{ Repository: "nvcr.io/nvidia/cloud-native", @@ -899,6 +902,7 @@ func TestTransformToolkit(t *testing.T) { {Name: CDIEnabledEnvName, Value: "true"}, {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: CRIOConfigModeEnvName, Value: "config"}, {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -920,12 +924,84 @@ func TestTransformToolkit(t *testing.T) { WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil). WithPullSecret("pull-secret"), }, + { + description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi enabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}), + runtime: gpuv1.CRIO, + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "nvidia-container-toolkit", + Version: "v1.0.0", + }, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "nvidia-container-toolkit-ctr", + Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: CRIOConfigModeEnvName, Value: "config"}, + {Name: "RUNTIME", Value: gpuv1.CRIO.String()}, + {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, + {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, + {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, + {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir}, + {Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"}, + }, + }). + WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)), + }, + { + description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}), + runtime: gpuv1.CRIO, + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "nvidia-container-toolkit", + Version: "v1.0.0", + }, + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "nvidia-container-toolkit-ctr", + Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: CRIOConfigModeEnvName, Value: "hook"}, + {Name: "RUNTIME", Value: gpuv1.CRIO.String()}, + {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, + {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, + {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, + {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir}, + {Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"}, + }, + }). + WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)), + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { controller := ClusterPolicyController{ - runtime: gpuv1.Containerd, + runtime: tc.runtime, logger: ctrl.Log.WithName("test"), } @@ -2587,6 +2663,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) { {Name: CDIEnabledEnvName, Value: "true"}, {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: CRIOConfigModeEnvName, Value: "config"}, }, }), },