Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 24 additions & 13 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
Expand Down Expand Up @@ -1241,6 +1241,7 @@ func transformToolkitCtrForCDI(container *corev1.Container) {
setContainerEnv(container, CDIEnabledEnvName, "true")
setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
setContainerEnv(container, CRIOConfigModeEnvName, "config")
}

// TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
Expand Down Expand Up @@ -1283,6 +1284,18 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
// update env required for CDI support
if config.CDI.IsEnabled() {
transformToolkitCtrForCDI(toolkitMainContainer)
} else if n.runtime == gpuv1.CRIO {
// (cdesiniotis) When CDI is not enabled and cri-o is the container runtime,
// we continue to install the OCI prestart hook as opposed to adding nvidia
// runtime handlers to the cri-o configuration. Users can override this behavior
// and have nvidia runtime handlers added to the cri-o configuration by setting
// the 'CRIO_CONFIG_MODE' environment variable to 'config' in the toolkit container.
// However, one should note setting 'CRIO_CONFIG_MODE' to 'config' in this case
// (when CDI is not enabled) would result in the 'nvidia' runtime being set as
// the default runtime. While this should work in theory, it is a significant
// change -- which was the primary motivation to continue using the OCI prestart
// hook by default in this case.
setContainerEnv(toolkitMainContainer, CRIOConfigModeEnvName, "hook")
}

// set install directory for the toolkit
Expand Down Expand Up @@ -1337,11 +1350,6 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
}

if runtime == gpuv1.CRIO.String() {
// We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook
setContainerEnv(container, CRIOConfigModeEnvName, "config")
}

// For runtime config files we have top-level configs and drop-in files.
// These are supported as follows:
// * Docker only supports top-level config files.
Expand Down Expand Up @@ -1517,7 +1525,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy)
Expand Down Expand Up @@ -1597,7 +1605,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy)
Expand Down Expand Up @@ -1705,7 +1713,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
}
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// set hostPID if specified for DCGM Exporter
if config.DCGMExporter.IsHostPIDEnabled() {
Expand Down Expand Up @@ -1830,7 +1838,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
}
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

return nil
}
Expand Down Expand Up @@ -1872,7 +1880,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// set ConfigMap name for "mig-parted-config" Volume
for i, vol := range obj.Spec.Template.Spec.Volumes {
Expand Down Expand Up @@ -2166,7 +2174,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
return fmt.Errorf("%v", err)
}

setRuntimeClassName(&obj.Spec.Template.Spec, config)
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

var validatorErr error
// apply changes for individual component validators(initContainers)
Expand Down Expand Up @@ -2540,7 +2548,10 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
return DefaultRuntimeClass
}

func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) {
func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
return
}
runtimeClassName := getRuntimeClassName(config)
podSpec.RuntimeClassName = &runtimeClassName
}
Expand Down
83 changes: 80 additions & 3 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,6 @@ func TestTransformForRuntime(t *testing.T) {
Name: "test-ctr",
Env: []corev1.EnvVar{
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
{Name: CRIOConfigModeEnvName, Value: "config"},
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
Expand Down Expand Up @@ -775,12 +774,14 @@ func TestTransformToolkit(t *testing.T) {
description string
ds Daemonset // Input DaemonSet
cpSpec *gpuv1.ClusterPolicySpec // Input configuration
expectedDs Daemonset // Expected output DaemonSet
runtime gpuv1.Runtime
expectedDs Daemonset // Expected output DaemonSet
}{
{
description: "transform nvidia-container-toolkit-ctr container",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
runtime: gpuv1.Containerd,
cpSpec: &gpuv1.ClusterPolicySpec{
Toolkit: gpuv1.ToolkitSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Expand Down Expand Up @@ -822,6 +823,7 @@ func TestTransformToolkit(t *testing.T) {
{Name: CDIEnabledEnvName, Value: "true"},
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
{Name: CRIOConfigModeEnvName, Value: "config"},
{Name: "foo", Value: "bar"},
{Name: "RUNTIME", Value: "containerd"},
{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
Expand All @@ -847,6 +849,7 @@ func TestTransformToolkit(t *testing.T) {
description: "transform nvidia-container-toolkit-ctr container with custom ctr runtime socket",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
runtime: gpuv1.Containerd,
cpSpec: &gpuv1.ClusterPolicySpec{
Toolkit: gpuv1.ToolkitSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Expand Down Expand Up @@ -899,6 +902,7 @@ func TestTransformToolkit(t *testing.T) {
{Name: CDIEnabledEnvName, Value: "true"},
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
{Name: CRIOConfigModeEnvName, Value: "config"},
{Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"},
{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
Expand All @@ -920,12 +924,84 @@ func TestTransformToolkit(t *testing.T) {
WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil).
WithPullSecret("pull-secret"),
},
{
description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi enabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
runtime: gpuv1.CRIO,
cpSpec: &gpuv1.ClusterPolicySpec{
Toolkit: gpuv1.ToolkitSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "nvidia-container-toolkit",
Version: "v1.0.0",
},
},
expectedDs: NewDaemonset().
WithContainer(corev1.Container{
Name: "nvidia-container-toolkit-ctr",
Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: CDIEnabledEnvName, Value: "true"},
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
{Name: CRIOConfigModeEnvName, Value: "config"},
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
{Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"},
},
VolumeMounts: []corev1.VolumeMount{
{Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir},
{Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"},
},
}).
WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)).
WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)),
},
{
description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi disabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
runtime: gpuv1.CRIO,
cpSpec: &gpuv1.ClusterPolicySpec{
Toolkit: gpuv1.ToolkitSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "nvidia-container-toolkit",
Version: "v1.0.0",
},
CDI: gpuv1.CDIConfigSpec{
Enabled: newBoolPtr(false),
},
},
expectedDs: NewDaemonset().
WithContainer(corev1.Container{
Name: "nvidia-container-toolkit-ctr",
Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: CRIOConfigModeEnvName, Value: "hook"},
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
{Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"},
},
VolumeMounts: []corev1.VolumeMount{
{Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir},
{Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"},
},
}).
WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)).
WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)),
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
controller := ClusterPolicyController{
runtime: gpuv1.Containerd,
runtime: tc.runtime,
logger: ctrl.Log.WithName("test"),
}

Expand Down Expand Up @@ -2587,6 +2663,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
{Name: CDIEnabledEnvName, Value: "true"},
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
{Name: CRIOConfigModeEnvName, Value: "config"},
},
}),
},
Expand Down