Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cmd/nvidia-container-runtime-hook/hook_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,26 @@ func (c *hookConfig) getSwarmResourceEnvvars() []string {

return envvars
}

// nvidiaContainerCliCUDACompatModeFlags returns required --cuda-compat-mode
// flag(s) depending on the hook and runtime configurations.
func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string {
var flag string
switch c.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode {
case config.CUDACompatModeLdconfig:
flag = "--cuda-compat-mode=ldconfig"
case config.CUDACompatModeMount:
flag = "--cuda-compat-mode=mount"
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
flag = "--cuda-compat-mode=disabled"
default:
if !c.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
flag = "--cuda-compat-mode=disabled"
}
}

if flag == "" {
return nil
}
return []string{flag}
}
5 changes: 2 additions & 3 deletions cmd/nvidia-container-runtime-hook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,8 @@ func doPrestart() {
}
args = append(args, "configure")

if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
args = append(args, "--no-cntlibs")
}
args = append(args, hook.nvidiaContainerCliCUDACompatModeFlags()...)

if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
}
Expand Down
15 changes: 15 additions & 0 deletions cmd/nvidia-ctk-installer/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -140,6 +143,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -204,6 +210,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -265,6 +274,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -348,6 +360,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down
3 changes: 3 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
SpecDirs: cdi.DefaultSpecDirs,
},
Legacy: legacyModeConfig{
CUDACompatMode: defaultCUDACompatMode,
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
23 changes: 23 additions & 0 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
Legacy: legacyModeConfig{
CUDACompatMode: "ldconfig",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand All @@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-cli.load-kmods = false",
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
"nvidia-container-cli.user = \"foo:bar\"",
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
"nvidia-container-runtime.debug = \"/foo/bar\"",
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
"nvidia-container-runtime.log-level = \"debug\"",
Expand All @@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"",
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
Expand Down Expand Up @@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
Legacy: legacyModeConfig{
CUDACompatMode: "mount",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) {
"/var/run/cdi",
},
},
Legacy: legacyModeConfig{
CUDACompatMode: "ldconfig",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand All @@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) {
"root = \"/bar/baz\"",
"load-kmods = false",
"ldconfig = \"@/foo/bar/ldconfig\"",
"cuda-compat-mode = \"mount\"",
"user = \"foo:bar\"",
"[nvidia-container-runtime]",
"debug = \"/foo/bar\"",
Expand All @@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) {
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-container-runtime.modes.legacy]",
"cuda-compat-mode = \"mount\"",
"[nvidia-container-runtime-hook]",
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
"[nvidia-ctk]",
Expand Down Expand Up @@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
Legacy: legacyModeConfig{
CUDACompatMode: "mount",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
Legacy: legacyModeConfig{
CUDACompatMode: "ldconfig",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
Legacy: legacyModeConfig{
CUDACompatMode: "ldconfig",
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
33 changes: 31 additions & 2 deletions internal/config/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ type RuntimeConfig struct {

// modesConfig defines (optional) per-mode configs
type modesConfig struct {
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
Legacy legacyModeConfig `toml:"legacy"`
}

type cdiModeConfig struct {
Expand All @@ -45,3 +46,31 @@ type cdiModeConfig struct {
type csvModeConfig struct {
MountSpecPath string `toml:"mount-spec-path"`
}

type legacyModeConfig struct {
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
// libraries discoverable in the container.
CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"`
}

type cudaCompatMode string

const (
defaultCUDACompatMode = CUDACompatModeLdconfig
// CUDACompatModeDisabled explicitly disables the handling of CUDA Forward
// Compatibility in the NVIDIA Container Runtime and NVIDIA Container
// Runtime Hook.
CUDACompatModeDisabled = cudaCompatMode("disabled")
// CUDACompatModeHook uses a container lifecycle hook to implement CUDA
// Forward Compatibility support. This requires the use of the NVIDIA
// Container Runtime and is not compatible with use cases where only the
// NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag).
CUDACompatModeHook = cudaCompatMode("hook")
// CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat
// libraries to the ldconfig command invoked from the NVIDIA Container
// Runtime Hook.
CUDACompatModeLdconfig = cudaCompatMode("ldconfig")
// CUDACompatModeMount mounts CUDA Forward Compat folders from the container
// to the container when using the NVIDIA Container Runtime Hook.
CUDACompatModeMount = cudaCompatMode("mount")
)
3 changes: 3 additions & 0 deletions internal/config/toml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.legacy]
cuda-compat-mode = "ldconfig"

[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false
Expand Down
49 changes: 33 additions & 16 deletions internal/modifier/gated.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
discoverers = append(discoverers, d)
}

if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
discoverers = append(discoverers, compatLibHookDiscoverer)
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
// If the feature flag has explicitly been toggled, we don't make any modification.
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
if err != nil {
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
}
discoverers = append(discoverers, cudaCompatDiscoverer)
}

return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
}

func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook {
return nil, nil
}

compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
return compatLibHookDiscoverer, nil
}

// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}

return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
}