Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions assets/state-operator-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ spec:
initContainers:
- name: driver-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
Expand Down Expand Up @@ -58,8 +57,7 @@ spec:
mountPath: /host-dev-char
- name: toolkit-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
Expand All @@ -75,8 +73,7 @@ spec:
mountPropagation: Bidirectional
- name: cuda-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "false"
Expand All @@ -98,8 +95,7 @@ spec:
mountPropagation: Bidirectional
- name: plugin-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: COMPONENT
value: plugin
Expand All @@ -126,14 +122,14 @@ spec:
containers:
- image: "FILLED BY THE OPERATOR"
name: nvidia-operator-validator
command: ['sh', '-c']
args: ["echo all validations are successful; while true; do sleep 86400; done"]
command: ["nvidia-validator"]
args: ["--sleep"]
securityContext:
privileged: true
lifecycle:
preStop:
exec:
command: ["sh", "-c", "rm -f /run/nvidia/validations/*-ready"]
command: ["/usr/bin/rmglob", "/run/nvidia/validations/*-ready"]
volumeMounts:
- name: run-nvidia-validations
mountPath: "/run/nvidia/validations"
Expand Down
18 changes: 7 additions & 11 deletions assets/state-sandbox-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ spec:
initContainers:
- name: cc-manager-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
Expand All @@ -49,8 +48,7 @@ spec:
mountPropagation: Bidirectional
- name: vfio-pci-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
Expand All @@ -74,8 +72,7 @@ spec:
mountPropagation: Bidirectional
- name: vgpu-manager-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
Expand All @@ -102,8 +99,7 @@ spec:
mountPropagation: Bidirectional
- name: vgpu-devices-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
args: ["nvidia-validator"]
command: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
Expand All @@ -122,14 +118,14 @@ spec:
containers:
- image: "FILLED BY THE OPERATOR"
name: nvidia-sandbox-validator
command: ['sh', '-c']
args: ["echo all validations are successful; while true; do sleep 86400; done"]
command: ["nvidia-validator"]
args: ["--sleep"]
securityContext:
privileged: true
lifecycle:
preStop:
exec:
command: ["sh", "-c", "rm -f /run/nvidia/validations/*"]
command: ["/usr/bin/rmglob", "/run/nvidia/validations/*"]
volumeMounts:
- name: run-nvidia-validations
mountPath: "/run/nvidia/validations"
Expand Down
75 changes: 60 additions & 15 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ var (
hostRootFlag string
driverInstallDirFlag string
driverInstallDirCtrPathFlag string
sleepFlag bool
)

// defaultGPUWorkloadConfig is "vm-passthrough" unless
Expand Down Expand Up @@ -375,14 +376,17 @@ func main() {
Destination: &driverInstallDirCtrPathFlag,
Sources: cli.EnvVars("DRIVER_INSTALL_DIR_CTR_PATH"),
},
&cli.BoolFlag{
Name: "sleep",
Usage: "after any other action, print the validator-success message and block until SIGTERM/SIGINT/SIGHUP, then exit 0",
Destination: &sleepFlag,
Sources: cli.EnvVars("SLEEP"),
},
}

// Log version info
log.Infof("version: %s", c.Version)

// Handle signals
go handleSignal()

// invoke command
err := c.Run(context.Background(), os.Args)
if err != nil {
Expand All @@ -404,6 +408,10 @@ func handleSignal() {

func validateFlags(ctx context.Context, cli *cli.Command) (context.Context, error) {
if componentFlag == "" {
// Standalone --sleep mode does not require a component.
if sleepFlag {
return ctx, nil
}
return ctx, fmt.Errorf("invalid -c <component-name> flag: must not be empty string")
}
if !isValidComponent() {
Expand Down Expand Up @@ -509,24 +517,59 @@ func getWorkloadConfig(ctx context.Context) (string, error) {
}

func start(ctx context.Context, cli *cli.Command) error {
// if cleanup is requested, delete all existing status files(default)
if cleanupAllFlag {
// cleanup output directory and create again each time
err := os.RemoveAll(outputDirFlag)
if err != nil {
if !os.IsNotExist(err) {
return err
// In sleep mode, runSleep installs its own signal handler. Otherwise
// preserve legacy behavior: any signal terminates the process.
if !sleepFlag {
go handleSignal()
}

if componentFlag != "" {
// if cleanup is requested, delete all existing status files(default)
if cleanupAllFlag {
// cleanup output directory and create again each time
err := os.RemoveAll(outputDirFlag)
if err != nil {
if !os.IsNotExist(err) {
return err
}
}
}

// create status directory
err := os.Mkdir(outputDirFlag, 0755)
if err != nil && !os.IsExist(err) {
return err
}

if err := validateComponent(ctx, componentFlag); err != nil {
return err
}
}

// create status directory
err := os.Mkdir(outputDirFlag, 0755)
if err != nil && !os.IsExist(err) {
return err
if sleepFlag {
return runSleep(ctx)
}
return nil
}

return validateComponent(ctx, componentFlag)
// runSleep prints the validator-success message and blocks until a
// termination signal arrives, then exits cleanly. Per-pod cleanup of
// status markers is handled separately by the rmglob binary invoked
// from `lifecycle.preStop`.
func runSleep(ctx context.Context) error {
fmt.Println("all validations are successful")

sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP)
defer signal.Stop(sigCh)

select {
case <-ctx.Done():
log.Infof("context canceled")
case s := <-sigCh:
log.Infof("received signal %s", s)
}
return nil
}

func validateComponent(ctx context.Context, componentFlag string) error {
Expand Down Expand Up @@ -1368,6 +1411,7 @@ func (p *Plugin) runWorkload() error {
if err != nil {
return err
}
fmt.Println("device-plugin workload validation is successful")
return nil
}

Expand Down Expand Up @@ -1621,6 +1665,7 @@ func (c *CUDA) runWorkload() error {
if err != nil {
return err
}
fmt.Println("cuda workload validation is successful")
return nil
}

Expand Down
71 changes: 71 additions & 0 deletions cmd/nvidia-validator/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ package main
import (
"context"
"os"
"syscall"
"testing"
"time"
)

func Test_isValidComponent(t *testing.T) {
Expand Down Expand Up @@ -216,3 +218,72 @@ UNKNOWN_FEATURE: true`,
})
}
}

func Test_validateFlags_standaloneSleep(t *testing.T) {
tests := []struct {
name string
component string
sleep bool
wantErr bool
}{
{name: "no component, no sleep: error", wantErr: true},
{name: "no component, sleep: ok", sleep: true},
{name: "valid component, no sleep: ok", component: "driver"},
{name: "valid component, sleep: ok", component: "driver", sleep: true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
origComponent, origSleep := componentFlag, sleepFlag
componentFlag, sleepFlag = tt.component, tt.sleep
defer func() {
componentFlag, sleepFlag = origComponent, origSleep
}()

_, err := validateFlags(context.Background(), nil)
if tt.wantErr && err == nil {
t.Errorf("validateFlags() expected error, got nil")
}
if !tt.wantErr && err != nil {
t.Errorf("validateFlags() unexpected error: %v", err)
}
})
}
}

func Test_runSleep_returnsOnSignal(t *testing.T) {
errCh := make(chan error, 1)
go func() { errCh <- runSleep(context.Background()) }()

// Give runSleep a moment to install its signal handler before sending.
time.Sleep(50 * time.Millisecond)
if err := syscall.Kill(syscall.Getpid(), syscall.SIGTERM); err != nil {
t.Fatalf("kill: %v", err)
}

select {
case err := <-errCh:
if err != nil {
t.Errorf("runSleep returned error: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatalf("runSleep did not return within 2s of SIGTERM")
}
}

func Test_runSleep_contextCancel(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
errCh := make(chan error, 1)
go func() { errCh <- runSleep(ctx) }()

time.Sleep(50 * time.Millisecond)
cancel()

select {
case err := <-errCh:
if err != nil {
t.Errorf("runSleep returned error: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatalf("runSleep did not return within 2s of context cancel")
}
}
60 changes: 60 additions & 0 deletions cmd/rmglob/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
Copyright (c) NVIDIA CORPORATION. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// rmglob is a tiny static helper binary that expands one or more glob
// patterns and removes the matching paths. It exists so that distroless
// gpu-operator container images can run path cleanup from a Kubernetes
// `lifecycle.preStop` hook without needing a shell on the image.
//
// It is the path-cleanup analog of k8s-cc-manager's vendored static `/bin/rm`.
package main

import (
"fmt"
"os"
"path/filepath"
)

func main() {
if len(os.Args) < 2 {
fmt.Fprintln(os.Stderr, "usage: rmglob <glob>...")
os.Exit(2)
}

var failed bool
for _, pattern := range os.Args[1:] {
matches, err := filepath.Glob(pattern)
if err != nil {
//#nosec G705 -- stderr diagnostic, not a network-reachable sink
fmt.Fprintf(os.Stderr, "rmglob: invalid pattern %q: %v\n", pattern, err)
failed = true
continue
}
for _, m := range matches {
// Path removal is the binary's sole purpose; the patterns come from
// gpu-operator-rendered manifests, not external user input.
//#nosec G703 -- intentional path removal
if err := os.RemoveAll(m); err != nil {
//#nosec G705 -- stderr diagnostic, not a network-reachable sink
fmt.Fprintf(os.Stderr, "rmglob: remove %q: %v\n", m, err)
failed = true
}
}
}
if failed {
os.Exit(1)
}
}
Loading
Loading