Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/NVIDIA/go-nvlib/pkg/nvmdev"
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks"
"github.com/cyphar/filepath-securejoin/pathrs-lite"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert/yaml"
cli "github.com/urfave/cli/v3"
Expand Down Expand Up @@ -742,16 +743,33 @@ func isDriverManagedByOperator(ctx context.Context) (bool, error) {
return false, nil
}

// resolveHostNvidiaSMI opens and stats nvidia-smi within the mounted host root.
func resolveHostNvidiaSMI(hostRootCtrPath string) (os.FileInfo, error) {
f, err := pathrs.OpenInRoot(hostRootCtrPath, "/usr/bin/nvidia-smi")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also attempt paths like /usr/sbin/nvidia-smi and /usr/lib/wsl/lib/nvidia-smi?

cc @cdesiniotis

Copy link
Copy Markdown
Contributor Author

@JunAr7112 JunAr7112 May 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tariq1890 In what systems would we see /usr/sbin/nvidia-smi and /usr/lib/wsl/lib/nvidia-smi symlinks? I think the original error was wrt NixOS systems, but I can generalize to include this as well if its a common bug

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/usr/lib/wsl/lib/nvidia-smi is used in WSL. I believe Talos and other similar distros use /usr/sbin/nvidia-smi

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should expand the search paths here, but we could do that in a follow-up. For reference, we have precedent for searching for nvidia-smi in multiple directories, e.g. https://github.com/kubernetes-sigs/dra-driver-nvidia-gpu/blob/510086482c09c9cdde2886c292fc0006fc9df926/hack/kubelet-plugin-prestart.sh#L44-L52

if err != nil {
return nil, fmt.Errorf("failed to open 'nvidia-smi' on the host: %w", err)
}
defer f.Close()

fileInfo, err := f.Stat()
if err != nil {
return nil, fmt.Errorf("failed to stat 'nvidia-smi' on the host: %w", err)
}

return fileInfo, nil
}

func validateHostDriver(silent bool) error {
log.Info("Attempting to validate a pre-installed driver on the host")
if fileInfo, err := os.Lstat(filepath.Join("/host", wslNvidiaSMIPath)); err == nil && fileInfo.Size() != 0 {
log.Infof("WSL2 system detected, assuming driver is pre-installed")
disableDevCharSymlinkCreation = true
return nil
}
fileInfo, err := os.Lstat("/host/usr/bin/nvidia-smi")

fileInfo, err := resolveHostNvidiaSMI("/host")
if err != nil {
return fmt.Errorf("no 'nvidia-smi' file present on the host: %w", err)
return err
}
if fileInfo.Size() == 0 {
return fmt.Errorf("empty 'nvidia-smi' file found on the host")
Expand Down Expand Up @@ -1753,7 +1771,7 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
args := []string{"/run/nvidia/driver", "nvidia-smi"}

// check if driver is pre-installed on the host and use host path for validation
if _, err := os.Lstat("/host/usr/bin/nvidia-smi"); err == nil {
if _, err := resolveHostNvidiaSMI("/host"); err == nil {
args = []string{"/host", "nvidia-smi"}
hostDriver = true
}
Expand Down
74 changes: 74 additions & 0 deletions cmd/nvidia-validator/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,84 @@ package main

import (
"context"
"maps"
"os"
"path/filepath"
"slices"
"strings"
"testing"

"github.com/stretchr/testify/require"
)

func TestResolveHostNvidiaSMI(t *testing.T) {
testCases := []struct {
description string
contents map[string]string
expectsError bool
}{
{
description: "nvidia-smi exists in /usr/bin",
contents: map[string]string{
"/usr/bin/nvidia-smi": "fake nvidia-smi",
},
},
{
description: "nvidia-smi exists through absolute /usr/bin symlink",
Comment thread
cdesiniotis marked this conversation as resolved.
contents: map[string]string{
"/run/current-system/sw/bin/nvidia-smi": "fake nvidia-smi",
"/usr/bin": "symlink=/run/current-system/sw/bin",
},
},
{
description: "nvidia-smi exists through relative /usr/bin symlink",
contents: map[string]string{
"/run/current-system/sw/bin/nvidia-smi": "fake nvidia-smi",
"/usr/bin": "symlink=../run/current-system/sw/bin",
},
},
{
description: "parent dir is symlink to path not within root",
contents: map[string]string{
"/usr/bin": "symlink=../../",
},
expectsError: true,
},
{
description: "nvidia-smi does not exist",
expectsError: true,
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
hostRoot := t.TempDir()
// Iterate in sorted order so parent symlinks are created before paths under them.
for _, name := range slices.Sorted(maps.Keys(tc.contents)) {
contents := tc.contents[name]
target := filepath.Join(hostRoot, name)
require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755))

if strings.HasPrefix(contents, "symlink=") {
require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target))
continue
}

require.NoError(t, os.WriteFile(target, []byte(contents), 0600))
}

fileInfo, err := resolveHostNvidiaSMI(hostRoot)
if tc.expectsError {
require.Error(t, err)
return
}

require.NoError(t, err)
require.NotZero(t, fileInfo.Size())
})
}
}

func Test_isValidComponent(t *testing.T) {
tests := []struct {
name string
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/NVIDIA/k8s-kata-manager v0.2.3
github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441
github.com/NVIDIA/nvidia-container-toolkit v1.19.1
github.com/cyphar/filepath-securejoin v0.6.1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0
Expand Down Expand Up @@ -35,6 +36,7 @@ require (
)

require (
cyphar.com/go-pathrs v0.2.4 // indirect
dario.cat/mergo v1.0.1 // indirect
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
github.com/MakeNowJust/heredoc v1.0.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
cyphar.com/go-pathrs v0.2.4 h1:iD/mge36swa1UFKdINkr1Frkpp6wZsy3YYEildj9cLY=
cyphar.com/go-pathrs v0.2.4/go.mod h1:y8f1EMG7r+hCuFf/rXsKqMJrJAUoADZGNh5/vZPKcGc=
dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
Expand Down Expand Up @@ -31,6 +33,8 @@ github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHe
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
github.com/cyphar/filepath-securejoin v0.6.1 h1:5CeZ1jPXEiYt3+Z6zqprSAgSWiggmpVyciv8syjIpVE=
github.com/cyphar/filepath-securejoin v0.6.1/go.mod h1:A8hd4EnAeyujCJRrICiOWqjS1AX0a9kM5XL+NwKoYSc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
Expand Down
43 changes: 43 additions & 0 deletions vendor/cyphar.com/go-pathrs/.golangci.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading