Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ var alertEvaluator *alert.AlertEvaluator
var schedulerConfigPath string
var alertEvaluatorReady chan struct{}
var enableAutoExpander bool
var compatibleWithNvidiaOperator bool

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
Expand Down Expand Up @@ -150,9 +151,16 @@ func main() {
"refer https://prometheus.io/docs/alerting/latest/configuration")
flag.BoolVar(&enableAutoExpander, "enable-auto-expander", false, "if turn on auto expander, "+
"TensorFusion will auto expand Nodes then Pending Pods which caused by insufficient GPU resources found")
flag.BoolVar(&compatibleWithNvidiaOperator, "compatible-with-nvidia-operator", false,
"if enabled, node discovery will wait for NVIDIA GPU Operator toolkit-ready validation before starting")

klog.InitFlags(nil)
flag.Parse()

// Set environment variable for utils package to read
if compatibleWithNvidiaOperator {
_ = os.Setenv(constants.CompatibleWithNvidiaOperatorEnv, constants.TrueStringValue)
}
ctrl.SetLogger(klog.NewKlogr())
ctx := context.Background()

Expand Down
3 changes: 2 additions & 1 deletion internal/constants/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ const (
NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION"
RunHypervisorUtilGPUAllocatable = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE"

UsingCommercialComponentEnv = "COMMERCIAL_PLAN"
UsingCommercialComponentEnv = "COMMERCIAL_PLAN"
CompatibleWithNvidiaOperatorEnv = "COMPATIBLE_WITH_NVIDIA_OPERATOR"
)

// General envs used in compose components manifest
Expand Down
41 changes: 41 additions & 0 deletions internal/utils/compose.go
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,47 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
tmpl.Spec.Containers[0].Image = pool.Spec.ComponentConfig.NodeDiscovery.Image
}

// Add initContainer to wait for NVIDIA GPU Operator toolkit-ready validation
if IsCompatibleWithNvidiaOperator() {
initContainerImage := pool.Spec.ComponentConfig.NodeDiscovery.Image
if initContainerImage == "" {
// Use the same image as the main container if not specified
initContainerImage = tmpl.Spec.Containers[0].Image
}

initContainer := v1.Container{
Name: "toolkit-validation",
Image: initContainerImage,
Command: []string{"sh", "-c"},
Args: []string{
"until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done",
},
SecurityContext: &v1.SecurityContext{
Privileged: ptr.To(true),
},
VolumeMounts: []v1.VolumeMount{
{
Name: "run-nvidia-validations",
MountPath: "/run/nvidia/validations",
MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
},
},
}

tmpl.Spec.InitContainers = append(tmpl.Spec.InitContainers, initContainer)

// Add volume for NVIDIA validations
tmpl.Spec.Volumes = append(tmpl.Spec.Volumes, v1.Volume{
Name: "run-nvidia-validations",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/run/nvidia/validations",
Type: ptr.To(v1.HostPathDirectoryOrCreate),
},
},
})
}

tmpl.Spec.Containers[0].Env = append(tmpl.Spec.Containers[0].Env, v1.EnvVar{
Name: constants.NodeDiscoveryReportGPUNodeEnvName,
Value: gpuNodeName,
Expand Down
6 changes: 6 additions & 0 deletions internal/utils/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ var nvidiaOperatorProgressiveMigrationEnv = os.Getenv(constants.NvidiaOperatorPr

var isLicensedEnv = os.Getenv(constants.UsingCommercialComponentEnv) == constants.TrueStringValue

var compatibleWithNvidiaOperatorEnv = os.Getenv(constants.CompatibleWithNvidiaOperatorEnv) == constants.TrueStringValue

func init() {
if isLicensedEnv {
ctrl.Log.Info("Enabling none open source components, please make sure you are in trial stage or have bought commercial license. Contact us: support@tensor-fusion.com")
Expand All @@ -195,6 +197,10 @@ func IsProgressiveMigration() bool {
return nvidiaOperatorProgressiveMigrationEnv
}

func IsCompatibleWithNvidiaOperator() bool {
return compatibleWithNvidiaOperatorEnv
}

// For test purpose only
func SetProgressiveMigration(isProgressiveMigration bool) {
nvidiaOperatorProgressiveMigrationEnv = isProgressiveMigration
Expand Down
Loading