From a1b8b2d424fd513ca4e9b3564037e4bba67fa1e0 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 17 Nov 2025 19:19:55 +0800 Subject: [PATCH 1/2] fix: use function parameter instead of env var for Container Toolkit config --- cmd/main.go | 22 ++++++++------------ internal/autoscaler/autoscaler_suite_test.go | 7 ++++--- internal/controller/gpunode_controller.go | 11 +++++----- internal/controller/suite_test.go | 9 ++++---- internal/utils/compose.go | 6 +++--- internal/utils/config.go | 6 ------ 6 files changed, 27 insertions(+), 34 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 8a7e5b0d..436155a4 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -99,7 +99,7 @@ var alertEvaluator *alert.AlertEvaluator var schedulerConfigPath string var alertEvaluatorReady chan struct{} var enableAutoExpander bool -var compatibleWithNvidiaOperator bool +var compatibleWithNvidiaContainerToolkit bool func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -151,16 +151,11 @@ func main() { "refer https://prometheus.io/docs/alerting/latest/configuration") flag.BoolVar(&enableAutoExpander, "enable-auto-expander", false, "if turn on auto expander, "+ "TensorFusion will auto expand Nodes then Pending Pods which caused by insufficient GPU resources found") - flag.BoolVar(&compatibleWithNvidiaOperator, "compatible-with-nvidia-operator", false, - "if enabled, node discovery will wait for NVIDIA GPU Operator toolkit-ready validation before starting") + flag.BoolVar(&compatibleWithNvidiaContainerToolkit, "compatible-with-nvidia-container-toolkit", false, + "if enabled, node discovery will wait for NVIDIA Container Toolkit toolkit-ready validation before starting") klog.InitFlags(nil) flag.Parse() - - // Set environment variable for utils package to read - if compatibleWithNvidiaOperator { - _ = os.Setenv(constants.CompatibleWithNvidiaOperatorEnv, constants.TrueStringValue) - } ctrl.SetLogger(klog.NewKlogr()) ctx := context.Background() @@ -401,11 +396,12 @@ func startCustomResourceController( } if err = (&controller.GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), - Allocator: allocator, - Expander: nodeExpander, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + Allocator: allocator, + Expander: nodeExpander, + CompatibleWithNvidiaContainerToolkit: compatibleWithNvidiaContainerToolkit, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "GPUNode") os.Exit(1) diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go index c51d18db..0595acce 100644 --- a/internal/autoscaler/autoscaler_suite_test.go +++ b/internal/autoscaler/autoscaler_suite_test.go @@ -181,9 +181,10 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&controller.GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + CompatibleWithNvidiaContainerToolkit: false, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 5fb0d3dd..4a6c235f 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -48,10 +48,11 @@ import ( // GPUNodeReconciler reconciles a GPUNode object type GPUNodeReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder - Allocator *gpuallocator.GpuAllocator - Expander *expander.NodeExpander + Scheme *runtime.Scheme + Recorder record.EventRecorder + Allocator *gpuallocator.GpuAllocator + Expander *expander.NodeExpander + CompatibleWithNvidiaContainerToolkit bool } // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete @@ -287,7 +288,7 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( }) tmpl.Spec.EnableServiceLinks = ptr.To(false) - utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name) + utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name, r.CompatibleWithNvidiaContainerToolkit) // create node-discovery job job := &batchv1.Job{ diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 0cc58a38..d0efb92e 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -182,10 +182,11 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), - Allocator: allocator, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + Allocator: allocator, + CompatibleWithNvidiaContainerToolkit: false, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) diff --git a/internal/utils/compose.go b/internal/utils/compose.go index 1dd6167f..8db2f52f 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -724,7 +724,7 @@ func composeVectorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) { } } -func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string) { +func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string, compatibleWithNvidiaContainerToolkit bool) { tmpl.Spec.RestartPolicy = v1.RestartPolicyOnFailure serviceAccountName := GetSelfServiceAccountNameShort() if serviceAccountName == "" { @@ -745,8 +745,8 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla tmpl.Spec.Containers[0].Image = pool.Spec.ComponentConfig.NodeDiscovery.Image } - // Add initContainer to wait for NVIDIA GPU Operator toolkit-ready validation - if IsCompatibleWithNvidiaOperator() { + // Add initContainer to wait for NVIDIA Container Toolkit toolkit-ready validation + if compatibleWithNvidiaContainerToolkit { initContainerImage := pool.Spec.ComponentConfig.NodeDiscovery.Image if initContainerImage == "" { // Use the same image as the main container if not specified diff --git a/internal/utils/config.go b/internal/utils/config.go index 4c7fde95..ba2e732b 100644 --- a/internal/utils/config.go +++ b/internal/utils/config.go @@ -181,8 +181,6 @@ var nvidiaOperatorProgressiveMigrationEnv = os.Getenv(constants.NvidiaOperatorPr var isLicensedEnv = os.Getenv(constants.UsingCommercialComponentEnv) == constants.TrueStringValue -var compatibleWithNvidiaOperatorEnv = os.Getenv(constants.CompatibleWithNvidiaOperatorEnv) == constants.TrueStringValue - func init() { if isLicensedEnv { ctrl.Log.Info("Enabling none open source components, please make sure you are in trial stage or have bought commercial license. Contact us: support@tensor-fusion.com") @@ -197,10 +195,6 @@ func IsProgressiveMigration() bool { return nvidiaOperatorProgressiveMigrationEnv } -func IsCompatibleWithNvidiaOperator() bool { - return compatibleWithNvidiaOperatorEnv -} - // For test purpose only func SetProgressiveMigration(isProgressiveMigration bool) { nvidiaOperatorProgressiveMigrationEnv = isProgressiveMigration From 861e6133493082ba4132e5102396b7cae39e1d23 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 17 Nov 2025 19:22:02 +0800 Subject: [PATCH 2/2] go fmt --- internal/controller/suite_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index d0efb92e..2f61b9f2 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -182,10 +182,10 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), - Allocator: allocator, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + Allocator: allocator, CompatibleWithNvidiaContainerToolkit: false, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred())