Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ var alertEvaluator *alert.AlertEvaluator
var schedulerConfigPath string
var alertEvaluatorReady chan struct{}
var enableAutoExpander bool
var compatibleWithNvidiaOperator bool
var compatibleWithNvidiaContainerToolkit bool

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
Expand Down Expand Up @@ -151,16 +151,11 @@ func main() {
"refer https://prometheus.io/docs/alerting/latest/configuration")
flag.BoolVar(&enableAutoExpander, "enable-auto-expander", false, "if turn on auto expander, "+
"TensorFusion will auto expand Nodes then Pending Pods which caused by insufficient GPU resources found")
flag.BoolVar(&compatibleWithNvidiaOperator, "compatible-with-nvidia-operator", false,
"if enabled, node discovery will wait for NVIDIA GPU Operator toolkit-ready validation before starting")
flag.BoolVar(&compatibleWithNvidiaContainerToolkit, "compatible-with-nvidia-container-toolkit", false,
"if enabled, node discovery will wait for NVIDIA Container Toolkit toolkit-ready validation before starting")

klog.InitFlags(nil)
flag.Parse()

// Set environment variable for utils package to read
if compatibleWithNvidiaOperator {
_ = os.Setenv(constants.CompatibleWithNvidiaOperatorEnv, constants.TrueStringValue)
}
ctrl.SetLogger(klog.NewKlogr())
ctx := context.Background()

Expand Down Expand Up @@ -401,11 +396,12 @@ func startCustomResourceController(
}

if err = (&controller.GPUNodeReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
Allocator: allocator,
Expander: nodeExpander,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
Allocator: allocator,
Expander: nodeExpander,
CompatibleWithNvidiaContainerToolkit: compatibleWithNvidiaContainerToolkit,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
os.Exit(1)
Expand Down
7 changes: 4 additions & 3 deletions internal/autoscaler/autoscaler_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,10 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

err = (&controller.GPUNodeReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
CompatibleWithNvidiaContainerToolkit: false,
}).SetupWithManager(mgr)
Expect(err).ToNot(HaveOccurred())

Expand Down
11 changes: 6 additions & 5 deletions internal/controller/gpunode_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ import (
// GPUNodeReconciler reconciles a GPUNode object
type GPUNodeReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Allocator *gpuallocator.GpuAllocator
Expander *expander.NodeExpander
Scheme *runtime.Scheme
Recorder record.EventRecorder
Allocator *gpuallocator.GpuAllocator
Expander *expander.NodeExpander
CompatibleWithNvidiaContainerToolkit bool
}

// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -287,7 +288,7 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
})
tmpl.Spec.EnableServiceLinks = ptr.To(false)

utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name)
utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name, r.CompatibleWithNvidiaContainerToolkit)

// create node-discovery job
job := &batchv1.Job{
Expand Down
9 changes: 5 additions & 4 deletions internal/controller/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,11 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

err = (&GPUNodeReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
Allocator: allocator,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GPUNode"),
Allocator: allocator,
CompatibleWithNvidiaContainerToolkit: false,
}).SetupWithManager(mgr)
Expect(err).ToNot(HaveOccurred())

Expand Down
6 changes: 3 additions & 3 deletions internal/utils/compose.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ func composeVectorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {
}
}

func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string) {
func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string, compatibleWithNvidiaContainerToolkit bool) {
tmpl.Spec.RestartPolicy = v1.RestartPolicyOnFailure
serviceAccountName := GetSelfServiceAccountNameShort()
if serviceAccountName == "" {
Expand All @@ -745,8 +745,8 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
tmpl.Spec.Containers[0].Image = pool.Spec.ComponentConfig.NodeDiscovery.Image
}

// Add initContainer to wait for NVIDIA GPU Operator toolkit-ready validation
if IsCompatibleWithNvidiaOperator() {
// Add initContainer to wait for NVIDIA Container Toolkit toolkit-ready validation
if compatibleWithNvidiaContainerToolkit {
initContainerImage := pool.Spec.ComponentConfig.NodeDiscovery.Image
if initContainerImage == "" {
// Use the same image as the main container if not specified
Expand Down
6 changes: 0 additions & 6 deletions internal/utils/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,6 @@ var nvidiaOperatorProgressiveMigrationEnv = os.Getenv(constants.NvidiaOperatorPr

var isLicensedEnv = os.Getenv(constants.UsingCommercialComponentEnv) == constants.TrueStringValue

var compatibleWithNvidiaOperatorEnv = os.Getenv(constants.CompatibleWithNvidiaOperatorEnv) == constants.TrueStringValue

func init() {
if isLicensedEnv {
ctrl.Log.Info("Enabling none open source components, please make sure you are in trial stage or have bought commercial license. Contact us: support@tensor-fusion.com")
Expand All @@ -197,10 +195,6 @@ func IsProgressiveMigration() bool {
return nvidiaOperatorProgressiveMigrationEnv
}

func IsCompatibleWithNvidiaOperator() bool {
return compatibleWithNvidiaOperatorEnv
}

// For test purpose only
func SetProgressiveMigration(isProgressiveMigration bool) {
nvidiaOperatorProgressiveMigrationEnv = isProgressiveMigration
Expand Down
Loading