From 7586e2e35c7bf70a82cf90041789b4499477b12a Mon Sep 17 00:00:00 2001 From: Joey <569475269@qq.com> Date: Thu, 27 Nov 2025 09:36:01 +0800 Subject: [PATCH] fix: avoid dummy device causing node scale up issue --- internal/webhook/v1/pod_webhook.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 4c6d9322..150c71a6 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -29,6 +29,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/strategicpatch" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -331,7 +332,9 @@ func (m *TensorFusionPodMutator) patchTFClient( container.Resources.Limits = make(corev1.ResourceList) } // Limit is set to actual index value (1-512) for Device Plugin to match Pod - // container.Resources.Limits[constants.PodIndexAnnotation] = resource.MustParse(strconv.Itoa(index)) + // ResourceFit of dummy device already ignored in TF scheduler + container.Resources.Limits[constants.PodIndexAnnotation] = resource.MustParse(strconv.Itoa(index)) + container.Resources.Requests[constants.PodIndexAnnotation] = resource.MustParse("0") if !isLocalGPU { addConnectionForRemoteFixedReplicaVirtualGPU(pod, container, clientConfig)