diff --git a/demos/cuj1-gke-config.md b/demos/cuj1-gke-config.md index d51d2c506..04dfbbc62 100644 --- a/demos/cuj1-gke-config.md +++ b/demos/cuj1-gke-config.md @@ -153,33 +153,6 @@ to render upgrade diffs): helm plugin install https://github.com/databus23/helm-diff ``` -GKE's `ResourceQuota` admission plugin blocks pods that declare -`priorityClassName: system-*-critical` outside `kube-system`. The gpu-operator -controller (and its DaemonSets) request `system-node-critical`, so apply a -permissive quota in the gpu-operator namespace before the helmfile run. -See https://github.com/NVIDIA/aicr/issues/915. - -```shell -kubectl create namespace gpu-operator --dry-run=client -o yaml | kubectl apply -f - -kubectl apply -f - <<'EOF' -apiVersion: v1 -kind: ResourceQuota -metadata: - name: gpu-operator-critical-pods - namespace: gpu-operator -spec: - hard: - pods: "16" - scopeSelector: - matchExpressions: - - operator: In - scopeName: PriorityClass - values: - - system-node-critical - - system-cluster-critical -EOF -``` - `helmfile.yaml` carries the release graph and ordering; helmfile handles parallelism and idempotent re-apply. This will take a few min. diff --git a/pkg/bundler/bundler.go b/pkg/bundler/bundler.go index 9f62e4126..e58e32c7b 100644 --- a/pkg/bundler/bundler.go +++ b/pkg/bundler/bundler.go @@ -43,6 +43,7 @@ import ( "github.com/NVIDIA/aicr/pkg/component" "github.com/NVIDIA/aicr/pkg/errors" "github.com/NVIDIA/aicr/pkg/recipe" + "github.com/NVIDIA/aicr/pkg/serializer" ) // digestAlgoSHA256 is the algorithm key used in attestation digest maps. @@ -1177,7 +1178,157 @@ func (b *DefaultBundler) collectComponentManifests(ctx context.Context, recipeRe // collectComponentPreManifests gathers the pre-phase manifests (those // the bundler will emit BEFORE each component's primary chart). Wired // into each deployer call site in buildDeployer alongside the -// post-phase collector. +// post-phase collector. Also folds in any synthesized pre-manifests +// (e.g. GKE critical-priority ResourceQuotas — see issue #915) so +// every deployer benefits from the same fix without per-deployer +// branching. func (b *DefaultBundler) collectComponentPreManifests(ctx context.Context, recipeResult *recipe.RecipeResult) (map[string]map[string][]byte, error) { - return b.collectComponentManifestsByPhase(ctx, recipeResult, phasePreManifests) + pre, err := b.collectComponentManifestsByPhase(ctx, recipeResult, phasePreManifests) + if err != nil { + return nil, err + } + return b.injectGKECriticalPriorityQuotas(pre, recipeResult) +} + +// gkeCriticalPriorityQuotaPodFloor is the smallest `pods` cap the +// synthesized ResourceQuota carries. The cap is an admission allowlist +// — not a real capacity gate — so the value is intentionally generous. +// The floor handles recipes that did not declare a node count (Nodes +// defaults to 0 in CriteriaSpec when --nodes is omitted on both recipe +// and bundle), so demos and small clusters do not need to specify it. +const gkeCriticalPriorityQuotaPodFloor = 32 + +// gkeCriticalPriorityQuotaPodsPerNode is the multiplier applied to the +// recipe's declared node count. gpu-operator alone runs ~8-10 +// critical-priority DaemonSet pods per GPU node (driver, toolkit, +// device-plugin, GFD, DCGM, DCGM exporter, MIG manager, validator) +// plus the controller Deployment; 32× covers steady-state plus +// rolling-update churn (old + new pods during a chart upgrade) with a +// ~3× safety margin. +const gkeCriticalPriorityQuotaPodsPerNode = 32 + +// gkeCriticalPriorityQuotaName is the metadata.name of the synthesized +// ResourceQuota. Stable across runs so idempotent re-apply by the +// deployer (helmfile / argocd / flux) updates the existing object +// rather than creating duplicates. +const gkeCriticalPriorityQuotaName = "aicr-gke-critical-pods" + +// gkeCriticalPriorityQuotaFilename is the manifest filename injected +// into the pre-manifests map. It is unique to this synthesized object +// and namespaced under a directory prefix so it cannot collide with a +// real PreManifestFiles path declared in a component overlay. +const gkeCriticalPriorityQuotaFilename = "aicr/synthesized/gke-critical-pods-quota.yaml" + +// injectGKECriticalPriorityQuotas appends a synthesized ResourceQuota +// pre-manifest to every component whose ComponentConfig declares +// GKECriticalPriority=true, when the recipe targets GKE. GKE Standard +// ships a kube-system ResourceQuota scoped to the system-*-critical +// PriorityClasses; per the Kubernetes spec, once any cluster-wide +// quota scopes by PriorityClass for those values, pods that request a +// matching priority class can only be created in namespaces that have +// a matching quota. Without the synthesized quota, gpu-operator (and +// any other marked component) hits a 10-minute helmfile-apply timeout +// when its first pod is rejected by admission. See issue #915. +// +// Non-GKE recipes return the input map unchanged, so the additive +// nature of the fix is preserved across services. +func (b *DefaultBundler) injectGKECriticalPriorityQuotas( + pre map[string]map[string][]byte, + recipeResult *recipe.RecipeResult, +) (map[string]map[string][]byte, error) { + + if recipeResult == nil || recipeResult.Criteria == nil || + recipeResult.Criteria.Service != recipe.CriteriaServiceGKE { + + return pre, nil + } + + registry, err := recipe.GetComponentRegistry() + if err != nil { + return nil, errors.Wrap(errors.ErrCodeInternal, + "failed to load component registry for GKE quota synthesis", err) + } + + pods := computeGKECriticalPriorityQuotaPods(recipeResult.Criteria.Nodes) + + if pre == nil { + pre = make(map[string]map[string][]byte) + } + + for _, ref := range recipeResult.ComponentRefs { + cfg := registry.Get(ref.Name) + if cfg == nil || !cfg.GKECriticalPriority { + continue + } + if ref.Namespace == "" { + // Defensive — the recipe resolver fills Namespace from the + // registry's defaultNamespace before bundling. An empty + // namespace here would produce an invalid ResourceQuota, + // so skip with a warning rather than emit broken YAML. + slog.Warn("skipping GKE critical-priority quota: component has no namespace", + "component", ref.Name) + continue + } + manifest, err := renderGKECriticalPriorityQuota(ref.Namespace, pods) + if err != nil { + return nil, errors.Wrap(errors.ErrCodeInternal, + fmt.Sprintf("failed to render GKE critical-priority quota for %s", ref.Name), err) + } + if pre[ref.Name] == nil { + pre[ref.Name] = make(map[string][]byte) + } + pre[ref.Name][gkeCriticalPriorityQuotaFilename] = manifest + } + + return pre, nil +} + +// computeGKECriticalPriorityQuotaPods returns the `hard.pods` value for +// the synthesized ResourceQuota. nodeCount of 0 (the CriteriaSpec +// default when --nodes is omitted) falls through to the floor. +func computeGKECriticalPriorityQuotaPods(nodeCount int) int { + if nodeCount <= 0 { + return gkeCriticalPriorityQuotaPodFloor + } + pods := nodeCount * gkeCriticalPriorityQuotaPodsPerNode + if pods < gkeCriticalPriorityQuotaPodFloor { + return gkeCriticalPriorityQuotaPodFloor + } + return pods +} + +// renderGKECriticalPriorityQuota returns the YAML for a ResourceQuota +// that admits pods with system-*-critical priority classes in the +// given namespace. Uses serializer.MarshalYAMLDeterministic so the +// bytes are stable across runs — the synthesized manifest is part of +// the bundle artifact (checksummed and optionally attested), and +// yaml.v3 walks randomized Go map order, which would otherwise +// produce a different SHA on every invocation. +func renderGKECriticalPriorityQuota(namespace string, pods int) ([]byte, error) { + quota := map[string]any{ + "apiVersion": "v1", + "kind": "ResourceQuota", + "metadata": map[string]any{ + "name": gkeCriticalPriorityQuotaName, + "namespace": namespace, + }, + "spec": map[string]any{ + "hard": map[string]any{ + "pods": strconv.Itoa(pods), + }, + "scopeSelector": map[string]any{ + "matchExpressions": []map[string]any{ + { + "operator": "In", + "scopeName": "PriorityClass", + "values": []string{ + "system-node-critical", + "system-cluster-critical", + }, + }, + }, + }, + }, + } + return serializer.MarshalYAMLDeterministic(quota) } diff --git a/pkg/bundler/bundler_gke_quota_test.go b/pkg/bundler/bundler_gke_quota_test.go new file mode 100644 index 000000000..65d116e23 --- /dev/null +++ b/pkg/bundler/bundler_gke_quota_test.go @@ -0,0 +1,310 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bundler + +import ( + "context" + "strings" + "testing" + + "github.com/NVIDIA/aicr/pkg/recipe" + "gopkg.in/yaml.v3" +) + +// TestComputeGKECriticalPriorityQuotaPods pins the pods cap arithmetic: +// the floor (32) applies when the recipe declared no node count or a +// non-positive count; otherwise nodeCount × 32. +func TestComputeGKECriticalPriorityQuotaPods(t *testing.T) { + tests := []struct { + name string + nodeCount int + want int + }{ + {"unset (zero) → floor", 0, 32}, + {"negative → floor", -1, 32}, + {"one node → 32 (floor matches)", 1, 32}, + {"small cluster", 8, 256}, + {"medium cluster", 100, 3200}, + {"large cluster", 2000, 64000}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := computeGKECriticalPriorityQuotaPods(tt.nodeCount) + if got != tt.want { + t.Errorf("computeGKECriticalPriorityQuotaPods(%d) = %d, want %d", + tt.nodeCount, got, tt.want) + } + }) + } +} + +// TestRenderGKECriticalPriorityQuota verifies the YAML shape so callers +// can rely on a stable manifest (helmfile / argocd / flux re-apply uses +// name + namespace as the identity key). +func TestRenderGKECriticalPriorityQuota(t *testing.T) { + bytes, err := renderGKECriticalPriorityQuota("gpu-operator", 1024) + if err != nil { + t.Fatalf("renderGKECriticalPriorityQuota: %v", err) + } + var doc map[string]any + if err := yaml.Unmarshal(bytes, &doc); err != nil { + t.Fatalf("rendered manifest is not valid YAML: %v", err) + } + if doc["apiVersion"] != "v1" { + t.Errorf("apiVersion = %v, want v1", doc["apiVersion"]) + } + if doc["kind"] != "ResourceQuota" { + t.Errorf("kind = %v, want ResourceQuota", doc["kind"]) + } + meta, _ := doc["metadata"].(map[string]any) + if meta["name"] != gkeCriticalPriorityQuotaName { + t.Errorf("metadata.name = %v, want %q", meta["name"], gkeCriticalPriorityQuotaName) + } + if meta["namespace"] != "gpu-operator" { + t.Errorf("metadata.namespace = %v, want gpu-operator", meta["namespace"]) + } + spec, _ := doc["spec"].(map[string]any) + hard, _ := spec["hard"].(map[string]any) + // pods is emitted as a string ("1024") so yaml.Unmarshal returns a + // string; matches the upstream ResourceQuota quantity convention. + if hard["pods"] != "1024" { + t.Errorf("spec.hard.pods = %v, want \"1024\"", hard["pods"]) + } + scopes, _ := spec["scopeSelector"].(map[string]any) + matches, _ := scopes["matchExpressions"].([]any) + if len(matches) != 1 { + t.Fatalf("expected exactly 1 matchExpression, got %d", len(matches)) + } + expr, _ := matches[0].(map[string]any) + if expr["scopeName"] != "PriorityClass" || expr["operator"] != "In" { + t.Errorf("matchExpression operator/scopeName = %v/%v, want In/PriorityClass", + expr["operator"], expr["scopeName"]) + } + values, _ := expr["values"].([]any) + if len(values) != 2 || values[0] != "system-node-critical" || values[1] != "system-cluster-critical" { + t.Errorf("scope values = %v, want [system-node-critical system-cluster-critical]", values) + } +} + +// TestRenderGKECriticalPriorityQuota_Deterministic guards the +// serializer.MarshalYAMLDeterministic call: the bundle is checksummed +// and optionally attested, so two renders of the same inputs must +// produce byte-identical output. yaml.v3 walks randomized Go map +// order, which would silently regress this if the call ever flipped +// back to the stdlib yaml.Marshal. +func TestRenderGKECriticalPriorityQuota_Deterministic(t *testing.T) { + // 50 iterations is generous; map-order non-determinism typically + // surfaces within ~5 with a multi-key spec. + first, err := renderGKECriticalPriorityQuota("gpu-operator", 320) + if err != nil { + t.Fatalf("first render: %v", err) + } + for i := range 50 { + got, err := renderGKECriticalPriorityQuota("gpu-operator", 320) + if err != nil { + t.Fatalf("render %d: %v", i, err) + } + if string(got) != string(first) { + t.Fatalf("non-deterministic render at iter %d:\nfirst:\n%s\ngot:\n%s", + i, string(first), string(got)) + } + } +} + +// TestInjectGKECriticalPriorityQuotas covers the integration of the +// synthesizer with collectComponentPreManifests. Each case asserts both +// presence/absence and (where applicable) the namespace + pods cap that +// landed in the synthesized manifest. +func TestInjectGKECriticalPriorityQuotas(t *testing.T) { + bundler, err := New() + if err != nil { + t.Fatalf("New() error = %v", err) + } + + const markedComponent = "gpu-operator" // registry has gkeCriticalPriority: true + + tests := []struct { + name string + service recipe.CriteriaServiceType + nodes int + refs []recipe.ComponentRef + wantSynthesized bool // expect the synthesized manifest in the map + wantPodsContains string // substring to look for in the rendered YAML + }{ + { + name: "gke + marked component + node count → synthesized with computed cap", + service: recipe.CriteriaServiceGKE, + nodes: 10, + refs: []recipe.ComponentRef{ + {Name: markedComponent, Namespace: "gpu-operator"}, + }, + wantSynthesized: true, + wantPodsContains: "pods: \"320\"", + }, + { + name: "gke + marked component + zero nodes → floor (32)", + service: recipe.CriteriaServiceGKE, + nodes: 0, + refs: []recipe.ComponentRef{ + {Name: markedComponent, Namespace: "gpu-operator"}, + }, + wantSynthesized: true, + wantPodsContains: "pods: \"32\"", + }, + { + name: "non-gke (eks) + marked component → no synthesis", + service: recipe.CriteriaServiceEKS, + nodes: 100, + refs: []recipe.ComponentRef{ + {Name: markedComponent, Namespace: "gpu-operator"}, + }, + wantSynthesized: false, + }, + { + name: "gke + unmarked component → no synthesis", + service: recipe.CriteriaServiceGKE, + nodes: 10, + refs: []recipe.ComponentRef{ + {Name: "cert-manager", Namespace: "cert-manager"}, + }, + wantSynthesized: false, + }, + { + name: "gke + marked component but empty namespace → skipped (defensive)", + service: recipe.CriteriaServiceGKE, + nodes: 10, + refs: []recipe.ComponentRef{ + {Name: markedComponent, Namespace: ""}, + }, + wantSynthesized: false, + }, + { + name: "gke + mixed: marked + unmarked → only marked is synthesized", + service: recipe.CriteriaServiceGKE, + nodes: 5, + refs: []recipe.ComponentRef{ + {Name: markedComponent, Namespace: "gpu-operator"}, + {Name: "cert-manager", Namespace: "cert-manager"}, + }, + wantSynthesized: true, + wantPodsContains: "pods: \"160\"", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rr := &recipe.RecipeResult{ + Criteria: &recipe.Criteria{Service: tt.service, Nodes: tt.nodes}, + ComponentRefs: tt.refs, + } + pre, err := bundler.collectComponentPreManifests(context.Background(), rr) + if err != nil { + t.Fatalf("collectComponentPreManifests: %v", err) + } + + got, ok := pre[markedComponent][gkeCriticalPriorityQuotaFilename] + if !tt.wantSynthesized { + if ok { + t.Errorf("expected no synthesized quota, got:\n%s", string(got)) + } + return + } + if !ok { + t.Fatalf("expected synthesized quota at %q, got map=%v", + gkeCriticalPriorityQuotaFilename, pre) + } + if !strings.Contains(string(got), tt.wantPodsContains) { + t.Errorf("rendered quota missing %q:\n%s", tt.wantPodsContains, string(got)) + } + if !strings.Contains(string(got), "namespace: gpu-operator") { + t.Errorf("rendered quota missing namespace line:\n%s", string(got)) + } + }) + } +} + +// TestInjectGKECriticalPriorityQuotas_CoexistsWithExistingPreManifests +// pins the additive behavior: the synthesized manifest sits alongside +// any manifests already declared via PreManifestFiles on the same +// component, not replacing them. +func TestInjectGKECriticalPriorityQuotas_CoexistsWithExistingPreManifests(t *testing.T) { + bundler, err := New() + if err != nil { + t.Fatalf("New() error = %v", err) + } + + // Pre-populate the per-component pre-manifest map as + // collectComponentManifestsByPhase would, then run the injector + // directly so we exercise the merge logic without needing a real + // PreManifestFiles path on disk. + pre := map[string]map[string][]byte{ + "gpu-operator": { + "existing/pre/manifest.yaml": []byte("kind: ConfigMap\n"), + }, + } + rr := &recipe.RecipeResult{ + Criteria: &recipe.Criteria{ + Service: recipe.CriteriaServiceGKE, + Nodes: 4, + }, + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator", Namespace: "gpu-operator"}, + }, + } + out, err := bundler.injectGKECriticalPriorityQuotas(pre, rr) + if err != nil { + t.Fatalf("injectGKECriticalPriorityQuotas: %v", err) + } + + gpu := out["gpu-operator"] + if _, ok := gpu["existing/pre/manifest.yaml"]; !ok { + t.Error("pre-existing manifest entry was dropped") + } + if _, ok := gpu[gkeCriticalPriorityQuotaFilename]; !ok { + t.Error("synthesized manifest not added") + } + if len(gpu) != 2 { + t.Errorf("expected exactly 2 entries (existing + synthesized), got %d: %v", + len(gpu), gpu) + } +} + +// TestInjectGKECriticalPriorityQuotas_NilInputs documents the +// nil-tolerant contract: a nil RecipeResult or nil Criteria returns +// the input unchanged (no panic, no synthesis). +func TestInjectGKECriticalPriorityQuotas_NilInputs(t *testing.T) { + bundler, err := New() + if err != nil { + t.Fatalf("New() error = %v", err) + } + tests := []struct { + name string + in *recipe.RecipeResult + }{ + {"nil recipe result", nil}, + {"nil criteria", &recipe.RecipeResult{}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + out, err := bundler.injectGKECriticalPriorityQuotas(nil, tt.in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(out) != 0 { + t.Errorf("expected empty map, got %v", out) + } + }) + } +} diff --git a/pkg/recipe/components.go b/pkg/recipe/components.go index 714435b08..e0d5444a5 100644 --- a/pkg/recipe/components.go +++ b/pkg/recipe/components.go @@ -67,6 +67,23 @@ type ComponentConfig struct { // HealthCheck defines custom health check configuration for this component. HealthCheck HealthCheckConfig `yaml:"healthCheck,omitempty"` + + // GKECriticalPriority signals that the component's default chart manifests + // include pods with `priorityClassName: system-node-critical` or + // `system-cluster-critical`. When true and the recipe's + // `criteria.service` is "gke", the bundler synthesizes a permissive + // ResourceQuota into the component's namespace (PreManifestFiles phase) + // so GKE Standard's ResourceQuota admission plugin admits the pods. + // + // GKE Standard ships a kube-system ResourceQuota scoped to + // `system-*-critical` PriorityClasses; per the Kubernetes spec, once + // any quota in the cluster scopes by PriorityClass for those values, + // pods that request a matching priority class can only be created in + // namespaces that have a matching quota. Other services (EKS, AKS, + // OKE, bare-metal) do not ship this default and are unaffected. + // + // See https://github.com/NVIDIA/aicr/issues/915. + GKECriticalPriority bool `yaml:"gkeCriticalPriority,omitempty"` } // HealthCheckConfig defines custom health check settings for a component. diff --git a/recipes/components/gpu-operator/manifests/gke-resource-quota.yaml b/recipes/components/gpu-operator/manifests/gke-resource-quota.yaml deleted file mode 100644 index 554d87cfc..000000000 --- a/recipes/components/gpu-operator/manifests/gke-resource-quota.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# GKE ResourceQuota for GPU Operator namespace. -# -# GKE requires a ResourceQuota scoped to system-critical PriorityClasses -# in namespaces where pods use system-node-critical or system-cluster-critical. -# Without this, pod creation fails with: -# "insufficient quota to match these scopes: [{PriorityClass In -# [system-node-critical system-cluster-critical]}]" ---- -apiVersion: v1 -kind: ResourceQuota -metadata: - name: gcp-critical-pods - namespace: gpu-operator -spec: - hard: - pods: "1000" - scopeSelector: - matchExpressions: - - operator: In - scopeName: PriorityClass - values: - - system-node-critical - - system-cluster-critical diff --git a/recipes/overlays/gke-cos.yaml b/recipes/overlays/gke-cos.yaml index 551bb38d1..35a30c711 100644 --- a/recipes/overlays/gke-cos.yaml +++ b/recipes/overlays/gke-cos.yaml @@ -32,13 +32,15 @@ spec: componentRefs: # GKE-COS-specific GPU Operator overrides - # GKE preinstalls NVIDIA drivers on COS; disable driver installation + # GKE preinstalls NVIDIA drivers on COS; disable driver installation. + # The GKE ResourceQuota that admits system-*-critical pods is synthesized + # by the bundler from registry.yaml (gkeCriticalPriority: true); no + # manifestFiles entry needed here. Issue #915. - name: gpu-operator type: Helm valuesFile: components/gpu-operator/values-gke-cos.yaml manifestFiles: - components/gpu-operator/manifests/dcgm-exporter.yaml - - components/gpu-operator/manifests/gke-resource-quota.yaml # Enable Prometheus persistent storage for GKE (GKE PD CSI is built-in) - name: kube-prometheus-stack diff --git a/recipes/registry.yaml b/recipes/registry.yaml index 1b892f7ee..4f3eb991e 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -79,6 +79,11 @@ components: displayName: gpu-operator valueOverrideKeys: - gpuoperator + # Default chart sets priorityClassName: system-node-critical on the + # controller Deployment and the DaemonSets it manages. On GKE the + # bundler synthesizes a permissive ResourceQuota in the gpu-operator + # namespace so admission control admits these pods. See issue #915. + gkeCriticalPriority: true healthCheck: assertFile: checks/gpu-operator/health-check.yaml helm: