From 179687e1bf430a6b2a272f02119b3992f8b8b34c Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Wed, 20 May 2026 09:15:41 -0700 Subject: [PATCH 1/5] test(recipe): enforce per-intent validation phase floor on leaves MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walk each leaf overlay's spec.base chain, resolve the merged ValidationConfig per pkg/recipe/metadata.go Merge semantics, classify by intent/service/platform, and assert the resolved validation contains the required phases. Per-intent floor: - Training (non-Kind) : deployment + conformance - Inference Dynamo/NIM (non-Kind) : deployment + conformance - Inference (plain) : deployment + conformance - Kind (any intent) : deployment + conformance - Performance is warn-only globally until #969 closes the data gap and Azure/OCI testbeds exist; AICR_VALIDATION_FLOOR_STRICT=1 promotes it to required. A knownGaps allowlist tracks the 7 overlays that fail today so the contract can land independently of #969. New overlays added without the floor will fail because they are not allowlisted. Criteria-wildcard overlays (intent: any or service: any) are excluded — they are cross-cutting fragments, not user-selectable leaves. Refs #970 Refs #969 --- pkg/recipe/validation_phase_floor_test.go | 323 ++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 pkg/recipe/validation_phase_floor_test.go diff --git a/pkg/recipe/validation_phase_floor_test.go b/pkg/recipe/validation_phase_floor_test.go new file mode 100644 index 000000000..8731df5ad --- /dev/null +++ b/pkg/recipe/validation_phase_floor_test.go @@ -0,0 +1,323 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// validation_phase_floor_test.go enforces a per-intent validation phase +// floor on every leaf overlay. For each user-selectable overlay it walks +// the spec.base chain, resolves the merged ValidationConfig per +// pkg/recipe/metadata.go Merge semantics, classifies the overlay by +// intent/service/platform, and asserts the resolved validation contains +// the required phases. +// +// Closes the loophole that let 27 of 41 GPU overlays drift to +// conformance-only without a CI gate (see issue #970, companion #969). +// +// Per-intent floor: +// Training (non-Kind) : deployment + conformance [performance recommended] +// Inference Dynamo / NIM (non-Kind) : deployment + conformance [performance recommended] +// Inference (plain) : deployment + conformance +// Kind (any intent) : deployment + conformance +// +// Strict toggle: AICR_VALIDATION_FLOOR_STRICT=1 promotes the recommended +// performance phase from warn-only to required. Default OFF until #969 +// closes the data gap and Azure/OCI performance testbeds land. +// +// knownGaps allowlist: overlays tracked by #969 are listed below so the +// contract can land independently of the data fix. Entries downgrade +// failures to logs. Empty this map as #969 closes; the test fails any +// NEW overlay added without the floor because it is not allowlisted. + +package recipe + +import ( + "context" + "fmt" + "os" + "sort" + "strings" + "testing" +) + +const strictEnvVar = "AICR_VALIDATION_FLOOR_STRICT" + +// knownGaps lists leaf overlays that fail the floor today and are tracked +// under #969. Each entry downgrades an Errorf to a Logf prefixed with +// "KNOWN GAP (#969):". Drain this map as #969 lands; delete the map and +// gap handling once empty. +var knownGaps = map[string]bool{ + "gb200-oke-ubuntu-inference-dynamo": true, + "gb200-oke-ubuntu-training-kubeflow": true, + "h100-kind-inference-dynamo": true, + "h100-kind-training-kubeflow": true, + "h100-kind-training-slurm": true, + "rtx-pro-6000-lke-ubuntu-inference": true, + "rtx-pro-6000-lke-ubuntu-training": true, +} + +// classification captures the inputs that drive the per-intent floor. +type classification struct { + Intent CriteriaIntentType + Service CriteriaServiceType + Platform CriteriaPlatformType + IsKind bool +} + +// String renders a classification for failure messages. +func (c classification) String() string { + return fmt.Sprintf("intent=%s service=%s platform=%s kind=%t", + c.Intent, c.Service, c.Platform, c.IsKind) +} + +// requiresPerformance reports whether the per-intent floor recommends +// the performance phase for this classification. +func (c classification) requiresPerformance() bool { + if c.IsKind { + return false + } + if c.Intent == CriteriaIntentTraining { + return true + } + dynamoOrNIM := c.Platform == CriteriaPlatformDynamo || c.Platform == CriteriaPlatformNIM + return c.Intent == CriteriaIntentInference && dynamoOrNIM +} + +// classifyOverlay derives the classification from resolved criteria. +func classifyOverlay(criteria *Criteria) classification { + return classification{ + Intent: criteria.Intent, + Service: criteria.Service, + Platform: criteria.Platform, + IsKind: criteria.Service == CriteriaServiceKind, + } +} + +// resolvedPhases returns the names of phases that are set on v. +func resolvedPhases(v *ValidationConfig) []string { + if v == nil { + return nil + } + var out []string + if v.Readiness != nil { + out = append(out, "readiness") + } + if v.Deployment != nil { + out = append(out, "deployment") + } + if v.Performance != nil { + out = append(out, "performance") + } + if v.Conformance != nil { + out = append(out, "conformance") + } + return out +} + +// resolveValidation walks the base: chain for the named recipe and +// returns the merged ValidationConfig using the same Merge semantics +// the production resolver uses. +func resolveValidation(s *MetadataStore, name string) (*ValidationConfig, error) { + chain, err := s.resolveInheritanceChain(name) + if err != nil { + return nil, err + } + merged := &RecipeMetadataSpec{} + for _, recipe := range chain { + merged.Merge(&recipe.Spec) + } + return merged.Validation, nil +} + +// discoverLeafOverlays returns the names of every user-selectable leaf +// overlay. A leaf has criteria, is not referenced as another overlay's +// spec.base, and is not a criteria-wildcard fragment (one whose +// criteria.intent or criteria.service is "any"). Wildcard overlays are +// cross-cutting fragments applied via the resolver's wildcard-match +// path — see docs/contributor/data.md#criteria-wildcard-overlays — not +// user-selectable entry points subject to the per-intent floor. +// Sorted for deterministic test output. +func discoverLeafOverlays(s *MetadataStore) []string { + referencedAsBase := make(map[string]bool) + for _, overlay := range s.Overlays { + if overlay.Spec.Base != "" { + referencedAsBase[overlay.Spec.Base] = true + } + } + var leaves []string + for name, overlay := range s.Overlays { + if referencedAsBase[name] { + continue + } + c := overlay.Spec.Criteria + if c == nil { + continue + } + if c.Intent == CriteriaIntentAny || c.Service == CriteriaServiceAny { + continue + } + leaves = append(leaves, name) + } + sort.Strings(leaves) + return leaves +} + +// TestOverlayValidationPhaseFloor asserts every leaf overlay's resolved +// validation block contains the per-intent required phases. See file +// header for the floor matrix and the strict-mode toggle. +func TestOverlayValidationPhaseFloor(t *testing.T) { + ctx := context.Background() + store, err := loadMetadataStore(ctx) + if err != nil { + t.Fatalf("loadMetadataStore: %v", err) + } + + strict := os.Getenv(strictEnvVar) == "1" + t.Logf("strict mode (%s=1): %t", strictEnvVar, strict) + t.Logf("knownGaps allowlist size (#969): %d", len(knownGaps)) + + leaves := discoverLeafOverlays(store) + t.Logf("leaf overlays discovered: %d", len(leaves)) + + for _, name := range leaves { + t.Run(name, func(t *testing.T) { + overlay := store.Overlays[name] + class := classifyOverlay(overlay.Spec.Criteria) + + validation, err := resolveValidation(store, name) + if err != nil { + t.Fatalf("resolveValidation: %v", err) + } + phases := resolvedPhases(validation) + + report := func(severity, kind, phase string) string { + return fmt.Sprintf( + "%s overlay %q [%s]\n resolved phases: %s\n missing %s: %s", + severity, name, class, + strings.Join(phases, ", "), + kind, phase, + ) + } + + // fail records a missing required phase. Overlays in knownGaps + // are downgraded to logs so the contract can land before #969 + // finishes closing the data gap. + fail := func(phase string) { + msg := report("FAIL", "required", phase) + if knownGaps[name] { + t.Logf("KNOWN GAP (#969): %s", msg) + return + } + t.Error(msg) + } + + // Required: deployment + conformance for every classification. + if validation == nil || validation.Deployment == nil { + fail("deployment") + } + if validation == nil || validation.Conformance == nil { + fail("conformance") + } + + // Performance: warn-only by default; strict mode promotes to + // required. Either way, knownGaps downgrades the result to + // preserve the allowlist contract. + if class.requiresPerformance() && (validation == nil || validation.Performance == nil) { + if strict { + fail("performance (strict)") + } else { + t.Log(report("WARN", "recommended", "performance")) + } + } + }) + } +} + +// TestClassifyOverlay exercises the classification function across the +// intent x service x platform matrix. +func TestClassifyOverlay(t *testing.T) { + tests := []struct { + name string + intent CriteriaIntentType + service CriteriaServiceType + platform CriteriaPlatformType + wantIsKind bool + wantRequiresPerf bool + }{ + {"training-eks-plain", CriteriaIntentTraining, CriteriaServiceEKS, CriteriaPlatformAny, false, true}, + {"training-aks-kubeflow", CriteriaIntentTraining, CriteriaServiceAKS, CriteriaPlatformKubeflow, false, true}, + {"training-kind", CriteriaIntentTraining, CriteriaServiceKind, CriteriaPlatformAny, true, false}, + {"inference-eks-plain", CriteriaIntentInference, CriteriaServiceEKS, CriteriaPlatformAny, false, false}, + {"inference-eks-dynamo", CriteriaIntentInference, CriteriaServiceEKS, CriteriaPlatformDynamo, false, true}, + {"inference-eks-nim", CriteriaIntentInference, CriteriaServiceEKS, CriteriaPlatformNIM, false, true}, + {"inference-kind-dynamo", CriteriaIntentInference, CriteriaServiceKind, CriteriaPlatformDynamo, true, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := &Criteria{Intent: tt.intent, Service: tt.service, Platform: tt.platform} + class := classifyOverlay(c) + if class.IsKind != tt.wantIsKind { + t.Errorf("IsKind = %v, want %v", class.IsKind, tt.wantIsKind) + } + if class.requiresPerformance() != tt.wantRequiresPerf { + t.Errorf("requiresPerformance() = %v, want %v", + class.requiresPerformance(), tt.wantRequiresPerf) + } + }) + } +} + +// TestResolvedPhases verifies the phase-name extractor for ValidationConfig. +func TestResolvedPhases(t *testing.T) { + tests := []struct { + name string + in *ValidationConfig + want []string + }{ + {"nil config", nil, nil}, + {"empty config", &ValidationConfig{}, nil}, + { + "deployment + conformance", + &ValidationConfig{Deployment: &ValidationPhase{}, Conformance: &ValidationPhase{}}, + []string{"deployment", "conformance"}, + }, + { + "all four", + &ValidationConfig{ + Readiness: &ValidationPhase{}, + Deployment: &ValidationPhase{}, + Performance: &ValidationPhase{}, + Conformance: &ValidationPhase{}, + }, + []string{"readiness", "deployment", "performance", "conformance"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := resolvedPhases(tt.in) + if !equalStringSlice(got, tt.want) { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} + +func equalStringSlice(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} From c06e5a38306dcc5734f817a631c055ea4c55afde Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Wed, 20 May 2026 11:40:41 -0700 Subject: [PATCH 2/5] test(recipe): add knownGaps staleness check + non-empty leaf assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on #989: - Track which knownGaps entries actually downgrade a failure during the run. Fail the test if any entry never fires — that means the overlay has been fixed but the allowlist entry was not removed, which would silently mask future regressions. - Hard-fail when discoverLeafOverlays returns zero leaves. Without this guard, a regression in discovery (e.g., refactor breaks leaf detection) would let the gate pass trivially. Verified by injecting a bogus knownGaps entry — test fails with the expected 'stale knownGaps entries' message and lists the entry to remove. Refs #970 --- pkg/recipe/validation_phase_floor_test.go | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pkg/recipe/validation_phase_floor_test.go b/pkg/recipe/validation_phase_floor_test.go index 8731df5ad..83cc4ccb1 100644 --- a/pkg/recipe/validation_phase_floor_test.go +++ b/pkg/recipe/validation_phase_floor_test.go @@ -186,6 +186,15 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { leaves := discoverLeafOverlays(store) t.Logf("leaf overlays discovered: %d", len(leaves)) + if len(leaves) == 0 { + t.Fatal("no leaf overlays discovered; the floor check would be vacuous — " + + "verify discoverLeafOverlays and the recipes/overlays/ directory") + } + + // triggeredKnownGaps tracks which knownGaps entries actually downgraded + // a failure during this run. Subtests run sequentially (no t.Parallel), + // so plain map writes from the fail closure are safe. + triggeredKnownGaps := make(map[string]bool, len(knownGaps)) for _, name := range leaves { t.Run(name, func(t *testing.T) { @@ -213,6 +222,7 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { fail := func(phase string) { msg := report("FAIL", "required", phase) if knownGaps[name] { + triggeredKnownGaps[name] = true t.Logf("KNOWN GAP (#969): %s", msg) return } @@ -239,6 +249,22 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { } }) } + + // Hygiene: every entry in knownGaps must have downgraded at least one + // failure during this run. Stale entries indicate the allowlist has + // drifted out of sync with the data (e.g., #969 fixed the overlay but + // forgot to remove the entry) — fail so the next reader cleans it up. + var stale []string + for name := range knownGaps { + if !triggeredKnownGaps[name] { + stale = append(stale, name) + } + } + if len(stale) > 0 { + sort.Strings(stale) + t.Errorf("stale knownGaps entries — overlay now meets the floor; "+ + "remove from knownGaps: %s", strings.Join(stale, ", ")) + } } // TestClassifyOverlay exercises the classification function across the From 05b6ae3782b40ec7cc61785265505043990ee684 Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Wed, 20 May 2026 13:14:22 -0700 Subject: [PATCH 3/5] fix(recipe): deep-copy Validation in Merge to prevent cached store pollution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RecipeMetadataSpec.Merge aliased other.Validation when the destination was nil, then later wrote phase pointers through that alias. Across successive calls on the cached MetadataStore, this mutated whichever overlay's cached ValidationConfig the alias happened to point at — so `BuildRecipeResult` was order-dependent, and an overlay that genuinely lacked a phase could appear compliant after a prior query planted that phase via pollution. Fix: - Add cloneValidationConfig + cloneValidationPhase helpers that deep- copy phases and their backing slices / NodeSelection. - Clone in Merge whenever a phase pointer is assigned (both the nil-destination branch and the per-phase overwrite branch). - Clone in initBaseMergedSpec so the merged spec never aliases s.Base.Spec.Validation. Regression test in metadata_test.go: two sequential merges into a fresh destination must not mutate the first source. Without the fix the test detects Deployment leaking from the second merge into the first source's ValidationConfig. Refs #970 --- pkg/recipe/metadata.go | 17 ++++++++----- pkg/recipe/metadata_store.go | 4 ++- pkg/recipe/metadata_test.go | 33 ++++++++++++++++++++++++ pkg/recipe/validation.go | 49 ++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 7 deletions(-) diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index e3665506a..5cba91b9a 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -456,22 +456,27 @@ func (s *RecipeMetadataSpec) Merge(other *RecipeMetadataSpec) { return s.ComponentRefs[i].Name < s.ComponentRefs[j].Name }) - // Merge validation config - overlay phases take precedence + // Merge validation config - overlay phases take precedence. Phase + // pointers are cloned (not aliased) so successive merges cannot mutate + // the source's cached ValidationConfig — a previous version aliased + // other.Validation when s.Validation was nil, then later writes through + // s.Validation.Deployment etc. corrupted whichever overlay's cached + // metadata the alias pointed at. if other.Validation != nil { if s.Validation == nil { - s.Validation = other.Validation + s.Validation = cloneValidationConfig(other.Validation) } else { if other.Validation.Readiness != nil { - s.Validation.Readiness = other.Validation.Readiness + s.Validation.Readiness = cloneValidationPhase(other.Validation.Readiness) } if other.Validation.Deployment != nil { - s.Validation.Deployment = other.Validation.Deployment + s.Validation.Deployment = cloneValidationPhase(other.Validation.Deployment) } if other.Validation.Performance != nil { - s.Validation.Performance = other.Validation.Performance + s.Validation.Performance = cloneValidationPhase(other.Validation.Performance) } if other.Validation.Conformance != nil { - s.Validation.Conformance = other.Validation.Conformance + s.Validation.Conformance = cloneValidationPhase(other.Validation.Conformance) } } } diff --git a/pkg/recipe/metadata_store.go b/pkg/recipe/metadata_store.go index 7a1a79baa..9b6bfdffc 100644 --- a/pkg/recipe/metadata_store.go +++ b/pkg/recipe/metadata_store.go @@ -636,11 +636,13 @@ func (s *MetadataStore) buildMixinConstraintCandidateIndex(candidateOverlays []s } // initBaseMergedSpec creates a copy of the base spec for overlay merging. +// Validation is deep-cloned so downstream Merge calls cannot reach back +// into the cached base ValidationConfig and mutate it. func (s *MetadataStore) initBaseMergedSpec() (RecipeMetadataSpec, []string) { mergedSpec := RecipeMetadataSpec{ Constraints: make([]Constraint, len(s.Base.Spec.Constraints)), ComponentRefs: make([]ComponentRef, len(s.Base.Spec.ComponentRefs)), - Validation: s.Base.Spec.Validation, + Validation: cloneValidationConfig(s.Base.Spec.Validation), } copy(mergedSpec.Constraints, s.Base.Spec.Constraints) copy(mergedSpec.ComponentRefs, s.Base.Spec.ComponentRefs) diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index 09149bf62..fc4cf1679 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -1027,6 +1027,39 @@ func TestMergeValidationConfig(t *testing.T) { t.Fatal("validation should be preserved from base when overlay has nil") } }) + + // Regression: a prior version of Merge aliased other.Validation when the + // destination's was nil. Subsequent merges then wrote through that alias + // into whichever overlay's cached ValidationConfig the alias pointed at, + // polluting it across calls. The fix deep-copies via cloneValidationConfig. + t.Run("merge does not alias source validation", func(t *testing.T) { + source := &ValidationConfig{ + Conformance: &ValidationPhase{Checks: []string{"check-from-source"}}, + } + first := RecipeMetadataSpec{ + Validation: source, + } + second := RecipeMetadataSpec{ + Validation: &ValidationConfig{ + Deployment: &ValidationPhase{Checks: []string{"deployment-from-second"}}, + }, + } + + // dest starts with nil Validation, so without the fix it would alias + // source. Merging second then plants Deployment via the alias into + // the source — corrupting subsequent reads of the source. + var dest RecipeMetadataSpec + dest.Merge(&first) + dest.Merge(&second) + + if source.Deployment != nil { + t.Errorf("source.Deployment leaked to %v — Merge aliased the source ValidationConfig", + source.Deployment.Checks) + } + if dest.Validation.Conformance == source.Conformance { + t.Error("dest.Validation.Conformance aliases source.Conformance — phase pointers must be cloned") + } + }) } func TestFinalizeRecipeResultIncludesValidation(t *testing.T) { diff --git a/pkg/recipe/validation.go b/pkg/recipe/validation.go index 9225ce643..f1244f800 100644 --- a/pkg/recipe/validation.go +++ b/pkg/recipe/validation.go @@ -14,6 +14,8 @@ package recipe +import "maps" + // ValidationConfig defines validation phases and settings. type ValidationConfig struct { // Readiness defines readiness validation phase settings. @@ -59,3 +61,50 @@ type NodeSelection struct { // ExcludeNodes lists node names to exclude from validation. ExcludeNodes []string `json:"excludeNodes,omitempty" yaml:"excludeNodes,omitempty"` } + +// cloneValidationConfig returns a deep copy of v. RecipeMetadataSpec.Merge +// uses this to avoid aliasing the source's nested phase pointers — without +// it, successive merges mutate whichever cached overlay's ValidationConfig +// the destination aliased. +func cloneValidationConfig(v *ValidationConfig) *ValidationConfig { + if v == nil { + return nil + } + return &ValidationConfig{ + Readiness: cloneValidationPhase(v.Readiness), + Deployment: cloneValidationPhase(v.Deployment), + Performance: cloneValidationPhase(v.Performance), + Conformance: cloneValidationPhase(v.Conformance), + } +} + +// cloneValidationPhase returns a deep copy of p with independent backing +// slices and a freshly allocated NodeSelection, so callers writing through +// the clone cannot reach the source's cached metadata. +func cloneValidationPhase(p *ValidationPhase) *ValidationPhase { + if p == nil { + return nil + } + out := *p + if p.Constraints != nil { + out.Constraints = make([]Constraint, len(p.Constraints)) + copy(out.Constraints, p.Constraints) + } + if p.Checks != nil { + out.Checks = make([]string, len(p.Checks)) + copy(out.Checks, p.Checks) + } + if p.NodeSelection != nil { + ns := *p.NodeSelection + if p.NodeSelection.Selector != nil { + ns.Selector = make(map[string]string, len(p.NodeSelection.Selector)) + maps.Copy(ns.Selector, p.NodeSelection.Selector) + } + if p.NodeSelection.ExcludeNodes != nil { + ns.ExcludeNodes = make([]string, len(p.NodeSelection.ExcludeNodes)) + copy(ns.ExcludeNodes, p.NodeSelection.ExcludeNodes) + } + out.NodeSelection = &ns + } + return &out +} From 853dbeb03d81b329f0b43f37e11954dfd391add7 Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Wed, 20 May 2026 13:14:35 -0700 Subject: [PATCH 4/5] test(recipe): gate via production resolver; phase-specific knownGaps Address review feedback on PR #989: - Use store.BuildRecipeResult (the production code path) instead of an ad-hoc base-chain walk. The previous helper missed wildcard overlay contributions and mixins, and its leaf detection excluded concrete intermediate overlays (h100-gke-cos-training, eks-training, etc.) that production returns as maximal-leaf candidates for queries that do not narrow further. - Gateable set expanded from 17 to 56 overlays. Per-overlay queries exercise the same resolver path the CLI/API use. Wildcard-fragment overlays (criteria.intent or criteria.service set to 'any') remain excluded because their criteria are not meaningful user queries. - knownGaps is now map[string]map[string]bool keyed by (overlay, phase). An overlay allowlisted for one missing phase no longer silently suppresses a regression in a different phase. - Allowlist repopulated under production semantics: 39 (overlay, phase) pairs missing deployment. The companion fix in 5d4f4cad eliminates cross-subtest pollution, so the failure set is stable across full and isolated runs. Refs #970 Refs #969 --- pkg/recipe/validation_phase_floor_test.go | 225 +++++++++++++--------- 1 file changed, 129 insertions(+), 96 deletions(-) diff --git a/pkg/recipe/validation_phase_floor_test.go b/pkg/recipe/validation_phase_floor_test.go index 83cc4ccb1..7799cab1b 100644 --- a/pkg/recipe/validation_phase_floor_test.go +++ b/pkg/recipe/validation_phase_floor_test.go @@ -13,14 +13,16 @@ // limitations under the License. // validation_phase_floor_test.go enforces a per-intent validation phase -// floor on every leaf overlay. For each user-selectable overlay it walks -// the spec.base chain, resolves the merged ValidationConfig per -// pkg/recipe/metadata.go Merge semantics, classifies the overlay by -// intent/service/platform, and asserts the resolved validation contains -// the required phases. +// floor on every overlay production can return as a maximal-leaf +// candidate for some query. For each candidate it calls BuildRecipeResult +// with the overlay's own criteria — the same code path the CLI and API +// use — and asserts the resolved validation contains the required phases +// per the candidate's classification. Wildcard fragments (intent or +// service "any") are excluded because their criteria do not correspond +// to a meaningful user query. // -// Closes the loophole that let 27 of 41 GPU overlays drift to -// conformance-only without a CI gate (see issue #970, companion #969). +// Closes the loophole that let GPU overlays drift to conformance-only +// without a CI gate (see issue #970, companion #969). // // Per-intent floor: // Training (non-Kind) : deployment + conformance [performance recommended] @@ -32,10 +34,9 @@ // performance phase from warn-only to required. Default OFF until #969 // closes the data gap and Azure/OCI performance testbeds land. // -// knownGaps allowlist: overlays tracked by #969 are listed below so the -// contract can land independently of the data fix. Entries downgrade -// failures to logs. Empty this map as #969 closes; the test fails any -// NEW overlay added without the floor because it is not allowlisted. +// knownGaps allowlist: keyed by (overlay, phase) so a regression in a +// different phase is not silently masked. Drain as #969 lands; new +// overlay/phase failures that are not allowlisted block CI. package recipe @@ -50,18 +51,51 @@ import ( const strictEnvVar = "AICR_VALIDATION_FLOOR_STRICT" -// knownGaps lists leaf overlays that fail the floor today and are tracked -// under #969. Each entry downgrades an Errorf to a Logf prefixed with -// "KNOWN GAP (#969):". Drain this map as #969 lands; delete the map and -// gap handling once empty. -var knownGaps = map[string]bool{ - "gb200-oke-ubuntu-inference-dynamo": true, - "gb200-oke-ubuntu-training-kubeflow": true, - "h100-kind-inference-dynamo": true, - "h100-kind-training-kubeflow": true, - "h100-kind-training-slurm": true, - "rtx-pro-6000-lke-ubuntu-inference": true, - "rtx-pro-6000-lke-ubuntu-training": true, +// knownGaps lists (overlay, phase) pairs that fail the floor today and +// are tracked under #969. Each entry downgrades an Errorf to a Logf +// prefixed with "KNOWN GAP (#969):". Drain this map as #969 lands per- +// overlay fixes; delete the map entirely once empty. New (overlay, phase) +// failures not in this map block CI. +var knownGaps = map[string]map[string]bool{ + "aks": {"deployment": true}, + "aks-inference": {"deployment": true}, + "aks-training": {"deployment": true}, + "eks": {"deployment": true}, + "eks-inference": {"deployment": true}, + "eks-training": {"deployment": true}, + "gb200-oke-inference": {"deployment": true}, + "gb200-oke-training": {"deployment": true}, + "gb200-oke-ubuntu-inference": {"deployment": true}, + "gb200-oke-ubuntu-inference-dynamo": {"deployment": true}, + "gb200-oke-ubuntu-training": {"deployment": true}, + "gb200-oke-ubuntu-training-kubeflow": {"deployment": true}, + "gke-cos": {"deployment": true}, + "gke-cos-inference": {"deployment": true}, + "gke-cos-training": {"deployment": true}, + "h100-aks-inference": {"deployment": true}, + "h100-aks-ubuntu-inference": {"deployment": true}, + "h100-eks-inference": {"deployment": true}, + "h100-eks-ubuntu-inference": {"deployment": true}, + "h100-gke-cos-inference": {"deployment": true}, + "h100-gke-cos-training": {"deployment": true}, + "h100-gke-cos-training-kubeflow": {"deployment": true}, + "h100-kind-inference": {"deployment": true}, + "h100-kind-inference-dynamo": {"deployment": true}, + "h100-kind-training": {"deployment": true}, + "h100-kind-training-kubeflow": {"deployment": true}, + "h100-kind-training-slurm": {"deployment": true}, + "kind": {"deployment": true}, + "kind-inference": {"deployment": true}, + "lke": {"deployment": true}, + "lke-inference": {"deployment": true}, + "lke-training": {"deployment": true}, + "oke": {"deployment": true}, + "oke-inference": {"deployment": true}, + "oke-training": {"deployment": true}, + "rtx-pro-6000-lke-inference": {"deployment": true}, + "rtx-pro-6000-lke-training": {"deployment": true}, + "rtx-pro-6000-lke-ubuntu-inference": {"deployment": true}, + "rtx-pro-6000-lke-ubuntu-training": {"deployment": true}, } // classification captures the inputs that drive the per-intent floor. @@ -122,41 +156,21 @@ func resolvedPhases(v *ValidationConfig) []string { return out } -// resolveValidation walks the base: chain for the named recipe and -// returns the merged ValidationConfig using the same Merge semantics -// the production resolver uses. -func resolveValidation(s *MetadataStore, name string) (*ValidationConfig, error) { - chain, err := s.resolveInheritanceChain(name) - if err != nil { - return nil, err - } - merged := &RecipeMetadataSpec{} - for _, recipe := range chain { - merged.Merge(&recipe.Spec) - } - return merged.Validation, nil -} - -// discoverLeafOverlays returns the names of every user-selectable leaf -// overlay. A leaf has criteria, is not referenced as another overlay's -// spec.base, and is not a criteria-wildcard fragment (one whose -// criteria.intent or criteria.service is "any"). Wildcard overlays are -// cross-cutting fragments applied via the resolver's wildcard-match -// path — see docs/contributor/data.md#criteria-wildcard-overlays — not -// user-selectable entry points subject to the per-intent floor. -// Sorted for deterministic test output. -func discoverLeafOverlays(s *MetadataStore) []string { - referencedAsBase := make(map[string]bool) - for _, overlay := range s.Overlays { - if overlay.Spec.Base != "" { - referencedAsBase[overlay.Spec.Base] = true - } - } - var leaves []string +// enumerateGateableOverlays returns the names of every overlay production +// can return as a maximal-leaf candidate for some query — every overlay +// with concrete criteria, minus wildcard fragments whose intent or service +// is "any". Wildcard fragments are cross-cutting overlays composed onto +// specific queries — see docs/contributor/data.md#criteria-wildcard-overlays — +// not standalone user-facing entry points. +// +// Concrete intermediate overlays (e.g., h100-gke-cos-training) are NOT +// excluded merely because another overlay references them as spec.base. +// Production's filterToMaximalLeaves is per-query, so an intermediate is +// the maximal leaf for queries that don't narrow further (e.g., no +// platform specified); the gate must cover that case too. +func enumerateGateableOverlays(s *MetadataStore) []string { + var out []string for name, overlay := range s.Overlays { - if referencedAsBase[name] { - continue - } c := overlay.Spec.Criteria if c == nil { continue @@ -164,15 +178,26 @@ func discoverLeafOverlays(s *MetadataStore) []string { if c.Intent == CriteriaIntentAny || c.Service == CriteriaServiceAny { continue } - leaves = append(leaves, name) + out = append(out, name) + } + sort.Strings(out) + return out +} + +// knownGapEntries totals the number of (overlay, phase) downgrade pairs +// in the allowlist for logging. +func knownGapEntries() int { + n := 0 + for _, phases := range knownGaps { + n += len(phases) } - sort.Strings(leaves) - return leaves + return n } -// TestOverlayValidationPhaseFloor asserts every leaf overlay's resolved -// validation block contains the per-intent required phases. See file -// header for the floor matrix and the strict-mode toggle. +// TestOverlayValidationPhaseFloor asserts every gateable overlay's +// production-resolved validation block contains the per-intent required +// phases. See file header for the floor matrix, the strict-mode toggle, +// and the allowlist contract. func TestOverlayValidationPhaseFloor(t *testing.T) { ctx := context.Background() store, err := loadMetadataStore(ctx) @@ -182,30 +207,33 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { strict := os.Getenv(strictEnvVar) == "1" t.Logf("strict mode (%s=1): %t", strictEnvVar, strict) - t.Logf("knownGaps allowlist size (#969): %d", len(knownGaps)) + t.Logf("knownGaps allowlist entries (#969): %d", knownGapEntries()) - leaves := discoverLeafOverlays(store) - t.Logf("leaf overlays discovered: %d", len(leaves)) - if len(leaves) == 0 { - t.Fatal("no leaf overlays discovered; the floor check would be vacuous — " + - "verify discoverLeafOverlays and the recipes/overlays/ directory") + overlays := enumerateGateableOverlays(store) + t.Logf("gateable overlays discovered: %d", len(overlays)) + if len(overlays) == 0 { + t.Fatal("no gateable overlays discovered; the floor check would be vacuous — " + + "verify enumerateGateableOverlays and the recipes/overlays/ directory") } - // triggeredKnownGaps tracks which knownGaps entries actually downgraded - // a failure during this run. Subtests run sequentially (no t.Parallel), - // so plain map writes from the fail closure are safe. - triggeredKnownGaps := make(map[string]bool, len(knownGaps)) + // triggered tracks which (overlay, phase) knownGaps entries actually + // downgraded a failure during this run. Subtests run sequentially + // (no t.Parallel), so plain map writes from the fail closure are safe. + triggered := make(map[string]map[string]bool, len(knownGaps)) - for _, name := range leaves { + for _, name := range overlays { t.Run(name, func(t *testing.T) { overlay := store.Overlays[name] class := classifyOverlay(overlay.Spec.Criteria) - validation, err := resolveValidation(store, name) + // Use the production resolver so the test gates the same + // ValidationConfig the CLI and API actually produce — wildcard + // overlay contributions and mixins included. + result, err := store.BuildRecipeResult(ctx, overlay.Spec.Criteria) if err != nil { - t.Fatalf("resolveValidation: %v", err) + t.Fatalf("BuildRecipeResult: %v", err) } - phases := resolvedPhases(validation) + phases := resolvedPhases(result.Validation) report := func(severity, kind, phase string) string { return fmt.Sprintf( @@ -216,13 +244,16 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { ) } - // fail records a missing required phase. Overlays in knownGaps - // are downgraded to logs so the contract can land before #969 - // finishes closing the data gap. + // fail records a missing required phase. (overlay, phase) + // pairs in knownGaps are downgraded to logs so the contract + // can land before #969 closes the data gap. fail := func(phase string) { msg := report("FAIL", "required", phase) - if knownGaps[name] { - triggeredKnownGaps[name] = true + if knownGaps[name][phase] { + if triggered[name] == nil { + triggered[name] = map[string]bool{} + } + triggered[name][phase] = true t.Logf("KNOWN GAP (#969): %s", msg) return } @@ -230,19 +261,19 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { } // Required: deployment + conformance for every classification. - if validation == nil || validation.Deployment == nil { + if result.Validation == nil || result.Validation.Deployment == nil { fail("deployment") } - if validation == nil || validation.Conformance == nil { + if result.Validation == nil || result.Validation.Conformance == nil { fail("conformance") } // Performance: warn-only by default; strict mode promotes to - // required. Either way, knownGaps downgrades the result to - // preserve the allowlist contract. - if class.requiresPerformance() && (validation == nil || validation.Performance == nil) { + // required. Either way, the knownGaps lookup downgrades the + // result so the allowlist contract holds in both modes. + if class.requiresPerformance() && (result.Validation == nil || result.Validation.Performance == nil) { if strict { - fail("performance (strict)") + fail("performance") } else { t.Log(report("WARN", "recommended", "performance")) } @@ -250,19 +281,21 @@ func TestOverlayValidationPhaseFloor(t *testing.T) { }) } - // Hygiene: every entry in knownGaps must have downgraded at least one - // failure during this run. Stale entries indicate the allowlist has - // drifted out of sync with the data (e.g., #969 fixed the overlay but - // forgot to remove the entry) — fail so the next reader cleans it up. + // Hygiene: every (overlay, phase) entry in knownGaps must have + // downgraded at least one failure. Stale entries indicate the data + // has caught up — remove them so a future regression in that phase + // is not silently masked. var stale []string - for name := range knownGaps { - if !triggeredKnownGaps[name] { - stale = append(stale, name) + for name, phases := range knownGaps { + for phase := range phases { + if !triggered[name][phase] { + stale = append(stale, fmt.Sprintf("%s:%s", name, phase)) + } } } if len(stale) > 0 { sort.Strings(stale) - t.Errorf("stale knownGaps entries — overlay now meets the floor; "+ + t.Errorf("stale knownGaps entries — overlay/phase now meets the floor; "+ "remove from knownGaps: %s", strings.Join(stale, ", ")) } } From 027afb662b6aca7dd9ed98c481c5075d12855d35 Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Wed, 20 May 2026 13:15:47 -0700 Subject: [PATCH 5/5] test(recipe): drain h100-gke-cos-training from knownGaps after #991 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yuan's #991 added the deployment phase to h100-gke-cos-training; the fix cascades through the base chain to h100-gke-cos-training-kubeflow. Both overlays now meet the floor on their own — drop both entries so a future regression is no longer silently masked. Refs #970 Refs #969 --- pkg/recipe/validation_phase_floor_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/recipe/validation_phase_floor_test.go b/pkg/recipe/validation_phase_floor_test.go index 7799cab1b..609885b53 100644 --- a/pkg/recipe/validation_phase_floor_test.go +++ b/pkg/recipe/validation_phase_floor_test.go @@ -77,8 +77,6 @@ var knownGaps = map[string]map[string]bool{ "h100-eks-inference": {"deployment": true}, "h100-eks-ubuntu-inference": {"deployment": true}, "h100-gke-cos-inference": {"deployment": true}, - "h100-gke-cos-training": {"deployment": true}, - "h100-gke-cos-training-kubeflow": {"deployment": true}, "h100-kind-inference": {"deployment": true}, "h100-kind-inference-dynamo": {"deployment": true}, "h100-kind-training": {"deployment": true},