diff --git a/.github/actions/kwok-test/action.yml b/.github/actions/kwok-test/action.yml index 02c2f8a3d..76f9f46e5 100644 --- a/.github/actions/kwok-test/action.yml +++ b/.github/actions/kwok-test/action.yml @@ -19,6 +19,10 @@ inputs: recipe: description: 'Recipe name to test' required: true + deployer: + description: 'Deployer to exercise: helm | argocd-oci | argocd-helm-oci | flux-oci' + required: false + default: 'helm' go_version: description: 'Go version to install' required: true @@ -108,7 +112,9 @@ runs: env: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} run: | - KWOK_CLUSTER=aicr-kwok-test bash kwok/scripts/run-all-recipes.sh ${{ inputs.recipe }} + KWOK_CLUSTER=aicr-kwok-test bash kwok/scripts/run-all-recipes.sh \ + --deployer ${{ inputs.deployer }} \ + ${{ inputs.recipe }} - name: Collect debug artifacts if: failure() @@ -122,6 +128,14 @@ runs: kubectl get nodes -o wide > "/tmp/kwok-debug-artifacts/${cluster}-nodes.txt" || true kubectl get events --all-namespaces --sort-by='.lastTimestamp' > "/tmp/kwok-debug-artifacts/${cluster}-events.txt" || true kubectl get pods --all-namespaces -o wide > "/tmp/kwok-debug-artifacts/${cluster}-pods.txt" || true + kubectl get applications -n argocd -o yaml > "/tmp/kwok-debug-artifacts/${cluster}-argo-apps.yaml" 2>/dev/null || true + kubectl logs -n argocd deploy/argocd-repo-server --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-argo-reposerver.log" 2>/dev/null || true + kubectl logs -n argocd statefulset/argocd-application-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-argo-appcontroller.log" 2>/dev/null || true + kubectl get ocirepositories,kustomizations,helmreleases -A -o yaml > "/tmp/kwok-debug-artifacts/${cluster}-flux-resources.yaml" 2>/dev/null || true + kubectl logs -n flux-system deploy/source-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-source-controller.log" 2>/dev/null || true + kubectl logs -n flux-system deploy/kustomize-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-kustomize-controller.log" 2>/dev/null || true + kubectl logs -n flux-system deploy/helm-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-helm-controller.log" 2>/dev/null || true + kubectl logs -n aicr-registry deploy/registry --tail=200 > "/tmp/kwok-debug-artifacts/${cluster}-registry.log" 2>/dev/null || true done - name: Export Kind logs @@ -137,7 +151,7 @@ runs: if: failure() && inputs.upload_artifacts == 'true' uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: - name: kwok-debug-${{ inputs.recipe }}-${{ github.run_id }} + name: kwok-debug-${{ inputs.recipe }}-${{ inputs.deployer }}-${{ github.run_id }} path: | /tmp/kwok-debug-artifacts/ /tmp/kwok-kind-logs/ diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml index 5ca4be25d..eb43e66c2 100644 --- a/.github/actions/load-versions/action.yml +++ b/.github/actions/load-versions/action.yml @@ -85,6 +85,12 @@ outputs: yq: description: 'yq version' value: ${{ steps.versions.outputs.yq }} + registry_image: + description: 'OCI registry image (in-cluster registry for KWOK deployer-matrix CI)' + value: ${{ steps.versions.outputs.registry_image }} + argocd_chart: + description: 'Argo CD Helm chart version (for KWOK deployer-matrix CI)' + value: ${{ steps.versions.outputs.argocd_chart }} coverage_threshold: description: 'Minimum test coverage percentage' value: ${{ steps.versions.outputs.coverage_threshold }} @@ -148,6 +154,8 @@ runs: echo "chainsaw_sha256_linux_amd64=$(yq eval '.testing_tools.chainsaw_checksums.linux_amd64' .settings.yaml)" >> $GITHUB_OUTPUT echo "chainsaw_sha256_linux_arm64=$(yq eval '.testing_tools.chainsaw_checksums.linux_arm64' .settings.yaml)" >> $GITHUB_OUTPUT echo "yq=$(yq eval '.testing_tools.yq' .settings.yaml)" >> $GITHUB_OUTPUT + echo "registry_image=$(yq eval '.testing_tools.registry_image' .settings.yaml)" >> $GITHUB_OUTPUT + echo "argocd_chart=$(yq eval '.testing_tools.argocd_chart' .settings.yaml)" >> $GITHUB_OUTPUT # Quality thresholds echo "coverage_threshold=$(yq eval '.quality.coverage_threshold' .settings.yaml)" >> $GITHUB_OUTPUT @@ -188,6 +196,8 @@ runs: echo " chainsaw_sha256_linux_amd64: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }}" echo " chainsaw_sha256_linux_arm64: ${{ steps.versions.outputs.chainsaw_sha256_linux_arm64 }}" echo " yq: ${{ steps.versions.outputs.yq }}" + echo " registry_image: ${{ steps.versions.outputs.registry_image }}" + echo " argocd_chart: ${{ steps.versions.outputs.argocd_chart }}" echo " coverage_threshold: ${{ steps.versions.outputs.coverage_threshold }}" echo " lint_timeout: ${{ steps.versions.outputs.lint_timeout }}" echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}" diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index ee09381e0..f5adffee8 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -224,7 +224,7 @@ jobs: # ── Tier 1: PR gate — generic overlays (PR + push, skip on schedule) ── test-tier1: - name: 'Tier 1: ${{ matrix.recipe }}' + name: 'Tier 1: ${{ matrix.recipe }} (${{ matrix.deployer }})' needs: discover if: >- github.event_name != 'schedule' && @@ -236,6 +236,7 @@ jobs: fail-fast: false matrix: recipe: ${{ fromJSON(needs.discover.outputs.tier1) }} + deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci] steps: - name: Checkout Code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -250,6 +251,7 @@ jobs: uses: ./.github/actions/kwok-test with: recipe: ${{ matrix.recipe }} + deployer: ${{ matrix.deployer }} go_version: ${{ steps.versions.outputs.go }} goreleaser_version: ${{ steps.versions.outputs.goreleaser }} kind_version: ${{ steps.versions.outputs.kind }} @@ -300,7 +302,7 @@ jobs: # Per ADR-003: separate concurrency group per SHA so successive merges # to main never cancel in-flight Tier 3 runs. test-tier3: - name: 'Tier 3: ${{ matrix.recipe }}' + name: 'Tier 3: ${{ matrix.recipe }} (${{ matrix.deployer }})' needs: discover concurrency: group: kwok-tier3-${{ github.sha }} @@ -315,6 +317,7 @@ jobs: fail-fast: false matrix: recipe: ${{ fromJSON(needs.discover.outputs.tier3) }} + deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci] steps: - name: Checkout Code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -329,6 +332,7 @@ jobs: uses: ./.github/actions/kwok-test with: recipe: ${{ matrix.recipe }} + deployer: ${{ matrix.deployer }} go_version: ${{ steps.versions.outputs.go }} goreleaser_version: ${{ steps.versions.outputs.goreleaser }} kind_version: ${{ steps.versions.outputs.kind }} diff --git a/.settings.yaml b/.settings.yaml index 448444cfc..130a04b43 100644 --- a/.settings.yaml +++ b/.settings.yaml @@ -85,6 +85,19 @@ testing_tools: yq: 'v4.53.2' # renovate: datasource=github-releases depName=kubernetes-sigs/karpenter depType=testing_tools karpenter: 'v1.12.0' + # In-cluster OCI registry for KWOK deployer-matrix CI (issue #843). + # renovate: datasource=docker depName=registry depType=testing_tools + registry_image: 'registry:2.8.3' + # Argo CD Helm chart for KWOK deployer-matrix CI. v9.5.x pins app v3.4.x, + # the first stable line with native OCI artifact source (issue #843). + # renovate: datasource=helm depName=argo-cd registryUrl=https://argoproj.github.io/argo-helm depType=testing_tools + argocd_chart: '9.5.14' + # Flux 2 release for KWOK deployer-matrix CI flux-oci lane (issue #843). + # install.yaml is downloaded from the GitHub release; source-controller, + # kustomize-controller and helm-controller are the only controllers the + # bundle consumes (OCIRepository -> Kustomization -> HelmRelease). + # renovate: datasource=github-releases depName=fluxcd/flux2 depType=testing_tools + flux_version: 'v2.8.7' # Quality Thresholds quality: diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 7b22113a7..d70008d31 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -522,10 +522,13 @@ make kwok-e2e RECIPE=gb200-eks-training # Test single recipe Recipes with `spec.criteria.service` defined are auto-discovered. KWOK validates scheduling (node selectors, tolerations, resource requests) but not runtime behavior (no container execution or GPU functionality). +For the deployer matrix (argocd / argocd-helm OCI lanes), see [Deployer Matrix Testing](docs/contributor/kwok-testing.md). + | Command | Description | |---------|-------------| | `make kwok-test-all` | Test all recipes in shared cluster (serial) | | `make kwok-e2e RECIPE=` | Full e2e: cluster, nodes, validate | +| `make kwok-test-deployer RECIPE= DEPLOYER=` | Validate single recipe under a specific deployer (`helm`, `argocd-oci`, `argocd-helm-oci`) | | `make kwok-cluster` | Create Kind cluster with KWOK | | `make kwok-status` | Show cluster and node status | | `make kwok-cluster-delete` | Delete cluster | diff --git a/Makefile b/Makefile index c126935ca..ab4f365ed 100644 --- a/Makefile +++ b/Makefile @@ -687,6 +687,20 @@ endif kwok-test-all: build ## Run all KWOK recipe tests in a shared cluster @bash kwok/scripts/run-all-recipes.sh +.PHONY: kwok-test-deployer +kwok-test-deployer: build ## Validate scheduling under a specific deployer (RECIPE=… DEPLOYER=helm|argocd-oci|argocd-helm-oci|flux-oci) +ifndef RECIPE + @echo "Error: RECIPE is required" + @echo "Usage: make kwok-test-deployer RECIPE=eks-training DEPLOYER=argocd-oci" + @exit 1 +endif +ifndef DEPLOYER + @echo "Error: DEPLOYER is required (helm | argocd-oci | argocd-helm-oci | flux-oci)" + @exit 1 +endif + @echo "Validating $(RECIPE) under deployer=$(DEPLOYER)" + bash kwok/scripts/run-all-recipes.sh --deployer $(DEPLOYER) $(RECIPE) + # ============================================================================= # Talos local test harness # ============================================================================= diff --git a/docs/contributor/kwok-testing.md b/docs/contributor/kwok-testing.md new file mode 100644 index 000000000..196bcd1de --- /dev/null +++ b/docs/contributor/kwok-testing.md @@ -0,0 +1,209 @@ +# KWOK Deployer Matrix Testing + +KWOK (Kubernetes WithOut Kubelet) simulates a GPU cluster without real +hardware so CI can validate scheduling shape — node selectors, +tolerations, resource requests — for every recipe. The **deployer +matrix** extends that coverage by re-running the same recipes through +three additional output adapters (`argocd-oci`, `argocd-helm-oci`, +`flux-oci`) in addition to the existing `helm` path. This catches +Argo CD / Flux template regressions and OCI-source compatibility +breaks that the `helm`-only lane could never see. + +For the design rationale and the spike findings that justify the chart +pin and Repository-secret shape, see +[ADR-008](https://github.com/NVIDIA/aicr/blob/main/docs/design/008-kwok-deployer-matrix.md). +For the cluster-level KWOK setup (node profiles, recipe +auto-discovery), see [kwok/README.md](https://github.com/NVIDIA/aicr/blob/main/kwok/README.md). + +## Deployer Coverage Matrix + +| Tier | Trigger | Deployers exercised | +|------|---------|----------------------| +| Tier 1 — generic overlays | every PR + push | `helm`, `argocd-oci`, `argocd-helm-oci`, `flux-oci` | +| Tier 2 — diff-aware accelerator overlays | PR only, conditional on changed files | `helm` only | +| Tier 3 — full overlay set | push to `main` + nightly schedule | `helm`, `argocd-oci`, `argocd-helm-oci`, `flux-oci` | + +### What each lane validates + +| Lane | Pull artifact | Apply manifests | Reconcile to Ready | +|---|---|---|---| +| `helm` | n/a (filesystem) | ✅ (`helm install`) | ✅ pods scheduled | +| `argocd-oci` | ✅ (repo-server OCI pull) | ✅ (Argo CD sync) | ✅ `Synced+Healthy` (or documented partial-pass states) | +| `argocd-helm-oci` | ✅ (`helm pull` OCI) | ✅ (wrapper chart install) | ✅ `Synced+Healthy` (or documented partial-pass states) | +| `flux-oci` | ✅ (source-controller OCI pull) | ✅ (kustomize-controller apply) | ⏳ **deferred** — see [#964](https://github.com/NVIDIA/aicr/issues/964) | + +`flux-oci` intentionally stops at "bundle applied + HelmRelease CRs created" because the bundler's flux deployer emits local-chart HelmReleases with `GitRepository` sources pointing at the placeholder URL `https://github.com/YOUR_ORG/YOUR_REPO.git`. Under OCI consumption helm-controller cannot fetch these charts, and Flux's `dependsOn` cascade then stalls every downstream HelmRelease. Issue [#964](https://github.com/NVIDIA/aicr/issues/964) tracks the proper fix (render local-chart manifests at bundle time OR push each local chart as its own OCI helm artifact). + +For filesystem (Git-source) round-trip coverage of `argocd` / `flux`, see [#963](https://github.com/NVIDIA/aicr/issues/963). + +Tier 2 stays `helm`-only because its job is to verify that an +accelerator-specific overlay still produces a correct bundle when the +diff touches that overlay's inputs. The deployer shape is orthogonal +to that question — re-running the same recipe under Argo CD would only +re-exercise template rendering, which Tier 1 and Tier 3 already cover +on the generic overlays. + +## Running Locally + +`make kwok-test-deployer` is the single entry point that mirrors what +CI runs per matrix cell. It expects a KWOK cluster to already exist. + +```bash +unset GITLAB_TOKEN +make build +make kwok-cluster + +# Single recipe + single deployer +make kwok-test-deployer RECIPE=eks-training DEPLOYER=argocd-oci +``` + +Valid `DEPLOYER` values: `helm`, `argocd-oci`, `argocd-helm-oci`, +`flux-oci`. The target invokes `kwok/scripts/run-all-recipes.sh +--deployer `, which calls `install-infra.sh` once with +`DEPLOYER` exported (in-cluster `registry:2` always; Argo CD for +`argocd-*`; Flux 2 controllers for `flux-oci`), then runs +`validate-scheduling.sh` for the recipe. + +### Registry host port + +The Kind cluster exposes the in-cluster `registry:2` Service on **host +port 5500** (`kwok/kind-config.yaml`'s `extraPortMappings`). The +unconventional choice avoids Apple ControlCenter (AirPlay / Handoff), +which listens on host port 5000 by default on macOS and would otherwise +fail `kind create cluster` with a port-bind error. Linux CI runners have +5500 free as well, so the same default works everywhere. + +The in-cluster NodePort (`30500`) and the Service `containerPort` +(`5000`) are hardcoded and independent of the host-side mapping — +Argo CD's repo-server reaches the registry via Service DNS +(`registry.aicr-registry.svc.cluster.local:5000`) regardless. + +Override `KWOK_REGISTRY_HOST_PORT` only when running against a +non-standard cluster topology (e.g., port 5500 is already in use on +your host). The variable adjusts the reachability probe; you still need +to update `kwok/kind-config.yaml`'s `hostPort` to match. + +## Sweeping All Deployers Locally + +`make kwok-test-all` still defaults to `helm` and there is no +matrix-aware make target — CI does the fan-out via the workflow +matrix. To reproduce the matrix locally, loop in shell: + +```bash +for d in helm argocd-oci argocd-helm-oci flux-oci; do + make kwok-test-deployer RECIPE=eks-training DEPLOYER="$d" || break +done +``` + +For the full recipe set under a single deployer, call the script +directly: + +```bash +bash kwok/scripts/run-all-recipes.sh --deployer argocd-oci +``` + +## Failure Modes and Exit Codes + +The three scripts emit distinct exit codes so callers (CI, the Make +target, local loops) can branch on failure mode rather than parsing +logs. + +| Script | Code | Meaning | +|--------|------|---------| +| `install-infra.sh` | 10 | `yq` missing or required `.settings.yaml` field absent | +| `install-infra.sh` | 20 | Registry Deployment not Ready within 120 s | +| `install-infra.sh` | 21 | Registry not reachable on host port within 60 s | +| `install-infra.sh` | 30 | Argo CD Helm install failed | +| `install-infra.sh` | 31 | `applications.argoproj.io` CRD not Established within 120 s | +| `install-infra.sh` | 40 | Repository secret apply failed | +| `install-infra.sh` | 60 | Flux install manifest apply failed | +| `install-infra.sh` | 61 | Flux controller (source/kustomize/helm) not Ready within 180 s | +| `install-infra.sh` | 62 | Flux CRDs (OCIRepository/Kustomization/HelmRelease) not Established within 60 s | +| `validate-scheduling.sh` | 50 | GitOps sync deadline hit (`KWOK_ARGOCD_SYNC_TIMEOUT` for `argocd-*`, `KWOK_FLUX_SYNC_TIMEOUT` for `flux-oci`); only GitOps lanes can return 50 | +| `run-all-recipes.sh` | 50 | Three consecutive GitOps sync timeouts; tripped the ADR-008 3-strike rule and bailed the whole job | + +Exit code 50 from `validate-scheduling.sh` is intentionally distinct +from generic non-zero exits: a sync-deadline timeout is qualitatively +different from a bundle-render failure or a scheduling-shape mismatch, +and the 3-strike rule in `run-all-recipes.sh` only counts code-50 +strikes. + +## Tuning the Sync Deadline + +Four environment variables shape how long the GitOps lanes wait before +declaring a sync timeout. The Argo CD pair is independent of the Flux +pair so the two GitOps lanes can be tuned separately when a recipe has +deployer-specific reconciliation overhead. + +| Variable | Default | Purpose | +|----------|---------|---------| +| `KWOK_ARGOCD_SYNC_TIMEOUT` | `300` (seconds) | Total deadline for all child Argo CD Applications to reach `Synced` + `Healthy` | +| `KWOK_ARGOCD_ROOT_GRACE` | `30` (seconds) | Grace period for the root Application to appear before deadline accounting starts | +| `KWOK_FLUX_SYNC_TIMEOUT` | `300` (seconds) | Total deadline for the outer `OCIRepository` to fetch + `Kustomization` to apply + `HelmRelease` CRs to be created. `flux-oci` does NOT wait for `HelmRelease` reconciliation to terminal state — see [issue #964](https://github.com/NVIDIA/aicr/issues/964) | +| `KWOK_FLUX_ROOT_GRACE` | `30` (seconds) | Grace period for the outer Kustomization to appear before deadline accounting starts | + +On a clean local Kind cluster the Phase-0 spike observed +Synced+Healthy in roughly 30 seconds. CI runners are slower and +contended, so the 300-second default exists to absorb that variance. +If a local run trips code 50 but the cluster is otherwise healthy, +raise `KWOK_ARGOCD_SYNC_TIMEOUT` / `KWOK_FLUX_SYNC_TIMEOUT` before +assuming the recipe is broken — image pulls on a cold cluster are the +most common cause. + +## Debugging CI Failures + +When the `kwok-test` composite action fails, it uploads an artifact +named `kwok-debug---` containing: + +- `-resources.txt` — `kubectl get all --all-namespaces` +- `-nodes.txt` — `kubectl get nodes -o wide` +- `-events.txt` — events sorted by `lastTimestamp` +- `-pods.txt` — pod listing across all namespaces +- `-argo-apps.yaml` — `applications.argoproj.io` YAML in `argocd` namespace (argocd lanes only) +- `-argo-reposerver.log` — last 500 lines of `argocd-repo-server` +- `-argo-appcontroller.log` — last 500 lines of `argocd-application-controller` +- `-flux-resources.yaml` — YAML for `ocirepositories`, `kustomizations`, and `helmreleases` across all namespaces (flux-oci lane only) +- `-flux-source-controller.log` — last 500 lines of `source-controller` +- `-flux-kustomize-controller.log` — last 500 lines of `kustomize-controller` +- `-flux-helm-controller.log` — last 500 lines of `helm-controller` +- `-registry.log` — last 200 lines of the in-cluster `registry:2` Deployment + +The repo-server log (Argo CD) and source-controller log (Flux) are +the first places to look for OCI-pull failures (plain-HTTP scheme +errors, mediaType mismatches). The application-controller (Argo CD) +and kustomize-controller (Flux) logs show reconciliation decisions +and prune behavior; helm-controller logs surface per-`HelmRelease` +install and upgrade outcomes. + +## Adding a New Deployer Value + +The deployer set is intentionally finite and matches what +`pkg/bundler` emits. To add a new value (say, `argocd-git`): + +1. Add a `case` branch in `kwok/scripts/validate-scheduling.sh`'s + `resolve_argocd_root_app()` (or the equivalent `resolve_flux_root_names()` + if the new lane reconciles via Flux) mapping the new deployer to its + root-resource name, plus any new branches in `generate_bundle` and + `deploy_bundle`. Reuse the existing `argocd-oci` / `flux-oci` branches + as templates. +2. Extend the `DEPLOYER` allowlist in `kwok/scripts/run-all-recipes.sh` + (the `case "$DEPLOYER" in` block in `main()`). +3. Extend the `case "${DEPLOYER}"` branches in + `kwok/scripts/install-infra.sh`'s `main()` so the right controller + stack is installed for the new deployer. +4. Extend the `deployer:` input description in + `.github/actions/kwok-test/action.yml` so callers see the new + value in the input docs. +5. Add the value to the `deployer:` matrix in Tier 1 and Tier 3 of + `.github/workflows/kwok-recipes.yaml`. Leave Tier 2 alone — the + orthogonality rationale above still applies. +6. Add a row to the [Deployer Coverage Matrix](#deployer-coverage-matrix) + table on this page so contributors can discover the new lane. + +If the new value requires changes to the in-cluster infra (a different +registry, a different Argo CD chart, additional CRDs), update +`install-infra.sh` and pin any new versions in `.settings.yaml` rather +than hardcoding. The exit-code taxonomy in +[Failure Modes and Exit Codes](#failure-modes-and-exit-codes) is +contiguous — pick the next free code if a new distinct failure mode +appears. diff --git a/docs/design/008-kwok-deployer-matrix.md b/docs/design/008-kwok-deployer-matrix.md index e4ee0fa96..f1e31d2c7 100644 --- a/docs/design/008-kwok-deployer-matrix.md +++ b/docs/design/008-kwok-deployer-matrix.md @@ -57,23 +57,37 @@ round-trip. ## Decision Add a `deployer` dimension to the existing `kwok-recipes.yaml` matrix -with values `{helm, argocd-oci, argocd-helm-oci}`. Per matrix cell, -boot a shared in-cluster OCI registry and Argo CD; push the bundle -via OCI; observe Argo CD reconciliation to `Synced+Healthy`; then -reuse the existing pod-Running verification. +with values `{helm, argocd-oci, argocd-helm-oci, flux-oci}`. Per matrix +cell, boot a shared in-cluster OCI registry plus the GitOps controllers +needed by the selected deployer (Argo CD for `argocd-*`, Flux 2 for +`flux-oci`); push the bundle via OCI; observe reconciliation to a +controller-specific terminal-pass state; then reuse the existing +pod-Running verification. | Tier | Trigger | Deployer values | |---|---|---| -| Tier 1 — generic overlays | every PR + push | `{helm, argocd-oci, argocd-helm-oci}` | +| Tier 1 — generic overlays | every PR + push | `{helm, argocd-oci, argocd-helm-oci, flux-oci}` | | Tier 2 — diff-aware accelerator overlays | PR only, conditional | `helm` only (unchanged) | -| Tier 3 — full overlay set | push to main + nightly | `{helm, argocd-oci, argocd-helm-oci}` | - -**Rationale for tier scope.** Argo CD template regressions surface on -generic overlays (Tier 1) — that is where the issue's bug class -lives. Full accelerator-specific Argo CD coverage runs nightly and on -push to main without inflating PR latency. Tier 2 stays helm-only -because its purpose is diff-aware accelerator-config validation; -Argo CD shape is orthogonal. +| Tier 3 — full overlay set | push to main + nightly | `{helm, argocd-oci, argocd-helm-oci, flux-oci}` | + +**Rationale for tier scope.** Argo CD / Flux template regressions +surface on generic overlays (Tier 1) — that is where the issue's bug +class lives. Full accelerator-specific GitOps coverage runs nightly +and on push to main without inflating PR latency. Tier 2 stays +helm-only because its purpose is diff-aware accelerator-config +validation; GitOps shape is orthogonal. + +**Rationale for adding `flux-oci`.** AICR ships a `flux` deployer +alongside `argocd` / `argocd-helm`. Without an end-to-end CI lane, +Flux-side bundle-format regressions ship to `main` undetected (same +class of bug `argocd-oci` was added to catch). Flux's source-controller +can consume the AICR generic OCI artifact via an outer `OCIRepository` ++ `Kustomization` wrapper (`spec.layerSelector.mediaType: +application/vnd.oci.image.layer.v1.tar+gzip`, `operation: extract`); +the inner `HelmRelease` resources then reconcile their per-component +charts via helm-controller. The lane validates that bundle shape, not +chart reconciliation completeness under KWOK fidelity gaps (see the +terminal-pass framing below). **Rationale for OCI over Git source.** The user-confirmed production path is OCI. A local Gitea / dumb-HTTP git server would test the @@ -111,9 +125,9 @@ One Kind cluster per matrix cell. Inside each cluster: │ │ │ (KWOK fake pods → Running) │ │ │ │ └─────────────────────────────────┘ │ └───────────┼───────────────────────────────────────────────────────┘ - │ kind extraPortMappings: host 5000 → node 30500 + │ kind extraPortMappings: host 5500 → node 30500 ┌────────┴────────┐ - │ CI runner │ aicr bundle --output oci://localhost:5000/... + │ CI runner │ aicr bundle --output oci://localhost:5500/... └─────────────────┘ ``` @@ -165,24 +179,45 @@ Per `(recipe, deployer)` matrix cell: └── if deployer in {argocd-oci, argocd-helm-oci}: aicr bundle --recipe ... \ --deployer \ - --output oci://localhost:5000/aicr/:- + --output oci://localhost:5500/aicr/:- kubectl apply -f bundle/app-of-apps.yaml (argocd) # or: helm install oci://... (argocd-helm) - wait_for_argocd_sync (new helper) + wait_for_argocd_sync (helper) + verify_pods (existing) + + └── if deployer == flux-oci: + aicr bundle --recipe ... \ + --deployer flux \ + --output oci://localhost:5500/aicr/:-flux-oci + kubectl apply -f - # OCIRepository (layerSelector + + # spec.insecure for plain-HTTP) + kubectl apply -f - # Kustomization (sourceRef -> OCIRepository, + # path: ./, wait: false) + wait_for_flux_sync (helper) verify_pods (existing) 3. Per-recipe teardown - └── cleanup_between_tests [argocd/registry survive] + └── cleanup_between_tests [argocd/flux-system/registry survive] ``` -**`wait_for_argocd_sync` (new helper)** polls +**`wait_for_argocd_sync` (helper)** polls `kubectl get application -n argocd -o json` until every `Application` has `status.sync.status=Synced` and `status.health.status=Healthy` (or `Progressing` after a bounded retry window — KWOK pods may take a few stage-fast cycles to settle). On failure, dumps specific Argo CD conditions for actionable logs. -**OCI tag convention:** `oci://localhost:5000/aicr/:-` +**`wait_for_flux_sync` (helper)** polls the outer `Kustomization` +status until `Ready=True`, then polls `kubectl get helmrelease -A -o +json` until every `HelmRelease` reaches a terminal pass state: +either `Ready=True` (canonical pass) or `Released=True` AND +`status.history[0].status=="deployed"` AND `Stalled!=True` ("bundle +applied; KWOK can't drive readiness probes" pass — analog of Argo CD's +`Synced + Progressing` arm). Reuses the EXIT_ARGOCD_SYNC_TIMEOUT (50) +exit code so `run-all-recipes.sh`'s 3-strike rule fires symmetrically +across both GitOps lanes. + +**OCI tag convention:** `oci://localhost:5500/aicr/:-` avoids stale-tag confusion when running the same recipe across deployer values back-to-back in local development. @@ -193,12 +228,12 @@ deployer values back-to-back in local development. | File | Change | |---|---| | `.settings.yaml` | Pin `argocd_chart` and `registry_image` under `versions:` | -| `kwok/kind-config.yaml` | Add `extraPortMappings: {containerPort: 30500, hostPort: 5000}` | +| `kwok/kind-config.yaml` | Add `extraPortMappings: {containerPort: 30500, hostPort: 5500}` (avoids macOS ControlCenter port-5000 collision) | | `kwok/scripts/install-infra.sh` (new) | Idempotent install of registry + Argo CD + repo-creds Secret | | `kwok/scripts/validate-scheduling.sh` | Add `--deployer ` flag; branch on deployer | | `kwok/scripts/run-all-recipes.sh` | Accept deployer arg; allowlist `aicr-registry` / `argocd` in cleanup | | `.github/actions/kwok-test/action.yml` | New input `deployer`; pass through to `run-all-recipes.sh` | -| `.github/workflows/kwok-recipes.yaml` | Add `deployer: [helm, argocd-oci, argocd-helm-oci]` to Tier 1 and Tier 3 matrices | +| `.github/workflows/kwok-recipes.yaml` | Add `deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci]` to Tier 1 and Tier 3 matrices | | `Makefile` | New `kwok-test-deployer RECIPE=... DEPLOYER=...` target | | `docs/contributor/.md` (KWOK testing page) | Document the matrix and local repro | @@ -220,7 +255,7 @@ deployer values back-to-back in local development. | # | Failure | Detection | Remediation | |---|---|---|---| -| 1 | Registry push fails | `aicr bundle --output oci://...` exits non-zero | `wait_for_registry_ready` polls `curl -sf http://localhost:5000/v2/` before any bundle step; dump registry Pod logs | +| 1 | Registry push fails | `aicr bundle --output oci://...` exits non-zero | `wait_for_registry_ready` polls `curl -sf http://localhost:5500/v2/` before any bundle step; dump registry Pod logs | | 2 | Argo CD install / CRD not ready | `kubectl wait --for=condition=Established crd/applications.argoproj.io --timeout=120s` | Bounded wait; fail fast with `kubectl describe deploy -n argocd` | | 3 | Argo CD cannot pull from local OCI | `Application.status.conditions[].type=ComparisonError` | `dump_argocd_failures` runs `kubectl get applications -n argocd -o json \| jq '.items[].status.conditions'` + repo-server logs | | 4 | Application stuck `OutOfSync` / `Progressing` past deadline | `wait_for_argocd_sync` deadline (default 300 s, override via `KWOK_ARGOCD_SYNC_TIMEOUT`) | Dump each Application's `status.operationState.message` and `status.health.message` | diff --git a/docs/index.yml b/docs/index.yml index 45a87e58c..bacc90d46 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -76,3 +76,5 @@ navigation: path: contributor/validator.md - page: Maintaining Recipe Contributions path: contributor/maintaining.md + - page: KWOK Deployer Matrix Testing + path: contributor/kwok-testing.md diff --git a/go.mod b/go.mod index a8cc48df2..310b26720 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.26.3 require ( github.com/CycloneDX/cyclonedx-go v0.11.0 + github.com/Masterminds/semver/v3 v3.5.0 github.com/Masterminds/sprig/v3 v3.3.0 github.com/coreos/go-systemd/v22 v22.7.0 github.com/distribution/reference v0.6.0 @@ -42,7 +43,6 @@ require ( dario.cat/mergo v1.0.2 // indirect github.com/IGLOU-EU/go-wildcard v1.0.3 // indirect github.com/Masterminds/goutils v1.1.1 // indirect - github.com/Masterminds/semver/v3 v3.5.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/aquilax/truncate v1.0.1 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect diff --git a/kwok/kind-config.yaml b/kwok/kind-config.yaml index 66d896ab3..e31291d59 100644 --- a/kwok/kind-config.yaml +++ b/kwok/kind-config.yaml @@ -16,7 +16,10 @@ # # Features: # - DRA (Dynamic Resource Allocation) enabled for GPU scheduling tests -# - CoreDNS skipped - not needed for scheduling tests +# - CoreDNS enabled — required by the Argo CD deployer-matrix lanes so +# in-cluster pods (argocd-server, repo-server, redis, etc.) can resolve +# each other via Service DNS. Costs ~15s extra boot vs the historical +# skip; trivial for KWOK CI. Issue #843. # - Control-plane tainted to prevent test workloads from scheduling there kind: Cluster @@ -27,11 +30,27 @@ runtimeConfig: "resource.k8s.io/v1beta1": "true" nodes: - role: control-plane + # Host port 5500 → NodePort 30500 exposes the in-cluster OCI registry + # (registry.aicr-registry.svc) to the runner for `aicr bundle --output oci://...` + # pushes during the KWOK deployer-matrix lanes. Plain HTTP — gated by + # `--plain-http` on the bundler side and `insecureOCIForceHttp` on the + # Argo CD repo secret. + # + # Why 5500 and not 5000: on macOS, Apple ControlCenter (AirPlay / + # Handoff) listens on host port 5000 by default, and `kind create + # cluster` then fails to bind the extraPortMapping. Linux CI + # runners have port 5500 free as well, so the unconventional + # choice costs nothing there. The Service `containerPort: 5000` + # and NodePort `30500` are unchanged — Argo CD's repo-server + # reaches the registry in-cluster via Service DNS on port 5000 + # regardless of the host-side mapping. Issue #843. + extraPortMappings: + - containerPort: 30500 + hostPort: 5500 + protocol: TCP kubeadmConfigPatches: - | kind: InitConfiguration - skipPhases: - - addon/coredns nodeRegistration: ignorePreflightErrors: - SystemVerification diff --git a/kwok/scripts/install-infra.sh b/kwok/scripts/install-infra.sh new file mode 100755 index 000000000..8c4cc33ed --- /dev/null +++ b/kwok/scripts/install-infra.sh @@ -0,0 +1,544 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# install-infra.sh - Install in-cluster OCI registry + the controllers the +# selected DEPLOYER needs (Argo CD or Flux) for the KWOK deployer-matrix CI +# lane (issue #843). +# +# Idempotent: re-running is a no-op when state already matches. All versions +# are sourced from .settings.yaml (`testing_tools.registry_image`, +# `testing_tools.argocd_chart`, `testing_tools.flux_version`) — never hardcoded. +# +# Components installed (controller installs branch on $DEPLOYER): +# 1. ALWAYS: `registry:2` Deployment + NodePort Service in the +# `aicr-registry` namespace. Service is exposed on nodePort 30500; +# kwok/kind-config.yaml maps host port 5500 -> nodePort 30500 so +# `aicr bundle --output oci://localhost:5500/...` works from the +# runner. Host port 5500 is used (not the more conventional 5000) +# because Apple ControlCenter holds 5000 on macOS by default; Linux +# CI runners are unaffected. +# 2. DEPLOYER=argocd-*: Argo CD via Helm, chart pinned to .settings.yaml. +# Server runs with `server.insecure=true` (no TLS termination needed +# inside the cluster). RepoCreds template secret +# `aicr-oci-repo-creds` with `insecureOCIForceHttp: "true"` so Argo +# CD's repo-server pulls the plain-HTTP in-cluster registry over +# HTTP. Phase 0 spike confirmed `repo-creds` shape does NOT propagate +# this field for OCI repos — see docs/plans/2026-05-18-kwok-deployer- +# matrix.md (F4). +# 3. DEPLOYER=flux-oci: Flux 2 install manifest applied from the pinned +# GitHub release; we only need source-controller (OCIRepository), +# kustomize-controller (Kustomization wrapper) and helm-controller +# (HelmRelease per AICR component). notification-controller and +# image-reflector-controller are tolerated but unused. Flux does NOT +# require a separate repo-creds-style secret for the in-cluster +# registry: per-OCIRepository `spec.insecure: true` toggles plain-HTTP +# at the source level. +# +# Usage: +# DEPLOYER=helm|argocd-oci|argocd-helm-oci|flux-oci ./install-infra.sh +# (DEPLOYER defaults to "helm"; under helm only the registry is installed, +# which is harmless if unused.) +# +# Environment variables: +# KUBECTL_CONTEXT (optional) - kubectl context to use; defaults to +# the current context if unset. +# KWOK_REGISTRY_HOST_PORT (optional, default 5500) - Host port the script +# probes when verifying the registry is +# reachable. MUST match the +# `extraPortMappings[*].hostPort` in +# the Kind cluster config — this script +# does not edit Kind config, so the +# variable only adjusts the local +# reachability check, not the actual +# port mapping. The default matches +# `kwok/kind-config.yaml`; override +# only when running against a +# non-standard cluster topology. The +# in-cluster NodePort (30500) and the +# Service containerPort (5000) are +# hardcoded and not affected by this +# variable. +# +# Environment variables: +# DEPLOYER (optional, default "helm") - Selects which +# controller(s) to install on top of +# the registry. Recognized values: +# helm - registry only +# argocd-oci - registry + Argo CD +# argocd-helm-oci - registry + Argo CD +# flux-oci - registry + Flux 2 +# Unknown values fall back to the +# registry-only path; the caller is +# expected to validate first. +# +# Exit codes (distinct so callers can branch on failure mode): +# 0 success +# 10 `yq` missing or required .settings.yaml field absent +# 20 registry Deployment not Ready in 120s +# 21 registry not reachable on host port within 60s +# 30 Argo CD Helm install failed +# 31 `applications.argoproj.io` CRD not Established in 120s +# 40 Repository secret apply failed +# 60 Flux install manifest apply failed +# 61 Flux controller (source/kustomize/helm) not Ready in 180s +# 62 Flux CRDs (OCIRepository/Kustomization/HelmRelease) not Established in 60s + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +KWOK_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" +REPO_ROOT="$(cd "${KWOK_DIR}/.." && pwd)" +SETTINGS_FILE="${REPO_ROOT}/.settings.yaml" + +# Colors for output (match validate-scheduling.sh / install-karpenter-kwok.sh) +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_debug() { echo -e "${BLUE}[DEBUG]${NC} $*"; } + +# Configuration +REGISTRY_NAMESPACE="aicr-registry" +REGISTRY_NAME="registry" +REGISTRY_PORT=5000 +REGISTRY_NODEPORT=30500 +REGISTRY_HOST_PORT="${KWOK_REGISTRY_HOST_PORT:-5500}" + +ARGOCD_NAMESPACE="argocd" +ARGOCD_RELEASE="argocd" +ARGOCD_REPO_NAME="argo" +ARGOCD_REPO_URL="https://argoproj.github.io/argo-helm" +ARGOCD_HELM_TIMEOUT="5m" + +# Flux 2 configuration. install.yaml is fetched from the pinned GitHub +# release; only source/kustomize/helm controllers are required by the +# flux-oci lane, but the upstream manifest also installs notification- +# controller and image-reflector-controller — they're tolerated. +FLUX_NAMESPACE="flux-system" +FLUX_CONTROLLERS=(source-controller kustomize-controller helm-controller) +FLUX_REQUIRED_CRDS=( + ocirepositories.source.toolkit.fluxcd.io + kustomizations.kustomize.toolkit.fluxcd.io + helmreleases.helm.toolkit.fluxcd.io +) + +# Selected deployer (env var). Branches controller install in main(). +DEPLOYER="${DEPLOYER:-helm}" + +# Wrapper that honors KUBECTL_CONTEXT when set; otherwise uses current context. +kc() { + if [[ -n "${KUBECTL_CONTEXT:-}" ]]; then + kubectl --context="${KUBECTL_CONTEXT}" "$@" + else + kubectl "$@" + fi +} + +hc() { + if [[ -n "${KUBECTL_CONTEXT:-}" ]]; then + helm --kube-context "${KUBECTL_CONTEXT}" "$@" + else + helm "$@" + fi +} + +# Read a required field from .settings.yaml via yq. Exit 10 if yq is missing +# or the field is empty / null / absent. +read_setting() { + local field="$1" + if ! command -v yq &>/dev/null; then + log_error "yq is required but not installed" + log_error "Missing field: ${field}" + exit 10 + fi + local value + value=$(yq eval "${field} // \"\"" "${SETTINGS_FILE}" 2>/dev/null || echo "") + if [[ -z "${value}" || "${value}" == "null" ]]; then + log_error "Required field ${field} is missing or empty in ${SETTINGS_FILE}" + exit 10 + fi + echo "${value}" +} + +# Diagnostic dump for registry failures. +dump_registry_diagnostics() { + log_error "--- registry diagnostics ---" + kc describe deployment -n "${REGISTRY_NAMESPACE}" "${REGISTRY_NAME}" 2>&1 || true + log_error "--- registry pod logs (tail=200) ---" + kc logs -n "${REGISTRY_NAMESPACE}" "deploy/${REGISTRY_NAME}" --tail=200 2>&1 || true +} + +# Diagnostic dump for Argo CD failures. +dump_argocd_diagnostics() { + log_error "--- argocd deployments ---" + kc describe deployment -n "${ARGOCD_NAMESPACE}" 2>&1 || true + log_error "--- argocd events (last 20) ---" + kc get events -n "${ARGOCD_NAMESPACE}" --sort-by=.lastTimestamp 2>&1 | tail -20 || true +} + +# Diagnostic dump for Flux controller failures. +dump_flux_diagnostics() { + log_error "--- flux-system deployments ---" + kc describe deployment -n "${FLUX_NAMESPACE}" 2>&1 || true + log_error "--- flux-system pods ---" + kc get pods -n "${FLUX_NAMESPACE}" -o wide 2>&1 || true + log_error "--- flux-system events (last 20) ---" + kc get events -n "${FLUX_NAMESPACE}" --sort-by=.lastTimestamp 2>&1 | tail -20 || true + log_error "--- flux CRDs ---" + kc get crd -l app.kubernetes.io/instance=flux-system 2>&1 || true +} + +# ------------------------------------------------------------------- +# Step 1: Install in-cluster registry:2 +# ------------------------------------------------------------------- +install_registry() { + local image="$1" + + log_info "Installing in-cluster OCI registry (${image}) in namespace ${REGISTRY_NAMESPACE}..." + + # Namespace — apply, not create, so re-runs upsert cleanly. + kc apply -f - <&1 || true + exit 21 +} + +# ------------------------------------------------------------------- +# Step 2: Install Argo CD via Helm +# ------------------------------------------------------------------- +install_argocd() { + local chart_version="$1" + + log_info "Installing Argo CD (chart ${chart_version}) in namespace ${ARGOCD_NAMESPACE}..." + + if ! helm repo add "${ARGOCD_REPO_NAME}" "${ARGOCD_REPO_URL}" --force-update >/dev/null 2>&1; then + log_warn "helm repo add returned non-zero; continuing (repo may already exist)" + fi + helm repo update >/dev/null + + # `server.insecure=true` disables TLS on the API server. Suitable for an + # in-cluster CI install — not for production. The `\.` escape in the key + # is required so Helm preserves the dot in `server.insecure`. Use + # `--set-string` (not `--set`) so the value renders as the literal string + # "true" in the `argocd-cmd-params-cm` ConfigMap — `configs.params` is a + # stringly-typed map and the chart's `tpl` step drops bool-typed values, + # which would leave the API server without `--insecure`. + if ! hc upgrade --install "${ARGOCD_RELEASE}" "${ARGOCD_REPO_NAME}/argo-cd" \ + --namespace "${ARGOCD_NAMESPACE}" --create-namespace \ + --version "${chart_version}" \ + --set-string 'configs.params.server\.insecure=true' \ + --wait --timeout "${ARGOCD_HELM_TIMEOUT}"; then + log_error "Argo CD Helm install failed" + dump_argocd_diagnostics + exit 30 + fi + + log_info "Waiting for applications.argoproj.io CRD to be Established (timeout 120s)..." + if ! kc wait --for=condition=Established \ + crd/applications.argoproj.io --timeout=120s; then + log_error "applications.argoproj.io CRD did not reach Established within 120s" + kc describe crd applications.argoproj.io 2>&1 || true + exit 31 + fi + + log_info "Argo CD ready" +} + +# ------------------------------------------------------------------- +# Step 3: Apply RepoCreds template for the in-cluster OCI registry. +# +# IMPORTANT: secret-type MUST be `repo-creds` (NOT `repository`). +# Repository-type secrets in Argo CD use EXACT URL match +# (util/db/repository_secrets.go::getRepositorySecret → git.SameURL). +# Different recipes produce different repoURLs (oci://…/aicr/), +# so a single Repository secret cannot cover them all. RepoCreds-type +# secrets use longest-prefix match +# (util/db/repository_secrets.go::getRepositoryCredentialIndex → +# strings.HasPrefix), so one template at the …/aicr prefix covers +# every recipe pushed by validate-scheduling.sh. Argo CD enriches the +# synthetic Repository for each Application with this template's +# insecureOCIForceHttp=true, forcing plain HTTP for the in-cluster +# registry. The Phase 0 spike on chart 7.7.0 (= app v2.13) saw +# repo-creds + type=helm + enableOCI=true NOT propagate +# insecureOCIForceHttp; chart 9.5.x (= app v3.x) with type=oci does. +# ------------------------------------------------------------------- +apply_repo_secret() { + log_info "Applying Argo CD repo-creds template aicr-oci-repo-creds (type=oci, insecureOCIForceHttp=true)..." + + # Best-effort cleanup of the older repository-typed secret from + # previous install-infra runs so an idempotent re-run converges + # cleanly on the prefix-match shape. + kc delete secret -n "${ARGOCD_NAMESPACE}" aicr-oci-repo --ignore-not-found >/dev/null 2>&1 || true + + if ! kc apply -f - <&1 || true + exit 40 + fi + + log_info "Repo-creds template applied (prefix-matches every oci://…/aicr/)" +} + +# ------------------------------------------------------------------- +# Step 4 (flux-oci only): Install Flux 2 controllers via the pinned +# release's install.yaml. +# +# Why install.yaml and not the community Helm chart: install.yaml is the +# canonical, pre-rendered, version-locked manifest the Flux project ships +# for every release. No third-party chart drift, no `helm install` step +# (so no failure mode where Flux installs while helm-controller crashloops +# searching for its own CRDs). Apply, then wait for the three controllers +# the bundle pipeline actually uses to roll out. +# +# OCIRepository.spec.insecure handles plain-HTTP for the in-cluster +# registry, so there's no Flux equivalent of apply_repo_secret(). +# ------------------------------------------------------------------- +install_flux() { + local version="$1" + local install_url="https://github.com/fluxcd/flux2/releases/download/${version}/install.yaml" + + log_info "Installing Flux ${version} from ${install_url}..." + # GitHub releases CDN occasionally returns transient 502s on + # release-asset fetches (same flake class that motivated the + # kwok-stage-fast retry hardening in run-all-recipes.sh). install- + # infra.sh runs in its own bash and doesn't import the retry helper, + # so retry inline. 5 attempts with doubling backoff (5+10+20+40s + # cumulative sleep = 75s) covers the 30-60s typical CDN hiccup + # without slowing the happy path. + local max_attempts=5 + local delay=5 + local attempt=1 + while true; do + if kc apply -f "${install_url}"; then + break + fi + if (( attempt >= max_attempts )); then + log_error "Failed to apply Flux install manifest after ${attempt} attempt(s)" + dump_flux_diagnostics + exit 60 + fi + log_warn "Flux install manifest apply failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..." + sleep "${delay}" + attempt=$(( attempt + 1 )) + delay=$(( delay * 2 )) + done + + log_info "Waiting for Flux controllers (source/kustomize/helm) to roll out (timeout 180s each)..." + local ctrl + for ctrl in "${FLUX_CONTROLLERS[@]}"; do + if ! kc rollout status deployment/"${ctrl}" \ + -n "${FLUX_NAMESPACE}" --timeout=180s; then + log_error "Flux controller ${ctrl} did not become Ready within 180s" + dump_flux_diagnostics + exit 61 + fi + done + + log_info "Waiting for Flux CRDs (OCIRepository / Kustomization / HelmRelease) to be Established..." + local crd + for crd in "${FLUX_REQUIRED_CRDS[@]}"; do + if ! kc wait --for=condition=Established \ + "crd/${crd}" --timeout=60s; then + log_error "Flux CRD ${crd} did not reach Established within 60s" + kc describe crd "${crd}" 2>&1 || true + exit 62 + fi + done + + log_info "Flux ready" +} + +# ------------------------------------------------------------------- +# Main +# ------------------------------------------------------------------- +main() { + log_info "==========================================" + log_info "AICR KWOK lane infra install (issue #843)" + log_info "==========================================" + + if [[ ! -f "${SETTINGS_FILE}" ]]; then + log_error "Settings file not found: ${SETTINGS_FILE}" + exit 10 + fi + + local registry_image + registry_image=$(read_setting '.testing_tools.registry_image') + + log_info "registry_image: ${registry_image}" + log_info "DEPLOYER: ${DEPLOYER}" + log_info "registry host port: ${REGISTRY_HOST_PORT}" + if [[ -n "${KUBECTL_CONTEXT:-}" ]]; then + log_info "kubectl context: ${KUBECTL_CONTEXT}" + else + local current_ctx + current_ctx=$(kubectl config current-context 2>/dev/null || echo "") + log_info "kubectl context: ${current_ctx} (current)" + fi + + log_debug "Step 1: install in-cluster registry" + install_registry "${registry_image}" + + # Controller install branches on DEPLOYER. Unknown / "helm" values stop + # after the registry — they don't need a GitOps controller. This avoids + # the ~30-90s of unconditional Argo CD install the prior version paid + # on the helm lane. + case "${DEPLOYER}" in + argocd-oci|argocd-helm-oci) + local argocd_chart + argocd_chart=$(read_setting '.testing_tools.argocd_chart') + log_info "argocd_chart: ${argocd_chart}" + log_debug "Step 2 (argocd): install Argo CD" + install_argocd "${argocd_chart}" + log_debug "Step 3 (argocd): apply repository secret" + apply_repo_secret + ;; + flux-oci) + local flux_version + flux_version=$(read_setting '.testing_tools.flux_version') + log_info "flux_version: ${flux_version}" + log_debug "Step 2 (flux): install Flux 2" + install_flux "${flux_version}" + ;; + helm|"") + log_info "DEPLOYER=${DEPLOYER:-helm}: skipping GitOps controller install" + ;; + *) + log_warn "Unknown DEPLOYER=${DEPLOYER}; installing registry only" + ;; + esac + + log_info "==========================================" + log_info "Infra install complete" + log_info "==========================================" +} + +main "$@" diff --git a/kwok/scripts/lib/cleanup.sh b/kwok/scripts/lib/cleanup.sh new file mode 100644 index 000000000..14bf97854 --- /dev/null +++ b/kwok/scripts/lib/cleanup.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# shellcheck shell=bash +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +# Shared cleanup helpers for the KWOK deployer-matrix CI lane. +# +# Sourced by validate-scheduling.sh and run-all-recipes.sh so the +# system-namespace allowlist (the only thing standing between the +# force-finalize sweep and the install-infra-owned ns argocd / +# flux-system / aicr-registry) and the KWOK-context safety guard +# live in exactly one place. Drift in either across scripts has +# silently broken the test infra before — keep it centralized. +# +# Source guard: this file uses constants and functions only, no +# side effects at source time. + +# SYSTEM_NS_PATTERN matches the namespaces that MUST NOT be touched by +# the test cleanup paths: +# - default, kube-*, local-path-storage, kwok-system: cluster +# control-plane and CNI / storage plumbing. Wiping any of them +# bricks the cluster. +# - argocd, flux-system, aicr-registry: owned by install-infra.sh, +# reused across recipes. Wiping any of them forces a full re- +# install (~3-5 min) on every subsequent recipe. +# +# Used in `grep -vE "^(${SYSTEM_NS_PATTERN})$"` and +# `grep -vE "^(${SYSTEM_NS_PATTERN})\s"` call sites. Anchors are NOT +# baked in here so callers can choose space-anchor vs eol-anchor +# depending on the kubectl output they're filtering. +# shellcheck disable=SC2034 # consumed by sourcing scripts +readonly SYSTEM_NS_PATTERN="default|kube-node-lease|kube-public|kube-system|kwok-system|local-path-storage|argocd|aicr-registry|flux-system" + +# ensure_kwok_context_loose checks that the kubectl context is a known +# KWOK Kind cluster name. Cheap check, no node lookup. Use this from +# cold-start paths where kwok nodes may not exist yet (e.g. run-all- +# recipes.sh's initial cleanup before any recipe has applied nodes). +# +# Hard-fail on mismatch — the cleanup paths the caller is about to run +# force-finalize namespaces, and we'd rather refuse to act than risk +# wiping a real cluster. +ensure_kwok_context_loose() { + local ctx + ctx=$(kubectl config current-context 2>/dev/null || true) + case "$ctx" in + kind-aicr-kwok|kind-aicr-kwok-test) + return 0 + ;; + *) + echo "[ERROR] ensure_kwok_context: refusing to run — current kubectl context '${ctx:-}' is not a known KWOK Kind cluster (expected kind-aicr-kwok or kind-aicr-kwok-test)" >&2 + echo "[ERROR] If you intended to run against a different cluster, double-check KUBECONFIG and the cluster's purpose — this script force-finalizes namespaces" >&2 + exit 1 + ;; + esac +} + +# ensure_kwok_context strict-checks BOTH the context name AND that at +# least one node carries `type=kwok` label. Use this from in-recipe +# paths where apply-nodes.sh has already populated the cluster. The +# node-label check is load-bearing: a developer could `kind create +# cluster --name aicr-kwok-test` and then point at a different cluster +# whose context name happens to be the same — only the kwok-typed +# nodes prove apply-nodes.sh ran against THIS cluster. +ensure_kwok_context() { + ensure_kwok_context_loose + + if ! kubectl get nodes -l type=kwok -o name 2>/dev/null | grep -q . ; then + echo "[ERROR] ensure_kwok_context: refusing to run — current context has no kwok-typed nodes" >&2 + echo "[ERROR] Run kwok/scripts/apply-nodes.sh first, or verify you're against the right cluster" >&2 + exit 1 + fi +} diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index b2c43fd31..753768270 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -15,16 +15,58 @@ # Run all KWOK recipe tests sequentially in a shared cluster. # Usage: -# ./run-all-recipes.sh # Run all testable recipes -# ./run-all-recipes.sh recipe1 # Run specific recipe(s) +# ./run-all-recipes.sh # Run all testable recipes (helm) +# ./run-all-recipes.sh recipe1 # Run specific recipe(s) +# ./run-all-recipes.sh --deployer # Select deployer for all recipes +# ./run-all-recipes.sh --deployer= recipe1 # `=` form supported +# +# Flags: +# --deployer Deployer to exercise for every recipe. One of: +# helm (default — original Helm path) +# argocd-oci (in-cluster registry + Argo CD +# app-of-apps) +# argocd-helm-oci (in-cluster registry + Argo CD +# via OCI Helm chart wrapper) +# flux-oci (in-cluster registry + Flux 2 +# OCIRepository → Kustomization → +# HelmRelease) +# When != helm, install-infra.sh is run ONCE before the +# recipe loop with DEPLOYER exported so it installs the +# shared in-cluster OCI registry plus the GitOps +# controllers the selected deployer needs (Argo CD or +# Flux). Their owning namespaces (`aicr-registry`, +# `argocd`, `flux-system`) survive across recipe +# iterations. +# +# Exit codes: +# 0 all recipes passed +# 1 one or more recipes failed (non-sync-timeout); install-infra.sh failed; +# or invalid arguments +# 50 Argo CD sync deadline hit on 3 consecutive recipes (3-strike rule, +# ADR-008 §"Error Handling and Failure Modes"). Distinct so CI can +# distinguish infra/controller-wide failure from per-recipe issues. +# Mirrors validate-scheduling.sh's EXIT_ARGOCD_SYNC_TIMEOUT. set -euo pipefail +# Mirrors validate-scheduling.sh's EXIT_ARGOCD_SYNC_TIMEOUT. Kept in sync by +# the matching documentation in both script headers. +readonly EXIT_ARGOCD_SYNC_TIMEOUT=50 +# Max consecutive sync timeouts before bailing the whole job (ADR-008 +# §"Error Handling and Failure Modes"). Only applied for argocd-* deployers. +readonly ARGOCD_SYNC_STRIKE_LIMIT=3 + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KWOK_DIR="${SCRIPT_DIR}/.." REPO_ROOT="${KWOK_DIR}/.." OVERLAYS_DIR="${REPO_ROOT}/recipes/overlays" +# Shared cleanup helpers (SYSTEM_NS_PATTERN, ensure_kwok_context). Same +# source for validate-scheduling.sh so the system-ns allowlist regex +# lives in one place. +# shellcheck source=lib/cleanup.sh +source "${SCRIPT_DIR}/lib/cleanup.sh" + CLUSTER_NAME="${KWOK_CLUSTER:-aicr-kwok-test}" CONTEXT="kind-${CLUSTER_NAME}" @@ -41,7 +83,11 @@ retry_command() { local description="$1" shift - local max_attempts="${KWOK_COMMAND_RETRIES:-3}" + # 5 attempts with doubling backoff: 5+10+20+40 = 75s cumulative sleep. + # The prior default (3 attempts → 15s cumulative) was not enough headroom + # for transient GitHub releases CDN 502s on kwok-stage-fast chart fetch, + # which observably persist 30-60s. Overridable via env for local tuning. + local max_attempts="${KWOK_COMMAND_RETRIES:-5}" local delay="${KWOK_COMMAND_RETRY_DELAY:-5}" local attempt=1 @@ -124,13 +170,59 @@ cleanup_between_tests() { # Clean up orphaned CRDs from cert-manager (cluster-scoped, not cleaned by ns delete) kubectl delete crd -l app.kubernetes.io/instance=aicr-test --ignore-not-found 2>/dev/null || true - # Wait for any still-terminating namespaces before next recipe - local system_ns="default|kube-node-lease|kube-public|kube-system|kwok-system|local-path-storage" - local terminating - terminating=$(kubectl get ns -o jsonpath='{range .items[?(@.status.phase=="Terminating")]}{.metadata.name}{"\n"}{end}' 2>/dev/null | grep -vE "^(${system_ns})$" || true) - for ns in $terminating; do - log_info "Waiting for namespace $ns to terminate..." - kubectl wait --for=delete "ns/$ns" --timeout=120s 2>/dev/null || true + # Force-finalize any still-terminating namespaces before the next recipe. + # `argocd`, `flux-system`, and `aicr-registry` are owned by install- + # infra.sh and MUST survive between recipes for the GitOps deployer + # lanes — installing them once and reusing avoids 5+ minutes of Helm + # install + CRD Established + controller-ready overhead per recipe. + # The three namespaces are listed unconditionally (even under --deployer + # helm) because they simply won't exist on the helm path, so the + # allowlist is harmless there. Do not gate by DEPLOYER. + # + # See the long comment in validate-scheduling.sh cleanup() for the + # rationale: KWOK cannot run real controller finalizers, so + # `kubectl wait --for=delete` against a Terminating namespace blocks + # the full --timeout=120s per namespace. Recipes touch ~10 namespaces; + # the previous version spent ~20 minutes of cleanup between recipes + # while doing zero useful work. Force-finalize bypasses the protocol + # by PATCHing spec.finalizers to []; safe in the ephemeral KWOK + # cluster (no real workloads to leak; cluster destroyed at job end). + local system_ns="${SYSTEM_NS_PATTERN}" # see SYSTEM_NS_PATTERN in lib/cleanup.sh + + # Two-phase cleanup, matching validate-scheduling.sh::cleanup_old_tests: + # the previous version only iterated namespaces ALREADY in + # status.phase=Terminating, which silently missed `Active` namespaces + # carrying a `deletionTimestamp` (Helm hook resources mid-uninstall, + # for example) and let them collide on the next recipe. + # + # Phase 1 — issue async deletes for every non-system namespace so the + # controller starts tearing down. Phase 2 — sleep briefly, then + # force-finalize anything still present. The two phases together + # match the in-recipe cleanup so the between-test path can't drift + # away from it over time. + local test_namespaces + test_namespaces=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' 2>/dev/null \ + | tr ' ' '\n' \ + | { grep -vE "^(${system_ns})$" || true; }) + if [[ -z "$test_namespaces" ]]; then + return 0 + fi + + for ns in $test_namespaces; do + kubectl delete ns "$ns" --ignore-not-found --wait=false 2>/dev/null || true + done + + # Brief grace for graceful deletion (real finalizers run in well + # under a second when there are no real workloads). + sleep 2 + for ns in $test_namespaces; do + if kubectl get ns "$ns" >/dev/null 2>&1; then + log_info "Force-finalizing stuck namespace: $ns" + kubectl get ns "$ns" -o json 2>/dev/null \ + | jq '.spec.finalizers = [] | .metadata.finalizers = []' \ + | kubectl replace --raw "/api/v1/namespaces/${ns}/finalize" -f - >/dev/null 2>&1 \ + || true + fi done } @@ -138,7 +230,7 @@ run_recipe_test() { local recipe="$1" echo "" log_info "========================================" - log_info "Testing recipe: ${recipe}" + log_info "Testing recipe: ${recipe} (deployer=${DEPLOYER})" log_info "========================================" cleanup_between_tests @@ -146,30 +238,171 @@ run_recipe_test() { # Create nodes (pass recipe name, script infers from overlay) bash "${SCRIPT_DIR}/apply-nodes.sh" "${recipe}" || return 1 - # Run validation - bash "${SCRIPT_DIR}/validate-scheduling.sh" "${recipe}" || return 1 + # Run validation. Preserve validate-scheduling.sh's exit code so callers + # can distinguish EXIT_ARGOCD_SYNC_TIMEOUT (50) from generic failures (1) + # for the 3-strike rule. + local rc=0 + bash "${SCRIPT_DIR}/validate-scheduling.sh" --deployer "${DEPLOYER}" "${recipe}" || rc=$? + return "$rc" +} + +# Print usage to stderr. +usage() { + cat >&2 <<'EOF' +Usage: run-all-recipes.sh [--deployer ] [recipe1 recipe2 ...] + +Flags: + --deployer Deployer to exercise for every recipe. One of: + helm (default), argocd-oci, argocd-helm-oci, flux-oci + +Examples: + run-all-recipes.sh + run-all-recipes.sh eks-training + run-all-recipes.sh --deployer argocd-oci eks-training + run-all-recipes.sh --deployer=argocd-helm-oci + run-all-recipes.sh --deployer flux-oci eks-training +EOF } +# Deployer selection (issue #843). Set by --deployer in main(); default helm. +# Under --deployer helm, this script's behavior is byte-identical to pre-#843 +# except the cleanup_between_tests system_ns allowlist gains two harmless +# entries (argocd|aicr-registry) that never exist on the helm path. +DEPLOYER="helm" + main() { local recipes failed=() passed=() + local positional=() + + # Parse flags + positional recipe names. Supports `--deployer X` and + # `--deployer=X`. Anything else with a leading `-` is rejected. + while [[ $# -gt 0 ]]; do + case "$1" in + --deployer) + if [[ $# -lt 2 ]]; then + log_error "--deployer requires a value" + usage + return 1 + fi + DEPLOYER="$2" + shift 2 + ;; + --deployer=*) + DEPLOYER="${1#--deployer=}" + shift + ;; + -h|--help) + usage + return 0 + ;; + -*) + log_error "Unknown option: $1" + usage + return 1 + ;; + *) + positional+=("$1") + shift + ;; + esac + done - if [[ $# -gt 0 ]]; then - recipes="$*" + case "$DEPLOYER" in + helm|argocd-oci|argocd-helm-oci|flux-oci) + ;; + *) + log_error "Invalid --deployer value: '${DEPLOYER}'" + log_error "Must be one of: helm, argocd-oci, argocd-helm-oci, flux-oci" + usage + return 1 + ;; + esac + + if [[ ${#positional[@]} -gt 0 ]]; then + recipes="${positional[*]}" else recipes=$(get_recipes) fi - log_info "Found $(echo "${recipes}" | wc -w | tr -d ' ') recipe(s) to test" + log_info "Found $(echo "${recipes}" | wc -w | tr -d ' ') recipe(s) to test (deployer=${DEPLOYER})" ensure_cluster + # Safety: refuse to start the cleanup sweep unless the kubectl + # context points at a known KWOK Kind cluster. ensure_cluster has + # just created/reused the cluster, but it does not validate the + # context name — a developer running locally with a stale + # KUBECONFIG could otherwise direct the upcoming force-finalize + # sweep at a real cluster. Loose check only: kwok nodes don't + # exist yet on the initial run (apply-nodes.sh runs per recipe + # inside run_recipe_test). + ensure_kwok_context_loose + # Clean up any stale resources from previous runs cleanup_between_tests + # Install shared in-cluster registry + the controller(s) the selected + # deployer needs (Argo CD for argocd-*, Flux for flux-oci). install- + # infra.sh is idempotent but unnecessary work per recipe; a failure + # here is fatal — the lane cannot run without it. DEPLOYER is exported + # so install-infra.sh can branch on it. Its exit code map: 10=yq/ + # settings, 20=registry Deployment not Ready, 21=registry not reachable + # on host port, 30=Argo CD Helm install failed, 31=Application CRD not + # Established, 40=Repository secret apply failed, 60=Flux install + # manifest apply failed, 61=Flux controller not Ready, 62=Flux CRDs not + # Established. Surface the raw rc in the log line so an operator can + # map it. + if [[ "${DEPLOYER}" != "helm" ]]; then + log_info "Installing shared infra (in-cluster registry + controllers for ${DEPLOYER})..." + local infra_rc=0 + DEPLOYER="${DEPLOYER}" bash "${SCRIPT_DIR}/install-infra.sh" || infra_rc=$? + if (( infra_rc != 0 )); then + log_error "install-infra.sh failed (exit code ${infra_rc}); cannot run ${DEPLOYER} deployer lane" + log_error "See kwok/scripts/install-infra.sh header for exit-code taxonomy" + return 1 + fi + fi + + # 3-strike rule for Argo CD sync timeouts (ADR-008 §"Error Handling and + # Failure Modes"). Tracks CONSECUTIVE EXIT_ARGOCD_SYNC_TIMEOUT failures. + # The counter resets on any rc != 50 (pass OR generic failure) so only + # genuinely consecutive sync timeouts trip the bail. A pass-fail-pass- + # timeout-timeout-timeout sequence is 3 consecutive; a timeout-pass- + # timeout-timeout sequence is 2 consecutive. + # Only argocd-* deployers can hit 50; helm path never trips this. + local consecutive_sync_timeouts=0 + for recipe in ${recipes}; do - if run_recipe_test "${recipe}"; then + local rc=0 + run_recipe_test "${recipe}" || rc=$? + if (( rc == 0 )); then passed+=("${recipe}") + consecutive_sync_timeouts=0 else failed+=("${recipe}") + # 3-strike rule is GitOps-only: helm path never returns 50, so + # the gate is currently implicit. Make it explicit by checking + # DEPLOYER too — keeps the contract auditable from this site + # alone instead of a chain of "but only X can produce 50" + # comments scattered across files. + if (( rc == EXIT_ARGOCD_SYNC_TIMEOUT )) \ + && [[ "$DEPLOYER" == argocd-* || "$DEPLOYER" == flux-oci ]]; then + consecutive_sync_timeouts=$(( consecutive_sync_timeouts + 1 )) + log_warn "GitOps sync timeout strike ${consecutive_sync_timeouts}/${ARGOCD_SYNC_STRIKE_LIMIT} on recipe ${recipe}" + if (( consecutive_sync_timeouts >= ARGOCD_SYNC_STRIKE_LIMIT )); then + log_error "========================================" + log_error "3-strike rule tripped: ${consecutive_sync_timeouts} consecutive GitOps sync timeouts" + log_error "Bailing remainder of recipe loop (partial coverage > no coverage is false here)" + log_error "Failed recipes so far:" + for r in "${failed[@]:-}"; do [[ -n "$r" ]] && log_error " - ${r}"; done + log_error "========================================" + cleanup_between_tests + return "$EXIT_ARGOCD_SYNC_TIMEOUT" + fi + else + # Any other failure (including a hypothetical rc=50 from a + # path that shouldn't produce one) breaks the streak. + consecutive_sync_timeouts=0 + fi fi done diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index 8c54adb31..a740a31bd 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -16,22 +16,90 @@ # validate-scheduling.sh - Validate bundle scheduling on KWOK cluster # # Usage: -# ./validate-scheduling.sh +# ./validate-scheduling.sh [--keep-namespace] [--deployer ] # ./validate-scheduling.sh h100-eks-ubuntu-training-kubeflow +# ./validate-scheduling.sh --deployer argocd-oci eks-training +# +# Flags: +# --keep-namespace Preserve releases and namespaces after the run for +# inspection (skips cleanup). +# --deployer Deployer to exercise. One of: +# helm (default — original Helm path) +# argocd-oci (aicr bundle --deployer argocd -> +# OCI push -> kubectl apply +# app-of-apps.yaml) +# argocd-helm-oci (aicr bundle --deployer argocd-helm +# -> OCI push -> helm install OCI +# chart in argocd namespace) +# flux-oci (aicr bundle --deployer flux -> +# OCI push -> apply OCIRepository + +# Kustomization wrappers and let +# Flux reconcile) +# For argocd-* values the in-cluster registry + Argo CD +# must already be installed; for flux-oci the registry +# + Flux 2 controllers must be installed. Both are +# managed by `kwok/scripts/install-infra.sh` keyed on +# the DEPLOYER env var. +# +# Environment variables (GitOps deployers only): +# KWOK_REGISTRY_HOST_PORT Host port the in-cluster registry is reachable on +# from the runner. MUST match the value used when +# running install-infra.sh / kind-config.yaml. +# Default: 5500 (avoids the macOS ControlCenter +# port-5000 collision; Linux CI runners have 5500 +# free as well). +# KWOK_ARGOCD_SYNC_TIMEOUT Seconds to wait for all Argo CD Applications to +# reach Synced + Healthy (or Progressing) before +# failing. Default: 300. +# KWOK_ARGOCD_ROOT_GRACE Seconds to wait for the root Argo CD Application +# (nvidia-stack for argocd-oci, aicr-stack for +# argocd-helm-oci) to appear in the argocd +# namespace before failing. A missing root after +# this grace window indicates `kubectl apply` / +# `helm install` silently produced no Application +# resource. Default: 30. +# KWOK_FLUX_SYNC_TIMEOUT Seconds to wait for the outer Kustomization + +# all HelmReleases to reach a terminal state +# before failing. Default: 300. +# KWOK_FLUX_ROOT_GRACE Seconds to wait for the outer Kustomization to +# appear in the cluster before failing. A missing +# Kustomization after the grace window indicates +# kubectl apply silently produced no resource +# (RBAC denial, CRD missing). Default: 30. # # This script: # 1. Generates a recipe from the cluster config # 2. Generates a bundle from the recipe -# 3. Deploys the bundle to the KWOK cluster +# 3. Deploys the bundle to the KWOK cluster (Helm, Argo CD, or Flux per --deployer) # 4. Verifies all pods reach Running state (KWOK auto-transitions them) # 5. Reports success/failure +# +# Exit codes: +# 0 success +# 1 generic failure (bundle gen, deploy, pod scheduling, RBAC, CRD missing, +# apiserver unreachable, etc.) +# 50 GitOps sync deadline hit (KWOK_ARGOCD_SYNC_TIMEOUT for argocd-*, +# KWOK_FLUX_SYNC_TIMEOUT for flux-oci). Distinct so run-all-recipes.sh +# can apply the 3-strike rule (ADR-008 §"Error Handling and Failure +# Modes") without grepping stderr. Only emitted for GitOps deployers; +# the helm path never returns 50. set -euo pipefail +# Distinct exit code for Argo CD sync timeout. Documented in the header above +# AND in kwok/scripts/run-all-recipes.sh so callers can branch on it. +readonly EXIT_ARGOCD_SYNC_TIMEOUT=50 + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KWOK_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" REPO_ROOT="$(cd "${KWOK_DIR}/.." && pwd)" +# Shared cleanup helpers (SYSTEM_NS_PATTERN constant, ensure_kwok_context +# safety guard). The same constants and functions are sourced by +# run-all-recipes.sh so the system-ns allowlist lives in one place. +# shellcheck source=lib/cleanup.sh +source "${SCRIPT_DIR}/lib/cleanup.sh" + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -51,6 +119,77 @@ WORK_DIR="" AICR_BIN="" KEEP_NAMESPACE=false +# Deployer selection (issue #843). Set by --deployer flag in main(). +# When DEPLOYER != "helm", generate_bundle pushes to OCI and deploy_bundle +# routes through Argo CD instead of `deploy.sh`. The helm path stays +# byte-identical to pre-#843 behavior. +DEPLOYER="helm" +# OCI ref populated in generate_bundle when DEPLOYER != "helm"; consumed by +# deploy_bundle (helm install path) and cleanup. +OCI_REF="" +# In-cluster pull URL for argocd-* deployers (no tag suffix). Populated in +# generate_bundle and consumed by deploy_bundle for the argocd-helm-oci lane, +# where the wrapper chart's `helm install` must be invoked with +# `--set repoURL=` so the Argo CD Applications it renders +# point repo-server at the in-cluster registry Service DNS, not the +# runner-side localhost:5500 (which is unreachable from inside the cluster). +# The argocd-oci lane derives repoURL via `aicr bundle --repo` at bundle +# time, so it doesn't read this variable. Kept as a script-global instead +# of being re-derived in deploy_bundle so the runner→service-DNS rewrite +# rule lives in exactly one place. +OCI_IN_CLUSTER_REF="" +# Root Argo CD Application name emitted by the bundler. Set per-deployer in +# resolve_argocd_root_app(): +# - argocd-oci -> "nvidia-stack" (pkg/bundler/deployer/argocd/ +# templates/app-of-apps.yaml.tmpl) +# - argocd-helm-oci -> "aicr-stack" (pkg/bundler/deployer/argocdhelm/ +# argocdhelm.go: parentAppTemplate) +# Default empty when DEPLOYER=helm; cleanup guards against empty before +# issuing `kubectl delete application`. +ARGOCD_ROOT_APP="" + +# Resolve ARGOCD_ROOT_APP from DEPLOYER. Must be called before any code path +# that consults ARGOCD_ROOT_APP (cleanup, wait_for_argocd_sync). +resolve_argocd_root_app() { + case "$DEPLOYER" in + argocd-oci) ARGOCD_ROOT_APP="nvidia-stack" ;; + argocd-helm-oci) ARGOCD_ROOT_APP="aicr-stack" ;; + *) ARGOCD_ROOT_APP="" ;; + esac +} + +# Flux-specific globals (set in generate_bundle / deploy_bundle for flux-oci). +# Both live in the flux-system namespace (Flux's default control plane). +# +# FLUX_OCIREPOSITORY_NAME is fixed to "aicr-bundle" — the value the bundler +# embeds into local-chart HelmRelease sourceRefs when --deployer flux is +# paired with an OCI --output (see pkg/bundler/config/Config.bundleOCISourceName +# and pkg/cli/bundle.go). Changing the name in one place WITHOUT the other +# breaks chart resolution at reconcile time, so the constant is the contract. +# +# FLUX_KUSTOMIZATION_NAME varies by recipe so back-to-back recipes on the +# same cluster don't collide (one Kustomization per recipe pointing at +# the same shared OCIRepository over time). +# +# Default empty when DEPLOYER != flux-oci; cleanup guards against empty. +FLUX_KUSTOMIZATION_NAME="" +FLUX_OCIREPOSITORY_NAME="" + +# Resolve FLUX_* names from the recipe. Must be called before any code path +# that consults them (cleanup, deploy_bundle, wait_for_flux_sync). +resolve_flux_root_names() { + local recipe="$1" + if [[ "$DEPLOYER" == "flux-oci" ]]; then + # Fixed name — see comment above. Must match what + # pkg/cli/bundle.go sets as BundleOCISourceName. + FLUX_OCIREPOSITORY_NAME="aicr-bundle" + FLUX_KUSTOMIZATION_NAME="aicr-${recipe}" + else + FLUX_OCIREPOSITORY_NAME="" + FLUX_KUSTOMIZATION_NAME="" + fi +} + # Cleanup function cleanup() { local exit_code=$? @@ -64,9 +203,87 @@ cleanup() { log_info "Preserving releases for inspection" log_info "Clean up with: helm list -A (then uninstall each release)" else - # Uninstall all Helm releases from component namespaces + # Argo CD deployers: delete the root Application FIRST so prune + # semantics tear down children before we touch the namespaces. + # The Helm-OCI release (argocd-helm-oci) also needs to come down so + # the next recipe gets a clean argocd namespace. + if [[ "$DEPLOYER" == "argocd-oci" || "$DEPLOYER" == "argocd-helm-oci" ]]; then + if [[ -n "$ARGOCD_ROOT_APP" ]]; then + log_info "Deleting Argo CD root Application ${ARGOCD_ROOT_APP} (prune cascade)..." + timeout 120s kubectl delete application "$ARGOCD_ROOT_APP" -n argocd \ + --ignore-not-found --wait=true 2>/dev/null || true + fi + if [[ "$DEPLOYER" == "argocd-helm-oci" ]]; then + # Defensive guard: the install-infra.sh-managed Argo CD release + # is named "argocd" in the same namespace. RELEASE_NAME-stack + # must never collide with that — uninstalling the infra release + # would tear down Argo CD itself and break subsequent recipes. + local wrapper="${RELEASE_NAME}-stack" + if [[ "$wrapper" == "argocd" ]]; then + log_error "Refusing to uninstall wrapper release named 'argocd'" + log_error "It collides with install-infra.sh's Argo CD release." + log_error "Override KWOK_RELEASE to something other than empty/'argocd'." + else + log_info "Uninstalling argocd-helm-oci wrapper release ${wrapper}..." + log_info "Releases in argocd namespace before uninstall:" + helm list -n argocd 2>/dev/null || true + timeout 60s helm uninstall "$wrapper" -n argocd --wait 2>/dev/null || true + fi + fi + fi + + # Flux deployer: delete the Kustomization FIRST so its prune + # GC removes the per-component HelmReleases before namespace + # cleanup races them. The OCIRepository then has nothing to feed + # and can be removed safely. + if [[ "$DEPLOYER" == "flux-oci" ]]; then + if [[ -n "$FLUX_KUSTOMIZATION_NAME" ]]; then + log_info "Deleting Flux Kustomization ${FLUX_KUSTOMIZATION_NAME} (prune cascade)..." + timeout 120s kubectl delete kustomization "$FLUX_KUSTOMIZATION_NAME" \ + -n flux-system --ignore-not-found --wait=true 2>/dev/null || true + fi + if [[ -n "$FLUX_OCIREPOSITORY_NAME" ]]; then + log_info "Deleting Flux OCIRepository ${FLUX_OCIREPOSITORY_NAME}..." + timeout 30s kubectl delete ocirepository "$FLUX_OCIREPOSITORY_NAME" \ + -n flux-system --ignore-not-found --wait=true 2>/dev/null || true + fi + # Best-effort: clear any HelmReleases the bundle declared in + # component namespaces. helm-controller's finalizer can hang + # under KWOK; force-remove if needed before the namespace + # sweep below force-finalizes the namespace. + local stale_hrs + stale_hrs=$(kubectl get helmrelease -A -o json 2>/dev/null \ + | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' \ + || true) + if [[ -n "$stale_hrs" ]]; then + while IFS=' ' read -r ns name; do + if [[ -n "$ns" && "$ns" != "flux-system" ]]; then + log_info "Deleting HelmRelease ${name} in ${ns}..." + timeout 30s kubectl delete helmrelease "$name" -n "$ns" \ + --ignore-not-found --wait=false 2>/dev/null || true + # Strip finalizers if helm-controller's reconciler is + # stuck. Merge patch (NOT --type=json remove): RFC 6902 + # remove errors when the target path is absent, and the + # `|| true` would then silently swallow the exact case + # this guard exists for — a HelmRelease whose + # helm-controller crashed before attaching its + # finalizer. Merge patch with `null` is idempotent + # whether finalizers are present or absent. + kubectl patch helmrelease "$name" -n "$ns" \ + --type=merge -p '{"metadata":{"finalizers":null}}' \ + >/dev/null 2>&1 || true + fi + done <<< "$stale_hrs" + fi + fi + + # Uninstall all Helm releases from component namespaces (excludes + # argocd because we want the install-infra.sh-managed release to + # survive recipe-to-recipe). local releases - releases=$(helm list -A -o json 2>/dev/null | jq -r '.[] | select(.namespace != "kube-system") | "\(.name) \(.namespace)"' || true) + releases=$(helm list -A -o json 2>/dev/null \ + | jq -r '.[] | select(.namespace != "kube-system") | select(.namespace != "argocd") | "\(.name) \(.namespace)"' \ + || true) if [[ -n "$releases" ]]; then while IFS=' ' read -r name ns; do if [[ -n "$name" ]]; then @@ -77,13 +294,51 @@ cleanup() { fi # Clean up stale APIServices before namespace deletion to prevent hangs cleanup_stale_apiservices - # Delete all non-system namespaces (dynamically covers any recipe) - local system_ns="default|kube-node-lease|kube-public|kube-system|kwok-system|local-path-storage" + # Delete all non-system namespaces (dynamically covers any recipe). + # `argocd`, `flux-system`, and `aicr-registry` are owned by + # install-infra.sh and must survive between recipes for the GitOps + # deployer matrix to work back-to-back. Listed unconditionally + # (harmless when the corresponding controller isn't installed on + # the current lane). + local system_ns="${SYSTEM_NS_PATTERN}" # see SYSTEM_NS_PATTERN in lib/cleanup.sh local test_namespaces test_namespaces=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep -vE "^(${system_ns})$" || true) + # Issue all delete requests asynchronously (--wait=false), then + # force-finalize any namespace stuck in Terminating. + # + # Why force-finalize: KWOK fake controllers can drive Pods through + # phases but cannot run real finalizer reconcilers (Argo CD's + # resources-finalizer.argocd.argoproj.io, kubernetes.io/legacy- + # finalizer, operator-injected finalizers). With --wait=true a + # `kubectl delete ns` blocks until the namespace's finalizers are + # cleared — which never happens in KWOK — and the previous + # --timeout=120s loop spent the full 2 minutes per namespace, + # exhausting the 15-minute job budget on cleanup *after* the + # actual recipe validation had already PASSED. Observed: every + # argocd-oci CI lane on commit c459b51f cancelled in post-test + # cleanup with `##[error]The operation was canceled.` + # + # Force-finalize bypasses the controller protocol by PATCHing the + # namespace's spec.finalizers to []. This is safe in the KWOK + # ephemeral cluster: there are no real workloads to leak, and the + # cluster is destroyed at job end regardless. Do NOT port this + # pattern to non-KWOK / production cleanup. for ns in $test_namespaces; do log_info "Deleting namespace $ns..." - kubectl delete ns "$ns" --ignore-not-found --wait=true --timeout=120s 2>/dev/null || true + kubectl delete ns "$ns" --ignore-not-found --wait=false 2>/dev/null || true + done + # Give graceful deletion a brief window first (real finalizers run + # in well under a second in tests with no real workloads); then + # force-finalize anything left over. + sleep 2 + for ns in $test_namespaces; do + if kubectl get ns "$ns" >/dev/null 2>&1; then + log_debug "Force-finalizing stuck namespace: $ns" + kubectl get ns "$ns" -o json 2>/dev/null \ + | jq '.spec.finalizers = [] | .metadata.finalizers = []' \ + | kubectl replace --raw "/api/v1/namespaces/${ns}/finalize" -f - >/dev/null 2>&1 \ + || true + fi done fi @@ -223,9 +478,13 @@ cleanup_old_tests() { fi fi - # Find and uninstall old Helm releases from component namespaces + # Find and uninstall old Helm releases from component namespaces. + # Skip the argocd namespace so the install-infra.sh-managed Argo CD + # release survives recipe-to-recipe. local releases - releases=$(helm list -A -o json 2>/dev/null | jq -r '.[] | select(.namespace != "kube-system") | "\(.namespace) \(.name)"' || true) + releases=$(helm list -A -o json 2>/dev/null \ + | jq -r '.[] | select(.namespace != "kube-system") | select(.namespace != "argocd") | "\(.namespace) \(.name)"' \ + || true) if [[ -n "$releases" ]]; then log_info "Uninstalling old releases..." echo "$releases" | while read -r ns release; do @@ -240,21 +499,51 @@ cleanup_old_tests() { # These can cause namespace deletion to hang with "stale GroupVersion discovery" errors cleanup_stale_apiservices - # Delete all non-system namespaces (dynamically covers any recipe) - local system_ns="default|kube-node-lease|kube-public|kube-system|kwok-system|local-path-storage" + # Delete all non-system namespaces (dynamically covers any recipe). + # `argocd` and `aicr-registry` are owned by install-infra.sh and must + # survive between recipes. + # + # See the long comment in cleanup() for the rationale: --wait=true on + # a KWOK cluster blocks the full --timeout=120s per namespace because + # KWOK can't run real finalizers (Argo CD's resources-finalizer, + # operator-injected finalizers, etc.). Issue async + force-finalize. + local system_ns="${SYSTEM_NS_PATTERN}" # see SYSTEM_NS_PATTERN in lib/cleanup.sh local test_namespaces test_namespaces=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep -vE "^(${system_ns})$" || true) for ns in $test_namespaces; do log_info "Removing namespace $ns..." - kubectl delete ns "$ns" --ignore-not-found --wait=true --timeout=120s 2>/dev/null || true + kubectl delete ns "$ns" --ignore-not-found --wait=false 2>/dev/null || true + done + sleep 2 + for ns in $test_namespaces; do + if kubectl get ns "$ns" >/dev/null 2>&1; then + log_debug "Force-finalizing stuck namespace: $ns" + kubectl get ns "$ns" -o json 2>/dev/null \ + | jq '.spec.finalizers = [] | .metadata.finalizers = []' \ + | kubectl replace --raw "/api/v1/namespaces/${ns}/finalize" -f - >/dev/null 2>&1 \ + || true + fi done - # Also clean up legacy aicr-kwok-test namespaces + # Also clean up legacy aicr-kwok-test namespaces (async + force-finalize). local old_namespaces old_namespaces=$(kubectl get ns -o name 2>/dev/null | grep "namespace/aicr-kwok-test" || true) if [[ -n "$old_namespaces" ]]; then log_info "Removing old test namespaces..." - echo "$old_namespaces" | xargs kubectl delete --wait=true --timeout=120s 2>/dev/null || true + echo "$old_namespaces" | xargs kubectl delete --wait=false 2>/dev/null || true + sleep 2 + # Force-finalize any of those that linger. + while read -r ns_ref; do + local ns_name="${ns_ref#namespace/}" + if [[ -z "$ns_name" ]]; then continue; fi + if kubectl get ns "$ns_name" >/dev/null 2>&1; then + log_debug "Force-finalizing legacy namespace: $ns_name" + kubectl get ns "$ns_name" -o json 2>/dev/null \ + | jq '.spec.finalizers = [] | .metadata.finalizers = []' \ + | kubectl replace --raw "/api/v1/namespaces/${ns_name}/finalize" -f - >/dev/null 2>&1 \ + || true + fi + done <<< "$old_namespaces" fi log_info "Cleanup complete" @@ -372,85 +661,460 @@ generate_bundle() { # node the pod schedules on — and KWOK fakes can't actually back a # local-path volume, leaving the pod stuck Pending with NominatedNodeName # set. Disabling persistence lets the controller pod bind. - log_info "Generating bundle..." + log_info "Generating bundle (deployer=${DEPLOYER})..." local bundle_output - if ! bundle_output=$("$AICR_BIN" bundle \ - --recipe "${WORK_DIR}/recipe.yaml" \ - --output "${WORK_DIR}/bundle" \ - --system-node-selector "aicr.nvidia.com/node-type=system" \ - --accelerated-node-selector "aicr.nvidia.com/node-type=accelerated" \ - --system-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ - --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ - --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ - --set "certmanager:startupapicheck.enabled=false" \ - --set "slinkyslurmoperator:webhook.enabled=false" \ - --set "slinkyslurmoperator:certManager.enabled=false" \ - --set "slurmcluster:controller.persistence.enabled=false" \ - --set "kubeprometheusstack:defaultRules.create=false" \ - --set "kubeprometheusstack:alertmanager.enabled=false" \ - --set "nodewright-customizations:enabled=false" \ - --set "dynamoplatform:etcd.persistence.enabled=false" \ - --set "dynamoplatform:nats.config.jetstream.fileStore.enabled=false" 2>&1); then - log_error "Bundle generation failed" - log_error "Bundle command output:" - echo "$bundle_output" - return 1 - fi + case "$DEPLOYER" in + helm) + # Original Helm path — DO NOT change this invocation. Byte-identical + # to pre-#843 behavior is a non-regression requirement. + if ! bundle_output=$("$AICR_BIN" bundle \ + --recipe "${WORK_DIR}/recipe.yaml" \ + --output "${WORK_DIR}/bundle" \ + --system-node-selector "aicr.nvidia.com/node-type=system" \ + --accelerated-node-selector "aicr.nvidia.com/node-type=accelerated" \ + --system-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ + --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --set "certmanager:startupapicheck.enabled=false" \ + --set "slinkyslurmoperator:webhook.enabled=false" \ + --set "slinkyslurmoperator:certManager.enabled=false" \ + --set "slurmcluster:controller.persistence.enabled=false" \ + --set "kubeprometheusstack:defaultRules.create=false" \ + --set "kubeprometheusstack:alertmanager.enabled=false" \ + --set "nodewright-customizations:enabled=false" \ + --set "dynamoplatform:etcd.persistence.enabled=false" \ + --set "dynamoplatform:nats.config.jetstream.fileStore.enabled=false" 2>&1); then + log_error "Bundle generation failed" + log_error "Bundle command output:" + echo "$bundle_output" + return 1 + fi - if [[ ! -d "${WORK_DIR}/bundle" ]]; then - log_error "Bundle directory not created: ${WORK_DIR}/bundle" - return 1 - fi + if [[ ! -d "${WORK_DIR}/bundle" ]]; then + log_error "Bundle directory not created: ${WORK_DIR}/bundle" + return 1 + fi - log_info "Bundle generated at ${WORK_DIR}/bundle" + # KWOK clusters use emptyDir for Prometheus storage (no PVC / + # StorageClass). Cloud overlays (EKS, AKS) set emptyDir: null + + # volumeClaimTemplate, which the Prometheus CRD rejects in a + # KWOK environment. Restore emptyDir and remove the PVC. + # + # This fix-up is intentionally helm-only: the argocd-* deployers + # consume the bundle from OCI (the artifact was already pushed + # above), so local-filesystem edits would be discarded by + # Argo CD's repo-server when it re-pulls the artifact. See + # SHOULD-FIX 2 of the #843 follow-up review. + local prom_values + prom_values=$(find "${WORK_DIR}/bundle" -mindepth 2 -maxdepth 2 \ + -path '*[0-9][0-9][0-9]-kube-prometheus-stack/values.yaml' \ + -type f 2>/dev/null | head -1) + if [[ -n "$prom_values" && -f "$prom_values" ]] && \ + yq eval '.prometheus.prometheusSpec.storageSpec.emptyDir' "$prom_values" 2>/dev/null | grep -q 'null'; then + log_info "Fixing kube-prometheus-stack storageSpec for KWOK (emptyDir instead of PVC)" + yq eval -i ' + .prometheus.prometheusSpec.storageSpec.emptyDir = {"medium": "", "sizeLimit": "10Gi"} | + del(.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate) + ' "$prom_values" + fi + ;; + argocd-oci|argocd-helm-oci) + # Compute OCI ref. Tag includes the deployer so back-to-back runs + # with different deployer values on the same recipe don't collide + # in the registry. + # + # Tag-format constraint: helm v3's OCI client filters tags via + # `semver.StrictNewVersion` (registry/client.go::Tags()) — a tag + # like `fe346a05-argocd-helm-oci` is silently dropped and helm + # reports `unable to locate any tags in provided repository` + # even though the artifact is in the registry with correct + # helm.chart.content.v1.tar+gzip mediaType (see #961 root + # cause). Wrap the sha-and-deployer suffix as a semver pre- + # release so `0.0.0-fe346a05-argocd-helm-oci` parses cleanly. + # Argo CD's OCI Helm source does not have this filter, so the + # other two lanes (helm, argocd-oci) keep the plain + # sha-deployer tag. + local short_sha + short_sha=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo "local") + local host_port="${KWOK_REGISTRY_HOST_PORT:-5500}" + local tag="${short_sha}-${DEPLOYER}" + if [[ "$DEPLOYER" == "argocd-helm-oci" ]]; then + tag="0.0.0-${short_sha}-${DEPLOYER}" + fi + OCI_REF="oci://localhost:${host_port}/aicr/${recipe}:${tag}" + + # The OCI ref above is the runner's view (push side) — host port + # exposed by Kind's extraPortMappings. Argo CD's repo-server runs + # IN-CLUSTER and reaches the same registry via Service DNS, on the + # Service's own port 5000 (not the runner's host port). The push + # and pull URLs MUST differ; passing --repo to `aicr bundle` + # overrides the default auto-derivation (which would otherwise + # set repoURL = the runner-view URL and Argo CD would try to dial + # localhost:5500 from inside its repo-server pod). + # Script-global so deploy_bundle's argocd-helm-oci branch can + # pass it through to `helm install --set repoURL=…` without + # duplicating the runner→service-DNS rewrite rule. + OCI_IN_CLUSTER_REF="oci://registry.aicr-registry.svc.cluster.local:5000/aicr/${recipe}" + local in_cluster_repo="$OCI_IN_CLUSTER_REF" + + # Map our deployer-matrix name to aicr's --deployer value. + local deployer_arg="argocd" + [[ "$DEPLOYER" == "argocd-helm-oci" ]] && deployer_arg="argocd-helm" + + log_info "Bundling for ${deployer_arg}, pushing to ${OCI_REF}" + log_info "Argo CD will pull from ${in_cluster_repo}:${tag}" + # When --output is an oci:// reference, `aicr bundle` writes the + # local bundle to ./bundle (relative to CWD) — there's no way to + # redirect it to an absolute path. cd into WORK_DIR so the local + # bundle lands at ${WORK_DIR}/bundle/ where downstream apply paths + # expect it (kubectl apply -f .../app-of-apps.yaml for argocd-oci). + if ! bundle_output=$(cd "${WORK_DIR}" && "$AICR_BIN" bundle \ + --recipe "${WORK_DIR}/recipe.yaml" \ + --deployer "$deployer_arg" \ + --output "$OCI_REF" \ + --repo "$in_cluster_repo" \ + --plain-http \ + --system-node-selector "aicr.nvidia.com/node-type=system" \ + --accelerated-node-selector "aicr.nvidia.com/node-type=accelerated" \ + --system-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ + --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --set "certmanager:startupapicheck.enabled=false" \ + --set "slinkyslurmoperator:webhook.enabled=false" \ + --set "slinkyslurmoperator:certManager.enabled=false" \ + --set "slurmcluster:controller.persistence.enabled=false" \ + --set "kubeprometheusstack:defaultRules.create=false" \ + --set "kubeprometheusstack:alertmanager.enabled=false" \ + --set "nodewright-customizations:enabled=false" \ + --set "dynamoplatform:etcd.persistence.enabled=false" \ + --set "dynamoplatform:nats.config.jetstream.fileStore.enabled=false" 2>&1); then + log_error "Bundle generation failed" + log_error "Bundle command output:" + echo "$bundle_output" + return 1 + fi - # KWOK clusters use emptyDir for Prometheus storage (no PVC/StorageClass). - # Cloud overlays (EKS, AKS) set emptyDir: null + volumeClaimTemplate which - # the Prometheus CRD rejects. Restore emptyDir and remove PVC for KWOK. - # Bundle layout uses NNN-prefixed folders, so glob to find the kube-prometheus-stack folder. - local prom_values - prom_values=$(ls -1 "${WORK_DIR}/bundle"/[0-9][0-9][0-9]-kube-prometheus-stack/values.yaml 2>/dev/null | head -1) - if [[ -n "$prom_values" && -f "$prom_values" ]] && yq eval '.prometheus.prometheusSpec.storageSpec.emptyDir' "$prom_values" 2>/dev/null | grep -q 'null'; then - log_info "Fixing kube-prometheus-stack storageSpec for KWOK (emptyDir instead of PVC)" - yq eval -i ' - .prometheus.prometheusSpec.storageSpec.emptyDir = {"medium": "", "sizeLimit": "10Gi"} | - del(.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate) - ' "$prom_values" - fi + # `--deployer argocd*` still renders the local bundle directory + # (app-of-apps.yaml lives there for argocd-oci). For argocd-helm-oci + # the bundle is also pushed as an OCI Helm chart we'll install via + # `helm upgrade --install`. + if [[ "$DEPLOYER" == "argocd-oci" ]] && [[ ! -f "${WORK_DIR}/bundle/app-of-apps.yaml" ]]; then + log_error "Expected app-of-apps.yaml not found in bundle: ${WORK_DIR}/bundle" + log_error "Bundle contents:" + list_bundle_entries "${WORK_DIR}/bundle" + return 1 + fi + # NOTE: Prometheus emptyDir fix-up (see helm) branch above) is + # intentionally NOT applied for argocd-* deployers because the + # bundle is consumed from OCI, not the local filesystem. KWOK + # clusters without a StorageClass may see Prometheus PVCs hang + # Pending. Disable Prometheus persistence at bundle time via + # `--set kubeprometheusstack:prometheus.prometheusSpec.storageSpec=null` + # if running these recipes through argocd-*. See SHOULD-FIX 2 of + # the #843 follow-up review and docs/plans/2026-05-18-kwok- + # deployer-matrix.md (Unresolved Questions). + ;; + flux-oci) + # Flux's source-controller does NOT filter OCI tags via semver + # (it's an artifact pull, not a chart-version lookup), so the + # plain sha-deployer tag is fine here — no `0.0.0-…` wrapping. + local short_sha + short_sha=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo "local") + local host_port="${KWOK_REGISTRY_HOST_PORT:-5500}" + OCI_REF="oci://localhost:${host_port}/aicr/${recipe}:${short_sha}-${DEPLOYER}" + # In-cluster URL that the Flux OCIRepository will pull from. + # The flux deployer does NOT bake repoURL into the bundle (the + # bundle is a Kustomize tree of HelmRelease + HelmRepository + # source CRs; the OUTER OCIRepository is what we apply at + # deploy time, and its URL is set then). We still derive it + # here so deploy_bundle can reuse the rewrite rule from + # exactly one place. + OCI_IN_CLUSTER_REF="oci://registry.aicr-registry.svc.cluster.local:5000/aicr/${recipe}" + + log_info "Bundling for flux, pushing to ${OCI_REF}" + log_info "Flux will pull from ${OCI_IN_CLUSTER_REF}:${short_sha}-${DEPLOYER}" + # cd into WORK_DIR for the same reason as the argocd-* branch + # (relative ./bundle output path under `--output oci://...`). + if ! bundle_output=$(cd "${WORK_DIR}" && "$AICR_BIN" bundle \ + --recipe "${WORK_DIR}/recipe.yaml" \ + --deployer flux \ + --output "$OCI_REF" \ + --plain-http \ + --system-node-selector "aicr.nvidia.com/node-type=system" \ + --accelerated-node-selector "aicr.nvidia.com/node-type=accelerated" \ + --system-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ + --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ + --set "certmanager:startupapicheck.enabled=false" \ + --set "slinkyslurmoperator:webhook.enabled=false" \ + --set "slinkyslurmoperator:certManager.enabled=false" \ + --set "slurmcluster:controller.persistence.enabled=false" \ + --set "kubeprometheusstack:defaultRules.create=false" \ + --set "kubeprometheusstack:alertmanager.enabled=false" \ + --set "nodewright-customizations:enabled=false" \ + --set "dynamoplatform:etcd.persistence.enabled=false" \ + --set "dynamoplatform:nats.config.jetstream.fileStore.enabled=false" 2>&1); then + log_error "Bundle generation failed" + log_error "Bundle command output:" + echo "$bundle_output" + return 1 + fi + + # Flux bundle root must contain kustomization.yaml — Flux's + # Kustomization CR points its `path: ./` at it. Fail fast + # otherwise (the OCIRepository pull would succeed but the + # Kustomization apply would no-op silently). + if [[ ! -f "${WORK_DIR}/bundle/kustomization.yaml" ]]; then + log_error "Expected kustomization.yaml not found in bundle: ${WORK_DIR}/bundle" + log_error "Bundle contents:" + list_bundle_entries "${WORK_DIR}/bundle" + return 1 + fi + ;; + *) + log_error "Unknown deployer: ${DEPLOYER}" + return 1 + ;; + esac + + log_info "Bundle generated at ${WORK_DIR}/bundle" log_debug "Bundle contents:" - ls -1 "${WORK_DIR}/bundle" | head -10 + list_bundle_entries "${WORK_DIR}/bundle" | head -10 } -# Deploy bundle to cluster using the generated deploy.sh +# Print top-level entries of a bundle directory, one per line, basename only. +# Portable replacement for `ls -1 ` (SC2012) that does not depend on +# GNU find's `-printf` extension. Bash NULLGLOB is enabled locally so an +# empty directory prints nothing instead of the literal pattern. +# +# SIGPIPE handling: callers commonly pipe this to `head -N`. When the bundle +# has more than N entries, `head` closes stdin after reading N lines. The next +# `printf` in this subshell then writes to a closed FD; bash forwards SIGPIPE +# (exit 141), which under `set -euo pipefail` aborts the whole script even +# though listing succeeded. Ignoring SIGPIPE in the subshell turns the write +# into a plain EPIPE on `printf` (returns non-zero, but does not kill the +# shell); breaking out of the loop on that first failed write produces the +# same "listed up to N then stopped" behavior the caller intends, without +# leaking a 141 exit through the pipeline. Observed bite: OKE argocd-helm-oci +# CI cancellation on commit 263f4433 — bundle generated cleanly but `… | head +# -10` raced the producer past the 10th line and SIGPIPE killed the run. +list_bundle_entries() { + local dir="$1" + [[ -d "$dir" ]] || return 0 + ( + trap '' PIPE + shopt -s nullglob dotglob + local entry + for entry in "$dir"/*; do + printf '%s\n' "${entry##*/}" || break + done + ) +} + +# Like list_bundle_entries but excludes the bundle's generated deploy.sh. +# Used by the helm deploy path to log component folder names. +list_bundle_components() { + local dir="$1" + list_bundle_entries "$dir" | { grep -v '^deploy\.sh$' || true; } +} + +# Deploy bundle to cluster. +# +# For DEPLOYER=helm: runs the bundle's generated deploy.sh (unchanged +# pre-#843 behavior). +# +# For DEPLOYER=argocd-oci: kubectl apply -f /app-of-apps.yaml, then +# wait for every Argo CD Application in the argocd namespace to reach +# Synced + (Healthy|Progressing). +# +# For DEPLOYER=argocd-helm-oci: helm upgrade --install the OCI chart into +# the argocd namespace, then wait the same way. deploy_bundle() { - log_info "Deploying per-component bundle..." + log_info "Deploying per-component bundle (deployer=${DEPLOYER})..." local bundle_dir="${WORK_DIR}/bundle" - if [[ ! -f "${bundle_dir}/deploy.sh" ]]; then - log_error "deploy.sh not found in bundle directory: $bundle_dir" - log_error "Bundle generation may have failed" - return 1 - fi + case "$DEPLOYER" in + helm) + if [[ ! -f "${bundle_dir}/deploy.sh" ]]; then + log_error "deploy.sh not found in bundle directory: $bundle_dir" + log_error "Bundle generation may have failed" + return 1 + fi - log_debug "Bundle directory: $bundle_dir" - log_debug "Components in bundle:" - ls -1 "$bundle_dir" | grep -v "deploy.sh" | head -10 + log_debug "Bundle directory: $bundle_dir" + log_debug "Components in bundle:" + list_bundle_components "$bundle_dir" | head -10 + + # Run the generated deploy script without --wait since KWOK clusters + # only validate scheduling, not pod readiness + chmod +x "${bundle_dir}/deploy.sh" + log_info "Running deploy.sh --no-wait..." + + local deploy_output + if ! deploy_output=$("${bundle_dir}/deploy.sh" --no-wait 2>&1); then + log_error "Deploy script failed" + log_error "Last 50 lines of deploy output:" + echo "$deploy_output" | tail -50 + return 1 + fi + ;; + argocd-oci) + log_info "Applying app-of-apps manifest: ${bundle_dir}/app-of-apps.yaml" + if ! kubectl apply -f "${bundle_dir}/app-of-apps.yaml"; then + log_error "kubectl apply -f app-of-apps.yaml failed" + return 1 + fi + # Preserve wait_for_argocd_sync's exit code (50 == sync timeout) + # so run-all-recipes.sh can apply the 3-strike rule. + local sync_rc=0 + wait_for_argocd_sync || sync_rc=$? + if (( sync_rc != 0 )); then + return "$sync_rc" + fi + ;; + argocd-helm-oci) + if [[ -z "$OCI_REF" ]]; then + log_error "OCI_REF unset for argocd-helm-oci path (bundle step skipped?)" + return 1 + fi + if [[ -z "$OCI_IN_CLUSTER_REF" ]]; then + log_error "OCI_IN_CLUSTER_REF unset for argocd-helm-oci path (bundle step skipped?)" + return 1 + fi + # Helm OCI syntax: the chart reference must NOT include a ':' + # suffix. The tag is passed via --version. OCI_REF is shaped as + # oci://host/path:tag; split into ref (oci://host/path) + tag. + local helm_ref="${OCI_REF%:*}" + local helm_tag="${OCI_REF##*:}" + # repoURL + targetRevision are REQUIRED by the argocdhelm wrapper + # chart at install time (pkg/bundler/deployer/argocdhelm). The + # chart templates path-based child Applications (e.g. + # gpu-operator-post for manifest-only / mixed components) whose + # spec.source.repoURL must resolve to a remote Argo CD's + # repo-server can pull from. The wrapper enforces this with + # `{{ required }}` on .Values.repoURL — without these flags + # `helm install` fails template-render with + # `repoURL is required: pass --set repoURL=`. + # + # repoURL is the IN-CLUSTER URL (Service DNS) because Argo CD's + # repo-server runs inside the cluster; targetRevision is the + # chart's OCI tag, which by convention also names the chart + # version (loadAndRewriteChartYAML in pkg/oci/helm.go rewrites + # Chart.yaml.version to the OCI tag at push time). + log_info "helm upgrade --install ${RELEASE_NAME}-stack ${helm_ref} --version ${helm_tag} -n argocd --plain-http --set repoURL=${OCI_IN_CLUSTER_REF} --set targetRevision=${helm_tag}" + # --plain-http: in-cluster registry is HTTP-only; helm defaults to HTTPS. + if ! helm upgrade --install "${RELEASE_NAME}-stack" "$helm_ref" \ + --version "$helm_tag" \ + --namespace argocd \ + --plain-http \ + --set "repoURL=${OCI_IN_CLUSTER_REF}" \ + --set "targetRevision=${helm_tag}"; then + log_error "helm upgrade --install failed for ${OCI_REF}" + return 1 + fi + # Preserve wait_for_argocd_sync's exit code (50 == sync timeout) + # so run-all-recipes.sh can apply the 3-strike rule. + local sync_rc=0 + wait_for_argocd_sync || sync_rc=$? + if (( sync_rc != 0 )); then + return "$sync_rc" + fi + ;; + flux-oci) + if [[ -z "$OCI_IN_CLUSTER_REF" ]]; then + log_error "OCI_IN_CLUSTER_REF unset for flux-oci path (bundle step skipped?)" + return 1 + fi + if [[ -z "$OCI_REF" ]]; then + log_error "OCI_REF unset for flux-oci path (bundle step skipped?)" + return 1 + fi + # OCI_REF is "oci://host:port/path:tag"; the tag we need for the + # OCIRepository's `ref.tag` is the rightmost ':'. Split. + local oci_tag="${OCI_REF##*:}" + + # Render OCIRepository + Kustomization wrappers. + # + # OCIRepository: + # - spec.insecure: true — in-cluster registry is HTTP-only. + # - spec.layerSelector.mediaType — AICR's generic OCI push + # uses application/vnd.oci.image.layer.v1.tar+gzip (see + # pkg/oci/push.go::artifactType for the manifest's + # artifactType; source-controller doesn't filter on that + # field but DOES need a layerSelector for non-Flux-CLI + # artifacts per fluxcd/source-controller docs). + # - spec.layerSelector.operation: extract — unpack the + # layer into the source workspace. + # + # Kustomization: + # - sourceRef → the OCIRepository above + # - path: ./ — the bundle's root contains kustomization.yaml + # - prune: true — let Flux GC removed resources between runs + # - wait: false — KWOK can't drive Pod readiness for arbitrary + # workloads, so `wait: true` would block the Kustomization's + # Ready condition indefinitely. We assert terminal state via + # HelmRelease conditions below instead. + # - timeout: 5m — matches KWOK_FLUX_SYNC_TIMEOUT budget. + log_info "Applying Flux OCIRepository ${FLUX_OCIREPOSITORY_NAME} (ref=${oci_tag})..." + if ! kubectl apply -f - <&1); then - log_error "Deploy script failed" - log_error "Last 50 lines of deploy output:" - echo "$deploy_output" | tail -50 - return 1 - fi + # Preserve wait_for_flux_sync's exit code (50 == sync timeout) + # so run-all-recipes.sh can apply the 3-strike rule. + local sync_rc=0 + wait_for_flux_sync || sync_rc=$? + if (( sync_rc != 0 )); then + return "$sync_rc" + fi + ;; + *) + log_error "Unknown deployer: ${DEPLOYER}" + return 1 + ;; + esac # Brief wait for scheduler to place pods log_info "Waiting for pods to be scheduled..." @@ -459,14 +1123,519 @@ deploy_bundle() { log_info "Bundle deployed successfully" } +# Assert the deployer's root Argo CD Application exists within +# KWOK_ARGOCD_ROOT_GRACE seconds (default 30). Without this gate the script +# treats "kubectl apply / helm install silently produced no Application" +# (webhook reject, CRD scope mismatch, RBAC denial) as a slow controller +# and burns the full KWOK_ARGOCD_SYNC_TIMEOUT before failing with a +# misleading "deadline hit" message. +# +# On failure, dumps recent events in the argocd namespace and returns 1. +wait_for_argocd_root_app() { + if [[ -z "$ARGOCD_ROOT_APP" ]]; then + log_error "ARGOCD_ROOT_APP is empty (DEPLOYER=${DEPLOYER})" + return 1 + fi + + local grace="${KWOK_ARGOCD_ROOT_GRACE:-30}" + local start=$SECONDS + + log_info "Waiting for root Application ${ARGOCD_ROOT_APP} to appear (grace ${grace}s)..." + + while (( SECONDS - start < grace )); do + if kubectl get application "$ARGOCD_ROOT_APP" -n argocd \ + >/dev/null 2>&1; then + log_info "Root Application ${ARGOCD_ROOT_APP} present" + return 0 + fi + sleep 2 + done + + log_error "Root Application ${ARGOCD_ROOT_APP} not present after ${grace}s" + log_error "kubectl apply / helm install for DEPLOYER=${DEPLOYER} produced no Application resource." + log_error "--- argocd namespace events (last 20) ---" + kubectl get events -n argocd --sort-by=.lastTimestamp 2>&1 | tail -20 || true + return 1 +} + +# Wait until every Argo CD Application in the argocd namespace reaches a +# terminal pass state. The pass set is intentionally broader than just +# "Synced + Healthy" so KWOK simulation gaps and the real-world +# operator-mutation-after-sync pattern do not mask a healthy deployment: +# +# 1. Synced + Healthy ........................ canonical pass +# 2. Synced + Progressing .................... KWOK fake pods often sit in +# Progressing because the +# health controller wants a +# deeper readiness signal +# that stage-fast cannot +# simulate (DaemonSet pods, +# Prometheus StatefulSet +# under the operator, +# etc.); accepted per +# ADR-008. +# 3. OutOfSync + Healthy + last op succeeded . Operator-mutation drift: +# charts like gpu-operator +# and nvidia-dra-driver-gpu +# ship CRs that their own +# controller updates post- +# sync. Argo CD initiates +# the sync, the operation +# completes ("successfully +# synced (all tasks run)"), +# resources are Healthy, +# then drift reappears as +# the operator reconciles. +# In production these are +# handled with per-App +# ignoreDifferences; for +# KWOK we accept the +# Healthy-post-success +# state as a real pass and +# rely on operationState +# .phase=Succeeded to +# distinguish it from a +# still-broken OutOfSync. +# 4. Synced + Degraded + last op succeeded ... Chart applied but health +# controller marks at +# least one managed +# resource Degraded — +# typically a Pod whose +# Ready signal KWOK cannot +# produce (leader election +# like kai-scheduler, +# charts with strict probe +# semantics) or a Deployment +# that cannot reach +# desired replicas under +# stage-fast. The sync +# operation succeeded — the +# bundle was applied. The +# health gradient is the +# chart's runtime contract +# + KWOK fidelity, not a +# bundle defect. Same +# operationState.phase= +# Succeeded guard as #3 +# prevents masking a +# still-broken sync. +# +# Returns 0 on success. On deadline hit OR on a `kubectl get applications` +# hard error (CRD missing, RBAC denied, apiserver unreachable), calls +# dump_argocd_failures and returns 1. +# +# Guards (issue #843 follow-up review): +# +# MUST-FIX 1 — Asserts the deployer's root Application exists (call site +# pre-checks via wait_for_argocd_root_app). Then derives the expected +# set of child Applications from the root's status.resources so a +# root-only "Synced + Healthy" state cannot masquerade as a true pass +# (the chart wrapping in argocd-helm-oci can land just the root +# Application without children if the OCI pull silently degrades). +# +# MUST-FIX 2 — Distinguishes a `kubectl get applications` non-zero exit +# (CRD missing, permission denied, apiserver unreachable) from a +# well-formed empty list. A hard error fails the loop immediately +# instead of spinning to the deadline. +# +# Deadline: KWOK_ARGOCD_SYNC_TIMEOUT env var (default 300s). +wait_for_argocd_sync() { + if ! wait_for_argocd_root_app; then + dump_argocd_failures + return 1 + fi + + local deadline="${KWOK_ARGOCD_SYNC_TIMEOUT:-300}" + local start=$SECONDS + + log_info "Waiting for Argo CD Applications to sync (deadline ${deadline}s)..." + + while (( SECONDS - start < deadline )); do + local apps_json apps_rc total pending root_sync root_health + # MUST-FIX 2: distinguish a hard error from an empty list. We + # cannot use command substitution + `|| echo '{"items":[]}'` + # because that swallows the exit code. + # + # Subtle bash semantics: a bare `apps_json=$(kubectl ...)` + # assignment under `set -euo pipefail` (line 72) DOES trip + # set -e when the substituted command exits non-zero — bash + # exits before the next line's `apps_rc=$?` can capture the + # status. Wrap in an `if cmd; then ok; else capture; fi` + # block: command-substitution failures inside an `if` + # condition are exempt from set -e (per the manual), so we + # get the exit code into `apps_rc` safely. + if apps_json=$(kubectl get applications -n argocd -o json 2>/dev/null); then + apps_rc=0 + else + apps_rc=$? + fi + if (( apps_rc != 0 )); then + log_error "kubectl get applications -n argocd failed (rc=${apps_rc})" + log_error "Likely cause: applications.argoproj.io CRD missing, RBAC denied, or apiserver unreachable." + log_error "--- kubectl get applications stderr ---" + kubectl get applications -n argocd 2>&1 | tail -20 || true + dump_argocd_failures + return 1 + fi + + total=$(echo "$apps_json" | jq '.items | length') + + # MUST-FIX 1: root must be Synced+Healthy AND its expected + # children (status.resources[].name) must all be reconciled. We + # use status.resources because it self-adjusts per recipe — no + # static min-count env var to maintain. + root_sync=$(echo "$apps_json" \ + | jq -r --arg n "$ARGOCD_ROOT_APP" \ + '.items[] | select(.metadata.name == $n) | .status.sync.status // "Unknown"') + root_health=$(echo "$apps_json" \ + | jq -r --arg n "$ARGOCD_ROOT_APP" \ + '.items[] | select(.metadata.name == $n) | .status.health.status // "Unknown"') + + if [[ "$root_sync" != "Synced" || \ + ( "$root_health" != "Healthy" && "$root_health" != "Progressing" ) ]]; then + log_debug "Root ${ARGOCD_ROOT_APP} not ready yet (sync=${root_sync} health=${root_health}; total=${total})..." + sleep 5 + continue + fi + + # Children expected (per the root's status.resources). For the + # argocd-oci app-of-apps shape, status.resources lists the child + # Applications by name. For argocd-helm-oci the wrapper App + # references chart subresources — fall back to "any non-root + # Application is a child" if status.resources is empty. + local expected_children + expected_children=$(echo "$apps_json" \ + | jq -r --arg n "$ARGOCD_ROOT_APP" ' + .items[] + | select(.metadata.name == $n) + | (.status.resources // []) + | map(select(.kind == "Application")) + | length') + + # Documented fallback: when status.resources does not list any + # Application children but other Applications exist in the + # namespace, treat the count of non-root Applications as the + # observed-children count. The argocd-helm-oci wrapper's + # status.resources can take a moment to populate while the + # children it created are already reconciling; without this + # fallback the loop sleeps to the deadline even when every + # child is Synced+Healthy. + if [[ "$expected_children" -eq 0 ]]; then + local observed_children + observed_children=$(echo "$apps_json" \ + | jq -r --arg n "$ARGOCD_ROOT_APP" \ + '[.items[] | select(.metadata.name != $n)] | length') + if [[ "$observed_children" -gt 0 ]]; then + expected_children="$observed_children" + fi + fi + + if [[ "$expected_children" -eq 0 ]]; then + # No children declared by the root yet. If we're the only + # Application in the namespace and the controller has been + # given a chance to reconcile, treat as root-only success + # only when status.resources has resolved (controller has + # observed the source). Otherwise keep waiting. + local resources_populated + resources_populated=$(echo "$apps_json" \ + | jq --arg n "$ARGOCD_ROOT_APP" ' + .items[] + | select(.metadata.name == $n) + | (.status.resources // []) | length > 0') + if [[ "$total" -eq 1 && "$resources_populated" == "true" ]]; then + log_warn "Root Application ${ARGOCD_ROOT_APP} reconciled with zero child Applications" + log_warn "Bundle may have been wrapped-only (no expansion). total=${total}" + # Surfacing as success would mask the bug from MUST-FIX 1. + # Fail closed: a child-less root is never a real PASS for + # the deployer matrix (every recipe ships >= 1 component). + dump_argocd_failures + return 1 + fi + log_debug "Root Application present but children not yet declared (total=${total})..." + sleep 5 + continue + fi + + # Pending = NOT one of the four terminal-pass states described + # in the function docstring. Arms 3 & 4 are the load-bearing + # ones for KWOK: they distinguish "bundle applied; chart-side + # runtime gap" (pass) from "bundle never applied" (fail) via + # operationState.phase=Succeeded. + pending=$(echo "$apps_json" \ + | jq '[.items[] + | . as $app + | select( + # NOT (Synced + Healthy) + ($app.status.sync.status != "Synced" + or $app.status.health.status != "Healthy") + # AND NOT (Synced + Progressing) + and ($app.status.sync.status != "Synced" + or $app.status.health.status != "Progressing") + # AND NOT (OutOfSync + Healthy + last op Succeeded) + and ($app.status.sync.status != "OutOfSync" + or $app.status.health.status != "Healthy" + or ($app.status.operationState.phase // "") != "Succeeded") + # AND NOT (Synced + Degraded + last op Succeeded) + and ($app.status.sync.status != "Synced" + or $app.status.health.status != "Degraded" + or ($app.status.operationState.phase // "") != "Succeeded") + )] + | length') + if [[ "$pending" == "0" ]]; then + # MUST-FIX 1: log observed counts so a low total is auditable. + log_info "Argo CD sync PASS: ${total} Applications reached terminal pass state (root=${ARGOCD_ROOT_APP}, expected_children=${expected_children})" + return 0 + fi + log_debug "Argo CD sync progress: ${pending}/${total} Applications still pending (expected_children=${expected_children})..." + sleep 5 + done + + log_error "Argo CD sync deadline (${deadline}s) hit" + dump_argocd_failures + # Distinct rc so run-all-recipes.sh can apply the 3-strike rule without + # parsing stderr. Other failure paths in this function still return 1. + return "$EXIT_ARGOCD_SYNC_TIMEOUT" +} + +# Dump diagnostics on Argo CD sync failure: per-Application status fields +# plus repo-server and application-controller log tails. +dump_argocd_failures() { + log_error "--- Argo CD Applications (name / sync / health / conditions / op.message) ---" + kubectl get applications -n argocd -o json 2>/dev/null \ + | jq -r '.items[] | { + name: .metadata.name, + sync: .status.sync.status, + health: .status.health.status, + conditions: .status.conditions, + operation: .status.operationState.message + }' \ + 2>&1 || true + log_error "--- argocd-repo-server (tail=200) ---" + kubectl logs -n argocd deploy/argocd-repo-server --tail=200 2>&1 || true + log_error "--- argocd-application-controller (tail=200) ---" + # Argo CD chart 9.x ships application-controller as a StatefulSet. Try + # StatefulSet first to avoid emitting a misleading "Deployment not + # found" error into the diagnostic dump on the common case; fall back + # to Deployment for older chart versions. + kubectl logs -n argocd statefulset/argocd-application-controller --tail=200 2>&1 \ + || kubectl logs -n argocd deploy/argocd-application-controller --tail=200 2>&1 \ + || true +} + +# Assert the outer Flux Kustomization exists within KWOK_FLUX_ROOT_GRACE +# seconds (default 30). Mirrors wait_for_argocd_root_app: a silently-missing +# Kustomization (RBAC denial, CRD scope mismatch, webhook reject) gets +# surfaced fast instead of burning the KWOK_FLUX_SYNC_TIMEOUT budget. +wait_for_flux_root_app() { + if [[ -z "$FLUX_KUSTOMIZATION_NAME" ]]; then + log_error "FLUX_KUSTOMIZATION_NAME is empty (DEPLOYER=${DEPLOYER})" + return 1 + fi + + local grace="${KWOK_FLUX_ROOT_GRACE:-30}" + local start=$SECONDS + + log_info "Waiting for Flux Kustomization ${FLUX_KUSTOMIZATION_NAME} to appear (grace ${grace}s)..." + + while (( SECONDS - start < grace )); do + if kubectl get kustomization "$FLUX_KUSTOMIZATION_NAME" -n flux-system \ + >/dev/null 2>&1; then + log_info "Kustomization ${FLUX_KUSTOMIZATION_NAME} present" + return 0 + fi + sleep 2 + done + + log_error "Kustomization ${FLUX_KUSTOMIZATION_NAME} not present after ${grace}s" + log_error "kubectl apply for DEPLOYER=${DEPLOYER} produced no Kustomization resource." + log_error "--- flux-system namespace events (last 20) ---" + kubectl get events -n flux-system --sort-by=.lastTimestamp 2>&1 | tail -20 || true + return 1 +} + +# Validate the flux-oci bundle was successfully APPLIED. Intentionally +# narrower than wait_for_argocd_sync: we do NOT wait for HelmReleases to +# reach a terminal Ready/Released state, because the bundler's flux deployer +# currently emits local-chart HelmReleases with sourceRef.kind=GitRepository +# pointing at the placeholder URL `https://github.com/YOUR_ORG/YOUR_REPO.git`. +# Under OCI consumption helm-controller cannot fetch these charts, so those +# HelmReleases stall — AND any HelmRelease downstream in the Flux dependsOn +# chain stays "dependency not ready", cascading the stall through the entire +# bundle. +# +# Flux's HelmRelease v2 API blocks the obvious fix: spec.chart.spec.sourceRef +# rejects OCIRepository ("supported values: HelmRepository, GitRepository, +# Bucket"), and spec.chartRef points at the whole OCI artifact with no path +# support — so a single wrapper OCIRepository cannot address inner local +# charts. Proper fixes (render local-chart manifests at bundle time OR push +# each local chart as its own helm OCI artifact) are out of scope here and +# tracked in a follow-up issue referenced from the PR description. +# +# What this lane DOES validate (the bundle-format regression surface this +# PR is meant to catch): +# 1. Flux source-controller could PULL the AICR bundle artifact (the layer +# mediaType + insecure HTTP wiring is correct). +# 2. Flux kustomize-controller could PARSE the bundle's root +# kustomization.yaml and APPLY all manifests (no template-render bugs, +# no missing files, no schema violations). +# 3. The applied resources include per-component HelmRelease CRs (the +# flux deployer produced the expected GitOps shape). +# +# What this lane does NOT validate (deferred): +# - HelmRelease reconciliation to Ready / Released +# - Helm install success +# - Pod-Running for the workloads the bundle declares +# +# Returns 0 on success, EXIT_ARGOCD_SYNC_TIMEOUT (50) on deadline (reused so +# run-all-recipes.sh's 3-strike rule fires symmetrically across argocd-* and +# flux-oci), and 1 on hard errors (CRD missing, RBAC denial, apiserver +# unreachable). +wait_for_flux_sync() { + local deadline="${KWOK_FLUX_SYNC_TIMEOUT:-300}" + local start=$SECONDS + + if ! wait_for_flux_root_app; then + return 1 + fi + + log_info "Waiting for Flux Kustomization to apply the bundle (deadline ${deadline}s)..." + + local kust_json oci_json hr_json kust_ready oci_ready total + local applied_rev + local apps_rc + + while (( SECONDS - start < deadline )); do + # 1. OCIRepository must reach Ready=True — proves source-controller + # could PULL the bundle artifact (mediaType, insecure-HTTP, layer + # selector all wired correctly). This is the artifact-shape + # regression surface argocd-oci catches via repo-server; flux-oci + # catches the same class of bug via source-controller. + if oci_json=$(kubectl get ocirepository "$FLUX_OCIREPOSITORY_NAME" \ + -n flux-system -o json 2>/dev/null); then + apps_rc=0 + else + apps_rc=$? + fi + if (( apps_rc != 0 )); then + log_error "kubectl get ocirepository failed (rc=${apps_rc})" + log_error "Likely cause: OCIRepository CRD missing, RBAC denied, or apiserver unreachable." + dump_flux_failures + return 1 + fi + oci_ready=$(echo "$oci_json" \ + | jq -r '.status.conditions[]? | select(.type == "Ready") | .status // "Unknown"') + if [[ "$oci_ready" != "True" ]]; then + log_debug "OCIRepository ${FLUX_OCIREPOSITORY_NAME} not Ready yet (status=${oci_ready:-Unknown})..." + sleep 5 + continue + fi + + # 2. Kustomization must reach Ready=True — proves kustomize-controller + # could PARSE the bundle's root kustomization.yaml and APPLY all + # manifests inside (no template-render bugs, no missing files, no + # schema violations). + if kust_json=$(kubectl get kustomization "$FLUX_KUSTOMIZATION_NAME" \ + -n flux-system -o json 2>/dev/null); then + apps_rc=0 + else + apps_rc=$? + fi + if (( apps_rc != 0 )); then + log_error "kubectl get kustomization failed (rc=${apps_rc})" + log_error "Likely cause: Kustomization CRD missing, RBAC denied, or apiserver unreachable." + dump_flux_failures + return 1 + fi + kust_ready=$(echo "$kust_json" \ + | jq -r '.status.conditions[]? | select(.type == "Ready") | .status // "Unknown"') + if [[ "$kust_ready" != "True" ]]; then + log_debug "Kustomization ${FLUX_KUSTOMIZATION_NAME} not Ready yet (status=${kust_ready:-Unknown})..." + sleep 5 + continue + fi + applied_rev=$(echo "$kust_json" \ + | jq -r '.status.lastAppliedRevision // ""') + + # 3. Per-component HelmRelease CRs must exist (the bundle declared + # the expected GitOps shape). We do NOT wait for them to reach + # terminal state — see function docstring for the dependsOn-cascade + # rationale. + if hr_json=$(kubectl get helmrelease -A -o json 2>/dev/null); then + apps_rc=0 + else + apps_rc=$? + fi + if (( apps_rc != 0 )); then + log_error "kubectl get helmrelease -A failed (rc=${apps_rc})" + dump_flux_failures + return 1 + fi + total=$(echo "$hr_json" | jq '.items | length') + if [[ "$total" -eq 0 ]]; then + log_debug "Kustomization Ready but no HelmReleases yet — waiting for resources to appear..." + sleep 5 + continue + fi + + log_info "Flux bundle-apply PASS: OCIRepository pulled (rev=$(echo "$oci_json" | jq -r '.status.artifact.revision // "unknown"')), Kustomization applied (rev=${applied_rev}), ${total} HelmReleases created" + log_info "(flux-oci validates bundle-format application only; HelmRelease reconciliation is deferred — see follow-up issue)" + return 0 + done + + log_error "Flux bundle-apply deadline (${deadline}s) hit" + dump_flux_failures + # Reuse EXIT_ARGOCD_SYNC_TIMEOUT so run-all-recipes.sh's 3-strike rule + # treats GitOps sync deadlines symmetrically across argocd-* and flux-oci. + return "$EXIT_ARGOCD_SYNC_TIMEOUT" +} + +# Dump diagnostics on Flux sync failure: outer Kustomization status, all +# OCIRepository + HelmRelease statuses, plus controller log tails. +dump_flux_failures() { + log_error "--- Flux Kustomization ${FLUX_KUSTOMIZATION_NAME} ---" + kubectl get kustomization "$FLUX_KUSTOMIZATION_NAME" -n flux-system \ + -o json 2>/dev/null \ + | jq -r '{ + ready: ((.status.conditions // []) | map(select(.type == "Ready")) | .[0]), + lastAppliedRevision: .status.lastAppliedRevision, + inventory: ((.status.inventory.entries // []) | length) + }' 2>&1 || true + log_error "--- Flux OCIRepositories ---" + kubectl get ocirepository -A -o json 2>/dev/null \ + | jq -r '.items[] | { + namespace: .metadata.namespace, + name: .metadata.name, + ready: ((.status.conditions // []) | map(select(.type == "Ready")) | .[0]), + artifact: .status.artifact.revision + }' 2>&1 || true + log_error "--- Flux HelmReleases (name / ready / released / stalled / hist[0]) ---" + kubectl get helmrelease -A -o json 2>/dev/null \ + | jq -r '.items[] | { + namespace: .metadata.namespace, + name: .metadata.name, + ready: ((.status.conditions // []) | map(select(.type == "Ready")) | .[0].status), + released: ((.status.conditions // []) | map(select(.type == "Released")) | .[0].status), + stalled: ((.status.conditions // []) | map(select(.type == "Stalled")) | .[0].status), + history: (.status.history // [])[0] + }' 2>&1 || true + log_error "--- source-controller (tail=200) ---" + kubectl logs -n flux-system deploy/source-controller --tail=200 2>&1 || true + log_error "--- kustomize-controller (tail=200) ---" + kubectl logs -n flux-system deploy/kustomize-controller --tail=200 2>&1 || true + log_error "--- helm-controller (tail=200) ---" + kubectl logs -n flux-system deploy/helm-controller --tail=200 2>&1 || true +} + # Verify pod scheduling verify_pods() { log_info "Verifying pod scheduling..." # Get pod status across all component namespaces (per-component deployment) - # Exclude system namespaces that aren't part of our bundle + # Exclude system namespaces that aren't part of our bundle (control-plane + # infra plus GitOps controller namespaces — argocd/flux-system/aicr- + # registry are owned by install-infra.sh, not the bundle under test). local ns_filter="--all-namespaces" - local exclude_ns="kube-system|kube-node-lease|kube-public|local-path-storage|kwok-system" + local exclude_ns="kube-system|kube-node-lease|kube-public|local-path-storage|kwok-system|argocd|flux-system|aicr-registry" local total_pods pending_pods failed_pods running_pods unscheduled_pods total_pods=$(kubectl get pods ${ns_filter} --no-headers 2>/dev/null | { grep -vE "^(${exclude_ns})\s" || true; } | wc -l | tr -d ' ') @@ -485,10 +1654,18 @@ verify_pods() { log_info "Pod status: $total_pods total, $running_pods running, $pending_pods pending ($unscheduled_pods unscheduled), $failed_pods failed" - # Show pod distribution across nodes + # Show pod distribution across nodes. + # grep -vE returns exit 1 when no lines match (e.g. all pods are in + # excluded namespaces — happens on the flux-oci lane where every pod + # the controller has placed so far lives in flux-system / kube-system). + # Under `set -euo pipefail` that exit 1 propagates as the pipeline's + # exit code and kills the script silently AFTER Pod-status logging, + # masquerading as a pass-then-fail. Wrap with `{ ... || true; }` so + # an empty result is a no-op, matching the total_pods / unscheduled_pods + # extraction above. log_info "Pod distribution:" kubectl get pods --all-namespaces -o wide --no-headers 2>/dev/null | \ - grep -vE "^(${exclude_ns})\s" | \ + { grep -vE "^(${exclude_ns})\s" || true; } | \ awk '{print $8}' | sort | uniq -c | \ while read -r count node; do echo " $node: $count pods" @@ -541,6 +1718,24 @@ verify_pods() { return 0 } +# Print usage to stderr and exit non-zero. +usage() { + cat >&2 <<'EOF' +Usage: validate-scheduling.sh [--keep-namespace] [--deployer ] + +Flags: + --keep-namespace Preserve releases/namespaces after the run. + --deployer One of: helm (default), argocd-oci, argocd-helm-oci. + +Examples: + validate-scheduling.sh h100-eks-ubuntu-training-kubeflow + validate-scheduling.sh --keep-namespace eks-training + validate-scheduling.sh --deployer argocd-oci eks-training + validate-scheduling.sh --deployer argocd-helm-oci eks-training +EOF + exit 1 +} + # Main main() { local recipe="" @@ -552,12 +1747,21 @@ main() { KEEP_NAMESPACE=true shift ;; + --deployer) + if [[ $# -lt 2 ]]; then + log_error "--deployer requires a value" + usage + fi + DEPLOYER="$2" + shift 2 + ;; + --deployer=*) + DEPLOYER="${1#--deployer=}" + shift + ;; -*) - echo "Unknown option: $1" - echo "Usage: $0 [--keep-namespace] " - echo "Example: $0 h100-eks-ubuntu-training-kubeflow" - echo " $0 --keep-namespace eks-training" - exit 1 + log_error "Unknown option: $1" + usage ;; *) recipe="$1" @@ -567,17 +1771,39 @@ main() { done if [[ -z "$recipe" ]]; then - echo "Usage: $0 [--keep-namespace] " - echo "Example: $0 h100-eks-ubuntu-training-kubeflow" - echo " $0 --keep-namespace eks-training" - exit 1 + usage fi + case "$DEPLOYER" in + helm|argocd-oci|argocd-helm-oci|flux-oci) + ;; + *) + log_error "Invalid --deployer value: '${DEPLOYER}'" + log_error "Must be one of: helm, argocd-oci, argocd-helm-oci, flux-oci" + exit 1 + ;; + esac + + # Must run before cleanup trap so the trap sees the resolved root name. + resolve_argocd_root_app + resolve_flux_root_names "$recipe" + + # Safety: refuse to set up the cleanup trap (which force-finalizes + # namespaces under KWOK) unless the current kubectl context is a + # known KWOK Kind cluster with kwok-typed nodes. A misconfigured + # KUBECONFIG must NOT silently strip finalizers from whatever + # cluster the user happens to be authenticated to. apply-nodes.sh + # has already populated the cluster by this point (the script is + # invoked from run-all-recipes.sh::run_recipe_test after apply- + # nodes.sh), so the node-label check is safe here. + ensure_kwok_context + # Set up cleanup trap trap cleanup EXIT log_info "==========================================" log_info "Starting validation for recipe: $recipe" + log_info "Deployer: $DEPLOYER" log_info "==========================================" log_debug "Step 1: Checking dependencies..." @@ -605,7 +1831,7 @@ main() { verify_pods log_info "==========================================" - log_info "✓ Validation PASSED for recipe: $recipe" + log_info "✓ Validation PASSED for recipe: $recipe (deployer=$DEPLOYER)" log_info "==========================================" } diff --git a/pkg/bundler/bundler.go b/pkg/bundler/bundler.go index f775d3749..14ac2db11 100644 --- a/pkg/bundler/bundler.go +++ b/pkg/bundler/bundler.go @@ -390,6 +390,10 @@ func (b *DefaultBundler) buildDeployer(ctx context.Context, recipeResult *recipe ComponentPreManifests: componentPreManifests, ComponentPostManifests: componentPostManifests, VendorCharts: b.Config.VendorCharts(), + // Inline values when the bundle repo is OCI: Argo CD's $values + // multi-source ref is Git-only (see #960), so an OCI repoURL + // must use single-source with helm.valuesObject embedded. + InlineUpstreamValues: strings.HasPrefix(b.Config.RepoURL(), "oci://"), }, nil case config.DeployerHelm: diff --git a/pkg/bundler/deployer/argocd/argocd.go b/pkg/bundler/deployer/argocd/argocd.go index 47531740c..b2057cf5f 100644 --- a/pkg/bundler/deployer/argocd/argocd.go +++ b/pkg/bundler/deployer/argocd/argocd.go @@ -54,6 +54,7 @@ import ( "github.com/NVIDIA/aicr/pkg/bundler/deployer/localformat" "github.com/NVIDIA/aicr/pkg/errors" "github.com/NVIDIA/aicr/pkg/recipe" + "github.com/NVIDIA/aicr/pkg/serializer" ) //go:embed templates/application.yaml.tmpl @@ -74,6 +75,12 @@ var readmeTemplate string // pure-Helm KindUpstreamHelm folders). BundleDir carries the NNN- // directory name for both kinds: it is the chart path for KindLocalHelm and // the values $ref path for KindUpstreamHelm. +// +// InlineValues drives the Application shape for KindUpstreamHelm when the +// bundle repo is OCI: instead of the multi-source $values pattern (Git-only +// in Argo CD), the Application is rendered single-source with helm.valuesObject +// inlined from ValuesYAML. See InlineUpstreamValues on Generator for the +// reason. type ApplicationData struct { Name string Namespace string @@ -85,6 +92,8 @@ type ApplicationData struct { TargetRevision string // Target revision for the user's repo BundleDir string // NNN- directory inside the bundle IsLocalChart bool // true → path-based single-source; false → multi-source upstream-helm + InlineValues bool // KindUpstreamHelm + OCI → single-source with helm.valuesObject inlined + ValuesYAML string // Pre-indented YAML (8 spaces) for helm.valuesObject; used when InlineValues } // AppOfAppsData contains data for rendering the App of Apps manifest. @@ -171,6 +180,19 @@ type Generator struct { // vendoring shape. VendorCharts bool + // InlineUpstreamValues replaces the multi-source $values pattern for + // KindUpstreamHelm Applications with a single source whose helm.valuesObject + // is inlined from ComponentValues. Required when RepoURL is OCI because + // Argo CD's $values multi-source ref is Git-only — the repo-server + // attempts a `git ListRefs` against the OCI URL and fails with + // `unsupported scheme "oci"`. Off by default to preserve the existing + // multi-source shape for Git-backed deployments (operators retain + // per-component values.yaml edit ergonomics). Wired by the bundler + // when --deployer argocd targets oci://. Should remain off for + // argocdhelm's inner Generator — that path relies on the multi-source + // shape to transform it into a Helm template with dynamic merging. + InlineUpstreamValues bool + // vendorRecords is populated by Generate when VendorCharts is on. // Captured here so VendorRecords() can expose it to callers // (currently argocdhelm) that need to write provenance.yaml without @@ -371,7 +393,15 @@ func (g *Generator) Generate(ctx context.Context, outputDir string) (*deployer.O fmt.Sprintf("localformat returned folder for unknown component %q", f.Parent)) } - appData := buildApplicationData(*comp, f, i, repoURL, targetRevision) + var inlineValues map[string]any + if g.InlineUpstreamValues && f.Kind == localformat.KindUpstreamHelm { + inlineValues = g.ComponentValues[comp.Name] + } + + appData, err := buildApplicationData(*comp, f, i, repoURL, targetRevision, inlineValues, g.InlineUpstreamValues) + if err != nil { + return nil, err + } appDataList = append(appDataList, appData) folderDir, joinErr := deployer.SafeJoin(outputDir, f.Dir) @@ -507,11 +537,17 @@ func findComponentRef(refs []recipe.ComponentRef, parent string) *recipe.Compone } // buildApplicationData constructs ApplicationData for a single folder. The -// FolderKind drives the Application shape — KindLocalHelm sets LocalChartPath +// FolderKind drives the Application shape — KindLocalHelm sets IsLocalChart // (path-based single-source); KindUpstreamHelm leaves it empty (multi-source // upstream-helm). The folder's name is used as the Application name to keep // primary and injected -post folders distinct in Argo. -func buildApplicationData(comp recipe.ComponentRef, f localformat.Folder, syncWave int, repoURL, targetRevision string) ApplicationData { +// +// When inline is true and the folder is KindUpstreamHelm, values are +// marshaled to deterministic YAML and indented 8 spaces for embedding under +// helm.valuesObject. The result is a single-source Application that avoids +// Argo CD's Git-only $values multi-source ref. Errors from value marshaling +// surface as ErrCodeInternal. +func buildApplicationData(comp recipe.ComponentRef, f localformat.Folder, syncWave int, repoURL, targetRevision string, values map[string]any, inline bool) (ApplicationData, error) { chart := comp.Chart if chart == "" { chart = comp.Name @@ -548,6 +584,47 @@ func buildApplicationData(comp recipe.ComponentRef, f localformat.Folder, syncWa data.Version = deployer.NormalizeVersion(comp.Version) } data.Chart = chart + + if inline { + data.InlineValues = true + yamlStr, err := renderInlineValuesYAML(values) + if err != nil { + return ApplicationData{}, errors.PropagateOrWrap(err, errors.ErrCodeInternal, + fmt.Sprintf("failed to render inline values for %s", comp.Name)) + } + data.ValuesYAML = yamlStr + } + } + return data, nil +} + +// renderInlineValuesYAML marshals values to deterministic YAML and indents +// every line by 8 spaces so the result drops cleanly under +// `spec.source.helm.valuesObject:` (column 6) at column 8. +// +// Empty / nil values produce " {}\n" so the template always emits a +// well-formed map node — Argo CD's schema validation rejects a bare key with +// no value. Trailing-blank lines from yaml.v3 are stripped to keep the +// rendered Application stable across runs. +func renderInlineValuesYAML(values map[string]any) (string, error) { + if len(values) == 0 { + return " {}\n", nil + } + yamlBytes, err := serializer.MarshalYAMLDeterministic(values) + if err != nil { + return "", err + } + const indent = " " + lines := strings.Split(strings.TrimRight(string(yamlBytes), "\n"), "\n") + var b strings.Builder + for _, line := range lines { + if line == "" { + b.WriteString("\n") + continue + } + b.WriteString(indent) + b.WriteString(line) + b.WriteString("\n") } - return data + return b.String(), nil } diff --git a/pkg/bundler/deployer/argocd/argocd_test.go b/pkg/bundler/deployer/argocd/argocd_test.go index 2f91c2bd2..6b5f33eef 100644 --- a/pkg/bundler/deployer/argocd/argocd_test.go +++ b/pkg/bundler/deployer/argocd/argocd_test.go @@ -1180,7 +1180,10 @@ func TestBuildApplicationData_OCIHandling(t *testing.T) { Kind: localformat.KindUpstreamHelm, Parent: "test-component", } - data := buildApplicationData(comp, folder, 0, "https://github.com/example/repo.git", "main") + data, err := buildApplicationData(comp, folder, 0, "https://github.com/example/repo.git", "main", nil, false) + if err != nil { + t.Fatalf("buildApplicationData() error = %v", err) + } if data.Repository != tt.wantRepository { t.Errorf("Repository: got %q, want %q (source=%q chart=%q)", data.Repository, tt.wantRepository, tt.source, tt.chart) @@ -1192,6 +1195,159 @@ func TestBuildApplicationData_OCIHandling(t *testing.T) { } } +// TestGenerate_InlineUpstreamValues verifies that an OCI-bound bundle +// inlines per-component values under helm.valuesObject and drops the +// multi-source $values ref. Closes #960: Argo CD's $values multi-source +// pattern is Git-only — over OCI, the repo-server attempts a git ListRefs +// against the OCI URL and fails with `unsupported scheme "oci"`. +func TestGenerate_InlineUpstreamValues(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + recipeResult := &recipe.RecipeResult{} + recipeResult.Metadata.Version = testVersion + recipeResult.ComponentRefs = []recipe.ComponentRef{ + { + Name: "cert-manager", + Namespace: "cert-manager", + Chart: "cert-manager", + Version: "v1.20.2", + Type: recipe.ComponentTypeHelm, + Source: "https://charts.jetstack.io", + }, + } + recipeResult.DeploymentOrder = []string{"cert-manager"} + + values := map[string]any{ + "installCRDs": true, + "resources": map[string]any{ + "requests": map[string]any{ + "cpu": "100m", + "memory": "128Mi", + }, + }, + } + + g := &Generator{ + RecipeResult: recipeResult, + ComponentValues: map[string]map[string]any{"cert-manager": values}, + Version: "v0.9.0", + RepoURL: "oci://registry.aicr-registry.svc.cluster.local:5000/aicr/test", + TargetRevision: "v0.9.0", + // Mirrors the bundler wiring at pkg/bundler/bundler.go for an + // OCI-bound --deployer argocd target. + InlineUpstreamValues: true, + } + + if _, err := g.Generate(ctx, outputDir); err != nil { + t.Fatalf("Generate() error = %v", err) + } + + appPath := filepath.Join(outputDir, "001-cert-manager", "application.yaml") + content, err := os.ReadFile(appPath) + if err != nil { + t.Fatalf("failed to read application.yaml: %v", err) + } + assertValidYAML(t, appPath) + + var doc map[string]any + if err := yaml.Unmarshal(content, &doc); err != nil { + t.Fatalf("failed to parse application.yaml: %v", err) + } + spec, ok := doc["spec"].(map[string]any) + if !ok { + t.Fatal("spec is not a map") + } + + if _, hasSources := spec["sources"]; hasSources { + t.Error("spec.sources should NOT be set when InlineUpstreamValues is true (would re-introduce Git-only $values ref)") + } + source, ok := spec["source"].(map[string]any) + if !ok { + t.Fatal("spec.source is not a map") + } + if source["repoURL"] != "https://charts.jetstack.io" { + t.Errorf("source.repoURL = %v, want https://charts.jetstack.io", source["repoURL"]) + } + if source["chart"] != "cert-manager" { + t.Errorf("source.chart = %v, want cert-manager", source["chart"]) + } + // HTTPS Helm-chart-repo sources strip the `v` prefix (index.yaml + // convention); OCI sources preserve it. cert-manager here is HTTPS. + if source["targetRevision"] != "1.20.2" { + t.Errorf("source.targetRevision = %v, want 1.20.2", source["targetRevision"]) + } + + helm, ok := source["helm"].(map[string]any) + if !ok { + t.Fatal("source.helm is not a map") + } + valuesObj, ok := helm["valuesObject"].(map[string]any) + if !ok { + t.Fatalf("helm.valuesObject is not a map: %T", helm["valuesObject"]) + } + if valuesObj["installCRDs"] != true { + t.Errorf("valuesObject.installCRDs = %v, want true", valuesObj["installCRDs"]) + } + + // $values literals must not appear anywhere — that token is the symptom + // of the bug being fixed; if it leaks back in, the regression goes + // silent until a live Argo CD parse. + if strings.Contains(string(content), "$values") { + t.Errorf("application.yaml still references $values:\n%s", string(content)) + } + if strings.Contains(string(content), "ref: values") { + t.Errorf("application.yaml still has ref: values source:\n%s", string(content)) + } +} + +// TestRenderInlineValuesYAML covers the edge cases for the inline-values +// helper: empty map produces a well-formed `{}` (Argo CD's schema rejects a +// bare key with no value), and non-empty maps are 8-space indented to drop +// cleanly under `helm.valuesObject:` at column 6. +func TestRenderInlineValuesYAML(t *testing.T) { + tests := []struct { + name string + values map[string]any + want string + }{ + { + name: "empty map → explicit {} (Argo CD rejects bare valuesObject:)", + values: nil, + want: " {}\n", + }, + { + name: "scalar value at top level", + values: map[string]any{"replicaCount": 3}, + want: " replicaCount: 3\n", + }, + { + name: "nested map preserves indentation under 8-space base", + values: map[string]any{ + "resources": map[string]any{ + "requests": map[string]any{ + "cpu": "100m", + }, + }, + }, + // yaml.v3 uses 2-space indent for nested keys; 8-space base + // stacks on top: col 8 = resources, col 10 = requests, col 12 = cpu. + want: " resources:\n requests:\n cpu: 100m\n", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := renderInlineValuesYAML(tt.values) + if err != nil { + t.Fatalf("renderInlineValuesYAML() error = %v", err) + } + if got != tt.want { + t.Errorf("renderInlineValuesYAML() got:\n%q\nwant:\n%q", got, tt.want) + } + }) + } +} + // TestBundleGolden_HelmAndManifestOnly freezes the bundle output for a recipe // containing both component shapes that this deployer must handle: // diff --git a/pkg/bundler/deployer/argocd/templates/application.yaml.tmpl b/pkg/bundler/deployer/argocd/templates/application.yaml.tmpl index 1ecc68c7f..8215e53f8 100644 --- a/pkg/bundler/deployer/argocd/templates/application.yaml.tmpl +++ b/pkg/bundler/deployer/argocd/templates/application.yaml.tmpl @@ -12,6 +12,14 @@ spec: repoURL: '{{ .RepoURL }}' targetRevision: {{ .TargetRevision }} path: {{ .BundleDir }} +{{- else if .InlineValues }} + source: + repoURL: {{ .Repository }} + chart: {{ .Chart }} + targetRevision: {{ .Version }} + helm: + valuesObject: +{{ .ValuesYAML -}} {{- else }} sources: - repoURL: {{ .Repository }} diff --git a/pkg/bundler/deployer/argocdhelm/argocdhelm.go b/pkg/bundler/deployer/argocdhelm/argocdhelm.go index b021e06f0..37e7afe28 100644 --- a/pkg/bundler/deployer/argocdhelm/argocdhelm.go +++ b/pkg/bundler/deployer/argocdhelm/argocdhelm.go @@ -462,15 +462,27 @@ func (g *Generator) processFolders(ctx context.Context, tmpDir, outputDir, templ continue } - // Mixed components emit a `-post` folder for raw manifests, alongside - // the primary upstream-helm folder. The -post folder is not a - // registered component on its own — its overrides flow from the - // parent (e.g., 010-gpu-operator-post → gpu-operator). Resolve the - // parent before looking up the override key. + // Mixed components emit synthetic `-pre` and `-post` folders for raw + // manifests that bracket the primary upstream-helm folder. Neither + // suffixed folder is a registered component on its own — both + // inherit overrides from the parent (e.g., 006-gpu-operator-pre and + // 010-gpu-operator-post both flow from gpu-operator). Resolve the + // parent before looking up the override key; if neither suffix + // matches a registered component, keep the original name and let + // resolveOverrideKey surface the registry miss. + // + // Symmetry note: the previous version handled only `-post`. Recipes + // with PreManifestFiles (e.g. the gke-cos OS overlay's gpu-driver + // toolkit prereq) synthesize a `-pre` folder; without the matching + // strip, resolveOverrideKey was called with "gpu-operator-pre" and + // failed `component %q not found in registry` at bundle time. parentComponent := folderComponent - if base, ok := strings.CutSuffix(folderComponent, "-post"); ok { - if findComponentByName(g.RecipeResult.ComponentRefs, base) { - parentComponent = base + for _, suffix := range []string{"-pre", "-post"} { + if base, ok := strings.CutSuffix(folderComponent, suffix); ok { + if findComponentByName(g.RecipeResult.ComponentRefs, base) { + parentComponent = base + break + } } } diff --git a/pkg/bundler/deployer/argocdhelm/argocdhelm_test.go b/pkg/bundler/deployer/argocdhelm/argocdhelm_test.go index 31cfb6593..0b7996f45 100644 --- a/pkg/bundler/deployer/argocdhelm/argocdhelm_test.go +++ b/pkg/bundler/deployer/argocdhelm/argocdhelm_test.go @@ -235,6 +235,125 @@ func TestGenerate(t *testing.T) { } } +// TestGenerate_PreManifestParentResolution exercises the synthetic +// `-pre` folder path that recipes with ComponentPreManifests emit +// (e.g. the gke-cos OS overlay, which injects a Talos-style driver +// prerequisite Namespace ahead of gpu-operator). The folder is named +// `NNN-gpu-operator-pre`; the bundler must strip the `-pre` suffix +// when resolving the override key, otherwise resolveOverrideKey is +// called with "gpu-operator-pre" and fails `component %q not found +// in registry` at bundle time. Caught in the KWOK gke-cos-training +// CI lane after the pre-fix; before this commit the bug only +// surfaced for recipes that combined Helm + pre-manifest overlays. +func TestGenerate_PreManifestParentResolution(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: newRecipeResult("1.0.0", []recipe.ComponentRef{ + {Name: "gpu-operator", Namespace: "gpu-operator", Source: "https://helm.ngc.nvidia.com/nvidia", Chart: "gpu-operator", Version: "v24.9.0"}, + }), + ComponentValues: map[string]map[string]any{ + "gpu-operator": {"driver": map[string]any{"version": "580"}}, + }, + Version: "test", + RepoURL: "https://github.com/example/repo.git", + ComponentPreManifests: map[string]map[string][]byte{ + "gpu-operator": { + "components/gpu-operator/manifests/cos-namespace.yaml": []byte( + "apiVersion: v1\nkind: Namespace\nmetadata:\n name: gpu-operator\n", + ), + }, + }, + } + + if _, err := g.Generate(ctx, outputDir); err != nil { + t.Fatalf("Generate() error = %v (regression: -pre suffix not stripped before resolveOverrideKey)", err) + } + + // The -pre folder content is copied into the bundle so Argo CD's + // repo-server can resolve the path-based child Application's + // `path: NNN--pre` reference. Confirm it survived the + // argocdhelm wrapper transformation. + preDir := filepath.Join(outputDir, "001-gpu-operator-pre") + if _, err := os.Stat(preDir); err != nil { + t.Errorf("expected -pre folder copied into bundle at %s: %v", preDir, err) + } + + // The transformed template under templates/ should be keyed by the + // PARENT component (gpu-operator), not the -pre folder name, + // because the override key comes from the parent's registry entry. + tmplPath := filepath.Join(outputDir, "templates", "gpu-operator-pre.yaml") + if _, err := os.Stat(tmplPath); err != nil { + t.Errorf("expected wrapper template for -pre folder at %s: %v", tmplPath, err) + } +} + +// TestGenerate_PreAndPostManifestParentResolution covers the actual +// gke-cos OS overlay shape: a single registered component (`gpu-operator`) +// with BOTH preManifestFiles (cos-namespace prep) and postManifestFiles +// (dcgm-exporter). The argocdhelm processFolders loop sees 3 entries +// (`-gpu-operator-pre`, `-gpu-operator`, `-gpu-operator-post`) +// and runs the suffix-strip logic twice across two distinct folders. +// Both wrapper templates must resolve back to the same parent override +// key so the values applied to the chart and the manifest hooks stay +// consistent. Earlier revisions handled only `-post`; this test guards +// against re-introducing that asymmetry. +func TestGenerate_PreAndPostManifestParentResolution(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: newRecipeResult("1.0.0", []recipe.ComponentRef{ + {Name: "gpu-operator", Namespace: "gpu-operator", Source: "https://helm.ngc.nvidia.com/nvidia", Chart: "gpu-operator", Version: "v24.9.0"}, + }), + ComponentValues: map[string]map[string]any{ + "gpu-operator": {"driver": map[string]any{"version": "580"}}, + }, + Version: "test", + RepoURL: "https://github.com/example/repo.git", + ComponentPreManifests: map[string]map[string][]byte{ + "gpu-operator": { + "components/gpu-operator/manifests/cos-namespace.yaml": []byte( + "apiVersion: v1\nkind: Namespace\nmetadata:\n name: gpu-operator\n", + ), + }, + }, + ComponentPostManifests: map[string]map[string][]byte{ + "gpu-operator": { + "components/gpu-operator/manifests/dcgm-exporter.yaml": []byte( + "apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: dcgm-config\n", + ), + }, + }, + } + + if _, err := g.Generate(ctx, outputDir); err != nil { + t.Fatalf("Generate() error = %v", err) + } + + // Both synthetic folders must exist in the bundle so the path-based + // child Applications can resolve them. + preDir := filepath.Join(outputDir, "001-gpu-operator-pre") + if _, err := os.Stat(preDir); err != nil { + t.Errorf("expected -pre folder at %s: %v", preDir, err) + } + postDir := filepath.Join(outputDir, "003-gpu-operator-post") + if _, err := os.Stat(postDir); err != nil { + t.Errorf("expected -post folder at %s: %v", postDir, err) + } + + // Both wrapper templates must be present under templates/, both + // keyed by the PARENT component name (not the -pre / -post folder + // names). Asymmetric handling would lose one of them. + for _, suffix := range []string{"-pre", "-post"} { + tmplPath := filepath.Join(outputDir, "templates", "gpu-operator"+suffix+".yaml") + if _, err := os.Stat(tmplPath); err != nil { + t.Errorf("expected wrapper template for %s folder at %s: %v", suffix, tmplPath, err) + } + } +} + func TestGenerate_DataFiles(t *testing.T) { makeGenerator := func(dataFiles []string, includeChecksums bool) *Generator { return &Generator{ diff --git a/pkg/bundler/deployer/localformat/vendor_write_test.go b/pkg/bundler/deployer/localformat/vendor_write_test.go index adf125529..f0a0abe34 100644 --- a/pkg/bundler/deployer/localformat/vendor_write_test.go +++ b/pkg/bundler/deployer/localformat/vendor_write_test.go @@ -323,9 +323,29 @@ func TestWrite_VendorCharts_KustomizeFallthrough(t *testing.T) { // regardless of whether the downstream kustomize build succeeds in // this test environment. // - // kustomize build may shell out to a git fetch; bound the call so a - // stalled subprocess can't hang the suite. - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + // kustomize build may shell out to a git fetch when Repository is a + // remote URL. Two hermeticity guards: + // + // 1. GIT_TERMINAL_PROMPT=0 — on macOS with a credential helper + // installed, git falls through to a `Username for 'https:// + // github.com':` prompt instead of failing fast on missing + // creds; that prompt blocks `make qualify` until the helper + // times out. Setting this env var makes git exit immediately + // so the test stays deterministic regardless of developer git + // config. + // + // 2. The Repository points at the RFC-2606 reserved `.invalid` + // TLD so DNS fails before any HTTP round trip — no chance of + // accidentally reaching a real github.com 404 (which would + // trigger the credential-helper path again on some setups). + // + // Bound the overall call so a stalled subprocess can't hang the + // suite even if both guards fail somehow. 5s is plenty — DNS for + // .invalid resolves to NXDOMAIN in milliseconds and the only + // reason this would approach the budget is a misbehaving + // kustomize/git-library internal retry loop. + t.Setenv("GIT_TERMINAL_PROMPT", "0") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() outDir := t.TempDir() @@ -334,7 +354,7 @@ func TestWrite_VendorCharts_KustomizeFallthrough(t *testing.T) { Components: []localformat.Component{{ Name: "kpack", Namespace: "kpack", - Repository: "https://github.com/example/kpack-overlay", + Repository: "https://kpack-overlay.invalid", Path: "config/default", Tag: "v0.1.0", }}, @@ -343,9 +363,9 @@ func TestWrite_VendorCharts_KustomizeFallthrough(t *testing.T) { }) recs := res.VendoredCharts // Unconditional: kustomize must never produce a vendor record, - // build success or not. (The kustomize build itself typically fails - // in the hermetic test environment because we can't reach the git - // remote, but the routing decision is what we're pinning here.) + // build success or not. (The kustomize build itself fails fast + // because the .invalid TLD doesn't resolve, but the routing + // decision is what we're pinning here.) if len(recs) != 0 { t.Errorf("kustomize component should not produce vendor records, got %d (err=%v)", len(recs), err) } diff --git a/pkg/cli/bundle.go b/pkg/cli/bundle.go index af5c63837..9a6415b4a 100644 --- a/pkg/cli/bundle.go +++ b/pkg/cli/bundle.go @@ -167,10 +167,20 @@ func parseBundleCmdOptions(cmd *cli.Command, cfg *appcfg.AICRConfig) (*bundleCmd opts.outputDir = absOut } - // When using Argo CD deployer with OCI output and no explicit --repo, - // auto-populate repoURL from the OCI reference (issue #519). + // When using --deployer argocd with OCI output and no explicit --repo, + // auto-populate repoURL from the OCI reference. The oci:// scheme is + // required so Argo CD routes to its native OCI artifact source instead + // of treating the value as a Git remote. + // + // argocd-helm is intentionally excluded: that bundle is URL-portable + // by design — the publish location is supplied at `helm install` + // time via `--set repoURL=...`, and the bundler's --repo flag is a + // no-op for it (the helper logs `--repo is ignored with --deployer + // argocd-helm`). Auto-filling repoURL here would surface that + // "ignored" warning even when the user never passed --repo, which + // blurs the URL-portable contract of the deployer. if opts.deployer == config.DeployerArgoCD && opts.ociRef != nil && opts.repoURL == "" { - opts.repoURL = opts.ociRef.Registry + "/" + opts.ociRef.Repository + opts.repoURL = oci.EnsureScheme(opts.ociRef.Registry + "/" + opts.ociRef.Repository) } // Derive target revision: use OCI tag when available @@ -636,15 +646,34 @@ func selectAttester(ctx context.Context, opts *bundleCmdOptions) (attestation.At } // pushOCIBundle packages and pushes the bundle to an OCI registry. +// Branches on deployer: --deployer argocd-helm uses the Helm OCI flow +// (helm.config.v1+json config + helm.chart.content.v1.tar+gzip layer) +// so `helm pull oci://…` discovers the chart at pull time; every other +// deployer uses AICR's generic OCI artifactType. See #961. func pushOCIBundle(ctx context.Context, opts *bundleCmdOptions, out *result.Output) error { - pushResult, err := oci.PackageAndPush(ctx, oci.OutputConfig{ - SourceDir: opts.outputDir, - OutputDir: opts.outputDir, - Reference: opts.ociRef, - Version: version, - PlainHTTP: opts.plainHTTP, - InsecureTLS: opts.insecureTLS, - }) + var ( + pushResult *oci.PackageAndPushResult + err error + ) + if opts.deployer == config.DeployerArgoCDHelm { + pushResult, err = oci.PackageAndPushHelmChart(ctx, oci.HelmChartOptions{ + SourceDir: opts.outputDir, + OutputDir: opts.outputDir, + Reference: opts.ociRef, + Version: version, + PlainHTTP: opts.plainHTTP, + InsecureTLS: opts.insecureTLS, + }) + } else { + pushResult, err = oci.PackageAndPush(ctx, oci.OutputConfig{ + SourceDir: opts.outputDir, + OutputDir: opts.outputDir, + Reference: opts.ociRef, + Version: version, + PlainHTTP: opts.plainHTTP, + InsecureTLS: opts.insecureTLS, + }) + } if err != nil { return err } diff --git a/pkg/cli/bundle_test.go b/pkg/cli/bundle_test.go index cf0f62295..75ddf9c59 100644 --- a/pkg/cli/bundle_test.go +++ b/pkg/cli/bundle_test.go @@ -16,6 +16,8 @@ package cli import ( "context" + "os" + "path/filepath" "testing" "github.com/NVIDIA/aicr/pkg/bundler/attestation" @@ -120,3 +122,72 @@ func TestSelectAttester_WiresEnvAndFlags(t *testing.T) { t.Errorf("expected *LazyKeylessAttester (deferred token resolution), got %T", att) } } + +// TestParseBundleCmdOptions_OCIRepoURLDerivation verifies the auto-population +// rules for opts.repoURL when bundling to an OCI target: +// +// - --deployer argocd: derive `oci://...` from --output. Argo CD v3.1+ +// parses a schemeless registry/repo string as a Git remote and fails +// on ssh-agent; the `oci://` prefix routes to its native OCI source. +// - --deployer argocd-helm: do NOT derive. That bundle is URL-portable +// by design — the publish location is supplied at `helm install` +// time via `--set repoURL=...`. Auto-deriving would surface the +// "--repo is ignored" warning even when the user never passed --repo. +// - --deployer helm: never derive. +// - explicit --repo: never overwritten. +func TestParseBundleCmdOptions_OCIRepoURLDerivation(t *testing.T) { + tmp := t.TempDir() + recipePath := filepath.Join(tmp, "recipe.yaml") + if err := os.WriteFile(recipePath, []byte("kind: Recipe\n"), 0o600); err != nil { + t.Fatalf("write recipe: %v", err) + } + + const ociOutput = "oci://reg.example.com:5000/aicr/foo:v1" + + tests := []struct { + name string + args []string + expectedRepo string + expectedTarget string + }{ + { + name: "argocd OCI derives oci:// scheme", + args: []string{"--recipe", recipePath, "--output", ociOutput, "--deployer", "argocd"}, + expectedRepo: "oci://reg.example.com:5000/aicr/foo", + expectedTarget: "v1", + }, + { + name: "argocd-helm OCI does NOT derive repoURL (URL-portable contract)", + args: []string{"--recipe", recipePath, "--output", ociOutput, "--deployer", "argocd-helm"}, + expectedRepo: "", + expectedTarget: "v1", + }, + { + name: "explicit --repo not overwritten", + args: []string{"--recipe", recipePath, "--output", ociOutput, "--deployer", "argocd", "--repo", "https://github.com/x/y"}, + expectedRepo: "https://github.com/x/y", + expectedTarget: "v1", + }, + { + name: "helm deployer leaves repoURL empty", + args: []string{"--recipe", recipePath, "--output", ociOutput, "--deployer", "helm"}, + expectedRepo: "", + expectedTarget: "v1", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + opts := captureBundleOpts(t, tt.args) + if opts == nil { + t.Fatal("captureBundleOpts returned nil") + } + if opts.repoURL != tt.expectedRepo { + t.Errorf("repoURL = %q, want %q", opts.repoURL, tt.expectedRepo) + } + if opts.targetRevision != tt.expectedTarget { + t.Errorf("targetRevision = %q, want %q", opts.targetRevision, tt.expectedTarget) + } + }) + } +} diff --git a/pkg/defaults/timeouts.go b/pkg/defaults/timeouts.go index e177f8f7d..425cb0956 100644 --- a/pkg/defaults/timeouts.go +++ b/pkg/defaults/timeouts.go @@ -453,6 +453,15 @@ const ( // or server to allocate an unbounded buffer. MaxConfigBytes int64 = 1 * 1024 * 1024 // 1 MiB + // MaxChartYAMLBytes caps the size of a Chart.yaml file that + // pkg/oci.PackageAndPushHelmChart reads from a caller-supplied + // SourceDir before pushing as an OCI artifact. Real Chart.yaml + // files are well under 4 KiB (apiVersion + name + version + + // maybe dependencies); 1 MiB is generous headroom while bounding + // an attacker-influenced SourceDir (symlink to /proc, NFS mount, + // FUSE filesystem) before os.ReadFile would OOM the process. + MaxChartYAMLBytes int64 = 1 * 1024 * 1024 // 1 MiB + // MaxChecksumFileBytes caps the size of a bundle checksums.txt file. // One entry is ~80 bytes; 1 MiB allows ~12k entries — well above any // realistic bundle while bounding attacker-influenced inputs at the diff --git a/pkg/oci/helm.go b/pkg/oci/helm.go new file mode 100644 index 000000000..b81fc9e66 --- /dev/null +++ b/pkg/oci/helm.go @@ -0,0 +1,528 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package oci + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "encoding/json" + stderrors "errors" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "strings" + + "github.com/Masterminds/semver/v3" + "github.com/opencontainers/go-digest" + ociv1 "github.com/opencontainers/image-spec/specs-go/v1" + "gopkg.in/yaml.v3" + oras "oras.land/oras-go/v2" + "oras.land/oras-go/v2/content/oci" + "oras.land/oras-go/v2/registry/remote" + + "github.com/NVIDIA/aicr/pkg/defaults" + apperrors "github.com/NVIDIA/aicr/pkg/errors" + "github.com/NVIDIA/aicr/pkg/serializer" +) + +// Helm OCI registry mediaTypes — pinned per the official Helm OCI spec +// (https://helm.sh/docs/topics/registries/). `helm pull`/`helm install` +// read the manifest config blob's mediaType to discover whether the +// artifact is a Helm chart. When AICR pushes with our generic AICR +// artifactType, helm v3 rejects the artifact during pull discovery and +// reports `unable to locate any tags in provided repository` (see +// #961). Pushing with these constants makes the argocd-helm bundle +// indistinguishable from a chart pushed via `helm push`. +const ( + helmConfigMediaType = "application/vnd.cncf.helm.config.v1+json" + helmLayerMediaType = "application/vnd.cncf.helm.chart.content.v1.tar+gzip" +) + +// chartYAML is intentionally the OCI **config blob** (registry-visible +// metadata), NOT a full Chart.yaml model. Helm itself reads name + +// version off the chart tarball's internal Chart.yaml — the full +// Chart.yaml content ships in the tar layer, so any Helm chart key not +// modeled here is not lost. The config blob mirrors only the fields +// `helm pull` validates and that registries (Harbor, GCR, ghcr) render +// in their artifact-metadata UI. +// +// Do NOT widen this struct to model additional Chart.yaml fields. The +// fix for a missing chart property is to ensure it's in the in-source +// Chart.yaml (which the bundler writes), not to plumb it through the +// config blob. +type chartYAML struct { + APIVersion string `yaml:"apiVersion" json:"apiVersion"` + Name string `yaml:"name" json:"name"` + Version string `yaml:"version" json:"version"` + Description string `yaml:"description,omitempty" json:"description,omitempty"` + Type string `yaml:"type,omitempty" json:"type,omitempty"` + AppVersion string `yaml:"appVersion,omitempty" json:"appVersion,omitempty"` +} + +// HelmChartOptions configures the Helm OCI push flow. +type HelmChartOptions struct { + // SourceDir is the directory containing the Helm chart (Chart.yaml, + // values.yaml, templates/, etc.). Mirrors PackageOptions.SourceDir. + SourceDir string + // OutputDir is where the temporary OCI Image Layout is created; + // callers usually pass the bundle output directory so cleanup is + // scoped to one tree. + OutputDir string + // Reference is the OCI registry reference. The reference's Tag MUST + // match the chart version that helm install / helm pull will look + // for; the push flow rewrites the in-source Chart.yaml so this + // invariant holds even when the recipe metadata version differs + // from the user-supplied --output tag. + Reference *Reference + // PlainHTTP uses HTTP instead of HTTPS (local test registries). + PlainHTTP bool + // InsecureTLS skips TLS certificate verification. + InsecureTLS bool + // Version threads the AICR CLI version into manifest annotations + // alongside the chart-derived ones. Pure metadata; not consulted + // by Helm at install time. + Version string +} + +// PackageAndPushHelmChart packages a Helm chart directory into a +// Helm-OCI compatible artifact and pushes it to a registry. Closes the +// gap described in #961: the standard AICR OCI flow uses +// `application/vnd.nvidia.aicr.artifact` for the manifest's +// artifactType and `application/vnd.oci.image.layer.v1.tar+gzip` for +// the layer; helm v3 rejects those during pull discovery and the user +// sees `unable to locate any tags in provided repository` — even +// though the tag exists and `/v2//tags/list` returns it. +// +// SourceDir is NOT mutated. The Chart.yaml rewrite (so its version +// matches Reference.Tag — see helm's chart-version invariant below) +// happens on a copy inside OutputDir, leaving the caller's source +// tree byte-identical. Earlier revisions of this function rewrote +// SourceDir in place, which (a) leaked the OCI tag back into the +// caller's working copy and (b) was non-atomic — a crash mid-write +// left a corrupt Chart.yaml the next run would refuse to parse. +// +// Helm OCI tags ARE chart versions: `helm install … --version ` +// looks up `:` and verifies the chart manifest's version +// against ``. Without the rewrite, a user who supplies `--output +// oci://…/foo:5bc50950-helm` for a recipe whose metadata version is +// `0.1.0` ends up with `Chart.yaml: 0.1.0` at the `:5bc50950-helm` +// tag, and helm rejects the version mismatch. +func PackageAndPushHelmChart(ctx context.Context, opts HelmChartOptions) (*PackageAndPushResult, error) { + if opts.Reference == nil || !opts.Reference.IsOCI { + return nil, apperrors.New(apperrors.ErrCodeInvalidRequest, "OCI reference is required for PackageAndPushHelmChart") + } + if opts.Reference.Tag == "" { + return nil, apperrors.New(apperrors.ErrCodeInvalidRequest, "tag is required for Helm OCI push") + } + if err := validateHelmTag(opts.Reference.Tag); err != nil { + return nil, err + } + if err := validateRegistryReference(opts.Reference.Registry, opts.Reference.Repository); err != nil { + return nil, err + } + + absSourceDir, err := filepath.Abs(opts.SourceDir) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to resolve source directory", err) + } + absOutputDir, err := filepath.Abs(opts.OutputDir) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to resolve output directory", err) + } + + // Stage a working copy of SourceDir so the rewrite + tar steps never + // mutate the caller's tree. MUST be outside SourceDir: the CLI calls + // this function with SourceDir == OutputDir, and putting the work + // dir under either would cause copyDir's recursive walk to copy the + // freshly-created work dir back into itself (observed: ~250 nested + // `helm-chart-work/helm-chart-work/...` levels until ENAMETOOLONG). + // Use the OS temp area instead and clean up unconditionally. + workDir, mkdirErr := os.MkdirTemp("", "aicr-helm-chart-") + if mkdirErr != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to create Helm staging dir", mkdirErr) + } + defer func() { _ = os.RemoveAll(workDir) }() + if copyErr := copyDir(ctx, absSourceDir, workDir); copyErr != nil { + return nil, apperrors.PropagateOrWrap(copyErr, apperrors.ErrCodeInternal, + "failed to stage Helm chart source for OCI push") + } + + chartMeta, err := loadAndRewriteChartYAML(workDir, opts.Reference.Tag) + if err != nil { + return nil, err + } + + slog.Info("packaging Helm chart for OCI push", + "registry", opts.Reference.Registry, + "repository", opts.Reference.Repository, + "tag", opts.Reference.Tag, + "chart_name", chartMeta.Name, + "chart_version", chartMeta.Version, + ) + + chartTGZ, err := buildHelmChartTGZ(ctx, workDir, chartMeta.Name) + if err != nil { + return nil, err + } + + configBlob, err := json.Marshal(chartMeta) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to marshal Helm chart config", err) + } + + storePath := filepath.Join(absOutputDir, "oci-layout-helm") + if mkdirErr := os.MkdirAll(storePath, 0o755); mkdirErr != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to create Helm OCI store directory", mkdirErr) + } + store, err := oci.New(storePath) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to create Helm OCI store", err) + } + + manifestDesc, err := stageHelmOCIManifest(ctx, store, chartMeta, chartTGZ, configBlob, opts.Version) + if err != nil { + return nil, err + } + if tagErr := store.Tag(ctx, manifestDesc, opts.Reference.Tag); tagErr != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to tag Helm OCI manifest", tagErr) + } + + pushResult, err := pushHelmOCIFromStore(ctx, opts, store) + if err != nil { + return nil, err + } + + return &PackageAndPushResult{ + Digest: pushResult.Digest, + MediaType: pushResult.MediaType, + Size: pushResult.Size, + Reference: pushResult.Reference, + StorePath: storePath, + }, nil +} + +// validateHelmTag rejects OCI tags that helm v3 won't accept as chart +// versions. Helm's registry client filters tags via +// `semver.StrictNewVersion` (pkg/registry/client.go::Tags()) — any tag +// that doesn't parse as semver is silently dropped, and `helm pull` / +// `helm install --version ` reports `unable to locate any tags in +// provided repository` even when the artifact exists in the registry +// with correct Helm OCI mediaTypes. Returning a typed error here +// converts that silent helm-side failure into an actionable +// bundle-time error with the constraint spelled out. +// +// One concession to common usage: a leading `v` (e.g. `v1.2.3`) is +// stripped before the parse because helm itself strips it (see +// `pkg/registry/client.go::Tags()` → `strings.TrimPrefix(tag, "v")` is +// implicit in the version compare path). Users who reach for `v1.2.3` +// should not be punished for a stylistic choice helm accepts. +func validateHelmTag(tag string) error { + candidate := strings.TrimPrefix(tag, "v") + if _, err := semver.StrictNewVersion(candidate); err != nil { + return apperrors.New(apperrors.ErrCodeInvalidRequest, + fmt.Sprintf("Helm OCI requires a semver-compatible tag; got %q. "+ + "`helm pull` / `helm install --version ` filter tags via "+ + "semver.StrictNewVersion and silently drop non-semver tags, "+ + "surfacing as `unable to locate any tags in provided repository`. "+ + "Wrap arbitrary identifiers as a semver pre-release, e.g. "+ + "`0.0.0-%s`.", tag, tag)) + } + return nil +} + +// loadAndRewriteChartYAML reads Chart.yaml from sourceDir, sets its +// version to tag, and writes the result back. Returns the parsed chart +// metadata so callers can build the OCI config blob without re-parsing. +// +// The rewrite is intentional and idempotent — see +// PackageAndPushHelmChart for the chart-version-equals-OCI-tag +// invariant. +func loadAndRewriteChartYAML(sourceDir, tag string) (*chartYAML, error) { + chartPath := filepath.Join(sourceDir, "Chart.yaml") + + // Bounded read: SourceDir is caller-supplied, so a hostile symlink + // (/proc, NFS, FUSE) could trick os.ReadFile into allocating the + // whole file into memory before yaml.Unmarshal would notice. Open + // + LimitReader against the Chart.yaml-shaped cap from pkg/defaults + // keeps the allocation bounded even under an attacker-influenced + // source path. See CLAUDE.md anti-pattern table for the rule. + f, err := os.Open(chartPath) //nolint:gosec // path is under caller-supplied source dir + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInvalidRequest, "failed to open Chart.yaml from source directory", err) + } + defer func() { _ = f.Close() }() + data, err := io.ReadAll(io.LimitReader(f, defaults.MaxChartYAMLBytes+1)) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInvalidRequest, "failed to read Chart.yaml from source directory", err) + } + if int64(len(data)) > defaults.MaxChartYAMLBytes { + return nil, apperrors.New(apperrors.ErrCodeInvalidRequest, + fmt.Sprintf("Chart.yaml exceeds %d bytes — refusing to read unbounded source", defaults.MaxChartYAMLBytes)) + } + + var meta chartYAML + if unmarshalErr := yaml.Unmarshal(data, &meta); unmarshalErr != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInvalidRequest, "failed to parse Chart.yaml", unmarshalErr) + } + if meta.Name == "" { + return nil, apperrors.New(apperrors.ErrCodeInvalidRequest, "Chart.yaml: name is required") + } + if nameErr := validateChartName(meta.Name); nameErr != nil { + return nil, nameErr + } + if meta.APIVersion == "" { + meta.APIVersion = "v2" + } + meta.Version = tag + + // The marshaled bytes get written back to Chart.yaml, which is then + // tarred into the chart.tgz that becomes the OCI artifact's layer + // blob. The layer digest is consumed by helm OCI's content- + // addressable cache and by any downstream attestation. Deterministic + // marshal is therefore mandatory — the chartYAML struct itself is + // stable, but `yaml.v3` makes formatting choices (block-vs-flow + // scalar style, line wrapping) that can vary; using the project's + // deterministic serializer is the lighter belt-and-suspenders + // matching the rest of the digest-feeding paths in this repo. + rewritten, err := serializer.MarshalYAMLDeterministic(meta) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to marshal Chart.yaml", err) + } + if err := os.WriteFile(chartPath, rewritten, 0o600); err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to rewrite Chart.yaml with OCI tag version", err) + } + return &meta, nil +} + +// validateChartName rejects chart names whose layout would let the tar +// entry prefix (`/`) escape the chart root on extraction. +// Helm itself enforces the Chart.yaml name to be `[a-z0-9-]+` at install +// time, but the OCI push path here is exposed as a public function and +// a future caller with a name derived from less-validated input could +// otherwise produce a tarball with absolute or parent-traversal entries. +// +// filepath.IsLocal catches `/`, `..`, drive letters, and parent +// references; the extra ContainsAny adds belt-and-suspenders against +// control characters and slashes (IsLocal accepts an embedded NUL on +// some platforms). +func validateChartName(name string) error { + if name == "" { + return apperrors.New(apperrors.ErrCodeInvalidRequest, "chart name is required") + } + if !filepath.IsLocal(name) || strings.ContainsAny(name, "/\\\x00") { + return apperrors.New(apperrors.ErrCodeInvalidRequest, + fmt.Sprintf("chart name %q contains unsafe characters — must be a single path segment without `/`, `..`, or control chars", name)) + } + return nil +} + +// buildHelmChartTGZ produces a Helm-conformant chart.tgz: a gzipped tar +// where every entry is prefixed with `/` (the Helm "chart +// root" convention). `helm package` produces exactly this layout; helm +// v3's pull / install both depend on the prefix so the chart's +// templates/ resolution roots correctly inside the extracted directory. +// +// Reproducible-build hygiene: mtimes are zeroed and uid/gid/uname/ +// gname are stripped (left at their zero values) so two AICR runs +// against the same source tree produce the same chart.tgz digest, +// which the OCI registry then dedups. +// +// ctx is checked on entry and per directory entry so a parent timeout +// (push deadline, server shutdown) terminates the walk without +// orphaning a partially-written buffer. +func buildHelmChartTGZ(ctx context.Context, sourceDir, chartName string) ([]byte, error) { + if err := ctx.Err(); err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeUnavailable, "chart packaging canceled", err) + } + // Defense-in-depth: loadAndRewriteChartYAML already validated the + // name, but this function is exported and a future caller might + // construct chartName from a different source. Reject here too so + // the tar-entry prefix is always a safe single segment. + if err := validateChartName(chartName); err != nil { + return nil, err + } + + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + tw := tar.NewWriter(gz) + + walkErr := filepath.Walk(sourceDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if ctxErr := ctx.Err(); ctxErr != nil { + return ctxErr + } + rel, relErr := filepath.Rel(sourceDir, path) + if relErr != nil { + return relErr + } + if rel == "." { + // Skip the synthetic chart-root directory entry; helm v3 + // extracts by file path and tolerates its absence, and + // emitting it adds noise to the tar that defeats the + // reproducible-digest goal whenever a tool inserts/omits it. + return nil + } + entry := chartName + "/" + filepath.ToSlash(rel) + + if info.IsDir() { + hdr := &tar.Header{ + Name: entry + "/", + Mode: 0o755, + Typeflag: tar.TypeDir, + } + return tw.WriteHeader(hdr) + } + // Skip non-regular files (symlinks, sockets, etc.) — chart + // bundles produced by argocdhelm are always plain files; + // surfacing the rest defensively avoids pushing an unsupported + // tar entry the registry would otherwise accept silently. + if !info.Mode().IsRegular() { + return nil + } + hdr := &tar.Header{ + Name: entry, + Mode: int64(info.Mode().Perm()), + Size: info.Size(), + Typeflag: tar.TypeReg, + } + if err := tw.WriteHeader(hdr); err != nil { + return err + } + f, openErr := os.Open(filepath.Clean(path)) //nolint:gosec // walking caller-supplied source dir + if openErr != nil { + return openErr + } + _, copyErr := io.Copy(tw, f) + closeErr := f.Close() + if copyErr != nil { + return copyErr + } + return closeErr + }) + if walkErr != nil { + // filepath.Walk callbacks return ctx.Err() on cancellation + // (see the entry-point check at the top of the closure). + // Surfacing those as ErrCodeInternal would misclassify a + // caller-initiated cancel or a parent-timeout as a bug, and + // flatten the CI's distinction between "chart packaging + // failed" and "parent context expired". Classify by the + // inner error so the same path stays ErrCodeInternal for + // real I/O failures and ErrCodeTimeout for cancellation. + if stderrors.Is(walkErr, context.Canceled) || stderrors.Is(walkErr, context.DeadlineExceeded) { + return nil, apperrors.PropagateOrWrap(walkErr, apperrors.ErrCodeTimeout, "chart packaging canceled") + } + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to build Helm chart tarball", walkErr) + } + if err := tw.Close(); err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to close tar writer", err) + } + if err := gz.Close(); err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to close gzip writer", err) + } + return buf.Bytes(), nil +} + +// stageHelmOCIManifest pushes the chart layer + config blob into store +// and packs an OCI 1.0 manifest with the Helm config mediaType. v1.0 +// is intentional: helm v3's pull path reads manifest.config.mediaType +// to discover the chart artifactType. v1.1's separate `artifactType` +// field with an empty config works for some registries but not all +// (notably older registry:2 builds observed in #961's repro); v1.0 +// keeps the legacy interpretation where the config blob's mediaType +// IS the artifact type. +func stageHelmOCIManifest(ctx context.Context, store *oci.Store, meta *chartYAML, chartTGZ, configBlob []byte, aicrVersion string) (ociv1.Descriptor, error) { + layerDesc := ociv1.Descriptor{ + MediaType: helmLayerMediaType, + Digest: digest.FromBytes(chartTGZ), + Size: int64(len(chartTGZ)), + } + if err := store.Push(ctx, layerDesc, bytes.NewReader(chartTGZ)); err != nil { + return ociv1.Descriptor{}, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to add chart layer to OCI store", err) + } + + configDesc := ociv1.Descriptor{ + MediaType: helmConfigMediaType, + Digest: digest.FromBytes(configBlob), + Size: int64(len(configBlob)), + } + if err := store.Push(ctx, configDesc, bytes.NewReader(configBlob)); err != nil { + return ociv1.Descriptor{}, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to add chart config to OCI store", err) + } + + annotations := map[string]string{ + ociv1.AnnotationCreated: reproducibleTimestamp, + ociv1.AnnotationTitle: meta.Name, + ociv1.AnnotationVersion: meta.Version, + "org.opencontainers.image.vendor": "NVIDIA", + } + if meta.Description != "" { + annotations[ociv1.AnnotationDescription] = meta.Description + } + if aicrVersion != "" { + annotations["com.nvidia.aicr.version"] = aicrVersion + } + + packOpts := oras.PackManifestOptions{ + Layers: []ociv1.Descriptor{layerDesc}, + ConfigDescriptor: &configDesc, + ManifestAnnotations: annotations, + } + manifestDesc, err := oras.PackManifest(ctx, store, oras.PackManifestVersion1_0, helmConfigMediaType, packOpts) + if err != nil { + return ociv1.Descriptor{}, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to pack Helm OCI manifest", err) + } + return manifestDesc, nil +} + +// pushHelmOCIFromStore uploads the staged manifest tree from an OCI +// Image Layout to the remote registry. Wraps the same retry policy +// as the generic AICR push path so the two flows share transient- +// failure handling. +func pushHelmOCIFromStore(ctx context.Context, opts HelmChartOptions, store *oci.Store) (*PushResult, error) { + registryHost := stripProtocol(opts.Reference.Registry) + refString := fmt.Sprintf("%s/%s:%s", registryHost, opts.Reference.Repository, opts.Reference.Tag) + + repo, err := remote.NewRepository(fmt.Sprintf("%s/%s", registryHost, opts.Reference.Repository)) + if err != nil { + return nil, apperrors.Wrap(apperrors.ErrCodeInternal, "failed to initialize remote repository", err) + } + repo.PlainHTTP = opts.PlainHTTP + + authClient, err := createAuthClientForHost(registryHost, opts.PlainHTTP, opts.InsecureTLS) + if err != nil { + slog.Warn("failed to initialize Docker credential store, continuing without authentication", "error", err) + } + repo.Client = authClient + + copyOpts := oras.DefaultCopyOptions + copyOpts.Concurrency = defaults.OCIPushConcurrency + desc, err := copyWithRetry(ctx, store, opts.Reference.Tag, repo, opts.Reference.Tag, copyOpts, oras.Copy) + if err != nil { + return nil, err + } + + return &PushResult{ + Digest: desc.Digest.String(), + MediaType: desc.MediaType, + Size: desc.Size, + Reference: refString, + }, nil +} diff --git a/pkg/oci/helm_test.go b/pkg/oci/helm_test.go new file mode 100644 index 000000000..d5833b0b3 --- /dev/null +++ b/pkg/oci/helm_test.go @@ -0,0 +1,795 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package oci + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "testing" + "time" + + ociv1 "github.com/opencontainers/image-spec/specs-go/v1" + "gopkg.in/yaml.v3" + orasoci "oras.land/oras-go/v2/content/oci" +) + +// openOCILayout creates (and returns) an OCI Image Layout store rooted +// at dir. Mirrors how PackageAndPushHelmChart provisions its staging +// store so the test exercises the production code path. +func openOCILayout(dir string) (*orasoci.Store, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, err + } + return orasoci.New(dir) +} + +// readOCIManifest decodes the JSON manifest blob at +// /blobs/sha256/ back into a typed ociv1.Manifest. +// Used by the helm tests to assert mediaTypes were set correctly +// without standing up a registry. +func readOCIManifest(t *testing.T, storeDir, digest string) ociv1.Manifest { + t.Helper() + blobPath := filepath.Join(storeDir, "blobs", "sha256", strings.TrimPrefix(digest, "sha256:")) + raw, err := os.ReadFile(blobPath) + if err != nil { + t.Fatalf("read manifest blob: %v", err) + } + var manifest ociv1.Manifest + if err := json.Unmarshal(raw, &manifest); err != nil { + t.Fatalf("unmarshal manifest: %v", err) + } + return manifest +} + +// writeHelmChartFixture stages a minimal Helm chart on disk for the +// tests below. Returns the chart's root directory. Caller supplies +// chartName so tests can vary it where the chart-root prefix matters +// (buildHelmChartTGZ). Version is fixed at "0.1.0" — the rewrite test +// asserts that the on-disk version is REPLACED by the OCI tag, so the +// starting value just needs to be something different from any tag a +// test uses. +func writeHelmChartFixture(t *testing.T, chartName string) string { + t.Helper() + dir := t.TempDir() + files := map[string]string{ + "Chart.yaml": "apiVersion: v2\nname: " + chartName + "\nversion: 0.1.0" + + "\ndescription: AICR test chart\n", + "values.yaml": "replicaCount: 1\n", + "templates/deployment.yaml": "kind: Deployment\n", + "templates/_helpers.tpl": `{{ define "noop" }}{{ end }}` + "\n", + } + for rel, content := range files { + path := filepath.Join(dir, rel) + if mkdirErr := os.MkdirAll(filepath.Dir(path), 0o755); mkdirErr != nil { + t.Fatalf("mkdir %s: %v", filepath.Dir(path), mkdirErr) + } + if writeErr := os.WriteFile(path, []byte(content), 0o600); writeErr != nil { + t.Fatalf("write %s: %v", path, writeErr) + } + } + return dir +} + +// TestLoadAndRewriteChartYAML_RewritesVersion verifies the OCI tag +// invariant: helm OCI tags ARE chart versions, so the push flow must +// rewrite Chart.yaml.version to match Reference.Tag. Catches +// regressions where the rewrite is silently skipped and helm install +// rejects the chart with a version-mismatch error. +func TestLoadAndRewriteChartYAML_RewritesVersion(t *testing.T) { + dir := writeHelmChartFixture(t, "aicr-bundle") + + meta, err := loadAndRewriteChartYAML(dir, "5bc50950-argocd-helm-oci") + if err != nil { + t.Fatalf("loadAndRewriteChartYAML() error = %v", err) + } + if meta.Version != "5bc50950-argocd-helm-oci" { + t.Errorf("meta.Version = %q, want %q", meta.Version, "5bc50950-argocd-helm-oci") + } + if meta.Name != "aicr-bundle" { + t.Errorf("meta.Name = %q, want aicr-bundle", meta.Name) + } + + raw, err := os.ReadFile(filepath.Join(dir, "Chart.yaml")) + if err != nil { + t.Fatalf("read Chart.yaml: %v", err) + } + var onDisk chartYAML + if err := yaml.Unmarshal(raw, &onDisk); err != nil { + t.Fatalf("unmarshal rewritten Chart.yaml: %v", err) + } + if onDisk.Version != "5bc50950-argocd-helm-oci" { + t.Errorf("on-disk Chart.yaml version = %q, want rewrite to tag", onDisk.Version) + } +} + +// TestLoadAndRewriteChartYAML_RejectsMissingName surfaces a malformed +// Chart.yaml (missing name) as ErrCodeInvalidRequest so the CLI exits +// non-zero before the (more expensive) tar.gz + push steps. +func TestLoadAndRewriteChartYAML_RejectsMissingName(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "Chart.yaml"), + []byte("apiVersion: v2\nversion: 0.1.0\n"), 0o600); err != nil { + t.Fatalf("write Chart.yaml: %v", err) + } + _, err := loadAndRewriteChartYAML(dir, "v1") + if err == nil { + t.Fatal("loadAndRewriteChartYAML() expected error for missing name") + } + if !strings.Contains(err.Error(), "name is required") { + t.Errorf("error = %v, want it to mention missing name", err) + } +} + +// TestValidateHelmTag covers the helm-tag semver gate. Helm v3's +// registry client filters tags via semver.StrictNewVersion before +// surfacing them to `helm pull` / `helm install --version `; +// non-semver tags silently disappear and the user sees `unable to +// locate any tags in provided repository`. We catch the bad tag at +// bundle time with a clear remediation hint instead. +// +// `v`-prefixed semver (e.g. `v1.2.3`) is accepted because helm itself +// strips the prefix internally. +func TestValidateHelmTag(t *testing.T) { + tests := []struct { + name string + tag string + wantErr bool + }{ + {"plain semver", "1.2.3", false}, + {"semver pre-release with dashes", "0.0.0-fe346a05-argocd-helm-oci", false}, + {"semver with build metadata", "0.0.0+fe346a05.argocd-helm-oci", false}, + {"v-prefixed semver (helm strips v)", "v1.2.3", false}, + {"v-prefixed semver pre-release", "v0.0.0-rc1", false}, + // Bug-reproducer: the kwok script's original tag scheme. + // helm v3 silently rejects it and reports "no tags found". + {"sha-deployer (kwok bug repro)", "fe346a05-argocd-helm-oci", true}, + {"plain hash", "abc123", true}, + {"empty", "", true}, + {"latest", "latest", true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateHelmTag(tt.tag) + if (err != nil) != tt.wantErr { + t.Errorf("validateHelmTag(%q) error = %v, wantErr %v", tt.tag, err, tt.wantErr) + } + // On rejection, the message must include the remediation + // hint — that's how users find the way out of the + // "no tags found" rabbit hole. + if err != nil && !strings.Contains(err.Error(), "0.0.0-") { + t.Errorf("validateHelmTag(%q) error missing semver-wrap remediation: %v", tt.tag, err) + } + }) + } +} + +// TestBuildHelmChartTGZ_HasChartRootPrefix verifies the tarball obeys +// helm's chart-root-prefix convention: every entry's path is +// `/...`. Helm v3's pull/install rely on this to resolve +// templates/ at the right level inside the extracted directory; an +// unprefixed tarball would deserialize as a flat list of files that +// Helm's loader can't locate templates/ inside. +// +// Uses a chart name that doesn't match the fixture's default +// "aicr-bundle" so the test catches a regression that hardcodes the +// prefix instead of using the chartName argument. +func TestBuildHelmChartTGZ_HasChartRootPrefix(t *testing.T) { + const chartName = "my-renamed-chart" + dir := writeHelmChartFixture(t, chartName) + + tgz, err := buildHelmChartTGZ(context.Background(), dir, chartName) + if err != nil { + t.Fatalf("buildHelmChartTGZ() error = %v", err) + } + entries := readTGZEntries(t, tgz) + + wantPaths := map[string]bool{ + chartName + "/Chart.yaml": false, + chartName + "/values.yaml": false, + chartName + "/templates/": false, + chartName + "/templates/deployment.yaml": false, + chartName + "/templates/_helpers.tpl": false, + } + for _, name := range entries { + if _, want := wantPaths[name]; want { + wantPaths[name] = true + } + if !strings.HasPrefix(name, chartName+"/") { + t.Errorf("entry %q missing chart-root prefix %q", name, chartName+"/") + } + } + for path, seen := range wantPaths { + if !seen { + t.Errorf("entry %q missing from tarball; got %v", path, entries) + } + } +} + +// TestBuildHelmChartTGZ_Reproducible asserts the tar.gz output is +// byte-identical across two builds AND that every tar header satisfies +// the absolute invariants that reproducibility relies on. The +// same-process byte equality is necessary but not sufficient — a +// regression that caches `time.Now()` once at package-init would still +// produce identical bytes across both builds in the same process, but +// the next CI run would diverge. The header-level invariants below +// catch that class of regression by checking concrete absolute values. +func TestBuildHelmChartTGZ_Reproducible(t *testing.T) { + dir := writeHelmChartFixture(t, "aicr-bundle") + a, err := buildHelmChartTGZ(context.Background(), dir, "aicr-bundle") + if err != nil { + t.Fatalf("first build: %v", err) + } + b, err := buildHelmChartTGZ(context.Background(), dir, "aicr-bundle") + if err != nil { + t.Fatalf("second build: %v", err) + } + if !bytes.Equal(a, b) { + t.Fatalf("tarball bytes differ between builds (len %d vs %d)", len(a), len(b)) + } + + // Header-level absolute invariants. Walk the produced tar and + // assert every entry zeroes out all the fields a regression could + // pull from runtime state (mtime / accesstime / changetime, + // uid / gid / uname / gname). A bug that caches time.Now() at + // init would still pass the byte-equality above but trip these + // assertions. + gz, err := gzip.NewReader(bytes.NewReader(a)) + if err != nil { + t.Fatalf("gzip.NewReader: %v", err) + } + defer gz.Close() + // gzip header's ModTime field must be the Go zero value — gzip's + // MTIME field is wall-clock time. If a regression sets it, every + // cross-run digest diverges even if every tar header is clean. + if !gz.ModTime.IsZero() { + t.Errorf("gzip header ModTime = %v, want zero (reproducibility)", gz.ModTime) + } + + // noTime returns true for either Go's zero time.Time (year 1) or + // Unix epoch (year 1970). The tar wire format stores time as + // seconds since epoch, so the Go reader normalizes an unset header + // to time.Unix(0, 0) — which is NOT IsZero() but IS still the + // "no time recorded" sentinel. We accept either. + noTime := func(tm time.Time) bool { return tm.IsZero() || tm.Unix() == 0 } + + tr := tar.NewReader(gz) + var entries int + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("tar.Next: %v", err) + } + entries++ + if !noTime(hdr.ModTime) { + t.Errorf("entry %q ModTime = %v (unix=%d), want zero or epoch", hdr.Name, hdr.ModTime, hdr.ModTime.Unix()) + } + if !noTime(hdr.AccessTime) { + t.Errorf("entry %q AccessTime = %v (unix=%d), want zero or epoch", hdr.Name, hdr.AccessTime, hdr.AccessTime.Unix()) + } + if !noTime(hdr.ChangeTime) { + t.Errorf("entry %q ChangeTime = %v (unix=%d), want zero or epoch", hdr.Name, hdr.ChangeTime, hdr.ChangeTime.Unix()) + } + if hdr.Uid != 0 { + t.Errorf("entry %q Uid = %d, want 0", hdr.Name, hdr.Uid) + } + if hdr.Gid != 0 { + t.Errorf("entry %q Gid = %d, want 0", hdr.Name, hdr.Gid) + } + if hdr.Uname != "" { + t.Errorf("entry %q Uname = %q, want empty", hdr.Name, hdr.Uname) + } + if hdr.Gname != "" { + t.Errorf("entry %q Gname = %q, want empty", hdr.Name, hdr.Gname) + } + } + if entries == 0 { + t.Fatal("walked zero tar entries; fixture or builder produced empty archive") + } +} + +// TestStageHelmOCIManifest_HasHelmMediaTypes stages the chart layer +// + helm.config.v1+json into an OCI Image Layout and asserts the +// manifest carries the Helm mediaTypes. `helm pull` reads +// manifest.config.mediaType to detect the chart artifactType — a +// regression that flips back to the AICR artifactType is exactly the +// bug #961 surfaced. We exercise the staging step in isolation (no +// network push) because the registry-side push path is covered by +// PushFromStore's existing tests. +func TestStageHelmOCIManifest_HasHelmMediaTypes(t *testing.T) { + storeDir := t.TempDir() + store, err := openOCILayout(storeDir) + if err != nil { + t.Fatalf("openOCILayout: %v", err) + } + + meta := &chartYAML{ + APIVersion: "v2", + Name: "aicr-bundle", + Version: "v9.9.9", + } + configBlob, err := json.Marshal(meta) + if err != nil { + t.Fatalf("marshal chartYAML: %v", err) + } + + manifestDesc, err := stageHelmOCIManifest(context.Background(), store, meta, + []byte("fake-chart-tgz-bytes"), configBlob, "v0.9.0-test") + if err != nil { + t.Fatalf("stageHelmOCIManifest() error = %v", err) + } + + manifest := readOCIManifest(t, storeDir, manifestDesc.Digest.String()) + if manifest.Config.MediaType != helmConfigMediaType { + t.Errorf("config.mediaType = %q, want %q (helm pull rejects otherwise — see #961)", + manifest.Config.MediaType, helmConfigMediaType) + } + if len(manifest.Layers) != 1 { + t.Fatalf("len(manifest.Layers) = %d, want 1", len(manifest.Layers)) + } + if manifest.Layers[0].MediaType != helmLayerMediaType { + t.Errorf("layer.mediaType = %q, want %q", + manifest.Layers[0].MediaType, helmLayerMediaType) + } + // AICR's custom artifactType MUST NOT appear anywhere in the + // pushed manifest — its presence is the regression signal. + if manifest.Config.MediaType == artifactType { + t.Errorf("Helm OCI manifest still uses AICR artifactType %q", artifactType) + } +} + +// TestPackageAndPushHelmChart_RewritesChartYAMLOnDisk verifies the +// on-disk side effect required for `helm install --version ` to +// resolve to the same chart bytes that `helm pull` validated. Without +// the rewrite, a recipe-version → OCI-tag mismatch (e.g., recipe +// metadata "0.1.0" vs CI tag "5bc50950-argocd-helm-oci") leaves the +// pushed config.version pointing at "0.1.0" while the tag is the CI +// SHA — helm install would fail with a version mismatch. +// +// Uses a stub OCI layout (no remote push) by exercising +// loadAndRewriteChartYAML directly, then asserting the rewrite is +// visible on disk for the next step. +func TestPackageAndPushHelmChart_RewritesChartYAMLOnDisk(t *testing.T) { + source := writeHelmChartFixture(t, "aicr-bundle") + + meta, err := loadAndRewriteChartYAML(source, "5bc50950-argocd-helm-oci") + if err != nil { + t.Fatalf("loadAndRewriteChartYAML() error = %v", err) + } + if meta.Version != "5bc50950-argocd-helm-oci" { + t.Errorf("returned meta.Version = %q, want rewrite to tag", meta.Version) + } + + raw, err := os.ReadFile(filepath.Join(source, "Chart.yaml")) + if err != nil { + t.Fatalf("read Chart.yaml: %v", err) + } + var diskMeta chartYAML + if err := yaml.Unmarshal(raw, &diskMeta); err != nil { + t.Fatalf("unmarshal Chart.yaml: %v", err) + } + if diskMeta.Version != "5bc50950-argocd-helm-oci" { + t.Errorf("on-disk Chart.yaml version = %q, want rewrite to OCI tag", diskMeta.Version) + } +} + +// TestPackageAndPushHelmChart_RejectsInvalidOptions covers the early +// validation paths in PackageAndPushHelmChart that surface caller errors +// before any registry I/O. Helm OCI bundles get pushed from CI lanes, +// so cheap fast-fail on bad inputs prevents masking real errors with +// remote-side ones. +func TestPackageAndPushHelmChart_RejectsInvalidOptions(t *testing.T) { + source := writeHelmChartFixture(t, "aicr-bundle") + + tests := []struct { + name string + opts HelmChartOptions + want string + }{ + { + name: "nil reference", + opts: HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + }, + want: "OCI reference is required", + }, + { + name: "non-OCI reference", + opts: HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: "localhost:5000", + Repository: "test/chart", + Tag: "1.0.0", + IsOCI: false, + }, + }, + want: "OCI reference is required", + }, + { + name: "empty tag", + opts: HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: "localhost:5000", + Repository: "test/chart", + IsOCI: true, + }, + }, + want: "tag is required", + }, + { + name: "non-semver tag", + opts: HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: "localhost:5000", + Repository: "test/chart", + Tag: "abc123-not-semver", + IsOCI: true, + }, + }, + want: "Helm OCI requires a semver-compatible tag", + }, + { + name: "invalid registry", + opts: HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: "spaces are bad", + Repository: "test/chart", + Tag: "1.0.0", + IsOCI: true, + }, + }, + want: "invalid", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := PackageAndPushHelmChart(context.Background(), tt.opts) + if err == nil { + t.Fatal("PackageAndPushHelmChart() expected error, got nil") + } + if result != nil { + t.Errorf("PackageAndPushHelmChart() result = %+v, want nil on error", result) + } + if !strings.Contains(err.Error(), tt.want) { + t.Errorf("PackageAndPushHelmChart() error = %q, want substring %q", err.Error(), tt.want) + } + }) + } +} + +// TestPackageAndPushHelmChart_MissingChartYAML covers the source-dir +// validation path: if Chart.yaml is absent the push must fail fast at +// the rewrite step, not after staging an empty manifest. +func TestPackageAndPushHelmChart_MissingChartYAML(t *testing.T) { + emptyDir := t.TempDir() // no Chart.yaml + result, err := PackageAndPushHelmChart(context.Background(), HelmChartOptions{ + SourceDir: emptyDir, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: "localhost:5000", + Repository: "test/chart", + Tag: "1.0.0", + IsOCI: true, + }, + }) + if err == nil { + t.Fatal("PackageAndPushHelmChart() expected error for missing Chart.yaml, got nil") + } + if result != nil { + t.Errorf("PackageAndPushHelmChart() result = %+v, want nil on error", result) + } + if !strings.Contains(err.Error(), "Chart.yaml") { + t.Errorf("PackageAndPushHelmChart() error = %q, want to mention Chart.yaml", err.Error()) + } +} + +// TestPackageAndPushHelmChart_SourceEqualsOutputDir guards against the +// `helm-chart-work/helm-chart-work/...` recursion bug observed in +// production: the CLI invokes PackageAndPushHelmChart with SourceDir +// == OutputDir (both point at `./bundle`). An earlier revision placed +// the staging dir under OutputDir, so copyDir's recursive walk picked +// up the freshly-created staging dir and copied it into itself until +// path length hit ENAMETOOLONG. Stage MUST be outside SourceDir. +func TestPackageAndPushHelmChart_SourceEqualsOutputDir(t *testing.T) { + dir := writeHelmChartFixture(t, "aicr-bundle") + registry := newFakeOCIRegistry(t) + defer registry.Close() + registryHost := strings.TrimPrefix(registry.URL, "http://") + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // SourceDir == OutputDir — the CLI's actual call shape. + result, err := PackageAndPushHelmChart(ctx, HelmChartOptions{ + SourceDir: dir, + OutputDir: dir, + Reference: &Reference{ + Registry: registryHost, + Repository: "test/aicr-bundle", + Tag: "1.0.0", + IsOCI: true, + }, + PlainHTTP: true, + Version: "v0.9.0-test", + }) + if err != nil { + t.Fatalf("PackageAndPushHelmChart() error = %v (SourceDir==OutputDir recursion regression)", err) + } + if result == nil { + t.Fatal("PackageAndPushHelmChart() result = nil") + } + + // Caller's source tree must NOT contain a stray staging dir after + // the push — that's the on-disk evidence of the recursion bug. + if _, statErr := os.Stat(filepath.Join(dir, "helm-chart-work")); statErr == nil { + t.Errorf("found leftover helm-chart-work under SourceDir — staging must happen outside SourceDir") + } +} + +// TestPackageAndPushHelmChart_PushesAgainstFakeRegistry exercises the +// full happy path including the network roundtrip against an httptest +// OCI registry. Beyond coverage for pushHelmOCIFromStore (the +// rest-of-the-function dead-after-staging path that was 0%), the +// returned PackageAndPushResult is asserted on so unparam sees the +// result as consumed in-package — see #843's lint follow-up. +func TestPackageAndPushHelmChart_PushesAgainstFakeRegistry(t *testing.T) { + source := writeHelmChartFixture(t, "aicr-bundle") + registry := newFakeOCIRegistry(t) + defer registry.Close() + + registryHost := strings.TrimPrefix(registry.URL, "http://") + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + result, err := PackageAndPushHelmChart(ctx, HelmChartOptions{ + SourceDir: source, + OutputDir: t.TempDir(), + Reference: &Reference{ + Registry: registryHost, + Repository: "test/aicr-bundle", + Tag: "1.0.0", + IsOCI: true, + }, + PlainHTTP: true, + Version: "v0.9.0-test", + }) + if err != nil { + t.Fatalf("PackageAndPushHelmChart() error = %v", err) + } + if result == nil { + t.Fatal("PackageAndPushHelmChart() result = nil, want non-nil on success") + } + if !strings.HasPrefix(result.Digest, "sha256:") { + t.Errorf("result.Digest = %q, want sha256: prefix", result.Digest) + } + if result.MediaType != ociv1.MediaTypeImageManifest { + t.Errorf("result.MediaType = %q, want %q", result.MediaType, ociv1.MediaTypeImageManifest) + } + if !strings.Contains(result.Reference, "test/aicr-bundle:1.0.0") { + t.Errorf("result.Reference = %q, want to contain test/aicr-bundle:1.0.0", result.Reference) + } + if result.Size <= 0 { + t.Errorf("result.Size = %d, want > 0", result.Size) + } + + // #961 regression check: the pushed manifest's config.mediaType + // MUST be the Helm OCI config mediaType. `helm pull` reads this + // field to discover the chart artifactType — if a regression + // flips it back to AICR's generic artifactType, helm silently + // drops the tag from `/v2//tags/list` and the user sees + // "unable to locate any tags in provided repository" even though + // the artifact is in the registry. The push-side fake-registry + // test would have passed silently before this check existed. + // + // Round-trip the manifest: GET it back from the fake registry, + // decode, assert config.mediaType + layer.mediaType. + manifestURL := registry.URL + "/v2/test/aicr-bundle/manifests/1.0.0" + req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, manifestURL, nil) + if reqErr != nil { + t.Fatalf("new manifest GET request: %v", reqErr) + } + req.Header.Set("Accept", ociv1.MediaTypeImageManifest) + resp, getErr := http.DefaultClient.Do(req) + if getErr != nil { + t.Fatalf("manifest GET: %v", getErr) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("manifest GET status = %d, want 200", resp.StatusCode) + } + manifestBytes, readErr := io.ReadAll(resp.Body) + if readErr != nil { + t.Fatalf("manifest GET read: %v", readErr) + } + var manifest ociv1.Manifest + if jsonErr := json.Unmarshal(manifestBytes, &manifest); jsonErr != nil { + t.Fatalf("manifest JSON decode: %v\nbody: %s", jsonErr, manifestBytes) + } + if manifest.Config.MediaType != helmConfigMediaType { + t.Errorf("pushed manifest config.mediaType = %q, want %q (#961 regression — helm pull would drop this tag)", + manifest.Config.MediaType, helmConfigMediaType) + } + if len(manifest.Layers) != 1 { + t.Fatalf("pushed manifest has %d layers, want 1", len(manifest.Layers)) + } + if manifest.Layers[0].MediaType != helmLayerMediaType { + t.Errorf("pushed manifest layer[0].mediaType = %q, want %q", + manifest.Layers[0].MediaType, helmLayerMediaType) + } +} + +// newFakeOCIRegistry stands up an httptest server that handles just +// enough of the OCI distribution-spec to accept a push: blob uploads +// (POST/PATCH/PUT under /v2//blobs/uploads/), blob HEADs (to +// satisfy oras's existence check), and manifest PUTs. Anything else +// returns 404 — the goal isn't a conformant registry, just the +// endpoints PackageAndPushHelmChart's push path actually touches. +func newFakeOCIRegistry(t *testing.T) *httptest.Server { + t.Helper() + // oras's push path uploads multiple blobs concurrently + // (configured via copyOpts.Concurrency = OCIPushConcurrency). The + // handler closure runs on the httptest goroutine pool, so the maps + // below MUST be lock-protected — without the mutex, `go test -race` + // catches concurrent map writes during chart-layer + config-blob + // uploads. + var mu sync.Mutex + blobs := make(map[string][]byte) + manifests := make(map[string][]byte) // keyed by ":" + + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + switch { + case path == "/v2/" || path == "/v2": + w.Header().Set("Docker-Distribution-API-Version", "registry/2.0") + w.WriteHeader(http.StatusOK) + return + + case strings.HasSuffix(path, "/blobs/uploads/") && r.Method == http.MethodPost: + // Start an upload session. The session URL is monotonic; + // embed a counter so successive uploads don't collide. + mu.Lock() + session := strconv.Itoa(len(blobs) + 1) + mu.Unlock() + w.Header().Set("Location", path+session) + w.WriteHeader(http.StatusAccepted) + return + + case strings.Contains(path, "/blobs/uploads/") && (r.Method == http.MethodPatch || r.Method == http.MethodPut): + // Drain the body — the digest is in the query string on PUT. + body, _ := io.ReadAll(r.Body) + if r.Method == http.MethodPut { + digest := r.URL.Query().Get("digest") + if digest != "" { + mu.Lock() + blobs[digest] = body + mu.Unlock() + } + w.Header().Set("Docker-Content-Digest", digest) + w.WriteHeader(http.StatusCreated) + return + } + w.Header().Set("Range", fmt.Sprintf("0-%d", len(body)-1)) + w.WriteHeader(http.StatusAccepted) + return + + case strings.Contains(path, "/blobs/sha256:") && (r.Method == http.MethodHead || r.Method == http.MethodGet): + // blob existence check / fetch. + _, digest, _ := strings.Cut(path, "/blobs/") + mu.Lock() + data, ok := blobs[digest] + mu.Unlock() + if ok { + w.Header().Set("Docker-Content-Digest", digest) + w.Header().Set("Content-Length", strconv.Itoa(len(data))) + if r.Method == http.MethodGet { + _, _ = w.Write(data) + } else { + w.WriteHeader(http.StatusOK) + } + return + } + w.WriteHeader(http.StatusNotFound) + return + + case strings.Contains(path, "/manifests/") && r.Method == http.MethodPut: + before, reference, _ := strings.Cut(path, "/manifests/") + name := strings.TrimPrefix(before, "/v2/") + body, _ := io.ReadAll(r.Body) + mu.Lock() + manifests[name+":"+reference] = body + mu.Unlock() + digest := digestSHA256(body) + w.Header().Set("Docker-Content-Digest", digest) + w.WriteHeader(http.StatusCreated) + return + + case strings.Contains(path, "/manifests/") && (r.Method == http.MethodHead || r.Method == http.MethodGet): + before, reference, _ := strings.Cut(path, "/manifests/") + name := strings.TrimPrefix(before, "/v2/") + mu.Lock() + data, ok := manifests[name+":"+reference] + mu.Unlock() + if ok { + w.Header().Set("Docker-Content-Digest", digestSHA256(data)) + w.Header().Set("Content-Type", ociv1.MediaTypeImageManifest) + w.Header().Set("Content-Length", strconv.Itoa(len(data))) + if r.Method == http.MethodGet { + _, _ = w.Write(data) + } else { + w.WriteHeader(http.StatusOK) + } + return + } + w.WriteHeader(http.StatusNotFound) + return + } + w.WriteHeader(http.StatusNotFound) + }) + return httptest.NewServer(mux) +} + +func digestSHA256(data []byte) string { + sum := sha256.Sum256(data) + return "sha256:" + hex.EncodeToString(sum[:]) +} + +func readTGZEntries(t *testing.T, tgz []byte) []string { + t.Helper() + gz, err := gzip.NewReader(strings.NewReader(string(tgz))) + if err != nil { + t.Fatalf("gzip.NewReader: %v", err) + } + defer gz.Close() + tr := tar.NewReader(gz) + var names []string + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("tar.Next: %v", err) + } + names = append(names, hdr.Name) + } + return names +}