Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions .github/actions/kwok-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ inputs:
recipe:
description: 'Recipe name to test'
required: true
deployer:
description: 'Deployer to exercise: helm | argocd-oci | argocd-helm-oci | flux-oci'
required: false
default: 'helm'
go_version:
description: 'Go version to install'
required: true
Expand Down Expand Up @@ -108,7 +112,9 @@ runs:
env:
KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
run: |
KWOK_CLUSTER=aicr-kwok-test bash kwok/scripts/run-all-recipes.sh ${{ inputs.recipe }}
KWOK_CLUSTER=aicr-kwok-test bash kwok/scripts/run-all-recipes.sh \
--deployer ${{ inputs.deployer }} \
${{ inputs.recipe }}

- name: Collect debug artifacts
if: failure()
Expand All @@ -122,6 +128,14 @@ runs:
kubectl get nodes -o wide > "/tmp/kwok-debug-artifacts/${cluster}-nodes.txt" || true
kubectl get events --all-namespaces --sort-by='.lastTimestamp' > "/tmp/kwok-debug-artifacts/${cluster}-events.txt" || true
kubectl get pods --all-namespaces -o wide > "/tmp/kwok-debug-artifacts/${cluster}-pods.txt" || true
kubectl get applications -n argocd -o yaml > "/tmp/kwok-debug-artifacts/${cluster}-argo-apps.yaml" 2>/dev/null || true
kubectl logs -n argocd deploy/argocd-repo-server --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-argo-reposerver.log" 2>/dev/null || true
kubectl logs -n argocd statefulset/argocd-application-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-argo-appcontroller.log" 2>/dev/null || true
kubectl get ocirepositories,kustomizations,helmreleases -A -o yaml > "/tmp/kwok-debug-artifacts/${cluster}-flux-resources.yaml" 2>/dev/null || true
kubectl logs -n flux-system deploy/source-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-source-controller.log" 2>/dev/null || true
kubectl logs -n flux-system deploy/kustomize-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-kustomize-controller.log" 2>/dev/null || true
kubectl logs -n flux-system deploy/helm-controller --tail=500 > "/tmp/kwok-debug-artifacts/${cluster}-flux-helm-controller.log" 2>/dev/null || true
kubectl logs -n aicr-registry deploy/registry --tail=200 > "/tmp/kwok-debug-artifacts/${cluster}-registry.log" 2>/dev/null || true
done

- name: Export Kind logs
Expand All @@ -137,7 +151,7 @@ runs:
if: failure() && inputs.upload_artifacts == 'true'
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: kwok-debug-${{ inputs.recipe }}-${{ github.run_id }}
name: kwok-debug-${{ inputs.recipe }}-${{ inputs.deployer }}-${{ github.run_id }}
path: |
/tmp/kwok-debug-artifacts/
/tmp/kwok-kind-logs/
Expand Down
10 changes: 10 additions & 0 deletions .github/actions/load-versions/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ outputs:
yq:
description: 'yq version'
value: ${{ steps.versions.outputs.yq }}
registry_image:
description: 'OCI registry image (in-cluster registry for KWOK deployer-matrix CI)'
value: ${{ steps.versions.outputs.registry_image }}
argocd_chart:
description: 'Argo CD Helm chart version (for KWOK deployer-matrix CI)'
value: ${{ steps.versions.outputs.argocd_chart }}
coverage_threshold:
description: 'Minimum test coverage percentage'
value: ${{ steps.versions.outputs.coverage_threshold }}
Expand Down Expand Up @@ -148,6 +154,8 @@ runs:
echo "chainsaw_sha256_linux_amd64=$(yq eval '.testing_tools.chainsaw_checksums.linux_amd64' .settings.yaml)" >> $GITHUB_OUTPUT
echo "chainsaw_sha256_linux_arm64=$(yq eval '.testing_tools.chainsaw_checksums.linux_arm64' .settings.yaml)" >> $GITHUB_OUTPUT
echo "yq=$(yq eval '.testing_tools.yq' .settings.yaml)" >> $GITHUB_OUTPUT
echo "registry_image=$(yq eval '.testing_tools.registry_image' .settings.yaml)" >> $GITHUB_OUTPUT
echo "argocd_chart=$(yq eval '.testing_tools.argocd_chart' .settings.yaml)" >> $GITHUB_OUTPUT

# Quality thresholds
echo "coverage_threshold=$(yq eval '.quality.coverage_threshold' .settings.yaml)" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -188,6 +196,8 @@ runs:
echo " chainsaw_sha256_linux_amd64: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }}"
echo " chainsaw_sha256_linux_arm64: ${{ steps.versions.outputs.chainsaw_sha256_linux_arm64 }}"
echo " yq: ${{ steps.versions.outputs.yq }}"
echo " registry_image: ${{ steps.versions.outputs.registry_image }}"
echo " argocd_chart: ${{ steps.versions.outputs.argocd_chart }}"
echo " coverage_threshold: ${{ steps.versions.outputs.coverage_threshold }}"
echo " lint_timeout: ${{ steps.versions.outputs.lint_timeout }}"
echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}"
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/kwok-recipes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ jobs:

# ── Tier 1: PR gate — generic overlays (PR + push, skip on schedule) ──
test-tier1:
name: 'Tier 1: ${{ matrix.recipe }}'
name: 'Tier 1: ${{ matrix.recipe }} (${{ matrix.deployer }})'
needs: discover
if: >-
github.event_name != 'schedule' &&
Expand All @@ -236,6 +236,7 @@ jobs:
fail-fast: false
matrix:
recipe: ${{ fromJSON(needs.discover.outputs.tier1) }}
deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci]
steps:
- name: Checkout Code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand All @@ -250,6 +251,7 @@ jobs:
uses: ./.github/actions/kwok-test
with:
recipe: ${{ matrix.recipe }}
deployer: ${{ matrix.deployer }}
go_version: ${{ steps.versions.outputs.go }}
goreleaser_version: ${{ steps.versions.outputs.goreleaser }}
kind_version: ${{ steps.versions.outputs.kind }}
Expand Down Expand Up @@ -300,7 +302,7 @@ jobs:
# Per ADR-003: separate concurrency group per SHA so successive merges
# to main never cancel in-flight Tier 3 runs.
test-tier3:
name: 'Tier 3: ${{ matrix.recipe }}'
name: 'Tier 3: ${{ matrix.recipe }} (${{ matrix.deployer }})'
needs: discover
concurrency:
group: kwok-tier3-${{ github.sha }}
Expand All @@ -315,6 +317,7 @@ jobs:
fail-fast: false
matrix:
recipe: ${{ fromJSON(needs.discover.outputs.tier3) }}
deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci]
steps:
- name: Checkout Code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand All @@ -329,6 +332,7 @@ jobs:
uses: ./.github/actions/kwok-test
with:
recipe: ${{ matrix.recipe }}
deployer: ${{ matrix.deployer }}
go_version: ${{ steps.versions.outputs.go }}
goreleaser_version: ${{ steps.versions.outputs.goreleaser }}
kind_version: ${{ steps.versions.outputs.kind }}
Expand Down
13 changes: 13 additions & 0 deletions .settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,19 @@ testing_tools:
yq: 'v4.53.2'
# renovate: datasource=github-releases depName=kubernetes-sigs/karpenter depType=testing_tools
karpenter: 'v1.12.0'
# In-cluster OCI registry for KWOK deployer-matrix CI (issue #843).
# renovate: datasource=docker depName=registry depType=testing_tools
registry_image: 'registry:2.8.3'
# Argo CD Helm chart for KWOK deployer-matrix CI. v9.5.x pins app v3.4.x,
# the first stable line with native OCI artifact source (issue #843).
# renovate: datasource=helm depName=argo-cd registryUrl=https://argoproj.github.io/argo-helm depType=testing_tools
argocd_chart: '9.5.14'
# Flux 2 release for KWOK deployer-matrix CI flux-oci lane (issue #843).
# install.yaml is downloaded from the GitHub release; source-controller,
# kustomize-controller and helm-controller are the only controllers the
# bundle consumes (OCIRepository -> Kustomization -> HelmRelease).
# renovate: datasource=github-releases depName=fluxcd/flux2 depType=testing_tools
flux_version: 'v2.8.7'

# Quality Thresholds
quality:
Expand Down
3 changes: 3 additions & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -522,10 +522,13 @@ make kwok-e2e RECIPE=gb200-eks-training # Test single recipe

Recipes with `spec.criteria.service` defined are auto-discovered. KWOK validates scheduling (node selectors, tolerations, resource requests) but not runtime behavior (no container execution or GPU functionality).

For the deployer matrix (argocd / argocd-helm OCI lanes), see [Deployer Matrix Testing](docs/contributor/kwok-testing.md).

| Command | Description |
|---------|-------------|
| `make kwok-test-all` | Test all recipes in shared cluster (serial) |
| `make kwok-e2e RECIPE=<name>` | Full e2e: cluster, nodes, validate |
| `make kwok-test-deployer RECIPE=<name> DEPLOYER=<name>` | Validate single recipe under a specific deployer (`helm`, `argocd-oci`, `argocd-helm-oci`) |
| `make kwok-cluster` | Create Kind cluster with KWOK |
| `make kwok-status` | Show cluster and node status |
| `make kwok-cluster-delete` | Delete cluster |
Expand Down
14 changes: 14 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,20 @@ endif
kwok-test-all: build ## Run all KWOK recipe tests in a shared cluster
@bash kwok/scripts/run-all-recipes.sh

.PHONY: kwok-test-deployer
kwok-test-deployer: build ## Validate scheduling under a specific deployer (RECIPE=… DEPLOYER=helm|argocd-oci|argocd-helm-oci|flux-oci)
ifndef RECIPE
@echo "Error: RECIPE is required"
@echo "Usage: make kwok-test-deployer RECIPE=eks-training DEPLOYER=argocd-oci"
@exit 1
endif
ifndef DEPLOYER
@echo "Error: DEPLOYER is required (helm | argocd-oci | argocd-helm-oci | flux-oci)"
@exit 1
endif
@echo "Validating $(RECIPE) under deployer=$(DEPLOYER)"
bash kwok/scripts/run-all-recipes.sh --deployer $(DEPLOYER) $(RECIPE)

# =============================================================================
# Talos local test harness
# =============================================================================
Expand Down
Loading
Loading