diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index a66e7233e..c7d035722 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -63,7 +63,7 @@ The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or ### TLS behaviour -`values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run +`ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run plaintext by default. To test with TLS enabled, comment out that line and redeploy. | Mode | `server.disableTls` | Gateway scheme | @@ -157,7 +157,7 @@ imports the openshell realm from `scripts/keycloak-realm.json`, and prints a por command for acquiring tokens from the CLI. Then activate OIDC in the OpenShell Helm chart: -1. Uncomment `#- values-keycloak.yaml` in `skaffold.yaml` +1. Uncomment `#- ci/values-keycloak.yaml` in `skaffold.yaml` 2. Redeploy: `mise run helm:skaffold:run` To remove Keycloak: @@ -188,10 +188,12 @@ mise run helm:k3s:status |------|---------| | `deploy/helm/openshell/skaffold.yaml` | Skaffold config — images, Helm releases, values overlays | | `deploy/helm/openshell/values.yaml` | Default Helm values | -| `deploy/helm/openshell/values-skaffold.yaml` | Dev overrides (image pull policy, local image names) | -| `deploy/helm/openshell/values-cert-manager.yaml` | cert-manager TLS overlay (opt-in; disables pkiInitJob) | -| `deploy/helm/openshell/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | -| `deploy/helm/openshell/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-skaffold.yaml` | Dev overrides (image pull policy, TLS disabled for local Skaffold) | +| `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) | +| `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | +| `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) | | `tasks/scripts/helm-k3s-local.sh` | k3d cluster create/delete/start/stop/status | +| `tasks/scripts/helm-e2e.sh` | Bootstrap k3d cluster and run Rust + Python e2e via Helm | | `tasks/scripts/keycloak-k8s-setup.sh` | Keycloak deploy + realm import | diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml new file mode 100644 index 000000000..8b7184133 --- /dev/null +++ b/.github/workflows/helm-lint.yml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Helm Lint + +on: + push: + branches: + - "pull-request/[0-9]+" + paths: + - "deploy/helm/**" + workflow_dispatch: + +env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +permissions: + contents: read + packages: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + + helm-lint: + name: Helm Lint + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v6 + + - name: Install tools + run: mise install --locked + + - name: Lint Helm chart + run: mise run helm:lint diff --git a/deploy/helm/openshell/.helmignore b/deploy/helm/openshell/.helmignore index 798d0e7c8..a12325802 100644 --- a/deploy/helm/openshell/.helmignore +++ b/deploy/helm/openshell/.helmignore @@ -19,8 +19,4 @@ # Ignore development files skaffold.yaml -values-keycloak.yaml -values-ingress.yaml -values-gateway.yaml -values-cert-manager.yaml -values-skaffold.yaml +ci/ diff --git a/deploy/helm/openshell/values-cert-manager.yaml b/deploy/helm/openshell/ci/values-cert-manager.yaml similarity index 84% rename from deploy/helm/openshell/values-cert-manager.yaml rename to deploy/helm/openshell/ci/values-cert-manager.yaml index bb024d716..ed99c8b46 100644 --- a/deploy/helm/openshell/values-cert-manager.yaml +++ b/deploy/helm/openshell/ci/values-cert-manager.yaml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Merge after values.yaml when cert-manager CRDs are installed, e.g.: -# helm install ... -f values.yaml -f values-cert-manager.yaml +# helm install ... -f values.yaml -f ci/values-cert-manager.yaml # Or add this file to skaffold manifests.helm.releases[].valuesFiles. server: disableTls: false diff --git a/deploy/helm/openshell/values-gateway.yaml b/deploy/helm/openshell/ci/values-gateway.yaml similarity index 92% rename from deploy/helm/openshell/values-gateway.yaml rename to deploy/helm/openshell/ci/values-gateway.yaml index c43a4cd45..196192213 100644 --- a/deploy/helm/openshell/values-gateway.yaml +++ b/deploy/helm/openshell/ci/values-gateway.yaml @@ -5,7 +5,7 @@ # # Requires Envoy Gateway in the cluster (installed via skaffold.yaml). # Add this file to the openshell release valuesFiles to activate: -# uncomment values-gateway.yaml in deploy/helm/openshell/skaffold.yaml +# uncomment ci/values-gateway.yaml in deploy/helm/openshell/skaffold.yaml # # Envoy Gateway will create an Envoy proxy Deployment and a LoadBalancer # Service (named envoy---*) in the openshell namespace. diff --git a/deploy/helm/openshell/values-keycloak.yaml b/deploy/helm/openshell/ci/values-keycloak.yaml similarity index 95% rename from deploy/helm/openshell/values-keycloak.yaml rename to deploy/helm/openshell/ci/values-keycloak.yaml index 42bb2ad4e..cc6ca658b 100644 --- a/deploy/helm/openshell/values-keycloak.yaml +++ b/deploy/helm/openshell/ci/values-keycloak.yaml @@ -8,7 +8,7 @@ # # Then layer this file on top of values.yaml when deploying: # helm upgrade --install openshell . \ -# -f values.yaml -f values-skaffold.yaml -f values-keycloak.yaml +# -f values.yaml -f ci/values-skaffold.yaml -f ci/values-keycloak.yaml # # Or add this file to skaffold.yaml valuesFiles for iterative dev. # diff --git a/deploy/helm/openshell/values-skaffold.yaml b/deploy/helm/openshell/ci/values-skaffold.yaml similarity index 100% rename from deploy/helm/openshell/values-skaffold.yaml rename to deploy/helm/openshell/ci/values-skaffold.yaml diff --git a/deploy/helm/openshell/ci/values-tls-disabled.yaml b/deploy/helm/openshell/ci/values-tls-disabled.yaml new file mode 100644 index 000000000..ea7c7900c --- /dev/null +++ b/deploy/helm/openshell/ci/values-tls-disabled.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI lint target: TLS disabled (plaintext HTTP, no client cert requirement). +# Typical when a reverse proxy or tunnel terminates TLS at the edge. +server: + disableTls: true + disableGatewayAuth: true +pkiInitJob: + enabled: false diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index fe7b96cf2..2de9ee4e6 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -87,16 +87,16 @@ deploy: createNamespace: true valuesFiles: - values.yaml - - values-skaffold.yaml - # Add values-cert-manager.yaml here (and uncomment the cert-manager + - ci/values-skaffold.yaml + # Add ci/values-cert-manager.yaml here (and uncomment the cert-manager # release above) to switch from pkiInitJob to cert-manager for PKI. - #- values-cert-manager.yaml + #- ci/values-cert-manager.yaml # To enable OIDC with a local Keycloak instance, run the one-time # setup task first, then uncomment the line below: # mise run keycloak:k8s:setup - #- values-keycloak.yaml + #- ci/values-keycloak.yaml # To enable the Gateway API HTTPRoute (requires Envoy Gateway above): - #- values-gateway.yaml + #- ci/values-gateway.yaml setValueTemplates: image.repository: '{{.IMAGE_REPO_openshell_gateway}}' image.tag: '{{.IMAGE_TAG_openshell_gateway}}' diff --git a/tasks/helm.toml b/tasks/helm.toml index c7949865b..9ef2ae832 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -4,9 +4,18 @@ # Helm chart tasks ["helm:lint"] -description = "Lint the openshell helm chart" -run = "helm lint deploy/helm/openshell" -hide = true +description = "Lint the openshell Helm chart (defaults + all CI configuration variants)" +run = """ + set -e + echo "--- helm lint: defaults ---" + helm lint deploy/helm/openshell + for f in deploy/helm/openshell/ci/values-*.yaml; do + variant=$(basename "$f" .yaml | sed 's/values-//') + echo "--- helm lint: $variant ---" + helm lint deploy/helm/openshell -f "$f" + done + echo "All variants passed." +""" ["helm:skaffold:dev"] description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)" @@ -59,3 +68,24 @@ hide = true ["helm:gateway:apply"] description = "Apply the Envoy GatewayClass manifest (run after helm:skaffold:run when gateway routing is enabled)" run = "kubectl apply -f deploy/kube/manifests/envoy-gateway-openshell.yaml" + +# Helm e2e — boots a k3d cluster via the Helm path and runs the Rust + Python suites + +["e2e:helm"] +description = "Bootstrap Helm k3d cluster and run Rust + Python e2e suites" +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:rust"] +description = "Bootstrap Helm k3d cluster and run Rust e2e only" +env = { HELM_E2E_SUITE = "rust" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:python"] +description = "Bootstrap Helm k3d cluster and run Python e2e only" +env = { HELM_E2E_SUITE = "python" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:cert-manager"] +description = "Bootstrap Helm k3d cluster with cert-manager PKI and run full e2e" +env = { HELM_E2E_PKI = "cert-manager" } +run = "tasks/scripts/helm-e2e.sh" diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh new file mode 100755 index 000000000..e0514cf66 --- /dev/null +++ b/tasks/scripts/helm-e2e.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run Rust and/or Python e2e tests against a gateway deployed via the Helm chart +# on a local k3d cluster (k3s backed by Docker). +# +# The script follows the same preflight → bootstrap → register → test → cleanup +# pattern as e2e/rust/e2e-docker.sh, but uses k3d + Skaffold + Helm instead of +# a standalone gateway process. +# +# Usage: +# mise run e2e:helm # full suite, pkiInitJob PKI +# mise run e2e:helm:rust # Rust only +# mise run e2e:helm:python # Python only +# mise run e2e:helm:cert-manager # full suite, cert-manager PKI +# +# Environment variables: +# HELM_E2E_SUITE rust | python | all (default: all) +# HELM_E2E_PKI pki-init | cert-manager (default: pki-init) +# HELM_E2E_KEEP_CLUSTER 1 to skip cluster deletion on exit (default: 0) +# HELM_E2E_CLUSTER_NAME override k3d cluster name (default: derived from branch) +# KUBECONFIG path to kubeconfig (default: /kubeconfig) +# OPENSHELL_PROVISION_TIMEOUT sandbox ready timeout in seconds (default: 300) + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +SUITE="${HELM_E2E_SUITE:-all}" +PKI_MODE="${HELM_E2E_PKI:-pki-init}" +KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}" + +# Derive cluster name the same way helm-k3s-local.sh does (last path component of branch). +_branch_cluster_name() { + local branch + branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")" + local suffix="${branch##*/}" + suffix="${suffix:0:24}" + echo "openshell-dev-${suffix}" +} + +CLUSTER_NAME="${HELM_E2E_CLUSTER_NAME:-$(_branch_cluster_name)}" +export KUBECONFIG="${KUBECONFIG:-${ROOT}/kubeconfig}" + +WORKDIR="$(mktemp -d "/tmp/openshell-helm-e2e.XXXXXX")" +GATEWAY_NAME="openshell-helm-e2e-${CLUSTER_NAME}" +GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}" +PF_PID="" +PORT="" +CLUSTER_CREATED=0 + +cleanup() { + local exit_code=$? + + if [ -n "${PF_PID}" ] && kill -0 "${PF_PID}" 2>/dev/null; then + echo "Stopping kubectl port-forward (pid ${PF_PID})..." + kill "${PF_PID}" 2>/dev/null || true + wait "${PF_PID}" 2>/dev/null || true + fi + + if [ -d "${GATEWAY_CONFIG_DIR}" ]; then + rm -rf "${GATEWAY_CONFIG_DIR}" + fi + + if [ "${KEEP_CLUSTER}" = "1" ]; then + echo "Keeping cluster '${CLUSTER_NAME}' (HELM_E2E_KEEP_CLUSTER=1)." + elif [ "${CLUSTER_CREATED}" = "1" ]; then + echo "Deleting cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" delete 2>/dev/null || true + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true + + if [ "${exit_code}" -ne 0 ]; then + echo "helm-e2e failed (exit ${exit_code})." + fi +} +trap cleanup EXIT + +# ── Preflight ──────────────────────────────────────────────────────────────── +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "ERROR: '$1' is required but not found in PATH" >&2 + exit 2 + fi +} + +require_cmd k3d +require_cmd helm +require_cmd kubectl +require_cmd docker +require_cmd openssl + +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable" >&2 + exit 2 +fi + +echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ===" + +# ── Cluster ────────────────────────────────────────────────────────────────── +if k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then + echo "Reusing existing k3d cluster '${CLUSTER_NAME}'." + # Refresh kubeconfig in case it's stale. + k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null +else + echo "Creating k3d cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create + CLUSTER_CREATED=1 +fi + +# ── cert-manager (optional) ────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update >/dev/null 2>&1 || true + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait 2>&1 +fi + +# ── Build images ───────────────────────────────────────────────────────────── +# Use a fixed local tag so the image names are stable across runs and Helm +# can reference them without Skaffold's digest-based tags. +GATEWAY_IMAGE="openshell/gateway:helm-e2e" +SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e" + +echo "Building gateway image..." +docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target gateway \ + --tag "${GATEWAY_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + +echo "Building supervisor image..." +docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target supervisor \ + --tag "${SUPERVISOR_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + +# Load images into the k3d cluster nodes. +echo "Loading images into k3d cluster..." +k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1 + +# ── Deploy via Helm ─────────────────────────────────────────────────────────── +HELM_VALUES_FLAGS=( + -f "${ROOT}/deploy/helm/openshell/values.yaml" +) +if [ "${PKI_MODE}" = "cert-manager" ]; then + HELM_VALUES_FLAGS+=(-f "${ROOT}/deploy/helm/openshell/ci/values-cert-manager.yaml") +fi + +echo "Deploying OpenShell via Helm (PKI: ${PKI_MODE})..." +helm upgrade --install openshell "${ROOT}/deploy/helm/openshell" \ + --namespace openshell --create-namespace \ + "${HELM_VALUES_FLAGS[@]}" \ + --set "image.repository=openshell/gateway" \ + --set "image.tag=helm-e2e" \ + --set "image.pullPolicy=Never" \ + --set "supervisor.image.repository=openshell/supervisor" \ + --set "supervisor.image.tag=helm-e2e" \ + --set "supervisor.image.pullPolicy=Never" \ + --wait --timeout 180s 2>&1 + +# ── Wait for PKI ───────────────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Waiting for cert-manager certificates to be ready..." + kubectl wait --for=condition=Ready certificate/openshell-server certificate/openshell-client \ + -n openshell --timeout=120s +else + echo "Waiting for pkiInitJob secrets..." + elapsed=0 + while [ "${elapsed}" -lt 60 ]; do + if kubectl get secret openshell-client-tls -n openshell >/dev/null 2>&1; then + echo "PKI secrets ready after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) + done + if [ "${elapsed}" -ge 60 ]; then + echo "ERROR: pkiInitJob secrets not created within 60s" >&2 + exit 1 + fi +fi + +# ── Port-forward ───────────────────────────────────────────────────────────── +pick_port() { + python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' +} +PORT=$(pick_port) + +echo "Port-forwarding openshell service → localhost:${PORT}..." +kubectl port-forward -n openshell svc/openshell "${PORT}:8080" \ + >"${WORKDIR}/pf.log" 2>&1 & +PF_PID=$! + +# ── Register gateway with CLI ───────────────────────────────────────────────── +mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" + +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.key}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.key" + +cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <&1 +fi + +echo "Waiting for gateway to become healthy (port ${PORT})..." +elapsed=0 +timeout=120 +while [ "${elapsed}" -lt "${timeout}" ]; do + if ! kill -0 "${PF_PID}" 2>/dev/null; then + echo "ERROR: port-forward exited unexpectedly" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 + fi + if "${CLI_BIN}" status --gateway "${GATEWAY_NAME}" >/dev/null 2>&1; then + echo "Gateway healthy after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: gateway did not become healthy within ${timeout}s" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 +fi + +# ── Run test suites ─────────────────────────────────────────────────────────── +run_rust() { + echo "--- Running Rust e2e ---" + cargo build -p openshell-cli --features openshell-core/dev-settings + cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- \ + --skip gateway_resume_scenarios \ + --skip docker_gpu_sandbox_runs_nvidia_smi \ + --skip sandbox_from_custom_dockerfile \ + --skip graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths \ + --skip forward_proxy_allows_l7_permitted_request \ + --skip sandbox_reaches_host_openshell_internal_via_host_gateway_alias \ + --skip sandbox_inference_local_routes_to_host_openshell_internal \ + --nocapture +} + +run_python() { + echo "--- Running Python e2e ---" + mise run --no-deps python:proto + UV_NO_SYNC=1 PYTHONPATH=python uv run pytest \ + -o python_files='test_*.py' \ + -m 'not gpu' \ + -n "${E2E_PARALLEL:-5}" \ + e2e/python +} + +case "${SUITE}" in + rust) run_rust ;; + python) run_python ;; + all) run_rust; run_python ;; + *) + echo "ERROR: unknown HELM_E2E_SUITE '${SUITE}' (must be rust, python, or all)" >&2 + exit 2 + ;; +esac + +echo "=== helm-e2e: all suites passed ==="