From 2387d048e5e8f9525685ad988a6386793767860e Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 11 May 2026 15:38:44 -0700 Subject: [PATCH 1/3] wip --- .../skills/debug-openshell-cluster/SKILL.md | 3 +- .github/workflows/rust-native-build.yml | 53 +++++++- architecture/build.md | 15 ++- crates/openshell-driver-podman/README.md | 6 +- deploy/docker/Dockerfile.ci | 1 + deploy/docker/Dockerfile.gateway | 61 +++++++++ deploy/docker/Dockerfile.images | 122 ------------------ deploy/docker/Dockerfile.supervisor | 64 +++++++++ deploy/helm/openshell/skaffold.yaml | 11 +- deploy/helm/openshell/templates/certgen.yaml | 2 +- tasks/scripts/docker-build-image.sh | 15 ++- tasks/scripts/stage-prebuilt-binaries.sh | 27 +++- 12 files changed, 224 insertions(+), 156 deletions(-) create mode 100644 deploy/docker/Dockerfile.gateway delete mode 100644 deploy/docker/Dockerfile.images create mode 100644 deploy/docker/Dockerfile.supervisor diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 16158c0dc..40d8bbf47 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -74,6 +74,7 @@ Common findings: - Sandbox image missing or pull denied: verify image reference and registry credentials. - Docker driver cannot initialize because it cannot find `openshell-sandbox`: verify `OPENSHELL_DOCKER_SUPERVISOR_BIN`, the sibling binary next to `openshell-gateway`, or the configured supervisor image contains `/openshell-sandbox`. - Sandbox never registers: check gateway logs and supervisor callback endpoint. +- Supervisor image exits before printing `openshell-sandbox --version`: the image should be the scratch supervisor image from `deploy/docker/Dockerfile.supervisor` and must contain a static executable at `/openshell-sandbox`. For source checkout development, restart the local gateway with: @@ -126,7 +127,7 @@ kubectl -n openshell get statefulset openshell -o jsonpath="{.spec.template.spec helm -n openshell get values openshell | grep -E 'repository|tag|supervisorImage' ``` -The gateway image and `server.supervisorImage` should use the same build tag in branch and E2E deploys. A stale supervisor image can make sandbox behavior lag behind gateway policy or proto changes. +The gateway image built from `deploy/docker/Dockerfile.gateway` and the scratch supervisor image built from `deploy/docker/Dockerfile.supervisor` should use the same build tag in branch and E2E deploys. A stale supervisor image can make sandbox behavior lag behind gateway policy or proto changes. For plaintext local evaluation, confirm the chart has: diff --git a/.github/workflows/rust-native-build.yml b/.github/workflows/rust-native-build.yml index 40feb1dd0..edb1bfb7a 100644 --- a/.github/workflows/rust-native-build.yml +++ b/.github/workflows/rust-native-build.yml @@ -1,10 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -name: Rust Native Build (openshell-gateway / openshell-sandbox) +name: Rust Image Binary Build (openshell-gateway / openshell-sandbox) -# Build Rust binaries natively per Linux architecture before the Docker image -# build consumes them as prebuilt artifacts. +# Build Rust binaries per Linux architecture before the Docker image build +# consumes them as prebuilt artifacts. Gateway images use GNU-linked binaries +# for the NVIDIA distroless C/C++ runtime; supervisor images use musl/static +# binaries so the final image can remain scratch. on: workflow_call: @@ -105,10 +107,12 @@ jobs: gateway) crate=openshell-server binary=openshell-gateway + zig_target= ;; sandbox) crate=openshell-sandbox binary=openshell-sandbox + zig_target= ;; *) echo "unsupported component: $COMPONENT" >&2 @@ -118,10 +122,20 @@ jobs: case "$ARCH" in amd64) - target=x86_64-unknown-linux-gnu + if [[ "$COMPONENT" == "sandbox" ]]; then + target=x86_64-unknown-linux-musl + zig_target=x86_64-linux-musl + else + target=x86_64-unknown-linux-gnu + fi ;; arm64) - target=aarch64-unknown-linux-gnu + if [[ "$COMPONENT" == "sandbox" ]]; then + target=aarch64-unknown-linux-musl + zig_target=aarch64-linux-musl + else + target=aarch64-unknown-linux-gnu + fi ;; *) echo "unsupported arch: $ARCH" >&2 @@ -133,6 +147,7 @@ jobs: echo "crate=$crate" echo "binary=$binary" echo "target=$target" + echo "zig_target=$zig_target" } >> "$GITHUB_OUTPUT" - name: Configure GHA sccache backend @@ -163,6 +178,30 @@ jobs: set -euo pipefail sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ steps.version.outputs.cargo_version }}"'"/}' Cargo.toml + - name: Set up zig musl wrappers + if: contains(steps.target.outputs.target, 'musl') + run: | + set -euo pipefail + ZIG="$(mise which zig)" + ZIG_TARGET="${{ steps.target.outputs.zig_target }}" + mkdir -p /tmp/zig-musl + + # cc-rs injects --target=, which zig does not parse. + # Strip caller-provided --target and use the wrapper's zig target. + for tool in cc c++; do + printf '#!/bin/bash\nargs=()\nfor arg in "$@"; do\n case "$arg" in\n --target=*) ;;\n *) args+=("$arg") ;;\n esac\ndone\nexec "%s" %s --target=%s "${args[@]}"\n' \ + "$ZIG" "$tool" "$ZIG_TARGET" > "/tmp/zig-musl/${tool}" + chmod +x "/tmp/zig-musl/${tool}" + done + + TARGET_ENV=$(echo "${{ steps.target.outputs.target }}" | tr '-' '_') + TARGET_ENV_UPPER=${TARGET_ENV^^} + + echo "CC_${TARGET_ENV}=/tmp/zig-musl/cc" >> "$GITHUB_ENV" + echo "CXX_${TARGET_ENV}=/tmp/zig-musl/c++" >> "$GITHUB_ENV" + echo "CARGO_TARGET_${TARGET_ENV_UPPER}_LINKER=/tmp/zig-musl/cc" >> "$GITHUB_ENV" + echo "CARGO_TARGET_${TARGET_ENV_UPPER}_RUSTFLAGS=-Clink-self-contained=no" >> "$GITHUB_ENV" + - name: Build ${{ steps.target.outputs.binary }} (${{ steps.target.outputs.target }}) env: # Preserve the release-codegen setting used by the old Dockerfile @@ -171,6 +210,7 @@ jobs: OPENSHELL_IMAGE_TAG: ${{ inputs['image-tag'] }} run: | set -euo pipefail + mise x -- rustup target add "${{ steps.target.outputs.target }}" args=( --release --target "${{ steps.target.outputs.target }}" @@ -192,8 +232,7 @@ jobs: OUTPUT="$("$BIN" --version)" echo "$OUTPUT" grep -q "^${{ steps.target.outputs.binary }} " <<<"$OUTPUT" - # Record glibc linkage so drift from the Ubuntu noble runtime base - # image is visible in logs. + # Record linkage so image runtime drift is visible in logs. ldd --version ldd "$BIN" || true diff --git a/architecture/build.md b/architecture/build.md index cfe13c4b1..9480a4f73 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -12,7 +12,8 @@ OpenShell builds these main artifacts: |---|---| | Gateway binary | `crates/openshell-server` | | CLI package and Python SDK | `python/openshell` plus Rust binaries where packaged | -| Gateway and supervisor container images | `deploy/docker/Dockerfile.images` | +| Gateway container image | `deploy/docker/Dockerfile.gateway` | +| Supervisor container image | `deploy/docker/Dockerfile.supervisor` | | Helm chart | `deploy/helm/openshell` | | VM driver/runtime assets | `crates/openshell-driver-vm` | | Published docs site | `docs/` rendered by Fern config in `fern/` | @@ -21,10 +22,14 @@ Sandbox community images are built outside this repository. ## Container Builds -The Docker image pipeline stages prebuilt Rust binaries, then builds container -images from `deploy/docker/Dockerfile.images`. CI builds native artifacts on the -target architecture, stages them under `deploy/docker/.build/`, and then uses -Buildx to publish per-architecture images and multi-architecture tags. +The Docker image pipeline stages prebuilt Rust binaries, then builds the gateway +image from `deploy/docker/Dockerfile.gateway` and the supervisor image from +`deploy/docker/Dockerfile.supervisor`. CI builds native artifacts on the target +architecture, stages them under `deploy/docker/.build/`, and then uses Buildx to +publish per-architecture images and multi-architecture tags. The gateway image +uses the NVIDIA distroless C/C++ runtime. The supervisor image remains +`scratch`, so the staged `openshell-sandbox` image binary is built as a static +musl binary. Gateway image builds bake the corresponding supervisor image tag into the gateway binary so Docker sandboxes do not depend on `:latest` by default. Package formulas also pin Docker supervisor extraction to the matching release diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index 5b88010e4..6b60613ed 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -86,8 +86,8 @@ sequenceDiagram C->>C: entrypoint: /opt/openshell/bin/openshell-sandbox ``` -The `supervisor` target in `deploy/docker/Dockerfile.images` copies the -`openshell-sandbox` binary to `/openshell-sandbox` in the supervisor image. +The supervisor image from `deploy/docker/Dockerfile.supervisor` copies the static +`openshell-sandbox` binary to `/openshell-sandbox`. Mounting that image at `/opt/openshell/bin` makes the binary available as `/opt/openshell/bin/openshell-sandbox`. @@ -352,4 +352,4 @@ matter compared to cluster or rootful runtimes: netns, proxy, and relay behavior shared by all drivers. - Container engine abstraction: `tasks/scripts/container-engine.sh` for build/deploy support across Docker and Podman. -- Supervisor image build: `deploy/docker/Dockerfile.images`. +- Supervisor image build: `deploy/docker/Dockerfile.supervisor`. diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 3c669a96f..ee67f97b2 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -29,6 +29,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libz3-dev \ pkg-config \ libssl-dev \ + musl-tools \ openssh-client \ python3 \ python3-venv \ diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway new file mode 100644 index 000000000..f39dea409 --- /dev/null +++ b/deploy/docker/Dockerfile.gateway @@ -0,0 +1,61 @@ +# syntax=docker/dockerfile:1.4 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Gateway image build. +# +# Rust binaries are normally built natively before the image build and staged at: +# deploy/docker/.build/prebuilt-binaries//openshell-gateway +# +# Local dev callers such as Skaffold can pass BUILD_FROM_SOURCE=1 to compile the +# gateway inside Docker instead. + +ARG BUILD_FROM_SOURCE=0 +ARG GATEWAY_BASE_IMAGE=nvcr.io/nvidia/distroless/cc:4.0.0 + +FROM rust:1.95.0-slim-bookworm AS rust-builder + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + libssl-dev \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build + +COPY Cargo.toml Cargo.lock ./ +COPY crates/ crates/ +COPY proto/ proto/ +COPY providers/ providers/ + +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/build/target \ + cargo build --release \ + --features "openshell-core/dev-settings" \ + -p openshell-server \ + --bin openshell-gateway \ + && mkdir -p /build/out \ + && install -m 0755 target/release/openshell-gateway /build/out/openshell-gateway + +FROM scratch AS gateway-binary-0 +ARG TARGETARCH +COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /build/out/openshell-gateway + +FROM rust-builder AS gateway-binary-1 + +FROM gateway-binary-${BUILD_FROM_SOURCE} AS gateway-binary + +FROM ${GATEWAY_BASE_IMAGE} AS gateway + +WORKDIR /app + +COPY --from=gateway-binary /build/out/openshell-gateway /usr/local/bin/openshell-gateway + +USER 65532:65532 +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/openshell-gateway"] +CMD ["--bind-address", "0.0.0.0", "--port", "8080"] diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images deleted file mode 100644 index 62662fa93..000000000 --- a/deploy/docker/Dockerfile.images +++ /dev/null @@ -1,122 +0,0 @@ -# syntax=docker/dockerfile:1.4 - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Shared OpenShell image build graph. -# -# Targets: -# gateway Final gateway image -# supervisor Final supervisor image (Ubuntu base, supervisor binary) -# -# Rust binaries are built natively before the image build and staged at: -# deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox} -# -# For local dev (Skaffold), pass --build-arg BUILD_FROM_SOURCE=1 to compile -# binaries inside Docker instead. BuildKit only executes the selected binary -# staging stage, so missing prebuilt files do not cause a build failure. - -# Controls binary source: 0 = prebuilt (release), 1 = compile in Docker (local dev). -# Must be declared here (global scope) so it can be used in FROM instructions below. -ARG BUILD_FROM_SOURCE=0 - -# --------------------------------------------------------------------------- -# Optional in-Docker Rust build (BUILD_FROM_SOURCE=1, local dev only) -# --------------------------------------------------------------------------- -FROM rust:1.95.0-slim-bookworm AS rust-builder - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - pkg-config \ - libssl-dev \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build - -COPY Cargo.toml Cargo.lock ./ -COPY crates/ crates/ -COPY proto/ proto/ -COPY providers/ providers/ - -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/build/target \ - cargo build --release \ - --features "openshell-core/dev-settings" \ - --bin openshell-gateway \ - --bin openshell-sandbox \ - && mkdir -p /build/out \ - && install -m 0755 target/release/openshell-gateway /build/out/openshell-gateway \ - && install -m 0755 target/release/openshell-sandbox /build/out/openshell-sandbox - -# --------------------------------------------------------------------------- -# Per-arch binary stages -# --------------------------------------------------------------------------- - -# Prebuilt path (release default, BUILD_FROM_SOURCE=0) -FROM scratch AS gateway-binary-0 -ARG TARGETARCH -# --chmod=755 preserves the executable bit through actions/upload-artifact + -# download-artifact, which strip exec perms during the roundtrip. -COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /build/out/openshell-gateway - -# Source-built path (local dev, BUILD_FROM_SOURCE=1) -FROM rust-builder AS gateway-binary-1 - -FROM gateway-binary-${BUILD_FROM_SOURCE} AS gateway-binary - -# Prebuilt path (release default, BUILD_FROM_SOURCE=0) -FROM scratch AS supervisor-binary-0 -ARG TARGETARCH -# --chmod=755 preserves the executable bit through actions/upload-artifact + -# download-artifact, which strip exec perms during the roundtrip. -COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /build/out/openshell-sandbox - -# Source-built path (local dev, BUILD_FROM_SOURCE=1) -FROM rust-builder AS supervisor-binary-1 - -FROM supervisor-binary-${BUILD_FROM_SOURCE} AS supervisor-binary - -# --------------------------------------------------------------------------- -# Final gateway image -# --------------------------------------------------------------------------- -FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 AS gateway - -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates && \ - apt-get install -y --only-upgrade gpgv && \ - rm -rf /var/lib/apt/lists/* - -RUN useradd --create-home --user-group openshell - -WORKDIR /app - -COPY --from=gateway-binary /build/out/openshell-gateway /usr/local/bin/ - -RUN mkdir -p /build/crates/openshell-server -COPY --chmod=755 crates/openshell-server/migrations /build/crates/openshell-server/migrations - -USER openshell -EXPOSE 8080 - -ENTRYPOINT ["openshell-gateway"] -CMD ["--bind-address", "0.0.0.0", "--port", "8080"] - -# --------------------------------------------------------------------------- -# Final supervisor image -# --------------------------------------------------------------------------- -# Supervisor image based on the same NVIDIA Ubuntu base used by the gateway. -# -# Used by: -# - Docker driver: binary is extracted from the image and run inside the -# agent container. -# - Podman driver: image is mounted as an OCI volume at /opt/openshell/bin. -# - Kubernetes driver: image runs as an init container that invokes the -# binary's `copy-self` subcommand to seed an emptyDir volume. -# -# An Ubuntu base provides glibc and the dynamic loader needed to exec the -# dynamically linked binary. `FROM scratch` would be smaller but cannot run -# the binary, breaking the Kubernetes init-container path. -FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 AS supervisor -COPY --from=supervisor-binary /build/out/openshell-sandbox /openshell-sandbox diff --git a/deploy/docker/Dockerfile.supervisor b/deploy/docker/Dockerfile.supervisor new file mode 100644 index 000000000..2c63a94d3 --- /dev/null +++ b/deploy/docker/Dockerfile.supervisor @@ -0,0 +1,64 @@ +# syntax=docker/dockerfile:1.4 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Supervisor image build. +# +# The final image intentionally stays `scratch`: it only carries the static +# openshell-sandbox binary used by Docker extraction, Podman image volumes, and +# the Kubernetes init container copy-self path. +# +# Rust binaries are normally built natively before the image build and staged at: +# deploy/docker/.build/prebuilt-binaries//openshell-sandbox +# +# Local dev callers such as Skaffold can pass BUILD_FROM_SOURCE=1 to compile the +# supervisor inside Docker instead. + +ARG BUILD_FROM_SOURCE=0 + +FROM rust:1.95.0-slim-bookworm AS rust-builder +ARG TARGETARCH +ARG BUILDARCH + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +COPY deploy/docker/cross-build.sh /usr/local/bin/cross-build.sh +RUN . /usr/local/bin/cross-build.sh \ + && install_musl_toolchain \ + && add_musl_target + +WORKDIR /build + +COPY Cargo.toml Cargo.lock ./ +COPY crates/ crates/ +COPY proto/ proto/ +COPY providers/ providers/ + +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/build/target \ + . /usr/local/bin/cross-build.sh \ + && cargo_musl_build --release \ + --features "openshell-core/dev-settings" \ + -p openshell-sandbox \ + --bin openshell-sandbox \ + && mkdir -p /build/out \ + && install -m 0755 "$(musl_output_dir release)/openshell-sandbox" /build/out/openshell-sandbox + +FROM scratch AS supervisor-binary-0 +ARG TARGETARCH +COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /build/out/openshell-sandbox + +FROM rust-builder AS supervisor-binary-1 + +FROM supervisor-binary-${BUILD_FROM_SOURCE} AS supervisor-binary + +FROM scratch AS supervisor + +COPY --from=supervisor-binary /build/out/openshell-sandbox /openshell-sandbox + +ENTRYPOINT ["/openshell-sandbox"] diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index 2de9ee4e6..ccb8fec54 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Local dev: builds gateway + supervisor images using Dockerfile.images with +# Local dev: builds gateway + supervisor images using split Dockerfiles with # BUILD_FROM_SOURCE=1, which compiles Rust binaries inside Docker without # requiring pre-staged artifacts. # @@ -28,7 +28,7 @@ build: --target gateway \ --tag "$IMAGE" \ --load \ - --file deploy/docker/Dockerfile.images \ + --file deploy/docker/Dockerfile.gateway \ . dependencies: paths: @@ -36,7 +36,7 @@ build: - Cargo.lock - crates/** - proto/** - - deploy/docker/Dockerfile.images + - deploy/docker/Dockerfile.gateway - crates/openshell-server/migrations/** - image: openshell/supervisor context: ../../.. @@ -47,7 +47,7 @@ build: --target supervisor \ --tag "$IMAGE" \ --load \ - --file deploy/docker/Dockerfile.images \ + --file deploy/docker/Dockerfile.supervisor \ . dependencies: paths: @@ -55,7 +55,8 @@ build: - Cargo.lock - crates/** - proto/** - - deploy/docker/Dockerfile.images + - deploy/docker/Dockerfile.supervisor + - deploy/docker/cross-build.sh deploy: helm: releases: diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml index d8136d581..ef4500db6 100644 --- a/deploy/helm/openshell/templates/certgen.yaml +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -95,7 +95,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - command: ["openshell-gateway"] + command: ["/usr/local/bin/openshell-gateway"] args: - generate-certs - --server-secret-name={{ .Values.server.tls.certSecretName }} diff --git a/tasks/scripts/docker-build-image.sh b/tasks/scripts/docker-build-image.sh index 2fb86bc5e..b733ec4a1 100755 --- a/tasks/scripts/docker-build-image.sh +++ b/tasks/scripts/docker-build-image.sh @@ -93,31 +93,29 @@ ensure_prebuilt_binaries() { TARGET=${1:?"Usage: docker-build-image.sh [extra-args...]"} shift -DOCKERFILE="deploy/docker/Dockerfile.images" -if [[ ! -f "${DOCKERFILE}" ]]; then - echo "Error: Dockerfile not found: ${DOCKERFILE}" >&2 - exit 1 -fi - IS_FINAL_IMAGE=0 IMAGE_NAME="" DOCKER_TARGET="" +DOCKERFILE="" case "${TARGET}" in gateway) IS_FINAL_IMAGE=1 IMAGE_NAME="openshell/gateway" DOCKER_TARGET="gateway" + DOCKERFILE="deploy/docker/Dockerfile.gateway" ;; supervisor) IS_FINAL_IMAGE=1 IMAGE_NAME="openshell/supervisor" DOCKER_TARGET="supervisor" + DOCKERFILE="deploy/docker/Dockerfile.supervisor" ;; supervisor-output) # Backward-compat alias: same as "supervisor". IS_FINAL_IMAGE=1 IMAGE_NAME="openshell/supervisor" DOCKER_TARGET="supervisor" + DOCKERFILE="deploy/docker/Dockerfile.supervisor" ;; *) echo "Error: unsupported target '${TARGET}'" >&2 @@ -125,6 +123,11 @@ case "${TARGET}" in ;; esac +if [[ ! -f "${DOCKERFILE}" ]]; then + echo "Error: Dockerfile not found: ${DOCKERFILE}" >&2 + exit 1 +fi + if [[ -n "${IMAGE_REGISTRY:-}" && "${IS_FINAL_IMAGE}" == "1" ]]; then IMAGE_NAME="${IMAGE_REGISTRY}/${IMAGE_NAME#openshell/}" fi diff --git a/tasks/scripts/stage-prebuilt-binaries.sh b/tasks/scripts/stage-prebuilt-binaries.sh index 21d97c472..d05c0cc75 100755 --- a/tasks/scripts/stage-prebuilt-binaries.sh +++ b/tasks/scripts/stage-prebuilt-binaries.sh @@ -21,9 +21,22 @@ normalize_arch() { } target_triple() { + local libc=${2:-gnu} case "$1" in - amd64) echo "x86_64-unknown-linux-gnu" ;; - arm64) echo "aarch64-unknown-linux-gnu" ;; + amd64) + if [[ "$libc" == "musl" ]]; then + echo "x86_64-unknown-linux-musl" + else + echo "x86_64-unknown-linux-gnu" + fi + ;; + arm64) + if [[ "$libc" == "musl" ]]; then + echo "aarch64-unknown-linux-musl" + else + echo "aarch64-unknown-linux-gnu" + fi + ;; *) echo "unsupported architecture: $1" >&2 exit 1 @@ -71,10 +84,10 @@ components_for_target() { echo "gateway" ;; sandbox|supervisor|supervisor-output) - echo "sandbox" + echo "supervisor" ;; all) - echo "gateway sandbox" + echo "gateway supervisor" ;; *) usage @@ -88,10 +101,12 @@ resolve_component() { gateway) crate=openshell-server binary=openshell-gateway + target_libc=gnu ;; - sandbox) + supervisor) crate=openshell-sandbox binary=openshell-sandbox + target_libc=musl ;; *) echo "unsupported binary component: $1" >&2 @@ -130,7 +145,7 @@ build_component_for_arch() { local current_host_arch resolve_component "$component" - target="$(target_triple "$arch")" + target="$(target_triple "$arch" "$target_libc")" stage="${ROOT}/deploy/docker/.build/prebuilt-binaries/${arch}" features="${EXTRA_CARGO_FEATURES:-openshell-core/dev-settings}" current_host_os="$(host_os)" From f76361152f81ad7c235d3518f83eba8b8397aec1 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:17:19 -0700 Subject: [PATCH 2/3] refactor(docker): use native rust builds for split gateway/supervisor images Drop the in-Docker BUILD_FROM_SOURCE path so both images consume only prebuilt binaries staged natively via tasks/scripts/stage-prebuilt-binaries.sh. This mirrors what CI does and reuses the host's cargo target cache and sccache across rebuilds. - Dockerfile.gateway: nvcr.io/nvidia/distroless/cc:v4.0.4 base (the 4.0.0 tag does not exist on nvcr.io; the registry uses a v prefix). GNU-linked binary copied to /usr/local/bin. - Dockerfile.supervisor: scratch base, static musl binary. Static linkage lets the image stay scratch while still being executable as a Kubernetes init container. - skaffold.yaml: each artifact invokes tasks/scripts/docker-build-image.sh, which stages the binary natively (cargo / cargo-zigbuild) and then builds the image. Drops the cross-build.sh dependency from the supervisor build. - seccomp.rs: add a local SYS_kexec_file_load constant for musl/aarch64. libc 0.2.185 omits the symbol from its musl/aarch64 bindings, so the supervisor's seccomp filter previously failed to compile for that target. - architecture/build.md: describe the native-first pipeline and per-image runtime choices. Local validation: gateway image 101MB (was 194MB), supervisor image 21.7MB. helm:skaffold:run deploys cleanly; the static musl supervisor binary runs correctly in a non-glibc agent container. --- architecture/build.md | 31 +++++++--- .../src/sandbox/linux/seccomp.rs | 14 ++++- deploy/docker/Dockerfile.gateway | 56 +++++------------ deploy/docker/Dockerfile.supervisor | 62 +++++-------------- deploy/helm/openshell/skaffold.yaml | 37 +++++------ 5 files changed, 82 insertions(+), 118 deletions(-) diff --git a/architecture/build.md b/architecture/build.md index 9480a4f73..62d21cdf4 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -22,14 +22,29 @@ Sandbox community images are built outside this repository. ## Container Builds -The Docker image pipeline stages prebuilt Rust binaries, then builds the gateway -image from `deploy/docker/Dockerfile.gateway` and the supervisor image from -`deploy/docker/Dockerfile.supervisor`. CI builds native artifacts on the target -architecture, stages them under `deploy/docker/.build/`, and then uses Buildx to -publish per-architecture images and multi-architecture tags. The gateway image -uses the NVIDIA distroless C/C++ runtime. The supervisor image remains -`scratch`, so the staged `openshell-sandbox` image binary is built as a static -musl binary. +The Docker image pipeline is a two-step flow: build the Rust binary natively +for the target architecture, then assemble the container image from the +prebuilt binary. The gateway image is built from `deploy/docker/Dockerfile.gateway` +and the supervisor image from `deploy/docker/Dockerfile.supervisor`. Neither +Dockerfile compiles Rust — both copy a staged binary out of +`deploy/docker/.build/prebuilt-binaries//` into the final image. + +Binary staging is driven by `tasks/scripts/stage-prebuilt-binaries.sh`, which +runs `cargo build` natively on a matching host or `cargo zigbuild` when +cross-compiling. CI invokes the same staging step via the +`rust-native-build.yml` workflow (per-architecture, per-component) and uploads +the result as an artifact that the image build job downloads back into the +staging directory before running Buildx. + +Runtime layout: + +- **Gateway**: `nvcr.io/nvidia/distroless/cc` base, GNU-linked binary at + `/usr/local/bin/openshell-gateway`, runs as UID/GID `65532:65532`. +- **Supervisor**: `scratch` base, static musl binary at `/openshell-sandbox`. + Static linkage is required because the image is mounted/extracted into + sandbox environments (Docker extraction, Podman image volumes, Kubernetes + init-container copy-self) and cannot rely on a dynamic loader. + Gateway image builds bake the corresponding supervisor image tag into the gateway binary so Docker sandboxes do not depend on `:latest` by default. Package formulas also pin Docker supervisor extraction to the matching release diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index f61464023..1044623f5 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -25,6 +25,16 @@ use tracing::debug; /// Value of `SECCOMP_SET_MODE_FILTER` (linux/seccomp.h). const SECCOMP_SET_MODE_FILTER: u64 = 1; +// libc 0.2.185 omits `SYS_kexec_file_load` from the musl/aarch64 bindings even +// though the kernel exposes syscall 294. Fall back to the literal so the +// supervisor's seccomp filter still blocks fileless kernel-image loads when +// built statically against musl on aarch64. +#[cfg(all(target_arch = "aarch64", target_env = "musl"))] +#[allow(non_upper_case_globals)] +const SYS_kexec_file_load: libc::c_long = 294; +#[cfg(not(all(target_arch = "aarch64", target_env = "musl")))] +use libc::SYS_kexec_file_load; + /// Apply the supervisor seccomp filter across the running process. /// /// This runs after privileged startup helpers complete and synchronizes the @@ -81,7 +91,7 @@ fn build_supervisor_prelude_rules() -> BTreeMap> { libc::SYS_finit_module, libc::SYS_delete_module, libc::SYS_kexec_load, - libc::SYS_kexec_file_load, + SYS_kexec_file_load, ] { rules.entry(syscall).or_default(); } @@ -423,7 +433,7 @@ mod tests { libc::SYS_finit_module, libc::SYS_delete_module, libc::SYS_kexec_load, - libc::SYS_kexec_file_load, + SYS_kexec_file_load, ] { assert!( filter_rules.contains_key(&syscall), diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway index f39dea409..785636313 100644 --- a/deploy/docker/Dockerfile.gateway +++ b/deploy/docker/Dockerfile.gateway @@ -5,54 +5,30 @@ # Gateway image build. # -# Rust binaries are normally built natively before the image build and staged at: +# The Rust binary is built natively before this image build runs and staged at: # deploy/docker/.build/prebuilt-binaries//openshell-gateway # -# Local dev callers such as Skaffold can pass BUILD_FROM_SOURCE=1 to compile the -# gateway inside Docker instead. - -ARG BUILD_FROM_SOURCE=0 -ARG GATEWAY_BASE_IMAGE=nvcr.io/nvidia/distroless/cc:4.0.0 - -FROM rust:1.95.0-slim-bookworm AS rust-builder - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - libssl-dev \ - pkg-config \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build - -COPY Cargo.toml Cargo.lock ./ -COPY crates/ crates/ -COPY proto/ proto/ -COPY providers/ providers/ - -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/build/target \ - cargo build --release \ - --features "openshell-core/dev-settings" \ - -p openshell-server \ - --bin openshell-gateway \ - && mkdir -p /build/out \ - && install -m 0755 target/release/openshell-gateway /build/out/openshell-gateway - -FROM scratch AS gateway-binary-0 -ARG TARGETARCH -COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /build/out/openshell-gateway - -FROM rust-builder AS gateway-binary-1 +# Use tasks/scripts/docker-build-image.sh gateway (or `mise run build:docker:gateway`) +# to stage the binary and build the image in one step. CI builds the binary +# per-architecture via the `rust-native-build.yml` workflow and uploads it as +# an artifact, which is downloaded into the same staging directory before the +# image build job runs. +# +# The runtime is `nvcr.io/nvidia/distroless/cc:4.0.0`, which provides glibc and +# the dynamic loader needed by the GNU-linked gateway binary while keeping the +# attack surface small. -FROM gateway-binary-${BUILD_FROM_SOURCE} AS gateway-binary +ARG GATEWAY_BASE_IMAGE=nvcr.io/nvidia/distroless/cc:v4.0.4 FROM ${GATEWAY_BASE_IMAGE} AS gateway +ARG TARGETARCH + WORKDIR /app -COPY --from=gateway-binary /build/out/openshell-gateway /usr/local/bin/openshell-gateway +# --chmod=755 preserves the executable bit through actions/upload-artifact + +# download-artifact, which strip exec perms during the roundtrip. +COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /usr/local/bin/openshell-gateway USER 65532:65532 EXPOSE 8080 diff --git a/deploy/docker/Dockerfile.supervisor b/deploy/docker/Dockerfile.supervisor index 2c63a94d3..e9b2b2684 100644 --- a/deploy/docker/Dockerfile.supervisor +++ b/deploy/docker/Dockerfile.supervisor @@ -5,60 +5,26 @@ # Supervisor image build. # -# The final image intentionally stays `scratch`: it only carries the static -# openshell-sandbox binary used by Docker extraction, Podman image volumes, and -# the Kubernetes init container copy-self path. +# The final image is `scratch`: it only carries the static `openshell-sandbox` +# binary used by Docker extraction, Podman image volumes, and the Kubernetes +# init container copy-self path. A static musl binary lets the image stay +# `scratch` while still being executable as an init container. # -# Rust binaries are normally built natively before the image build and staged at: +# The Rust binary is built natively before this image build runs and staged at: # deploy/docker/.build/prebuilt-binaries//openshell-sandbox # -# Local dev callers such as Skaffold can pass BUILD_FROM_SOURCE=1 to compile the -# supervisor inside Docker instead. +# Use tasks/scripts/docker-build-image.sh supervisor (or `mise run build:docker:supervisor`) +# to stage the binary and build the image in one step. CI builds the binary +# per-architecture via the `rust-native-build.yml` workflow (with the musl +# target) and uploads it as an artifact, which is downloaded into the same +# staging directory before the image build job runs. -ARG BUILD_FROM_SOURCE=0 - -FROM rust:1.95.0-slim-bookworm AS rust-builder -ARG TARGETARCH -ARG BUILDARCH - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - pkg-config \ - && rm -rf /var/lib/apt/lists/* - -COPY deploy/docker/cross-build.sh /usr/local/bin/cross-build.sh -RUN . /usr/local/bin/cross-build.sh \ - && install_musl_toolchain \ - && add_musl_target - -WORKDIR /build - -COPY Cargo.toml Cargo.lock ./ -COPY crates/ crates/ -COPY proto/ proto/ -COPY providers/ providers/ - -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/build/target \ - . /usr/local/bin/cross-build.sh \ - && cargo_musl_build --release \ - --features "openshell-core/dev-settings" \ - -p openshell-sandbox \ - --bin openshell-sandbox \ - && mkdir -p /build/out \ - && install -m 0755 "$(musl_output_dir release)/openshell-sandbox" /build/out/openshell-sandbox +FROM scratch AS supervisor -FROM scratch AS supervisor-binary-0 ARG TARGETARCH -COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /build/out/openshell-sandbox - -FROM rust-builder AS supervisor-binary-1 - -FROM supervisor-binary-${BUILD_FROM_SOURCE} AS supervisor-binary - -FROM scratch AS supervisor -COPY --from=supervisor-binary /build/out/openshell-sandbox /openshell-sandbox +# --chmod=755 preserves the executable bit through actions/upload-artifact + +# download-artifact, which strip exec perms during the roundtrip. +COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /openshell-sandbox ENTRYPOINT ["/openshell-sandbox"] diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index ccb8fec54..779211877 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -1,12 +1,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Local dev: builds gateway + supervisor images using split Dockerfiles with -# BUILD_FROM_SOURCE=1, which compiles Rust binaries inside Docker without -# requiring pre-staged artifacts. +# Local dev: builds gateway + supervisor images via tasks/scripts/docker-build-image.sh, +# which first stages Rust binaries natively on the host (using cargo / cargo-zigbuild +# when cross-compiling) and then builds the image from the prebuilt binary. This +# mirrors CI and is faster than compiling inside Docker on every rebuild because +# the host's cargo target cache and sccache are reused across iterations. # # Run from repo root: -# skaffold dev -f deploy/helm/openshell/skaffold.yaml +# mise run helm:skaffold:dev +# mise run helm:skaffold:run # # See https://skaffold.dev/docs/deployers/helm/ (setValueTemplates, IMAGE_* fields). apiVersion: skaffold/v4beta14 @@ -23,13 +26,9 @@ build: context: ../../.. custom: buildCommand: | - docker buildx build \ - --build-arg BUILD_FROM_SOURCE=1 \ - --target gateway \ - --tag "$IMAGE" \ - --load \ - --file deploy/docker/Dockerfile.gateway \ - . + IMAGE_NAME="${IMAGE%:*}" \ + IMAGE_TAG="${IMAGE##*:}" \ + tasks/scripts/docker-build-image.sh gateway dependencies: paths: - Cargo.toml @@ -37,18 +36,15 @@ build: - crates/** - proto/** - deploy/docker/Dockerfile.gateway - - crates/openshell-server/migrations/** + - tasks/scripts/docker-build-image.sh + - tasks/scripts/stage-prebuilt-binaries.sh - image: openshell/supervisor context: ../../.. custom: buildCommand: | - docker buildx build \ - --build-arg BUILD_FROM_SOURCE=1 \ - --target supervisor \ - --tag "$IMAGE" \ - --load \ - --file deploy/docker/Dockerfile.supervisor \ - . + IMAGE_NAME="${IMAGE%:*}" \ + IMAGE_TAG="${IMAGE##*:}" \ + tasks/scripts/docker-build-image.sh supervisor dependencies: paths: - Cargo.toml @@ -56,7 +52,8 @@ build: - crates/** - proto/** - deploy/docker/Dockerfile.supervisor - - deploy/docker/cross-build.sh + - tasks/scripts/docker-build-image.sh + - tasks/scripts/stage-prebuilt-binaries.sh deploy: helm: releases: From 59a089afef856b13538c330b99174864dc01d609 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:34:05 -0700 Subject: [PATCH 3/3] style(driver-kubernetes): apply cargo fmt to driver.rs Fixes formatting drift flagged by `cargo fmt --all -- --check`. --- crates/openshell-driver-kubernetes/src/driver.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a6107f907..cde1f4b22 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -2308,10 +2308,18 @@ mod tests { #[test] fn log_level_propagates_as_env_var_to_sandbox_pod() { - let spec = SandboxSpec { log_level: "debug".to_string(), ..SandboxSpec::default() }; + let spec = SandboxSpec { + log_level: "debug".to_string(), + ..SandboxSpec::default() + }; let cr = sandbox_to_k8s_spec(Some(&spec), &SandboxPodParams::default()); - let env = cr["spec"]["podTemplate"]["spec"]["containers"][0]["env"].as_array().unwrap(); - assert!(env.iter().any(|e| e["name"] == "OPENSHELL_LOG_LEVEL" && e["value"] == "debug")); + let env = cr["spec"]["podTemplate"]["spec"]["containers"][0]["env"] + .as_array() + .unwrap(); + assert!( + env.iter() + .any(|e| e["name"] == "OPENSHELL_LOG_LEVEL" && e["value"] == "debug") + ); assert!(cr["spec"].get("logLevel").is_none()); } }