From 0ec31fe08876efbfc6aef96c9c9dddf33859e476 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 00:40:39 -0700 Subject: [PATCH 1/4] ci(canary): switch to single-command sandbox create canary Now that OPENSHELL_GATEWAY_HOST is supported in auto-bootstrap, remove the explicit `gateway start` step. The canary now tests the real zero-to-sandbox user path: a single `sandbox create` that auto-bootstraps the gateway, creates a sandbox, and runs a command. --- .github/workflows/release-canary.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 90251610..d71b9cf7 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -45,6 +45,11 @@ jobs: - /var/run/docker.sock:/var/run/docker.sock env: OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # The CI container mounts the host Docker socket, so the gateway + # container is a sibling — not reachable at 127.0.0.1 from inside + # this container. OPENSHELL_GATEWAY_HOST tells the auto-bootstrap + # to advertise a reachable address instead. + OPENSHELL_GATEWAY_HOST: host.docker.internal steps: - uses: actions/checkout@v4 @@ -90,18 +95,15 @@ jobs: echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts fi - - name: Start gateway - env: - # Use OPENSHELL_GATEWAY_HOST when supported (CLI >= next release), - # fall back to the explicit --gateway-host flag for older CLIs. - OPENSHELL_GATEWAY_HOST: host.docker.internal - run: openshell gateway start --gateway-host host.docker.internal - - name: Run canary test run: | set -euo pipefail - echo "Creating sandbox and running 'echo hello world'..." + # Single-command canary: tests the full zero-to-sandbox path. + # `sandbox create` detects no gateway, auto-bootstraps one (using + # OPENSHELL_GATEWAY_HOST for the advertised address), then creates + # a sandbox and runs the command inside it. + echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..." OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { EXIT_CODE=$? echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" From 63ca7727d9b054e66766bc38da85e23614779ebe Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 10:40:19 -0700 Subject: [PATCH 2/4] ci(canary): add two-step gateway start + sandbox create canary test Add a parallel canary job that explicitly tests the gateway start followed by sandbox create flow, separate from the existing auto-bootstrap path. --- .github/workflows/release-canary.yml | 106 ++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index d71b9cf7..d792806f 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -20,8 +20,8 @@ defaults: shell: bash jobs: - acceptance: - name: Canary (${{ matrix.arch }}) + canary-auto-bootstrap: + name: Canary Auto-Bootstrap (${{ matrix.arch }}) if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} strategy: fail-fast: false @@ -119,3 +119,105 @@ jobs: echo "::error::Canary test failed: 'hello world' not found in output" exit 1 fi + + canary-two-step: + name: Canary Two-Step (${{ matrix.arch }}) + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + runner: build-amd64 + target: x86_64-unknown-linux-musl + - arch: arm64 + runner: build-arm64 + target: aarch64-unknown-linux-musl + runs-on: ${{ matrix.runner }} + timeout-minutes: 30 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY_HOST: host.docker.internal + steps: + - uses: actions/checkout@v4 + + - name: Determine release tag + id: release + run: | + set -euo pipefail + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "tag=${{ inputs.tag }}" >> "$GITHUB_OUTPUT" + else + WORKFLOW_NAME="${{ github.event.workflow_run.name }}" + if [ "$WORKFLOW_NAME" = "Release Dev" ]; then + echo "tag=devel" >> "$GITHUB_OUTPUT" + elif [ "$WORKFLOW_NAME" = "Release Tag" ]; then + TAG="${{ github.event.workflow_run.head_branch }}" + if [ -z "$TAG" ]; then + echo "::error::Could not determine release tag from workflow_run" + exit 1 + fi + echo "tag=${TAG}" >> "$GITHUB_OUTPUT" + else + echo "::error::Unexpected triggering workflow: ${WORKFLOW_NAME}" + exit 1 + fi + fi + + - name: Install CLI from GitHub Release + run: ./install.sh + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_VERSION: ${{ steps.release.outputs.tag }} + + - name: Verify CLI installation + run: openshell --version + + - name: Resolve gateway host + run: | + # On Linux CI runners host.docker.internal is not set automatically + # (it's a Docker Desktop feature). Add it via the Docker bridge IP. + if ! getent hosts host.docker.internal >/dev/null 2>&1; then + BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}') + echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal" + echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts + fi + + - name: Start gateway + run: | + set -euo pipefail + + # Two-step canary: explicitly start the gateway first, then create + # a sandbox against it. This tests the `gateway start` + `sandbox + # create` flow separately from the auto-bootstrap path. + echo "Starting gateway..." + openshell gateway start + + - name: Run canary test + run: | + set -euo pipefail + + echo "Creating sandbox against running gateway..." + OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { + EXIT_CODE=$? + echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" + echo "$OUTPUT" + exit $EXIT_CODE + } + + echo "$OUTPUT" + + if echo "$OUTPUT" | grep -q "hello world"; then + echo "Two-step canary test passed: 'hello world' found in output" + else + echo "::error::Two-step canary test failed: 'hello world' not found in output" + exit 1 + fi From 4be92be59208d451679a374e293fe674314776cb Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 10:45:18 -0700 Subject: [PATCH 3/4] fix(canary): pass --gateway-host to gateway start in two-step job The gateway container is a Docker sibling, not in the CI container's network namespace. Without --gateway-host the metadata stores 127.0.0.1 which is unreachable. The auto-bootstrap path reads OPENSHELL_GATEWAY_HOST from the env, but gateway start requires it as a CLI flag. --- .github/workflows/release-canary.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index d792806f..234836a3 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -198,8 +198,13 @@ jobs: # Two-step canary: explicitly start the gateway first, then create # a sandbox against it. This tests the `gateway start` + `sandbox # create` flow separately from the auto-bootstrap path. + # + # --gateway-host is required because the gateway container is a + # Docker sibling (not in the same network namespace). Without it + # the metadata stores 127.0.0.1 which is unreachable from this + # CI container. echo "Starting gateway..." - openshell gateway start + openshell gateway start --gateway-host "$OPENSHELL_GATEWAY_HOST" - name: Run canary test run: | From 1e93e889a1e4ad5d569c580b03d9842601f4735c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 11:05:06 -0700 Subject: [PATCH 4/4] refactor(canary): consolidate jobs using matrix mode dimension Replace two separate jobs (canary-auto-bootstrap, canary-two-step) with a single job using arch x mode matrix. The only difference between modes is a conditional 'Start gateway' step for two-step mode. --- .github/workflows/release-canary.yml | 126 ++++----------------------- 1 file changed, 16 insertions(+), 110 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 234836a3..9e0108b7 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -20,12 +20,18 @@ defaults: shell: bash jobs: - canary-auto-bootstrap: - name: Canary Auto-Bootstrap (${{ matrix.arch }}) + canary: + name: Canary ${{ matrix.mode }} (${{ matrix.arch }}) if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} strategy: fail-fast: false matrix: + arch: + - amd64 + - arm64 + mode: + - auto-bootstrap + - two-step include: - arch: amd64 runner: build-amd64 @@ -95,114 +101,14 @@ jobs: echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts fi - - name: Run canary test - run: | - set -euo pipefail - - # Single-command canary: tests the full zero-to-sandbox path. - # `sandbox create` detects no gateway, auto-bootstraps one (using - # OPENSHELL_GATEWAY_HOST for the advertised address), then creates - # a sandbox and runs the command inside it. - echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..." - OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { - EXIT_CODE=$? - echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" - echo "$OUTPUT" - exit $EXIT_CODE - } - - echo "$OUTPUT" - - if echo "$OUTPUT" | grep -q "hello world"; then - echo "Canary test passed: 'hello world' found in output" - else - echo "::error::Canary test failed: 'hello world' not found in output" - exit 1 - fi - - canary-two-step: - name: Canary Two-Step (${{ matrix.arch }}) - if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} - strategy: - fail-fast: false - matrix: - include: - - arch: amd64 - runner: build-amd64 - target: x86_64-unknown-linux-musl - - arch: arm64 - runner: build-arm64 - target: aarch64-unknown-linux-musl - runs-on: ${{ matrix.runner }} - timeout-minutes: 30 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock - env: - OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_GATEWAY_HOST: host.docker.internal - steps: - - uses: actions/checkout@v4 - - - name: Determine release tag - id: release - run: | - set -euo pipefail - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "tag=${{ inputs.tag }}" >> "$GITHUB_OUTPUT" - else - WORKFLOW_NAME="${{ github.event.workflow_run.name }}" - if [ "$WORKFLOW_NAME" = "Release Dev" ]; then - echo "tag=devel" >> "$GITHUB_OUTPUT" - elif [ "$WORKFLOW_NAME" = "Release Tag" ]; then - TAG="${{ github.event.workflow_run.head_branch }}" - if [ -z "$TAG" ]; then - echo "::error::Could not determine release tag from workflow_run" - exit 1 - fi - echo "tag=${TAG}" >> "$GITHUB_OUTPUT" - else - echo "::error::Unexpected triggering workflow: ${WORKFLOW_NAME}" - exit 1 - fi - fi - - - name: Install CLI from GitHub Release - run: ./install.sh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_VERSION: ${{ steps.release.outputs.tag }} - - - name: Verify CLI installation - run: openshell --version - - - name: Resolve gateway host - run: | - # On Linux CI runners host.docker.internal is not set automatically - # (it's a Docker Desktop feature). Add it via the Docker bridge IP. - if ! getent hosts host.docker.internal >/dev/null 2>&1; then - BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}') - echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal" - echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts - fi - + # Two-step mode: explicitly start the gateway before creating a sandbox. + # --gateway-host is required because the gateway container is a Docker + # sibling (not in the same network namespace). Without it the metadata + # stores 127.0.0.1 which is unreachable from this CI container. - name: Start gateway + if: matrix.mode == 'two-step' run: | set -euo pipefail - - # Two-step canary: explicitly start the gateway first, then create - # a sandbox against it. This tests the `gateway start` + `sandbox - # create` flow separately from the auto-bootstrap path. - # - # --gateway-host is required because the gateway container is a - # Docker sibling (not in the same network namespace). Without it - # the metadata stores 127.0.0.1 which is unreachable from this - # CI container. echo "Starting gateway..." openshell gateway start --gateway-host "$OPENSHELL_GATEWAY_HOST" @@ -210,7 +116,7 @@ jobs: run: | set -euo pipefail - echo "Creating sandbox against running gateway..." + echo "Creating sandbox and running 'echo hello world'..." OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { EXIT_CODE=$? echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" @@ -221,8 +127,8 @@ jobs: echo "$OUTPUT" if echo "$OUTPUT" | grep -q "hello world"; then - echo "Two-step canary test passed: 'hello world' found in output" + echo "Canary test passed: 'hello world' found in output" else - echo "::error::Two-step canary test failed: 'hello world' not found in output" + echo "::error::Canary test failed: 'hello world' not found in output" exit 1 fi