From b69c274beceeedde0b7b355046a32cf3b745d143 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 13 Mar 2026 07:47:21 -0700
Subject: [PATCH 1/5] fix(bootstrap): detect missing sandbox supervisor binary
 during gateway health check

The cluster health check now verifies that /opt/openshell/bin/openshell-sandbox
exists in the gateway container. Without this binary, every sandbox pod crashes
with 'no such file or directory' but the gateway previously reported healthy.

- Add HEALTHCHECK_MISSING_SUPERVISOR marker to cluster-healthcheck.sh
- Add early detection in bootstrap polling loop (runtime.rs)
- Add structured error diagnosis with recovery steps (errors.rs)
- Update debug-openshell-cluster skill with new failure pattern
---
 .../skills/debug-openshell-cluster/SKILL.md   |  6 ++++
 crates/openshell-bootstrap/src/errors.rs      | 33 +++++++++++++++++++
 crates/openshell-bootstrap/src/runtime.rs     | 30 +++++++++++++++++
 deploy/docker/cluster-healthcheck.sh          | 12 +++++++
 4 files changed, 81 insertions(+)
diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md
index 2d46ea550..81b1b53e5 100644
--- a/.agents/skills/debug-openshell-cluster/SKILL.md
+++ b/.agents/skills/debug-openshell-cluster/SKILL.md
@@ -26,6 +26,7 @@ Use **only** `openshell` CLI commands (`openshell status`, `openshell doctor log
     - k3s API server readiness (`/readyz`)
     - `openshell` statefulset ready in `openshell` namespace
     - TLS secrets `openshell-server-tls` and `openshell-client-tls` exist in `openshell` namespace
+    - Sandbox supervisor binary exists at `/opt/openshell/bin/openshell-sandbox` (emits `HEALTHCHECK_MISSING_SUPERVISOR` marker if absent)
 
 For local deploys, metadata endpoint selection now depends on Docker connectivity:
 
@@ -311,6 +312,8 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
 | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component |
 | Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 2) or choose "Recreate" when prompted |
 | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` |
+| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` stage. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker |
+| `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy <name> && openshell gateway start` |
 
 ## Full Diagnostic Dump
 
@@ -359,6 +362,9 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op
 echo "=== Registry Configuration ==="
 openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml
 
+echo "=== Supervisor Binary ==="
+openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox
+
 echo "=== DNS Configuration ==="
 openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf
 ```
diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs
index 40aac1a74..39088ffb9 100644
--- a/crates/openshell-bootstrap/src/errors.rs
+++ b/crates/openshell-bootstrap/src/errors.rs
@@ -139,6 +139,12 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
         match_mode: MatchMode::Any,
         diagnose: diagnose_node_pressure,
     },
+    // Missing sandbox supervisor binary
+    FailurePattern {
+        matchers: &["HEALTHCHECK_MISSING_SUPERVISOR"],
+        match_mode: MatchMode::Any,
+        diagnose: diagnose_missing_supervisor,
+    },
     // TLS/certificate issues
     FailurePattern {
         matchers: &[
@@ -342,6 +348,33 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis {
     }
 }
 
+fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis {
+    GatewayFailureDiagnosis {
+        summary: "Sandbox supervisor binary missing from cluster image".to_string(),
+        explanation: "The sandbox supervisor binary (/opt/openshell/bin/openshell-sandbox) \
+            was not found in the gateway container. This binary is side-loaded into every \
+            sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \
+            crash immediately with \"no such file or directory\". This typically means the \
+            cluster image was built or published without the supervisor-builder stage."
+            .to_string(),
+        recovery_steps: vec![
+            RecoveryStep::with_command(
+                "Rebuild the cluster image with the supervisor binary included",
+                "mise run docker:build:cluster",
+            ),
+            RecoveryStep::with_command(
+                "Destroy and recreate the gateway with the updated image",
+                format!("openshell gateway destroy {gateway_name} && openshell gateway start"),
+            ),
+            RecoveryStep::new(
+                "Or set OPENSHELL_CLUSTER_IMAGE to a cluster image version that includes \
+                the supervisor binary",
+            ),
+        ],
+        retryable: false,
+    }
+}
+
 fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
     GatewayFailureDiagnosis {
         summary: "TLS certificate issue".to_string(),
diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs
index 0bc893d84..271fde8d4 100644
--- a/crates/openshell-bootstrap/src/runtime.rs
+++ b/crates/openshell-bootstrap/src/runtime.rs
@@ -24,6 +24,13 @@ const DNS_FAILURE_MARKERS: &[&str] = &["DNS_PROBE_FAILED", "HEALTHCHECK_DNS_FAIL
 /// new scheduling, so the cluster will never become healthy on its own.
 const NODE_PRESSURE_MARKER: &str = "HEALTHCHECK_NODE_PRESSURE";
 
+/// Log marker emitted by the health-check script when the sandbox supervisor
+/// binary (`/opt/openshell/bin/openshell-sandbox`) is missing from the node
+/// filesystem. Without this binary, every sandbox pod will crash immediately
+/// with "no such file or directory". This is a permanent error that requires
+/// rebuilding or updating the cluster image.
+const MISSING_SUPERVISOR_MARKER: &str = "HEALTHCHECK_MISSING_SUPERVISOR";
+
 /// Number of consecutive polling iterations that must observe DNS failure
 /// markers before we treat the failure as persistent and abort. A small
 /// grace period avoids false positives on transient hiccups during startup.
@@ -116,6 +123,29 @@ where
             }
         }
 
+        // -- Missing supervisor binary detection ----------------------------
+        // The health-check script verifies that /opt/openshell/bin/openshell-sandbox
+        // exists on the node filesystem. If missing, every sandbox pod will crash.
+        // This is a permanent error — fail immediately with actionable guidance.
+        if recent_logs
+            .iter()
+            .any(|line| line.contains(MISSING_SUPERVISOR_MARKER))
+        {
+            result = Some(Err(miette::miette!(
+                "The sandbox supervisor binary is missing from the cluster image.\n\
+                 The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \
+                 container. Without it, sandbox pods cannot start.\n\n\
+                 This usually means the cluster image was built or published without the \
+                 supervisor-builder stage.\n\n\
+                 To fix:\n  \
+                 1. Rebuild the cluster image: mise run docker:build:cluster\n  \
+                 2. Or update to a cluster image that includes the supervisor binary\n  \
+                 3. Then recreate the gateway: openshell gateway destroy && openshell gateway start\n\n{}",
+                format_recent_logs(&recent_logs)
+            )));
+            break;
+        }
+
         let inspect = docker
             .inspect_container(&container_name, None::<InspectContainerOptions>)
             .await
diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh
index a3b373030..68210b456 100644
--- a/deploy/docker/cluster-healthcheck.sh
+++ b/deploy/docker/cluster-healthcheck.sh
@@ -50,6 +50,18 @@ done
 kubectl -n openshell get statefulset/openshell >/dev/null 2>&1 || exit 1
 kubectl -n openshell wait --for=jsonpath='{.status.readyReplicas}'=1 statefulset/openshell --timeout=1s >/dev/null 2>&1 || exit 1
 
+# ---------------------------------------------------------------------------
+# Verify the sandbox supervisor binary exists on the node filesystem.
+# Sandbox pods mount /opt/openshell/bin as a read-only hostPath volume and
+# exec /opt/openshell/bin/openshell-sandbox as their entrypoint. If the binary
+# is missing (e.g. cluster image was built without the supervisor-builder
+# stage), every sandbox pod will crash with "no such file or directory".
+# ---------------------------------------------------------------------------
+if [ ! -x /opt/openshell/bin/openshell-sandbox ]; then
+    echo "HEALTHCHECK_MISSING_SUPERVISOR: /opt/openshell/bin/openshell-sandbox not found" >&2
+    exit 1
+fi
+
 # Verify TLS secrets exist (created by openshell-bootstrap before the StatefulSet starts)
 # Skip when TLS is disabled — secrets are not required.
 if [ "${DISABLE_TLS:-}" != "true" ]; then

From 20bcf3bf10e9dd8251d7b8684f6b5903fc6dcd42 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 13 Mar 2026 07:53:00 -0700
Subject: [PATCH 2/5] fix(sandbox): increase provisioning timeout from 120s to
 300s

Large sandbox images (e.g. 852MB base image) can take ~3 minutes to
pull, exceeding the previous 120s timeout. Bump to 300s across all
surfaces: CLI watch loop, server orphan grace period, TUI ready poll,
Python SDK wait_ready default, and E2E test harness.
---
 crates/openshell-cli/src/run.rs            | 2 +-
 crates/openshell-server/src/sandbox/mod.rs | 2 +-
 crates/openshell-tui/src/lib.rs            | 2 +-
 e2e/rust/src/harness/sandbox.rs            | 2 +-
 python/openshell/sandbox.py                | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 12b891da1..9e62e6455 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1993,7 +1993,7 @@ pub async fn sandbox_create(
     // Track whether we have seen a non-Ready phase during the watch.
     let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready);
     let start_time = Instant::now();
-    let provision_timeout = Duration::from_secs(120);
+    let provision_timeout = Duration::from_secs(300);
     // Track whether we saw the gateway become ready (from log messages).
     let mut saw_gateway_ready = false;
 
diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs
index 2fd9b54b6..f8ab6a6dd 100644
--- a/crates/openshell-server/src/sandbox/mod.rs
+++ b/crates/openshell-server/src/sandbox/mod.rs
@@ -381,7 +381,7 @@ const RECONCILE_INTERVAL: Duration = Duration::from_secs(60);
 /// How long a sandbox can stay in `Provisioning` in the store without a
 /// corresponding Kubernetes resource before it is considered orphaned and
 /// removed.
-const ORPHAN_GRACE_PERIOD: Duration = Duration::from_secs(120);
+const ORPHAN_GRACE_PERIOD: Duration = Duration::from_secs(300);
 
 /// Periodically reconcile the store against Kubernetes to clean up orphaned
 /// sandbox records.  A record is orphaned when it exists in the store but
diff --git a/crates/openshell-tui/src/lib.rs b/crates/openshell-tui/src/lib.rs
index 9255ae0b2..98f4ed0d6 100644
--- a/crates/openshell-tui/src/lib.rs
+++ b/crates/openshell-tui/src/lib.rs
@@ -1256,7 +1256,7 @@ fn spawn_create_sandbox(app: &mut App, tx: mpsc::UnboundedSender<Event>) {
             let mut attempts = 0;
             let sandbox_id = loop {
                 attempts += 1;
-                if attempts > 60 {
+                if attempts > 150 {
                     let _ = tx.send(Event::CreateResult(Err(
                         "timed out waiting for sandbox to be ready".to_string(),
                     )));
diff --git a/e2e/rust/src/harness/sandbox.rs b/e2e/rust/src/harness/sandbox.rs
index da48c5ed2..7a942265d 100644
--- a/e2e/rust/src/harness/sandbox.rs
+++ b/e2e/rust/src/harness/sandbox.rs
@@ -25,7 +25,7 @@ fn extract_sandbox_name(output: &str) -> Option<String> {
 }
 
 /// Default timeout for waiting for a sandbox to become ready.
-const SANDBOX_READY_TIMEOUT: Duration = Duration::from_secs(120);
+const SANDBOX_READY_TIMEOUT: Duration = Duration::from_secs(300);
 
 /// RAII guard that deletes a sandbox on drop.
 ///
diff --git a/python/openshell/sandbox.py b/python/openshell/sandbox.py
index 6507c9a45..7b48ab3b9 100644
--- a/python/openshell/sandbox.py
+++ b/python/openshell/sandbox.py
@@ -248,7 +248,7 @@ def wait_deleted(self, sandbox_name: str, *, timeout_seconds: float = 60.0) -> N
         raise SandboxError(f"sandbox {sandbox_name} was not deleted within timeout")
 
     def wait_ready(
-        self, sandbox_name: str, *, timeout_seconds: float = 120.0
+        self, sandbox_name: str, *, timeout_seconds: float = 300.0
     ) -> SandboxRef:
         deadline = time.time() + timeout_seconds
         while time.time() < deadline:

From 9c392678ebc2731a3e945e8e9460e84f8ae10215 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 13 Mar 2026 13:26:57 -0700
Subject: [PATCH 3/5] fix(bootstrap): consolidate gateway teardown and remove
 unused Docker network
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Teardown of existing gateway resources (container, volume, image) is now
performed inside deploy_gateway_with_logs() so both 'gateway start' and
the auto-bootstrap path in 'sandbox create' get identical cleanup.

Remove the dedicated 'openshell-cluster' bridge network — it was only
used by a single container and the default Docker bridge is sufficient.
This eliminates the network create/remove retry loops and stale endpoint
cleanup that added complexity without benefit.

Fix the provisioning stream timeout in run.rs: wrap stream.next() in
tokio::time::timeout() so the 300s deadline fires even when the gRPC
stream stops producing events.
---
 crates/openshell-bootstrap/src/constants.rs |   2 -
 crates/openshell-bootstrap/src/docker.rs    | 110 +-------------------
 crates/openshell-bootstrap/src/lib.rs       |  15 ++-
 crates/openshell-cli/src/main.rs            |   2 +-
 crates/openshell-cli/src/run.rs             | 108 +++++--------------
 5 files changed, 42 insertions(+), 195 deletions(-)

diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs
index 76f5d2f3f..0c6a93821 100644
--- a/crates/openshell-bootstrap/src/constants.rs
+++ b/crates/openshell-bootstrap/src/constants.rs
@@ -1,8 +1,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-pub const NETWORK_NAME: &str = "openshell-cluster";
-
 /// Path to the kubeconfig inside the k3s container.
 /// Used by in-container kubectl operations (node cleanup, PKI reconciliation, etc.).
 pub const KUBECONFIG_PATH: &str = "/etc/rancher/k3s/k3s.yaml";
diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index c44197bfb..6a2064e91 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use crate::RemoteOptions;
-use crate::constants::{NETWORK_NAME, container_name, volume_name};
+use crate::constants::{container_name, volume_name};
 use crate::image::{
     self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref,
 };
@@ -10,11 +10,10 @@ use bollard::API_DEFAULT_VERSION;
 use bollard::Docker;
 use bollard::errors::Error as BollardError;
 use bollard::models::{
-    ContainerCreateBody, DeviceRequest, HostConfig, NetworkCreateRequest, NetworkDisconnectRequest,
-    PortBinding, VolumeCreateRequest,
+    ContainerCreateBody, DeviceRequest, HostConfig, PortBinding, VolumeCreateRequest,
 };
 use bollard::query_parameters::{
-    CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
+    CreateContainerOptions, CreateImageOptions, InspectContainerOptions,
     ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions,
     StartContainerOptions,
 };
@@ -186,53 +185,6 @@ pub async fn find_gateway_container(docker: &Docker, port: Option<u16>) -> Resul
     }
 }
 
-pub async fn ensure_network(docker: &Docker) -> Result<()> {
-    // Always remove and recreate the network to guarantee a clean state.
-    // Stale Docker networks (e.g., from a previous interrupted destroy or
-    // Docker Desktop restart) can leave broken routing that causes the
-    // container to fail with "no default routes found".
-    force_remove_network(docker).await?;
-
-    // Docker may return a 409 conflict if the previous network teardown has
-    // not fully completed in the daemon.  Retry a few times with back-off,
-    // re-attempting the removal before each create.
-    let mut last_err = None;
-    for attempt in 0u64..5 {
-        if attempt > 0 {
-            tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await;
-            // Re-attempt removal in case the previous teardown has now settled.
-            force_remove_network(docker).await?;
-        }
-        match docker
-            .create_network(NetworkCreateRequest {
-                name: NETWORK_NAME.to_string(),
-                driver: Some("bridge".to_string()),
-                attachable: Some(true),
-                ..Default::default()
-            })
-            .await
-        {
-            Ok(_) => return Ok(()),
-            Err(err) if is_conflict(&err) => {
-                tracing::debug!(
-                    "Network create conflict (attempt {}/5), retrying: {}",
-                    attempt + 1,
-                    err,
-                );
-                last_err = Some(err);
-            }
-            Err(err) => {
-                return Err(err)
-                    .into_diagnostic()
-                    .wrap_err("failed to create Docker network");
-            }
-        }
-    }
-    Err(last_err.expect("at least one retry attempt"))
-        .into_diagnostic()
-        .wrap_err("failed to create Docker network after retries (network still in use)")
-}
-
 pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> {
     match docker.inspect_volume(name).await {
         Ok(_) => return Ok(()),
@@ -376,7 +328,6 @@ pub async fn ensure_container(
         privileged: Some(true),
         port_bindings: Some(port_bindings),
         binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
-        network_mode: Some(NETWORK_NAME.to_string()),
         // Add host.docker.internal mapping for DNS resolution
         // This allows the entrypoint script to configure CoreDNS to use the host gateway
         extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]),
@@ -678,20 +629,6 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
         .ok()
         .and_then(|info| info.image);
 
-    // Explicitly disconnect the container from the cluster network before
-    // removing it.  This ensures Docker tears down the network endpoint
-    // synchronously so port bindings are released immediately and the
-    // subsequent network cleanup sees zero connected containers.
-    let _ = docker
-        .disconnect_network(
-            NETWORK_NAME,
-            NetworkDisconnectRequest {
-                container: container_name.clone(),
-                force: Some(true),
-            },
-        )
-        .await;
-
     let _ = stop_container(docker, &container_name).await;
 
     let remove_container = docker
@@ -763,50 +700,9 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
         return Err(err).into_diagnostic();
     }
 
-    // Force-remove the network during a full destroy.  First disconnect any
-    // stale endpoints that Docker may still report (race between container
-    // removal and network bookkeeping), then remove the network itself.
-    force_remove_network(docker).await?;
     Ok(())
 }
 
-/// Forcefully remove the gateway network, disconnecting any remaining
-/// containers first.  This ensures that stale Docker network endpoints
-/// cannot prevent port bindings from being released.
-async fn force_remove_network(docker: &Docker) -> Result<()> {
-    let network = match docker
-        .inspect_network(NETWORK_NAME, None::<InspectNetworkOptions>)
-        .await
-    {
-        Ok(info) => info,
-        Err(err) if is_not_found(&err) => return Ok(()),
-        Err(err) => return Err(err).into_diagnostic(),
-    };
-
-    // Disconnect any containers still attached to the network.
-    if let Some(containers) = network.containers {
-        for (id, _) in containers {
-            let _ = docker
-                .disconnect_network(
-                    NETWORK_NAME,
-                    NetworkDisconnectRequest {
-                        container: id,
-                        force: Some(true),
-                    },
-                )
-                .await;
-        }
-    }
-
-    match docker.remove_network(NETWORK_NAME).await {
-        Ok(()) => Ok(()),
-        Err(err) if is_not_found(&err) => Ok(()),
-        Err(err) => Err(err)
-            .into_diagnostic()
-            .wrap_err("failed to remove Docker network"),
-    }
-}
-
 fn is_not_found(err: &BollardError) -> bool {
     matches!(
         err,
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index b8c0cdddd..4f347eb7f 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -30,7 +30,7 @@ use crate::constants::{
 };
 use crate::docker::{
     check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
-    ensure_image, ensure_network, ensure_volume, start_container, stop_container,
+    ensure_image, ensure_volume, start_container, stop_container,
 };
 use crate::metadata::{
     create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -256,6 +256,18 @@ where
         ),
     };
 
+    // Tear down any existing gateway resources (container, volume, image) so
+    // we always start from a clean state.  This is the single consolidated
+    // teardown point — both `gateway start` and the auto-bootstrap path in
+    // `sandbox create` flow through here.
+    if check_existing_gateway(&target_docker, &name)
+        .await?
+        .is_some()
+    {
+        log("[status] Removing existing gateway".to_string());
+        destroy_gateway_resources(&target_docker, &name).await?;
+    }
+
     // Ensure the image is available on the target Docker daemon
     if remote_opts.is_some() {
         log("[status] Downloading gateway".to_string());
@@ -280,7 +292,6 @@ where
 
     // All subsequent operations use the target Docker (remote or local)
     log("[status] Initializing environment".to_string());
-    ensure_network(&target_docker).await?;
     ensure_volume(&target_docker, &volume_name(&name)).await?;
 
     // Compute extra TLS SANs for remote deployments so the gateway and k3s
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 5258f18a7..84f7180de 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -1420,13 +1420,13 @@ async fn main() -> Result<()> {
                 registry_token,
                 gpu,
             } => {
+                let _ = recreate; // teardown is always performed by deploy
                 run::gateway_admin_deploy(
                     &name,
                     remote.as_deref(),
                     ssh_key.as_deref(),
                     port,
                     gateway_host.as_deref(),
-                    recreate,
                     plaintext,
                     disable_gateway_auth,
                     registry_token.as_deref(),
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 9e62e6455..c072f58c1 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1205,40 +1205,6 @@ async fn http_health_check(server: &str, tls: &TlsOptions) -> Result<Option<Stat
     Ok(Some(resp.status()))
 }
 
-/// Prompt the user to choose how to handle an existing gateway deployment.
-///
-/// Returns `true` to recreate (destroy and start fresh), `false` to reuse.
-fn prompt_existing_gateway(
-    name: &str,
-    info: &openshell_bootstrap::ExistingGatewayInfo,
-) -> Result<bool> {
-    let status = if info.container_running {
-        "running"
-    } else if info.container_exists {
-        "stopped"
-    } else {
-        "volume only"
-    };
-
-    eprintln!("• Existing gateway '{name}' detected ({status})");
-    if let Some(image) = &info.container_image {
-        eprintln!("  {} {}", "Image:".dimmed(), image);
-    }
-    eprintln!();
-
-    eprint!("Destroy and recreate gateway from scratch? [y/N] ");
-    std::io::stderr().flush().ok();
-
-    let mut input = String::new();
-    std::io::stdin()
-        .read_line(&mut input)
-        .into_diagnostic()
-        .wrap_err("failed to read user input")?;
-
-    let choice = input.trim().to_lowercase();
-    Ok(choice == "y" || choice == "yes")
-}
-
 /// Deploy a gateway with the rich progress panel (interactive) or simple
 /// logging (non-interactive). Returns the [`GatewayHandle`] on success.
 ///
@@ -1346,7 +1312,6 @@ pub async fn gateway_admin_deploy(
     ssh_key: Option<&str>,
     port: u16,
     gateway_host: Option<&str>,
-    recreate: bool,
     disable_tls: bool,
     disable_gateway_auth: bool,
     registry_token: Option<&str>,
@@ -1373,41 +1338,9 @@ pub async fn gateway_admin_deploy(
         options = options.with_registry_token(token);
     }
 
-    let interactive = std::io::stderr().is_terminal();
-
-    // Check for existing gateway and prompt user if found.
-    // --recreate skips the prompt and always destroys.
-    {
-        let remote_opts = remote.map(|dest| {
-            let mut opts = RemoteOptions::new(dest);
-            if let Some(key) = ssh_key {
-                opts = opts.with_ssh_key(key);
-            }
-            opts
-        });
-        if let Some(info) =
-            openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
-        {
-            let should_recreate = if recreate {
-                true
-            } else if interactive {
-                prompt_existing_gateway(name, &info)?
-            } else {
-                false // non-interactive without --recreate: silently reuse
-            };
-
-            if should_recreate {
-                eprintln!("• Destroying existing gateway...");
-                let handle =
-                    openshell_bootstrap::gateway_handle(name, remote_opts.as_ref()).await?;
-                handle.destroy().await?;
-                eprintln!("{} Gateway destroyed, starting fresh.", "✓".green().bold());
-                eprintln!();
-            }
-            // If reusing, the deploy flow will handle stale node cleanup automatically
-        }
-    }
-
+    // Teardown of any existing resources is handled inside
+    // deploy_gateway_with_logs(), so both `gateway start` and the
+    // auto-bootstrap path in `sandbox create` get the same cleanup.
     let handle = deploy_gateway_with_panel(options, name, location).await?;
 
     print_deploy_summary(name, &handle);
@@ -1997,20 +1930,29 @@ pub async fn sandbox_create(
     // Track whether we saw the gateway become ready (from log messages).
     let mut saw_gateway_ready = false;
 
-    while let Some(item) = stream.next().await {
-        // Check for timeout
-        if start_time.elapsed() > provision_timeout {
-            let timeout_message = provisioning_timeout_message(
-                provision_timeout.as_secs(),
-                requested_gpu,
-                last_condition_message.as_deref(),
-            );
-            if let Some(d) = display.as_mut() {
-                d.finish_error(&timeout_message);
+    loop {
+        // Compute remaining time so the timeout fires even when the stream
+        // produces no events (e.g. server-side producer died).
+        let remaining = provision_timeout.saturating_sub(start_time.elapsed());
+        let maybe_item = tokio::time::timeout(remaining, stream.next()).await;
+
+        let item = match maybe_item {
+            Ok(Some(item)) => item,
+            Ok(None) => break, // stream ended
+            Err(_elapsed) => {
+                // Timeout fired — the stream was idle for too long.
+                let timeout_message = provisioning_timeout_message(
+                    provision_timeout.as_secs(),
+                    requested_gpu,
+                    last_condition_message.as_deref(),
+                );
+                if let Some(d) = display.as_mut() {
+                    d.finish_error(&timeout_message);
+                }
+                println!();
+                return Err(miette::miette!(timeout_message));
             }
-            println!();
-            return Err(miette::miette!(timeout_message));
-        }
+        };
 
         let evt = item.into_diagnostic()?;
         match evt.payload {

From 5162fd16e01bd86815154ad07072051c9913e10b Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 13 Mar 2026 13:46:06 -0700
Subject: [PATCH 4/5] fix(bootstrap): restore interactive prompt before
 destroying existing gateway
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of unconditionally tearing down existing gateway resources on
every `gateway start`, restore the interactive prompt that asks the
user whether to destroy and recreate. When --recreate is passed, the
prompt is skipped and resources are destroyed directly. In
non-interactive mode, the existing gateway is reused silently.

The deploy_gateway_with_logs function now respects a `recreate` field
on DeployOptions — only destroying Docker resources (container, image,
volume) when explicitly requested. The auto-bootstrap path sets
recreate=true to handle stale Docker resources without metadata.
---
 crates/openshell-bootstrap/src/lib.rs | 37 ++++++++++++-----
 crates/openshell-cli/src/bootstrap.rs |  4 +-
 crates/openshell-cli/src/main.rs      |  6 +--
 crates/openshell-cli/src/run.rs       | 60 ++++++++++++++++++++++-----
 4 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index 4f347eb7f..237d65f2c 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -107,6 +107,10 @@ pub struct DeployOptions {
     /// created with GPU device requests (`--gpus all`) and the NVIDIA
     /// k8s-device-plugin is deployed inside the k3s cluster.
     pub gpu: bool,
+    /// When true, destroy any existing gateway resources before deploying.
+    /// When false, an existing gateway is left as-is and deployment is
+    /// skipped (the caller is responsible for prompting the user first).
+    pub recreate: bool,
 }
 
 impl DeployOptions {
@@ -121,6 +125,7 @@ impl DeployOptions {
             disable_gateway_auth: false,
             registry_token: None,
             gpu: false,
+            recreate: false,
         }
     }
 
@@ -172,6 +177,13 @@ impl DeployOptions {
         self.gpu = gpu;
         self
     }
+
+    /// Set whether to destroy and recreate existing gateway resources.
+    #[must_use]
+    pub fn with_recreate(mut self, recreate: bool) -> Self {
+        self.recreate = recreate;
+        self
+    }
 }
 
 #[derive(Debug, Clone)]
@@ -232,6 +244,7 @@ where
     let disable_gateway_auth = options.disable_gateway_auth;
     let registry_token = options.registry_token;
     let gpu = options.gpu;
+    let recreate = options.recreate;
 
     // Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
     // which needs a 'static callback for the bollard streaming pull.
@@ -256,16 +269,20 @@ where
         ),
     };
 
-    // Tear down any existing gateway resources (container, volume, image) so
-    // we always start from a clean state.  This is the single consolidated
-    // teardown point — both `gateway start` and the auto-bootstrap path in
-    // `sandbox create` flow through here.
-    if check_existing_gateway(&target_docker, &name)
-        .await?
-        .is_some()
-    {
-        log("[status] Removing existing gateway".to_string());
-        destroy_gateway_resources(&target_docker, &name).await?;
+    // If an existing gateway is found, either tear it down (when recreate is
+    // requested) or bail out so the caller can prompt the user / reuse it.
+    if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
+        if recreate {
+            log("[status] Removing existing gateway".to_string());
+            destroy_gateway_resources(&target_docker, &name).await?;
+        } else {
+            return Err(miette::miette!(
+                "Gateway '{name}' already exists (container_running={}).\n\
+                 Use --recreate to destroy and redeploy, or destroy it first with:\n\n    \
+                 openshell gateway destroy {name}",
+                existing.container_running,
+            ));
+        }
     }
 
     // Ensure the image is available on the target Docker daemon
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index 2caa20b96..ca81404f8 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -144,7 +144,9 @@ pub async fn run_bootstrap(
     );
     eprintln!();
 
-    let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name);
+    // Auto-bootstrap always recreates if stale Docker resources are found
+    // (e.g. metadata was deleted but container/volume still exist).
+    let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
     if let Some(dest) = remote {
         let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
         if let Some(key) = ssh_key {
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 84f7180de..f21ad7595 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -740,8 +740,8 @@ enum GatewayCommands {
 
         /// Destroy and recreate the gateway from scratch if one already exists.
         ///
-        /// Without this flag, an interactive prompt asks what to do; in
-        /// non-interactive mode the existing gateway is reused silently.
+        /// Without this flag, an interactive prompt asks whether to recreate;
+        /// in non-interactive mode the existing gateway is reused silently.
         #[arg(long)]
         recreate: bool,
 
@@ -1420,13 +1420,13 @@ async fn main() -> Result<()> {
                 registry_token,
                 gpu,
             } => {
-                let _ = recreate; // teardown is always performed by deploy
                 run::gateway_admin_deploy(
                     &name,
                     remote.as_deref(),
                     ssh_key.as_deref(),
                     port,
                     gateway_host.as_deref(),
+                    recreate,
                     plaintext,
                     disable_gateway_auth,
                     registry_token.as_deref(),
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index c072f58c1..002596f8c 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1312,6 +1312,7 @@ pub async fn gateway_admin_deploy(
     ssh_key: Option<&str>,
     port: u16,
     gateway_host: Option<&str>,
+    recreate: bool,
     disable_tls: bool,
     disable_gateway_auth: bool,
     registry_token: Option<&str>,
@@ -1319,17 +1320,59 @@ pub async fn gateway_admin_deploy(
 ) -> Result<()> {
     let location = if remote.is_some() { "remote" } else { "local" };
 
+    // Build remote options once so we can reuse them for the existence check
+    // and the deploy options.
+    let remote_opts = remote.map(|dest| {
+        let mut opts = RemoteOptions::new(dest);
+        if let Some(key) = ssh_key {
+            opts = opts.with_ssh_key(key);
+        }
+        opts
+    });
+
+    // Check whether a gateway already exists. If so, prompt the user (unless
+    // --recreate was passed or we're in non-interactive mode).
+    let mut should_recreate = recreate;
+    if let Some(existing) =
+        openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
+    {
+        if !should_recreate {
+            let interactive = std::io::stderr().is_terminal();
+            if interactive {
+                let status = if existing.container_running {
+                    "running"
+                } else {
+                    "stopped"
+                };
+                eprintln!(
+                    "{} Gateway '{name}' already exists ({status}).",
+                    "!".yellow().bold()
+                );
+                should_recreate = Confirm::new()
+                    .with_prompt("Destroy and recreate it?")
+                    .default(false)
+                    .interact()
+                    .into_diagnostic()?;
+                if !should_recreate {
+                    eprintln!("Keeping existing gateway.");
+                    return Ok(());
+                }
+            } else {
+                // Non-interactive mode: reuse existing gateway silently.
+                eprintln!("Gateway '{name}' already exists, reusing.");
+                return Ok(());
+            }
+        }
+    }
+
     let mut options = DeployOptions::new(name)
         .with_port(port)
         .with_disable_tls(disable_tls)
         .with_disable_gateway_auth(disable_gateway_auth)
-        .with_gpu(gpu);
-    if let Some(dest) = remote {
-        let mut remote_opts = RemoteOptions::new(dest);
-        if let Some(key) = ssh_key {
-            remote_opts = remote_opts.with_ssh_key(key);
-        }
-        options = options.with_remote(remote_opts);
+        .with_gpu(gpu)
+        .with_recreate(should_recreate);
+    if let Some(opts) = remote_opts {
+        options = options.with_remote(opts);
     }
     if let Some(host) = gateway_host {
         options = options.with_gateway_host(host);
@@ -1338,9 +1381,6 @@ pub async fn gateway_admin_deploy(
         options = options.with_registry_token(token);
     }
 
-    // Teardown of any existing resources is handled inside
-    // deploy_gateway_with_logs(), so both `gateway start` and the
-    // auto-bootstrap path in `sandbox create` get the same cleanup.
     let handle = deploy_gateway_with_panel(options, name, location).await?;
 
     print_deploy_summary(name, &handle);

From a27aaf4342d164afb48f36673e1815e1e47ae0da Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 13 Mar 2026 13:53:39 -0700
Subject: [PATCH 5/5] wip

---
 tasks/scripts/cluster-deploy-fast.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh
index 42c4c343f..348640d52 100755
--- a/tasks/scripts/cluster-deploy-fast.sh
+++ b/tasks/scripts/cluster-deploy-fast.sh
@@ -131,12 +131,14 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then
     previous_helm_fingerprint=""
   fi
 
-  # Invalidate all previous fingerprints when the cluster container has
+  # Invalidate gateway and helm fingerprints when the cluster container has
   # changed (recreated or replaced).  The new k3s instance has no pushed
-  # images so everything must be rebuilt.
+  # images so the gateway must be rebuilt and helm must be re-applied.
+  # The supervisor is NOT invalidated here because it is already built into
+  # the cluster image — a fresh cluster already has the correct supervisor
+  # binary, so rebuilding it would be redundant.
   if [[ -n "${current_container_id}" && "${current_container_id}" != "${previous_container_id:-}" ]]; then
     previous_gateway_fingerprint=""
-    previous_supervisor_fingerprint=""
     previous_helm_fingerprint=""
   fi
 fi