From b69c274beceeedde0b7b355046a32cf3b745d143 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 13 Mar 2026 07:47:21 -0700 Subject: [PATCH 1/5] fix(bootstrap): detect missing sandbox supervisor binary during gateway health check The cluster health check now verifies that /opt/openshell/bin/openshell-sandbox exists in the gateway container. Without this binary, every sandbox pod crashes with 'no such file or directory' but the gateway previously reported healthy. - Add HEALTHCHECK_MISSING_SUPERVISOR marker to cluster-healthcheck.sh - Add early detection in bootstrap polling loop (runtime.rs) - Add structured error diagnosis with recovery steps (errors.rs) - Update debug-openshell-cluster skill with new failure pattern --- .../skills/debug-openshell-cluster/SKILL.md | 6 ++++ crates/openshell-bootstrap/src/errors.rs | 33 +++++++++++++++++++ crates/openshell-bootstrap/src/runtime.rs | 30 +++++++++++++++++ deploy/docker/cluster-healthcheck.sh | 12 +++++++ 4 files changed, 81 insertions(+) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 2d46ea550..81b1b53e5 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -26,6 +26,7 @@ Use **only** `openshell` CLI commands (`openshell status`, `openshell doctor log - k3s API server readiness (`/readyz`) - `openshell` statefulset ready in `openshell` namespace - TLS secrets `openshell-server-tls` and `openshell-client-tls` exist in `openshell` namespace + - Sandbox supervisor binary exists at `/opt/openshell/bin/openshell-sandbox` (emits `HEALTHCHECK_MISSING_SUPERVISOR` marker if absent) For local deploys, metadata endpoint selection now depends on Docker connectivity: @@ -311,6 +312,8 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component | | Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 2) or choose "Recreate" when prompted | | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` | +| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` stage. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker | +| `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy && openshell gateway start` | ## Full Diagnostic Dump @@ -359,6 +362,9 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op echo "=== Registry Configuration ===" openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml +echo "=== Supervisor Binary ===" +openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox + echo "=== DNS Configuration ===" openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf ``` diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 40aac1a74..39088ffb9 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -139,6 +139,12 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_node_pressure, }, + // Missing sandbox supervisor binary + FailurePattern { + matchers: &["HEALTHCHECK_MISSING_SUPERVISOR"], + match_mode: MatchMode::Any, + diagnose: diagnose_missing_supervisor, + }, // TLS/certificate issues FailurePattern { matchers: &[ @@ -342,6 +348,33 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis { } } +fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis { + GatewayFailureDiagnosis { + summary: "Sandbox supervisor binary missing from cluster image".to_string(), + explanation: "The sandbox supervisor binary (/opt/openshell/bin/openshell-sandbox) \ + was not found in the gateway container. This binary is side-loaded into every \ + sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \ + crash immediately with \"no such file or directory\". This typically means the \ + cluster image was built or published without the supervisor-builder stage." + .to_string(), + recovery_steps: vec![ + RecoveryStep::with_command( + "Rebuild the cluster image with the supervisor binary included", + "mise run docker:build:cluster", + ), + RecoveryStep::with_command( + "Destroy and recreate the gateway with the updated image", + format!("openshell gateway destroy {gateway_name} && openshell gateway start"), + ), + RecoveryStep::new( + "Or set OPENSHELL_CLUSTER_IMAGE to a cluster image version that includes \ + the supervisor binary", + ), + ], + retryable: false, + } +} + fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "TLS certificate issue".to_string(), diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 0bc893d84..271fde8d4 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -24,6 +24,13 @@ const DNS_FAILURE_MARKERS: &[&str] = &["DNS_PROBE_FAILED", "HEALTHCHECK_DNS_FAIL /// new scheduling, so the cluster will never become healthy on its own. const NODE_PRESSURE_MARKER: &str = "HEALTHCHECK_NODE_PRESSURE"; +/// Log marker emitted by the health-check script when the sandbox supervisor +/// binary (`/opt/openshell/bin/openshell-sandbox`) is missing from the node +/// filesystem. Without this binary, every sandbox pod will crash immediately +/// with "no such file or directory". This is a permanent error that requires +/// rebuilding or updating the cluster image. +const MISSING_SUPERVISOR_MARKER: &str = "HEALTHCHECK_MISSING_SUPERVISOR"; + /// Number of consecutive polling iterations that must observe DNS failure /// markers before we treat the failure as persistent and abort. A small /// grace period avoids false positives on transient hiccups during startup. @@ -116,6 +123,29 @@ where } } + // -- Missing supervisor binary detection ---------------------------- + // The health-check script verifies that /opt/openshell/bin/openshell-sandbox + // exists on the node filesystem. If missing, every sandbox pod will crash. + // This is a permanent error — fail immediately with actionable guidance. + if recent_logs + .iter() + .any(|line| line.contains(MISSING_SUPERVISOR_MARKER)) + { + result = Some(Err(miette::miette!( + "The sandbox supervisor binary is missing from the cluster image.\n\ + The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \ + container. Without it, sandbox pods cannot start.\n\n\ + This usually means the cluster image was built or published without the \ + supervisor-builder stage.\n\n\ + To fix:\n \ + 1. Rebuild the cluster image: mise run docker:build:cluster\n \ + 2. Or update to a cluster image that includes the supervisor binary\n \ + 3. Then recreate the gateway: openshell gateway destroy && openshell gateway start\n\n{}", + format_recent_logs(&recent_logs) + ))); + break; + } + let inspect = docker .inspect_container(&container_name, None::) .await diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index a3b373030..68210b456 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -50,6 +50,18 @@ done kubectl -n openshell get statefulset/openshell >/dev/null 2>&1 || exit 1 kubectl -n openshell wait --for=jsonpath='{.status.readyReplicas}'=1 statefulset/openshell --timeout=1s >/dev/null 2>&1 || exit 1 +# --------------------------------------------------------------------------- +# Verify the sandbox supervisor binary exists on the node filesystem. +# Sandbox pods mount /opt/openshell/bin as a read-only hostPath volume and +# exec /opt/openshell/bin/openshell-sandbox as their entrypoint. If the binary +# is missing (e.g. cluster image was built without the supervisor-builder +# stage), every sandbox pod will crash with "no such file or directory". +# --------------------------------------------------------------------------- +if [ ! -x /opt/openshell/bin/openshell-sandbox ]; then + echo "HEALTHCHECK_MISSING_SUPERVISOR: /opt/openshell/bin/openshell-sandbox not found" >&2 + exit 1 +fi + # Verify TLS secrets exist (created by openshell-bootstrap before the StatefulSet starts) # Skip when TLS is disabled — secrets are not required. if [ "${DISABLE_TLS:-}" != "true" ]; then From 20bcf3bf10e9dd8251d7b8684f6b5903fc6dcd42 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 13 Mar 2026 07:53:00 -0700 Subject: [PATCH 2/5] fix(sandbox): increase provisioning timeout from 120s to 300s Large sandbox images (e.g. 852MB base image) can take ~3 minutes to pull, exceeding the previous 120s timeout. Bump to 300s across all surfaces: CLI watch loop, server orphan grace period, TUI ready poll, Python SDK wait_ready default, and E2E test harness. --- crates/openshell-cli/src/run.rs | 2 +- crates/openshell-server/src/sandbox/mod.rs | 2 +- crates/openshell-tui/src/lib.rs | 2 +- e2e/rust/src/harness/sandbox.rs | 2 +- python/openshell/sandbox.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 12b891da1..9e62e6455 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1993,7 +1993,7 @@ pub async fn sandbox_create( // Track whether we have seen a non-Ready phase during the watch. let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready); let start_time = Instant::now(); - let provision_timeout = Duration::from_secs(120); + let provision_timeout = Duration::from_secs(300); // Track whether we saw the gateway become ready (from log messages). let mut saw_gateway_ready = false; diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index 2fd9b54b6..f8ab6a6dd 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -381,7 +381,7 @@ const RECONCILE_INTERVAL: Duration = Duration::from_secs(60); /// How long a sandbox can stay in `Provisioning` in the store without a /// corresponding Kubernetes resource before it is considered orphaned and /// removed. -const ORPHAN_GRACE_PERIOD: Duration = Duration::from_secs(120); +const ORPHAN_GRACE_PERIOD: Duration = Duration::from_secs(300); /// Periodically reconcile the store against Kubernetes to clean up orphaned /// sandbox records. A record is orphaned when it exists in the store but diff --git a/crates/openshell-tui/src/lib.rs b/crates/openshell-tui/src/lib.rs index 9255ae0b2..98f4ed0d6 100644 --- a/crates/openshell-tui/src/lib.rs +++ b/crates/openshell-tui/src/lib.rs @@ -1256,7 +1256,7 @@ fn spawn_create_sandbox(app: &mut App, tx: mpsc::UnboundedSender) { let mut attempts = 0; let sandbox_id = loop { attempts += 1; - if attempts > 60 { + if attempts > 150 { let _ = tx.send(Event::CreateResult(Err( "timed out waiting for sandbox to be ready".to_string(), ))); diff --git a/e2e/rust/src/harness/sandbox.rs b/e2e/rust/src/harness/sandbox.rs index da48c5ed2..7a942265d 100644 --- a/e2e/rust/src/harness/sandbox.rs +++ b/e2e/rust/src/harness/sandbox.rs @@ -25,7 +25,7 @@ fn extract_sandbox_name(output: &str) -> Option { } /// Default timeout for waiting for a sandbox to become ready. -const SANDBOX_READY_TIMEOUT: Duration = Duration::from_secs(120); +const SANDBOX_READY_TIMEOUT: Duration = Duration::from_secs(300); /// RAII guard that deletes a sandbox on drop. /// diff --git a/python/openshell/sandbox.py b/python/openshell/sandbox.py index 6507c9a45..7b48ab3b9 100644 --- a/python/openshell/sandbox.py +++ b/python/openshell/sandbox.py @@ -248,7 +248,7 @@ def wait_deleted(self, sandbox_name: str, *, timeout_seconds: float = 60.0) -> N raise SandboxError(f"sandbox {sandbox_name} was not deleted within timeout") def wait_ready( - self, sandbox_name: str, *, timeout_seconds: float = 120.0 + self, sandbox_name: str, *, timeout_seconds: float = 300.0 ) -> SandboxRef: deadline = time.time() + timeout_seconds while time.time() < deadline: From 9c392678ebc2731a3e945e8e9460e84f8ae10215 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 13 Mar 2026 13:26:57 -0700 Subject: [PATCH 3/5] fix(bootstrap): consolidate gateway teardown and remove unused Docker network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teardown of existing gateway resources (container, volume, image) is now performed inside deploy_gateway_with_logs() so both 'gateway start' and the auto-bootstrap path in 'sandbox create' get identical cleanup. Remove the dedicated 'openshell-cluster' bridge network — it was only used by a single container and the default Docker bridge is sufficient. This eliminates the network create/remove retry loops and stale endpoint cleanup that added complexity without benefit. Fix the provisioning stream timeout in run.rs: wrap stream.next() in tokio::time::timeout() so the 300s deadline fires even when the gRPC stream stops producing events. --- crates/openshell-bootstrap/src/constants.rs | 2 - crates/openshell-bootstrap/src/docker.rs | 110 +------------------- crates/openshell-bootstrap/src/lib.rs | 15 ++- crates/openshell-cli/src/main.rs | 2 +- crates/openshell-cli/src/run.rs | 108 +++++-------------- 5 files changed, 42 insertions(+), 195 deletions(-) diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index 76f5d2f3f..0c6a93821 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -pub const NETWORK_NAME: &str = "openshell-cluster"; - /// Path to the kubeconfig inside the k3s container. /// Used by in-container kubectl operations (node cleanup, PKI reconciliation, etc.). pub const KUBECONFIG_PATH: &str = "/etc/rancher/k3s/k3s.yaml"; diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index c44197bfb..6a2064e91 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::RemoteOptions; -use crate::constants::{NETWORK_NAME, container_name, volume_name}; +use crate::constants::{container_name, volume_name}; use crate::image::{ self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref, }; @@ -10,11 +10,10 @@ use bollard::API_DEFAULT_VERSION; use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::{ - ContainerCreateBody, DeviceRequest, HostConfig, NetworkCreateRequest, NetworkDisconnectRequest, - PortBinding, VolumeCreateRequest, + ContainerCreateBody, DeviceRequest, HostConfig, PortBinding, VolumeCreateRequest, }; use bollard::query_parameters::{ - CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, + CreateContainerOptions, CreateImageOptions, InspectContainerOptions, ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions, StartContainerOptions, }; @@ -186,53 +185,6 @@ pub async fn find_gateway_container(docker: &Docker, port: Option) -> Resul } } -pub async fn ensure_network(docker: &Docker) -> Result<()> { - // Always remove and recreate the network to guarantee a clean state. - // Stale Docker networks (e.g., from a previous interrupted destroy or - // Docker Desktop restart) can leave broken routing that causes the - // container to fail with "no default routes found". - force_remove_network(docker).await?; - - // Docker may return a 409 conflict if the previous network teardown has - // not fully completed in the daemon. Retry a few times with back-off, - // re-attempting the removal before each create. - let mut last_err = None; - for attempt in 0u64..5 { - if attempt > 0 { - tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await; - // Re-attempt removal in case the previous teardown has now settled. - force_remove_network(docker).await?; - } - match docker - .create_network(NetworkCreateRequest { - name: NETWORK_NAME.to_string(), - driver: Some("bridge".to_string()), - attachable: Some(true), - ..Default::default() - }) - .await - { - Ok(_) => return Ok(()), - Err(err) if is_conflict(&err) => { - tracing::debug!( - "Network create conflict (attempt {}/5), retrying: {}", - attempt + 1, - err, - ); - last_err = Some(err); - } - Err(err) => { - return Err(err) - .into_diagnostic() - .wrap_err("failed to create Docker network"); - } - } - } - Err(last_err.expect("at least one retry attempt")) - .into_diagnostic() - .wrap_err("failed to create Docker network after retries (network still in use)") -} - pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> { match docker.inspect_volume(name).await { Ok(_) => return Ok(()), @@ -376,7 +328,6 @@ pub async fn ensure_container( privileged: Some(true), port_bindings: Some(port_bindings), binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), - network_mode: Some(NETWORK_NAME.to_string()), // Add host.docker.internal mapping for DNS resolution // This allows the entrypoint script to configure CoreDNS to use the host gateway extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]), @@ -678,20 +629,6 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() .ok() .and_then(|info| info.image); - // Explicitly disconnect the container from the cluster network before - // removing it. This ensures Docker tears down the network endpoint - // synchronously so port bindings are released immediately and the - // subsequent network cleanup sees zero connected containers. - let _ = docker - .disconnect_network( - NETWORK_NAME, - NetworkDisconnectRequest { - container: container_name.clone(), - force: Some(true), - }, - ) - .await; - let _ = stop_container(docker, &container_name).await; let remove_container = docker @@ -763,50 +700,9 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() return Err(err).into_diagnostic(); } - // Force-remove the network during a full destroy. First disconnect any - // stale endpoints that Docker may still report (race between container - // removal and network bookkeeping), then remove the network itself. - force_remove_network(docker).await?; Ok(()) } -/// Forcefully remove the gateway network, disconnecting any remaining -/// containers first. This ensures that stale Docker network endpoints -/// cannot prevent port bindings from being released. -async fn force_remove_network(docker: &Docker) -> Result<()> { - let network = match docker - .inspect_network(NETWORK_NAME, None::) - .await - { - Ok(info) => info, - Err(err) if is_not_found(&err) => return Ok(()), - Err(err) => return Err(err).into_diagnostic(), - }; - - // Disconnect any containers still attached to the network. - if let Some(containers) = network.containers { - for (id, _) in containers { - let _ = docker - .disconnect_network( - NETWORK_NAME, - NetworkDisconnectRequest { - container: id, - force: Some(true), - }, - ) - .await; - } - } - - match docker.remove_network(NETWORK_NAME).await { - Ok(()) => Ok(()), - Err(err) if is_not_found(&err) => Ok(()), - Err(err) => Err(err) - .into_diagnostic() - .wrap_err("failed to remove Docker network"), - } -} - fn is_not_found(err: &BollardError) -> bool { matches!( err, diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b8c0cdddd..4f347eb7f 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -30,7 +30,7 @@ use crate::constants::{ }; use crate::docker::{ check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container, - ensure_image, ensure_network, ensure_volume, start_container, stop_container, + ensure_image, ensure_volume, start_container, stop_container, }; use crate::metadata::{ create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, @@ -256,6 +256,18 @@ where ), }; + // Tear down any existing gateway resources (container, volume, image) so + // we always start from a clean state. This is the single consolidated + // teardown point — both `gateway start` and the auto-bootstrap path in + // `sandbox create` flow through here. + if check_existing_gateway(&target_docker, &name) + .await? + .is_some() + { + log("[status] Removing existing gateway".to_string()); + destroy_gateway_resources(&target_docker, &name).await?; + } + // Ensure the image is available on the target Docker daemon if remote_opts.is_some() { log("[status] Downloading gateway".to_string()); @@ -280,7 +292,6 @@ where // All subsequent operations use the target Docker (remote or local) log("[status] Initializing environment".to_string()); - ensure_network(&target_docker).await?; ensure_volume(&target_docker, &volume_name(&name)).await?; // Compute extra TLS SANs for remote deployments so the gateway and k3s diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 5258f18a7..84f7180de 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1420,13 +1420,13 @@ async fn main() -> Result<()> { registry_token, gpu, } => { + let _ = recreate; // teardown is always performed by deploy run::gateway_admin_deploy( &name, remote.as_deref(), ssh_key.as_deref(), port, gateway_host.as_deref(), - recreate, plaintext, disable_gateway_auth, registry_token.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 9e62e6455..c072f58c1 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1205,40 +1205,6 @@ async fn http_health_check(server: &str, tls: &TlsOptions) -> Result Result { - let status = if info.container_running { - "running" - } else if info.container_exists { - "stopped" - } else { - "volume only" - }; - - eprintln!("• Existing gateway '{name}' detected ({status})"); - if let Some(image) = &info.container_image { - eprintln!(" {} {}", "Image:".dimmed(), image); - } - eprintln!(); - - eprint!("Destroy and recreate gateway from scratch? [y/N] "); - std::io::stderr().flush().ok(); - - let mut input = String::new(); - std::io::stdin() - .read_line(&mut input) - .into_diagnostic() - .wrap_err("failed to read user input")?; - - let choice = input.trim().to_lowercase(); - Ok(choice == "y" || choice == "yes") -} - /// Deploy a gateway with the rich progress panel (interactive) or simple /// logging (non-interactive). Returns the [`GatewayHandle`] on success. /// @@ -1346,7 +1312,6 @@ pub async fn gateway_admin_deploy( ssh_key: Option<&str>, port: u16, gateway_host: Option<&str>, - recreate: bool, disable_tls: bool, disable_gateway_auth: bool, registry_token: Option<&str>, @@ -1373,41 +1338,9 @@ pub async fn gateway_admin_deploy( options = options.with_registry_token(token); } - let interactive = std::io::stderr().is_terminal(); - - // Check for existing gateway and prompt user if found. - // --recreate skips the prompt and always destroys. - { - let remote_opts = remote.map(|dest| { - let mut opts = RemoteOptions::new(dest); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - opts - }); - if let Some(info) = - openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? - { - let should_recreate = if recreate { - true - } else if interactive { - prompt_existing_gateway(name, &info)? - } else { - false // non-interactive without --recreate: silently reuse - }; - - if should_recreate { - eprintln!("• Destroying existing gateway..."); - let handle = - openshell_bootstrap::gateway_handle(name, remote_opts.as_ref()).await?; - handle.destroy().await?; - eprintln!("{} Gateway destroyed, starting fresh.", "✓".green().bold()); - eprintln!(); - } - // If reusing, the deploy flow will handle stale node cleanup automatically - } - } - + // Teardown of any existing resources is handled inside + // deploy_gateway_with_logs(), so both `gateway start` and the + // auto-bootstrap path in `sandbox create` get the same cleanup. let handle = deploy_gateway_with_panel(options, name, location).await?; print_deploy_summary(name, &handle); @@ -1997,20 +1930,29 @@ pub async fn sandbox_create( // Track whether we saw the gateway become ready (from log messages). let mut saw_gateway_ready = false; - while let Some(item) = stream.next().await { - // Check for timeout - if start_time.elapsed() > provision_timeout { - let timeout_message = provisioning_timeout_message( - provision_timeout.as_secs(), - requested_gpu, - last_condition_message.as_deref(), - ); - if let Some(d) = display.as_mut() { - d.finish_error(&timeout_message); + loop { + // Compute remaining time so the timeout fires even when the stream + // produces no events (e.g. server-side producer died). + let remaining = provision_timeout.saturating_sub(start_time.elapsed()); + let maybe_item = tokio::time::timeout(remaining, stream.next()).await; + + let item = match maybe_item { + Ok(Some(item)) => item, + Ok(None) => break, // stream ended + Err(_elapsed) => { + // Timeout fired — the stream was idle for too long. + let timeout_message = provisioning_timeout_message( + provision_timeout.as_secs(), + requested_gpu, + last_condition_message.as_deref(), + ); + if let Some(d) = display.as_mut() { + d.finish_error(&timeout_message); + } + println!(); + return Err(miette::miette!(timeout_message)); } - println!(); - return Err(miette::miette!(timeout_message)); - } + }; let evt = item.into_diagnostic()?; match evt.payload { From 5162fd16e01bd86815154ad07072051c9913e10b Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 13 Mar 2026 13:46:06 -0700 Subject: [PATCH 4/5] fix(bootstrap): restore interactive prompt before destroying existing gateway MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of unconditionally tearing down existing gateway resources on every `gateway start`, restore the interactive prompt that asks the user whether to destroy and recreate. When --recreate is passed, the prompt is skipped and resources are destroyed directly. In non-interactive mode, the existing gateway is reused silently. The deploy_gateway_with_logs function now respects a `recreate` field on DeployOptions — only destroying Docker resources (container, image, volume) when explicitly requested. The auto-bootstrap path sets recreate=true to handle stale Docker resources without metadata. --- crates/openshell-bootstrap/src/lib.rs | 37 ++++++++++++----- crates/openshell-cli/src/bootstrap.rs | 4 +- crates/openshell-cli/src/main.rs | 6 +-- crates/openshell-cli/src/run.rs | 60 ++++++++++++++++++++++----- 4 files changed, 83 insertions(+), 24 deletions(-) diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 4f347eb7f..237d65f2c 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -107,6 +107,10 @@ pub struct DeployOptions { /// created with GPU device requests (`--gpus all`) and the NVIDIA /// k8s-device-plugin is deployed inside the k3s cluster. pub gpu: bool, + /// When true, destroy any existing gateway resources before deploying. + /// When false, an existing gateway is left as-is and deployment is + /// skipped (the caller is responsible for prompting the user first). + pub recreate: bool, } impl DeployOptions { @@ -121,6 +125,7 @@ impl DeployOptions { disable_gateway_auth: false, registry_token: None, gpu: false, + recreate: false, } } @@ -172,6 +177,13 @@ impl DeployOptions { self.gpu = gpu; self } + + /// Set whether to destroy and recreate existing gateway resources. + #[must_use] + pub fn with_recreate(mut self, recreate: bool) -> Self { + self.recreate = recreate; + self + } } #[derive(Debug, Clone)] @@ -232,6 +244,7 @@ where let disable_gateway_auth = options.disable_gateway_auth; let registry_token = options.registry_token; let gpu = options.gpu; + let recreate = options.recreate; // Wrap on_log in Arc> so we can share it with pull_remote_image // which needs a 'static callback for the bollard streaming pull. @@ -256,16 +269,20 @@ where ), }; - // Tear down any existing gateway resources (container, volume, image) so - // we always start from a clean state. This is the single consolidated - // teardown point — both `gateway start` and the auto-bootstrap path in - // `sandbox create` flow through here. - if check_existing_gateway(&target_docker, &name) - .await? - .is_some() - { - log("[status] Removing existing gateway".to_string()); - destroy_gateway_resources(&target_docker, &name).await?; + // If an existing gateway is found, either tear it down (when recreate is + // requested) or bail out so the caller can prompt the user / reuse it. + if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { + if recreate { + log("[status] Removing existing gateway".to_string()); + destroy_gateway_resources(&target_docker, &name).await?; + } else { + return Err(miette::miette!( + "Gateway '{name}' already exists (container_running={}).\n\ + Use --recreate to destroy and redeploy, or destroy it first with:\n\n \ + openshell gateway destroy {name}", + existing.container_running, + )); + } } // Ensure the image is available on the target Docker daemon diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 2caa20b96..ca81404f8 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -144,7 +144,9 @@ pub async fn run_bootstrap( ); eprintln!(); - let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name); + // Auto-bootstrap always recreates if stale Docker resources are found + // (e.g. metadata was deleted but container/volume still exist). + let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true); if let Some(dest) = remote { let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); if let Some(key) = ssh_key { diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 84f7180de..f21ad7595 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -740,8 +740,8 @@ enum GatewayCommands { /// Destroy and recreate the gateway from scratch if one already exists. /// - /// Without this flag, an interactive prompt asks what to do; in - /// non-interactive mode the existing gateway is reused silently. + /// Without this flag, an interactive prompt asks whether to recreate; + /// in non-interactive mode the existing gateway is reused silently. #[arg(long)] recreate: bool, @@ -1420,13 +1420,13 @@ async fn main() -> Result<()> { registry_token, gpu, } => { - let _ = recreate; // teardown is always performed by deploy run::gateway_admin_deploy( &name, remote.as_deref(), ssh_key.as_deref(), port, gateway_host.as_deref(), + recreate, plaintext, disable_gateway_auth, registry_token.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c072f58c1..002596f8c 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1312,6 +1312,7 @@ pub async fn gateway_admin_deploy( ssh_key: Option<&str>, port: u16, gateway_host: Option<&str>, + recreate: bool, disable_tls: bool, disable_gateway_auth: bool, registry_token: Option<&str>, @@ -1319,17 +1320,59 @@ pub async fn gateway_admin_deploy( ) -> Result<()> { let location = if remote.is_some() { "remote" } else { "local" }; + // Build remote options once so we can reuse them for the existence check + // and the deploy options. + let remote_opts = remote.map(|dest| { + let mut opts = RemoteOptions::new(dest); + if let Some(key) = ssh_key { + opts = opts.with_ssh_key(key); + } + opts + }); + + // Check whether a gateway already exists. If so, prompt the user (unless + // --recreate was passed or we're in non-interactive mode). + let mut should_recreate = recreate; + if let Some(existing) = + openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? + { + if !should_recreate { + let interactive = std::io::stderr().is_terminal(); + if interactive { + let status = if existing.container_running { + "running" + } else { + "stopped" + }; + eprintln!( + "{} Gateway '{name}' already exists ({status}).", + "!".yellow().bold() + ); + should_recreate = Confirm::new() + .with_prompt("Destroy and recreate it?") + .default(false) + .interact() + .into_diagnostic()?; + if !should_recreate { + eprintln!("Keeping existing gateway."); + return Ok(()); + } + } else { + // Non-interactive mode: reuse existing gateway silently. + eprintln!("Gateway '{name}' already exists, reusing."); + return Ok(()); + } + } + } + let mut options = DeployOptions::new(name) .with_port(port) .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) - .with_gpu(gpu); - if let Some(dest) = remote { - let mut remote_opts = RemoteOptions::new(dest); - if let Some(key) = ssh_key { - remote_opts = remote_opts.with_ssh_key(key); - } - options = options.with_remote(remote_opts); + .with_gpu(gpu) + .with_recreate(should_recreate); + if let Some(opts) = remote_opts { + options = options.with_remote(opts); } if let Some(host) = gateway_host { options = options.with_gateway_host(host); @@ -1338,9 +1381,6 @@ pub async fn gateway_admin_deploy( options = options.with_registry_token(token); } - // Teardown of any existing resources is handled inside - // deploy_gateway_with_logs(), so both `gateway start` and the - // auto-bootstrap path in `sandbox create` get the same cleanup. let handle = deploy_gateway_with_panel(options, name, location).await?; print_deploy_summary(name, &handle); From a27aaf4342d164afb48f36673e1815e1e47ae0da Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Fri, 13 Mar 2026 13:53:39 -0700 Subject: [PATCH 5/5] wip --- tasks/scripts/cluster-deploy-fast.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 42c4c343f..348640d52 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -131,12 +131,14 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then previous_helm_fingerprint="" fi - # Invalidate all previous fingerprints when the cluster container has + # Invalidate gateway and helm fingerprints when the cluster container has # changed (recreated or replaced). The new k3s instance has no pushed - # images so everything must be rebuilt. + # images so the gateway must be rebuilt and helm must be re-applied. + # The supervisor is NOT invalidated here because it is already built into + # the cluster image — a fresh cluster already has the correct supervisor + # binary, so rebuilding it would be redundant. if [[ -n "${current_container_id}" && "${current_container_id}" != "${previous_container_id:-}" ]]; then previous_gateway_fingerprint="" - previous_supervisor_fingerprint="" previous_helm_fingerprint="" fi fi