Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .agents/skills/debug-openshell-cluster/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Use **only** `openshell` CLI commands (`openshell status`, `openshell doctor log
- k3s API server readiness (`/readyz`)
- `openshell` statefulset ready in `openshell` namespace
- TLS secrets `openshell-server-tls` and `openshell-client-tls` exist in `openshell` namespace
- Sandbox supervisor binary exists at `/opt/openshell/bin/openshell-sandbox` (emits `HEALTHCHECK_MISSING_SUPERVISOR` marker if absent)

For local deploys, metadata endpoint selection now depends on Docker connectivity:

Expand Down Expand Up @@ -311,6 +312,8 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
| `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component |
| Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 2) or choose "Recreate" when prompted |
| gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` |
| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` stage. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker |
| `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy <name> && openshell gateway start` |

## Full Diagnostic Dump

Expand Down Expand Up @@ -359,6 +362,9 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op
echo "=== Registry Configuration ==="
openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml

echo "=== Supervisor Binary ==="
openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox

echo "=== DNS Configuration ==="
openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf
```
2 changes: 0 additions & 2 deletions crates/openshell-bootstrap/src/constants.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

pub const NETWORK_NAME: &str = "openshell-cluster";

/// Path to the kubeconfig inside the k3s container.
/// Used by in-container kubectl operations (node cleanup, PKI reconciliation, etc.).
pub const KUBECONFIG_PATH: &str = "/etc/rancher/k3s/k3s.yaml";
Expand Down
110 changes: 3 additions & 107 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
// SPDX-License-Identifier: Apache-2.0

use crate::RemoteOptions;
use crate::constants::{NETWORK_NAME, container_name, volume_name};
use crate::constants::{container_name, volume_name};
use crate::image::{
self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref,
};
use bollard::API_DEFAULT_VERSION;
use bollard::Docker;
use bollard::errors::Error as BollardError;
use bollard::models::{
ContainerCreateBody, DeviceRequest, HostConfig, NetworkCreateRequest, NetworkDisconnectRequest,
PortBinding, VolumeCreateRequest,
ContainerCreateBody, DeviceRequest, HostConfig, PortBinding, VolumeCreateRequest,
};
use bollard::query_parameters::{
CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
CreateContainerOptions, CreateImageOptions, InspectContainerOptions,
ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions,
StartContainerOptions,
};
Expand Down Expand Up @@ -186,53 +185,6 @@ pub async fn find_gateway_container(docker: &Docker, port: Option<u16>) -> Resul
}
}

pub async fn ensure_network(docker: &Docker) -> Result<()> {
// Always remove and recreate the network to guarantee a clean state.
// Stale Docker networks (e.g., from a previous interrupted destroy or
// Docker Desktop restart) can leave broken routing that causes the
// container to fail with "no default routes found".
force_remove_network(docker).await?;

// Docker may return a 409 conflict if the previous network teardown has
// not fully completed in the daemon. Retry a few times with back-off,
// re-attempting the removal before each create.
let mut last_err = None;
for attempt in 0u64..5 {
if attempt > 0 {
tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await;
// Re-attempt removal in case the previous teardown has now settled.
force_remove_network(docker).await?;
}
match docker
.create_network(NetworkCreateRequest {
name: NETWORK_NAME.to_string(),
driver: Some("bridge".to_string()),
attachable: Some(true),
..Default::default()
})
.await
{
Ok(_) => return Ok(()),
Err(err) if is_conflict(&err) => {
tracing::debug!(
"Network create conflict (attempt {}/5), retrying: {}",
attempt + 1,
err,
);
last_err = Some(err);
}
Err(err) => {
return Err(err)
.into_diagnostic()
.wrap_err("failed to create Docker network");
}
}
}
Err(last_err.expect("at least one retry attempt"))
.into_diagnostic()
.wrap_err("failed to create Docker network after retries (network still in use)")
}

pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> {
match docker.inspect_volume(name).await {
Ok(_) => return Ok(()),
Expand Down Expand Up @@ -376,7 +328,6 @@ pub async fn ensure_container(
privileged: Some(true),
port_bindings: Some(port_bindings),
binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
network_mode: Some(NETWORK_NAME.to_string()),
// Add host.docker.internal mapping for DNS resolution
// This allows the entrypoint script to configure CoreDNS to use the host gateway
extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]),
Expand Down Expand Up @@ -678,20 +629,6 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
.ok()
.and_then(|info| info.image);

// Explicitly disconnect the container from the cluster network before
// removing it. This ensures Docker tears down the network endpoint
// synchronously so port bindings are released immediately and the
// subsequent network cleanup sees zero connected containers.
let _ = docker
.disconnect_network(
NETWORK_NAME,
NetworkDisconnectRequest {
container: container_name.clone(),
force: Some(true),
},
)
.await;

let _ = stop_container(docker, &container_name).await;

let remove_container = docker
Expand Down Expand Up @@ -763,50 +700,9 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
return Err(err).into_diagnostic();
}

// Force-remove the network during a full destroy. First disconnect any
// stale endpoints that Docker may still report (race between container
// removal and network bookkeeping), then remove the network itself.
force_remove_network(docker).await?;
Ok(())
}

/// Forcefully remove the gateway network, disconnecting any remaining
/// containers first. This ensures that stale Docker network endpoints
/// cannot prevent port bindings from being released.
async fn force_remove_network(docker: &Docker) -> Result<()> {
let network = match docker
.inspect_network(NETWORK_NAME, None::<InspectNetworkOptions>)
.await
{
Ok(info) => info,
Err(err) if is_not_found(&err) => return Ok(()),
Err(err) => return Err(err).into_diagnostic(),
};

// Disconnect any containers still attached to the network.
if let Some(containers) = network.containers {
for (id, _) in containers {
let _ = docker
.disconnect_network(
NETWORK_NAME,
NetworkDisconnectRequest {
container: id,
force: Some(true),
},
)
.await;
}
}

match docker.remove_network(NETWORK_NAME).await {
Ok(()) => Ok(()),
Err(err) if is_not_found(&err) => Ok(()),
Err(err) => Err(err)
.into_diagnostic()
.wrap_err("failed to remove Docker network"),
}
}

fn is_not_found(err: &BollardError) -> bool {
matches!(
err,
Expand Down
33 changes: 33 additions & 0 deletions crates/openshell-bootstrap/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
match_mode: MatchMode::Any,
diagnose: diagnose_node_pressure,
},
// Missing sandbox supervisor binary
FailurePattern {
matchers: &["HEALTHCHECK_MISSING_SUPERVISOR"],
match_mode: MatchMode::Any,
diagnose: diagnose_missing_supervisor,
},
// TLS/certificate issues
FailurePattern {
matchers: &[
Expand Down Expand Up @@ -342,6 +348,33 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis {
}
}

fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "Sandbox supervisor binary missing from cluster image".to_string(),
explanation: "The sandbox supervisor binary (/opt/openshell/bin/openshell-sandbox) \
was not found in the gateway container. This binary is side-loaded into every \
sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \
crash immediately with \"no such file or directory\". This typically means the \
cluster image was built or published without the supervisor-builder stage."
.to_string(),
recovery_steps: vec![
RecoveryStep::with_command(
"Rebuild the cluster image with the supervisor binary included",
"mise run docker:build:cluster",
),
RecoveryStep::with_command(
"Destroy and recreate the gateway with the updated image",
format!("openshell gateway destroy {gateway_name} && openshell gateway start"),
),
RecoveryStep::new(
"Or set OPENSHELL_CLUSTER_IMAGE to a cluster image version that includes \
the supervisor binary",
),
],
retryable: false,
}
}

fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "TLS certificate issue".to_string(),
Expand Down
32 changes: 30 additions & 2 deletions crates/openshell-bootstrap/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use crate::constants::{
};
use crate::docker::{
check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
ensure_image, ensure_network, ensure_volume, start_container, stop_container,
ensure_image, ensure_volume, start_container, stop_container,
};
use crate::metadata::{
create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
Expand Down Expand Up @@ -107,6 +107,10 @@ pub struct DeployOptions {
/// created with GPU device requests (`--gpus all`) and the NVIDIA
/// k8s-device-plugin is deployed inside the k3s cluster.
pub gpu: bool,
/// When true, destroy any existing gateway resources before deploying.
/// When false, an existing gateway is left as-is and deployment is
/// skipped (the caller is responsible for prompting the user first).
pub recreate: bool,
}

impl DeployOptions {
Expand All @@ -121,6 +125,7 @@ impl DeployOptions {
disable_gateway_auth: false,
registry_token: None,
gpu: false,
recreate: false,
}
}

Expand Down Expand Up @@ -172,6 +177,13 @@ impl DeployOptions {
self.gpu = gpu;
self
}

/// Set whether to destroy and recreate existing gateway resources.
#[must_use]
pub fn with_recreate(mut self, recreate: bool) -> Self {
self.recreate = recreate;
self
}
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -232,6 +244,7 @@ where
let disable_gateway_auth = options.disable_gateway_auth;
let registry_token = options.registry_token;
let gpu = options.gpu;
let recreate = options.recreate;

// Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
// which needs a 'static callback for the bollard streaming pull.
Expand All @@ -256,6 +269,22 @@ where
),
};

// If an existing gateway is found, either tear it down (when recreate is
// requested) or bail out so the caller can prompt the user / reuse it.
if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
if recreate {
log("[status] Removing existing gateway".to_string());
destroy_gateway_resources(&target_docker, &name).await?;
} else {
return Err(miette::miette!(
"Gateway '{name}' already exists (container_running={}).\n\
Use --recreate to destroy and redeploy, or destroy it first with:\n\n \
openshell gateway destroy {name}",
existing.container_running,
));
}
}

// Ensure the image is available on the target Docker daemon
if remote_opts.is_some() {
log("[status] Downloading gateway".to_string());
Expand All @@ -280,7 +309,6 @@ where

// All subsequent operations use the target Docker (remote or local)
log("[status] Initializing environment".to_string());
ensure_network(&target_docker).await?;
ensure_volume(&target_docker, &volume_name(&name)).await?;

// Compute extra TLS SANs for remote deployments so the gateway and k3s
Expand Down
30 changes: 30 additions & 0 deletions crates/openshell-bootstrap/src/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ const DNS_FAILURE_MARKERS: &[&str] = &["DNS_PROBE_FAILED", "HEALTHCHECK_DNS_FAIL
/// new scheduling, so the cluster will never become healthy on its own.
const NODE_PRESSURE_MARKER: &str = "HEALTHCHECK_NODE_PRESSURE";

/// Log marker emitted by the health-check script when the sandbox supervisor
/// binary (`/opt/openshell/bin/openshell-sandbox`) is missing from the node
/// filesystem. Without this binary, every sandbox pod will crash immediately
/// with "no such file or directory". This is a permanent error that requires
/// rebuilding or updating the cluster image.
const MISSING_SUPERVISOR_MARKER: &str = "HEALTHCHECK_MISSING_SUPERVISOR";

/// Number of consecutive polling iterations that must observe DNS failure
/// markers before we treat the failure as persistent and abort. A small
/// grace period avoids false positives on transient hiccups during startup.
Expand Down Expand Up @@ -116,6 +123,29 @@ where
}
}

// -- Missing supervisor binary detection ----------------------------
// The health-check script verifies that /opt/openshell/bin/openshell-sandbox
// exists on the node filesystem. If missing, every sandbox pod will crash.
// This is a permanent error — fail immediately with actionable guidance.
if recent_logs
.iter()
.any(|line| line.contains(MISSING_SUPERVISOR_MARKER))
{
result = Some(Err(miette::miette!(
"The sandbox supervisor binary is missing from the cluster image.\n\
The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \
container. Without it, sandbox pods cannot start.\n\n\
This usually means the cluster image was built or published without the \
supervisor-builder stage.\n\n\
To fix:\n \
1. Rebuild the cluster image: mise run docker:build:cluster\n \
2. Or update to a cluster image that includes the supervisor binary\n \
3. Then recreate the gateway: openshell gateway destroy && openshell gateway start\n\n{}",
format_recent_logs(&recent_logs)
)));
break;
}

let inspect = docker
.inspect_container(&container_name, None::<InspectContainerOptions>)
.await
Expand Down
4 changes: 3 additions & 1 deletion crates/openshell-cli/src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ pub async fn run_bootstrap(
);
eprintln!();

let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name);
// Auto-bootstrap always recreates if stale Docker resources are found
// (e.g. metadata was deleted but container/volume still exist).
let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
if let Some(dest) = remote {
let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
if let Some(key) = ssh_key {
Expand Down
4 changes: 2 additions & 2 deletions crates/openshell-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -740,8 +740,8 @@ enum GatewayCommands {

/// Destroy and recreate the gateway from scratch if one already exists.
///
/// Without this flag, an interactive prompt asks what to do; in
/// non-interactive mode the existing gateway is reused silently.
/// Without this flag, an interactive prompt asks whether to recreate;
/// in non-interactive mode the existing gateway is reused silently.
#[arg(long)]
recreate: bool,

Expand Down
Loading
Loading