From 70fc8312d58d9542a6d935585b6982743362611b Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 17 Mar 2026 15:58:34 -0700 Subject: [PATCH] fix(server): add startup probe for gateway boot --- architecture/gateway-single-node.md | 8 ++-- crates/openshell-server/src/lib.rs | 42 ++++++++++++++++++- .../helm/openshell/templates/statefulset.yaml | 6 +++ deploy/helm/openshell/values.yaml | 4 ++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 679bc338..8dc270ac 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -188,9 +188,11 @@ After the container starts: 1. **Clean stale nodes**: `clean_stale_nodes()` finds `NotReady` nodes via `kubectl get nodes` and deletes them. This is needed when a container is recreated but reuses the persistent volume -- k3s registers a new node (using the container ID as hostname) while old node entries persist in etcd. Non-fatal on error; returns the count of removed nodes. 2. **Push local images** (optional, local deploy only): If `OPENSHELL_PUSH_IMAGES` is set, the comma-separated image refs are exported from the local Docker daemon as a single tar, uploaded into the container via `docker put_archive`, and imported into containerd via `ctr images import` in the `k8s.io` namespace. After import, `kubectl rollout restart deployment/openshell openshell` is run, followed by `kubectl rollout status --timeout=180s` to wait for completion. See `crates/openshell-bootstrap/src/push.rs`. 3. **Wait for gateway health**: `wait_for_gateway_ready()` polls the Docker HEALTHCHECK status up to 180 times, 2 seconds apart (6 min total). A background task streams container logs during this wait. Failure modes: - - Container exits during polling: error includes recent log lines. - - Container has no HEALTHCHECK instruction: fails immediately. - - HEALTHCHECK reports unhealthy on final attempt: error includes recent logs. + - Container exits during polling: error includes recent log lines. + - Container has no HEALTHCHECK instruction: fails immediately. + - HEALTHCHECK reports unhealthy on final attempt: error includes recent logs. + +The gateway StatefulSet also uses a Kubernetes `startupProbe` on the gRPC port before steady-state liveness and readiness checks begin. This gives single-node k3s boots extra time to absorb early networking and flannel initialization delay without restarting the gateway pod too aggressively. ### 5) mTLS bundle capture diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 31210ac6..fad238be 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -25,9 +25,10 @@ mod ws_tunnel; use openshell_core::{Config, Error, Result}; use std::collections::HashMap; +use std::io::ErrorKind; use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; -use tracing::{error, info}; +use tracing::{debug, error, info}; pub use grpc::OpenShellService; pub use http::{health_router, http_router}; @@ -67,6 +68,13 @@ pub struct ServerState { pub ssh_connections_by_sandbox: Mutex>, } +fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { + matches!( + error.kind(), + ErrorKind::UnexpectedEof | ErrorKind::ConnectionReset + ) +} + impl ServerState { /// Create new server state. #[must_use] @@ -198,7 +206,11 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul } } Err(e) => { - error!(error = %e, client = %addr, "TLS handshake failed"); + if is_benign_tls_handshake_failure(&e) { + debug!(error = %e, client = %addr, "TLS handshake closed early"); + } else { + error!(error = %e, client = %addr, "TLS handshake failed"); + } } } }); @@ -211,3 +223,29 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul } } } + +#[cfg(test)] +mod tests { + use super::is_benign_tls_handshake_failure; + use std::io::{Error, ErrorKind}; + + #[test] + fn classifies_probe_style_tls_disconnects_as_benign() { + for kind in [ErrorKind::UnexpectedEof, ErrorKind::ConnectionReset] { + let error = Error::new(kind, "probe disconnected"); + assert!(is_benign_tls_handshake_failure(&error)); + } + } + + #[test] + fn preserves_real_tls_failures_as_errors() { + for kind in [ + ErrorKind::InvalidData, + ErrorKind::PermissionDenied, + ErrorKind::Other, + ] { + let error = Error::new(kind, "real tls failure"); + assert!(!is_benign_tls_handshake_failure(&error)); + } + } +} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 83ece499..1be8f14a 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -110,6 +110,12 @@ spec: - name: grpc containerPort: {{ .Values.service.port }} protocol: TCP + startupProbe: + tcpSocket: + port: grpc + periodSeconds: {{ .Values.probes.startup.periodSeconds }} + timeoutSeconds: {{ .Values.probes.startup.timeoutSeconds }} + failureThreshold: {{ .Values.probes.startup.failureThreshold }} livenessProbe: tcpSocket: port: grpc diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 2691fc48..ccc8d1ff 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -43,6 +43,10 @@ podLifecycle: terminationGracePeriodSeconds: 5 probes: + startup: + periodSeconds: 2 + timeoutSeconds: 1 + failureThreshold: 30 liveness: initialDelaySeconds: 2 periodSeconds: 5