diff --git a/architecture/sandbox-connect.md b/architecture/sandbox-connect.md index a8201e61..56681672 100644 --- a/architecture/sandbox-connect.md +++ b/architecture/sandbox-connect.md @@ -345,7 +345,13 @@ NSSH1 \n 3. Reject if skew exceeds `ssh_handshake_skew_secs` (default: 300 seconds) 4. Recompute HMAC-SHA256 over `token|timestamp|nonce` with the shared secret 5. Compare computed signature against the received signature (constant-time via `hmac` crate) -6. Respond with `OK\n` on success or `ERR\n` on failure +6. Check nonce against the replay cache; reject if the nonce has been seen before within the skew window +7. Insert the nonce into the replay cache on success +8. Respond with `OK\n` on success or `ERR\n` on failure + +### Nonce replay detection + +The SSH server maintains a per-process `NonceCache` (`HashMap` behind `Arc>`) that tracks nonces seen within the handshake skew window. A background tokio task reaps expired entries every 60 seconds. If a valid preface is presented with a previously-seen nonce, the handshake is rejected. This prevents replay attacks within the timestamp validity window. ### HMAC computation @@ -517,7 +523,12 @@ This function is shared between the CLI and TUI via the `navigator-core::forward 1. **mTLS (transport layer)** -- when TLS is configured, the CLI authenticates to the gateway using client certificates. The `ssh-proxy` subprocess inherits TLS options from the parent CLI process. 2. **Session token (application layer)** -- the gateway validates the session token against the persistence layer. Tokens are scoped to a specific sandbox and can be revoked. -3. **NSSH1 handshake (gateway-to-sandbox)** -- the shared handshake secret proves the connection originated from an authorized gateway. The timestamp + nonce prevent replay attacks within the skew window. +3. **NSSH1 handshake (gateway-to-sandbox)** -- the shared handshake secret proves the connection originated from an authorized gateway. The timestamp + nonce prevent replay attacks within the skew window. The nonce replay cache rejects duplicates. +4. **Kubernetes NetworkPolicy** -- a Helm-managed `NetworkPolicy` restricts ingress to sandbox pods on port 2222 to only the gateway pod, preventing lateral movement from other in-cluster workloads. Controlled by `networkPolicy.enabled` in the Helm values (default: `true`). + +### Mandatory handshake secret + +The NSSH1 handshake secret (`NEMOCLAW_SSH_HANDSHAKE_SECRET`) is required. Both the server and sandbox will refuse to start if the secret is empty or unset. For cluster deployments the secret is auto-generated by the entrypoint script (`deploy/docker/cluster-entrypoint.sh`) via `openssl rand -hex 32` and injected into the Helm values. ### What SSH auth does NOT enforce @@ -542,7 +553,7 @@ The sandbox generates a fresh Ed25519 host key on every startup. The CLI disable | `ssh_gateway_port` | `8080` | Public port for gateway connections (0 = use bind port) | | `ssh_connect_path` | `/connect/ssh` | HTTP path for CONNECT requests | | `sandbox_ssh_port` | `2222` | SSH listen port inside sandbox pods | -| `ssh_handshake_secret` | (empty) | Shared HMAC key for NSSH1 handshake | +| `ssh_handshake_secret` | (required) | Shared HMAC key for NSSH1 handshake (server fails to start if empty) | | `ssh_handshake_skew_secs` | `300` | Maximum allowed clock skew (seconds) | ### Sandbox environment variables diff --git a/crates/navigator-sandbox/src/lib.rs b/crates/navigator-sandbox/src/lib.rs index 1d7471b9..b0682ef0 100644 --- a/crates/navigator-sandbox/src/lib.rs +++ b/crates/navigator-sandbox/src/lib.rs @@ -404,7 +404,14 @@ pub async fn run_sandbox( let addr: SocketAddr = listen_addr.parse().into_diagnostic()?; let policy_clone = policy.clone(); let workdir_clone = workdir.clone(); - let secret = ssh_handshake_secret.unwrap_or_default(); + let secret = ssh_handshake_secret + .filter(|s| !s.is_empty()) + .ok_or_else(|| { + miette::miette!( + "NEMOCLAW_SSH_HANDSHAKE_SECRET is required when SSH is enabled.\n\ + Set --ssh-handshake-secret or the NEMOCLAW_SSH_HANDSHAKE_SECRET env var." + ) + })?; let proxy_url = ssh_proxy_url; let netns_fd = ssh_netns_fd; let ca_paths = ca_file_paths.clone(); diff --git a/crates/navigator-sandbox/src/ssh.rs b/crates/navigator-sandbox/src/ssh.rs index 03192df1..99d143d7 100644 --- a/crates/navigator-sandbox/src/ssh.rs +++ b/crates/navigator-sandbox/src/ssh.rs @@ -21,14 +21,19 @@ use std::net::SocketAddr; use std::os::fd::{AsRawFd, RawFd}; use std::path::PathBuf; use std::process::Command; -use std::sync::{Arc, mpsc}; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use std::sync::{Arc, Mutex, mpsc}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::TcpListener; use tracing::{info, warn}; const PREFACE_MAGIC: &str = "NSSH1"; +/// A time-bounded set of nonces used to detect replayed NSSH1 handshakes. +/// Each entry records the `Instant` it was inserted; a background reaper task +/// periodically evicts entries older than the handshake skew window. +type NonceCache = Arc>>; + /// Perform SSH server initialization: generate a host key, build the config, /// and bind the TCP listener. Extracted so that startup errors can be forwarded /// through the readiness channel rather than being silently logged. @@ -85,6 +90,23 @@ pub async fn run_ssh_server( } }; + // Nonce cache for replay detection. Entries are evicted by a background + // reaper once they exceed the handshake skew window. + let nonce_cache: NonceCache = Arc::new(Mutex::new(HashMap::new())); + + // Background task that periodically purges expired nonces. + let reaper_cache = nonce_cache.clone(); + let ttl = Duration::from_secs(handshake_skew_secs); + tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(60)); + loop { + interval.tick().await; + if let Ok(mut cache) = reaper_cache.lock() { + cache.retain(|_, inserted| inserted.elapsed() < ttl); + } + } + }); + loop { let (stream, peer) = listener.accept().await.into_diagnostic()?; stream.set_nodelay(true).into_diagnostic()?; @@ -95,6 +117,7 @@ pub async fn run_ssh_server( let proxy_url = proxy_url.clone(); let ca_paths = ca_paths.clone(); let provider_env = provider_env.clone(); + let nonce_cache = nonce_cache.clone(); tokio::spawn(async move { if let Err(err) = handle_connection( @@ -109,6 +132,7 @@ pub async fn run_ssh_server( proxy_url, ca_paths, provider_env, + &nonce_cache, ) .await { @@ -131,12 +155,13 @@ async fn handle_connection( proxy_url: Option, ca_file_paths: Option>, provider_env: HashMap, + nonce_cache: &NonceCache, ) -> Result<()> { info!(peer = %peer, "SSH connection: reading handshake preface"); let mut line = String::new(); read_line(&mut stream, &mut line).await?; info!(peer = %peer, preface_len = line.len(), "SSH connection: preface received, verifying"); - if !verify_preface(&line, secret, handshake_skew_secs)? { + if !verify_preface(&line, secret, handshake_skew_secs, nonce_cache)? { warn!(peer = %peer, "SSH connection: handshake verification failed"); let _ = stream.write_all(b"ERR\n").await; return Ok(()); @@ -178,7 +203,12 @@ async fn read_line(stream: &mut tokio::net::TcpStream, buf: &mut String) -> Resu Ok(()) } -fn verify_preface(line: &str, secret: &str, handshake_skew_secs: u64) -> Result { +fn verify_preface( + line: &str, + secret: &str, + handshake_skew_secs: u64, + nonce_cache: &NonceCache, +) -> Result { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() != 5 || parts[0] != PREFACE_MAGIC { return Ok(false); @@ -202,7 +232,22 @@ fn verify_preface(line: &str, secret: &str, handshake_skew_secs: u64) -> Result< let payload = format!("{token}|{timestamp}|{nonce}"); let expected = hmac_sha256(secret.as_bytes(), payload.as_bytes()); - Ok(signature == expected) + if signature != expected { + return Ok(false); + } + + // Reject replayed nonces. The cache is bounded by the reaper task which + // evicts entries older than `handshake_skew_secs`. + let mut cache = nonce_cache + .lock() + .map_err(|_| miette::miette!("nonce cache lock poisoned"))?; + if cache.contains_key(nonce) { + warn!(nonce = nonce, "NSSH1 nonce replay detected"); + return Ok(false); + } + cache.insert(nonce.to_string(), Instant::now()); + + Ok(true) } fn hmac_sha256(key: &[u8], data: &[u8]) -> String { @@ -1085,4 +1130,103 @@ mod tests { "expected all 100 KiB delivered before EOF" ); } + + // ----------------------------------------------------------------------- + // verify_preface tests + // ----------------------------------------------------------------------- + + /// Build a valid NSSH1 preface line with the given parameters. + fn build_preface(token: &str, secret: &str, nonce: &str, timestamp: i64) -> String { + let payload = format!("{token}|{timestamp}|{nonce}"); + let signature = hmac_sha256(secret.as_bytes(), payload.as_bytes()); + format!("{PREFACE_MAGIC} {token} {timestamp} {nonce} {signature}") + } + + fn fresh_nonce_cache() -> NonceCache { + Arc::new(Mutex::new(HashMap::new())) + } + + fn current_timestamp() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(), + ) + .unwrap() + } + + #[test] + fn verify_preface_accepts_valid_preface() { + let secret = "test-secret-key"; + let nonce = "unique-nonce-1"; + let ts = current_timestamp(); + let line = build_preface("tok1", secret, nonce, ts); + let cache = fresh_nonce_cache(); + + assert!(verify_preface(&line, secret, 300, &cache).unwrap()); + } + + #[test] + fn verify_preface_rejects_replayed_nonce() { + let secret = "test-secret-key"; + let nonce = "replay-nonce"; + let ts = current_timestamp(); + let line = build_preface("tok1", secret, nonce, ts); + let cache = fresh_nonce_cache(); + + // First attempt should succeed. + assert!(verify_preface(&line, secret, 300, &cache).unwrap()); + // Second attempt with the same nonce should be rejected. + assert!(!verify_preface(&line, secret, 300, &cache).unwrap()); + } + + #[test] + fn verify_preface_rejects_expired_timestamp() { + let secret = "test-secret-key"; + let nonce = "expired-nonce"; + // Timestamp 600 seconds in the past, with a 300-second skew window. + let ts = current_timestamp() - 600; + let line = build_preface("tok1", secret, nonce, ts); + let cache = fresh_nonce_cache(); + + assert!(!verify_preface(&line, secret, 300, &cache).unwrap()); + } + + #[test] + fn verify_preface_rejects_invalid_hmac() { + let secret = "test-secret-key"; + let nonce = "hmac-nonce"; + let ts = current_timestamp(); + // Build with the correct secret, then verify with the wrong one. + let line = build_preface("tok1", secret, nonce, ts); + let cache = fresh_nonce_cache(); + + assert!(!verify_preface(&line, "wrong-secret", 300, &cache).unwrap()); + } + + #[test] + fn verify_preface_rejects_malformed_input() { + let cache = fresh_nonce_cache(); + + // Too few parts. + assert!(!verify_preface("NSSH1 tok1 123", "s", 300, &cache).unwrap()); + // Wrong magic. + assert!(!verify_preface("NSSH2 tok1 123 nonce sig", "s", 300, &cache).unwrap()); + // Empty string. + assert!(!verify_preface("", "s", 300, &cache).unwrap()); + } + + #[test] + fn verify_preface_distinct_nonces_both_accepted() { + let secret = "test-secret-key"; + let ts = current_timestamp(); + let cache = fresh_nonce_cache(); + + let line1 = build_preface("tok1", secret, "nonce-a", ts); + let line2 = build_preface("tok1", secret, "nonce-b", ts); + + assert!(verify_preface(&line1, secret, 300, &cache).unwrap()); + assert!(verify_preface(&line2, secret, 300, &cache).unwrap()); + } } diff --git a/crates/navigator-server/src/lib.rs b/crates/navigator-server/src/lib.rs index 1f953d4a..262aabdb 100644 --- a/crates/navigator-server/src/lib.rs +++ b/crates/navigator-server/src/lib.rs @@ -92,6 +92,11 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul if database_url.is_empty() { return Err(Error::config("database_url is required")); } + if config.ssh_handshake_secret.is_empty() { + return Err(Error::config( + "ssh_handshake_secret is required. Set --ssh-handshake-secret or NEMOCLAW_SSH_HANDSHAKE_SECRET", + )); + } let store = Store::connect(database_url).await?; let sandbox_client = SandboxClient::new( diff --git a/crates/navigator-server/src/sandbox/mod.rs b/crates/navigator-server/src/sandbox/mod.rs index 63881900..bee41f68 100644 --- a/crates/navigator-server/src/sandbox/mod.rs +++ b/crates/navigator-server/src/sandbox/mod.rs @@ -915,9 +915,7 @@ fn apply_required_env( if !ssh_listen_addr.is_empty() { upsert_env(env, "NEMOCLAW_SSH_LISTEN_ADDR", ssh_listen_addr); } - if !ssh_handshake_secret.is_empty() { - upsert_env(env, "NEMOCLAW_SSH_HANDSHAKE_SECRET", ssh_handshake_secret); - } + upsert_env(env, "NEMOCLAW_SSH_HANDSHAKE_SECRET", ssh_handshake_secret); upsert_env( env, "NEMOCLAW_SSH_HANDSHAKE_SKEW_SECS", @@ -1219,4 +1217,29 @@ mod tests { assert_eq!(derive_phase(&status, false), SandboxPhase::Ready); } + + #[test] + fn apply_required_env_always_injects_ssh_handshake_secret() { + let mut env = Vec::new(); + apply_required_env( + &mut env, + "sandbox-1", + "my-sandbox", + "https://endpoint:8080", + "0.0.0.0:2222", + "my-secret-value", + 300, + ); + + let secret_entry = env + .iter() + .find(|e| { + e.get("name").and_then(|v| v.as_str()) == Some("NEMOCLAW_SSH_HANDSHAKE_SECRET") + }) + .expect("NEMOCLAW_SSH_HANDSHAKE_SECRET must be present in env"); + assert_eq!( + secret_entry.get("value").and_then(|v| v.as_str()), + Some("my-secret-value") + ); + } } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 1e0db386..8083c89c 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -249,6 +249,11 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART" fi +# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between +# the gateway and sandbox SSH servers. This is required — the server will refuse +# to start without it. +SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(openssl rand -hex 32)}" + # Inject SSH gateway host/port into the HelmChart manifest so the navigator # server returns the correct address to CLI clients for SSH proxy CONNECT. if [ -f "$HELMCHART" ]; then @@ -266,6 +271,8 @@ if [ -f "$HELMCHART" ]; then # Clear the placeholder so the default (8080) is used sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART" fi + echo "Setting SSH handshake secret" + sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" fi # Inject chart checksum into the HelmChart manifest so that a changed chart diff --git a/deploy/helm/navigator/templates/networkpolicy.yaml b/deploy/helm/navigator/templates/networkpolicy.yaml new file mode 100644 index 00000000..aea332da --- /dev/null +++ b/deploy/helm/navigator/templates/networkpolicy.yaml @@ -0,0 +1,32 @@ +{{- if .Values.networkPolicy.enabled }} +# NetworkPolicy restricting SSH ingress on sandbox pods to the gateway pod. +# Sandbox pods are dynamically created by the server and labelled with +# navigator.ai/managed-by=navigator. This policy ensures only the gateway +# (navigator server) pod can reach the sandbox SSH port (2222), blocking +# lateral movement from other in-cluster workloads. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "navigator.fullname" . }}-sandbox-ssh + namespace: {{ .Values.server.sandboxNamespace }} + labels: + {{- include "navigator.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + navigator.ai/managed-by: navigator + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} + podSelector: + matchLabels: + app.kubernetes.io/name: {{ include "navigator.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + ports: + - protocol: TCP + port: 2222 +{{- end }} diff --git a/deploy/helm/navigator/templates/statefulset.yaml b/deploy/helm/navigator/templates/statefulset.yaml index 7c6a5c09..67caedd8 100644 --- a/deploy/helm/navigator/templates/statefulset.yaml +++ b/deploy/helm/navigator/templates/statefulset.yaml @@ -58,6 +58,8 @@ spec: - name: NEMOCLAW_SSH_GATEWAY_PORT value: {{ .Values.server.sshGatewayPort | quote }} {{- end }} + - name: NEMOCLAW_SSH_HANDSHAKE_SECRET + value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }} - name: NEMOCLAW_TLS_CERT value: /etc/navigator-tls/server/tls.crt - name: NEMOCLAW_TLS_KEY diff --git a/deploy/helm/navigator/values.yaml b/deploy/helm/navigator/values.yaml index 17b31d8e..d6e8b2f9 100644 --- a/deploy/helm/navigator/values.yaml +++ b/deploy/helm/navigator/values.yaml @@ -77,6 +77,10 @@ server: sshGatewayPort: 0 # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. + # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH. + # Required — the server will refuse to start if empty. For cluster deployments + # this is auto-generated by the entrypoint script. + sshHandshakeSecret: "" tls: # K8s secret (type kubernetes.io/tls) with tls.crt and tls.key for the server certSecretName: navigator-server-tls @@ -84,3 +88,7 @@ server: clientCaSecretName: navigator-server-client-ca # K8s secret mounted into sandbox pods for mTLS to the server clientTlsSecretName: navigator-client-tls + +# NetworkPolicy restricting SSH ingress on sandbox pods to the gateway only. +networkPolicy: + enabled: true diff --git a/deploy/kube/manifests/navigator-helmchart.yaml b/deploy/kube/manifests/navigator-helmchart.yaml index f07b3726..d99eeccd 100644 --- a/deploy/kube/manifests/navigator-helmchart.yaml +++ b/deploy/kube/manifests/navigator-helmchart.yaml @@ -31,6 +31,7 @@ spec: sandboxImage: d1i0nduu2f6qxk.cloudfront.net/navigator/sandbox:latest sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ + sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ grpcEndpoint: "https://navigator.navigator.svc.cluster.local:8080" tls: certSecretName: navigator-server-tls diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index d29e0102..4f31fc49 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -379,6 +379,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then # terminates mTLS (there is no server.tls.enabled toggle). Without this, # a prior Helm override or chart default change could silently regress # sandbox callbacks to plaintext. + # Retrieve the existing handshake secret from the running release, or generate + # a new one if this is the first deploy with the mandatory secret. + EXISTING_SECRET=$(helm get values navigator -n navigator -o json 2>/dev/null \ + | grep -o '"sshHandshakeSecret":"[^"]*"' \ + | cut -d'"' -f4) || true + SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}" + helm upgrade navigator deploy/helm/navigator \ --namespace navigator \ --set image.repository=${IMAGE_REPO_BASE}/server \ @@ -389,6 +396,7 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.certSecretName=navigator-server-tls \ --set server.tls.clientCaSecretName=navigator-server-client-ca \ --set server.tls.clientTlsSecretName=navigator-client-tls \ + --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ "${helm_wait_args[@]}" helm_end=$(date +%s) log_duration "Helm upgrade" "${helm_start}" "${helm_end}"