diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index eaa221311..fb9b4a63d 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -46,7 +46,10 @@ pub async fn build_and_push_image( on_log(format!( "Pushing image {tag} into gateway \"{gateway_name}\"" )); - let local_docker = Docker::connect_with_local_defaults() + // Use the long-timeout Docker client so `docker save` of multi-GB images + // doesn't trip the 120s bollard default mid-stream. Override with + // OPENSHELL_DOCKER_TIMEOUT_SECS=. + let local_docker = crate::docker::connect_local_for_large_transfers() .into_diagnostic() .wrap_err("failed to connect to local Docker daemon")?; let container = container_name(gateway_name); diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index be086e534..65482739f 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -23,6 +23,24 @@ use std::collections::HashMap; const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; +/// Default total HTTP timeout for Docker API calls that stream large payloads +/// (e.g. `docker save` used by `sandbox create --from`). Bollard's own +/// `connect_with_local_defaults()` ceiling is 120s, which is far too short for +/// multi-GB image exports — a 7 GB image on a laptop SSD takes ~4–5 minutes. +/// One hour is a safe upper bound; override with `OPENSHELL_DOCKER_TIMEOUT_SECS`. +pub(crate) const DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS: u64 = 3600; + +/// Build a local-Docker client suitable for large streaming transfers. +/// Respects `OPENSHELL_DOCKER_TIMEOUT_SECS` (in seconds); falls back to +/// [`DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS`] when unset or unparseable. +pub fn connect_local_for_large_transfers() -> std::result::Result { + let secs: u64 = std::env::var("OPENSHELL_DOCKER_TIMEOUT_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS); + Ok(Docker::connect_with_local_defaults()?.with_timeout(std::time::Duration::from_secs(secs))) +} + /// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a /// concrete device ID based on whether CDI is enabled on the daemon. /// diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 71d223d66..53f659fc6 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -521,7 +521,10 @@ where .collect(); if !images.is_empty() { log("[status] Deploying components".to_string()); - let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?; + // Long-timeout client: `docker save` of multi-GB component + // images streams past bollard's 120s default. See + // docker::connect_local_for_large_transfers(). + let local_docker = docker::connect_local_for_large_transfers().into_diagnostic()?; let container = container_name(&name); let on_log_ref = Arc::clone(&on_log); let mut push_log = move |msg: String| {