diff --git a/.github/workflows/release-vm-dev.yml b/.github/workflows/release-vm-dev.yml index 06ba474a0..b32bfd8de 100644 --- a/.github/workflows/release-vm-dev.yml +++ b/.github/workflows/release-vm-dev.yml @@ -416,12 +416,253 @@ jobs: path: artifacts/*.tar.gz retention-days: 5 + # --------------------------------------------------------------------------- + # Build openshell-driver-vm binary (Linux — native on each arch) + # --------------------------------------------------------------------------- + build-driver-vm-linux: + name: Build Driver VM (Linux ${{ matrix.arch }}) + needs: [compute-versions, download-kernel-runtime, build-rootfs] + strategy: + matrix: + include: + - arch: arm64 + runner: build-arm64 + target: aarch64-unknown-linux-gnu + platform: linux-aarch64 + guest_arch: aarch64 + - arch: amd64 + runner: build-amd64 + target: x86_64-unknown-linux-gnu + platform: linux-x86_64 + guest_arch: x86_64 + runs-on: ${{ matrix.runner }} + timeout-minutes: 30 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SCCACHE_MEMCACHED_ENDPOINT: ${{ vars.SCCACHE_MEMCACHED_ENDPOINT }} + OPENSHELL_IMAGE_TAG: dev + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Mark workspace safe for git + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Fetch tags + run: git fetch --tags --force + + - name: Install tools + run: mise install + + - name: Cache Rust target and registry + uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + with: + shared-key: driver-vm-linux-${{ matrix.arch }} + cache-directories: .cache/sccache + cache-targets: "true" + + - name: Install zstd + run: apt-get update && apt-get install -y --no-install-recommends zstd && rm -rf /var/lib/apt/lists/* + + - name: Download kernel runtime tarball + uses: actions/download-artifact@v4 + with: + name: kernel-runtime-tarballs + path: runtime-download/ + + - name: Download rootfs tarball + uses: actions/download-artifact@v4 + with: + name: rootfs-${{ matrix.arch }} + path: rootfs-download/ + + - name: Stage compressed runtime for embedding + run: | + set -euo pipefail + COMPRESSED_DIR="${PWD}/target/vm-runtime-compressed" + mkdir -p "$COMPRESSED_DIR" + + # Extract kernel runtime tarball and re-compress individual files + EXTRACT_DIR=$(mktemp -d) + zstd -d "runtime-download/vm-runtime-${{ matrix.platform }}.tar.zst" --stdout \ + | tar -xf - -C "$EXTRACT_DIR" + + echo "Extracted runtime files:" + ls -lah "$EXTRACT_DIR" + + for file in "$EXTRACT_DIR"/*; do + [ -f "$file" ] || continue + name=$(basename "$file") + [ "$name" = "provenance.json" ] && continue + zstd -19 -f -q -T0 -o "${COMPRESSED_DIR}/${name}.zst" "$file" + done + + # Copy rootfs tarball (already zstd-compressed) + cp rootfs-download/rootfs.tar.zst "${COMPRESSED_DIR}/rootfs.tar.zst" + + echo "Staged compressed artifacts:" + ls -lah "$COMPRESSED_DIR" + + - name: Scope workspace to driver-vm crates + run: | + set -euo pipefail + sed -i 's|members = \["crates/\*"\]|members = ["crates/openshell-driver-vm", "crates/openshell-core"]|' Cargo.toml + + - name: Patch workspace version + if: needs.compute-versions.outputs.cargo_version != '' + run: | + set -euo pipefail + sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ needs.compute-versions.outputs.cargo_version }}"'"/}' Cargo.toml + + - name: Build openshell-driver-vm + run: | + set -euo pipefail + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${PWD}/target/vm-runtime-compressed" \ + mise x -- cargo build --release -p openshell-driver-vm + + - name: sccache stats + if: always() + run: mise x -- sccache --show-stats + + - name: Package binary + run: | + set -euo pipefail + mkdir -p artifacts + tar -czf "artifacts/openshell-driver-vm-${{ matrix.target }}.tar.gz" \ + -C target/release openshell-driver-vm + ls -lh artifacts/ + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: driver-vm-linux-${{ matrix.arch }} + path: artifacts/*.tar.gz + retention-days: 5 + + # --------------------------------------------------------------------------- + # Build openshell-driver-vm binary (macOS ARM64 via osxcross) + # --------------------------------------------------------------------------- + build-driver-vm-macos: + name: Build Driver VM (macOS) + needs: [compute-versions, download-kernel-runtime, build-rootfs] + runs-on: build-amd64 + timeout-minutes: 60 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SCCACHE_MEMCACHED_ENDPOINT: ${{ vars.SCCACHE_MEMCACHED_ENDPOINT }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Mark workspace safe for git + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Fetch tags + run: git fetch --tags --force + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Set up Docker Buildx + uses: ./.github/actions/setup-buildx + + - name: Install zstd + run: apt-get update && apt-get install -y --no-install-recommends zstd && rm -rf /var/lib/apt/lists/* + + - name: Download kernel runtime tarball + uses: actions/download-artifact@v4 + with: + name: kernel-runtime-tarballs + path: runtime-download/ + + - name: Download rootfs tarball (arm64) + uses: actions/download-artifact@v4 + with: + name: rootfs-arm64 + path: rootfs-download/ + + - name: Prepare compressed runtime directory + run: | + set -euo pipefail + COMPRESSED_DIR="${PWD}/target/vm-runtime-compressed-macos" + mkdir -p "$COMPRESSED_DIR" + + # Extract the darwin runtime tarball and re-compress for embedding. + # The macOS embedded.rs expects: libkrun.dylib.zst, libkrunfw.5.dylib.zst, gvproxy.zst + EXTRACT_DIR=$(mktemp -d) + zstd -d "runtime-download/vm-runtime-darwin-aarch64.tar.zst" --stdout \ + | tar -xf - -C "$EXTRACT_DIR" + + echo "Extracted darwin runtime files:" + ls -lah "$EXTRACT_DIR" + + for file in "$EXTRACT_DIR"/*; do + [ -f "$file" ] || continue + name=$(basename "$file") + [ "$name" = "provenance.json" ] && continue + zstd -19 -f -q -T0 -o "${COMPRESSED_DIR}/${name}.zst" "$file" + done + + # The macOS VM guest is always Linux ARM64, so use the arm64 rootfs + cp rootfs-download/rootfs.tar.zst "${COMPRESSED_DIR}/rootfs.tar.zst" + + echo "Staged macOS compressed artifacts:" + ls -lah "$COMPRESSED_DIR" + + - name: Build macOS binary via Docker (osxcross) + run: | + set -euo pipefail + docker buildx build \ + --file deploy/docker/Dockerfile.driver-vm-macos \ + --build-arg OPENSHELL_CARGO_VERSION="${{ needs.compute-versions.outputs.cargo_version }}" \ + --build-arg OPENSHELL_IMAGE_TAG=dev \ + --build-arg CARGO_TARGET_CACHE_SCOPE="${{ github.sha }}" \ + --build-context vm-runtime-compressed="${PWD}/target/vm-runtime-compressed-macos" \ + --target binary \ + --output type=local,dest=out/ \ + . + + - name: Package binary + run: | + set -euo pipefail + mkdir -p artifacts + tar -czf artifacts/openshell-driver-vm-aarch64-apple-darwin.tar.gz \ + -C out openshell-driver-vm + ls -lh artifacts/ + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: driver-vm-macos + path: artifacts/*.tar.gz + retention-days: 5 + # --------------------------------------------------------------------------- # Upload all VM binaries to the vm-dev rolling release # --------------------------------------------------------------------------- release-vm-dev: name: Release VM Dev - needs: [build-vm-linux, build-vm-macos] + needs: + - build-vm-linux + - build-vm-macos + - build-driver-vm-linux + - build-driver-vm-macos runs-on: build-amd64 timeout-minutes: 10 steps: @@ -430,7 +671,7 @@ jobs: - name: Download all VM binary artifacts uses: actions/download-artifact@v4 with: - pattern: vm-* + pattern: "{vm-*,driver-vm-*}" path: release/ merge-multiple: true @@ -438,21 +679,30 @@ jobs: run: | set -euo pipefail mkdir -p release-final - # Only include the openshell-vm binary tarballs, not kernel runtime - cp release/openshell-vm-*.tar.gz release-final/ - count=$(ls release-final/openshell-vm-*.tar.gz 2>/dev/null | wc -l) - if [ "$count" -eq 0 ]; then - echo "ERROR: No VM binary tarballs found in release/" >&2 + # Include both openshell-vm and openshell-driver-vm binary tarballs. + # Exclude kernel runtime tarballs (they come from release-vm-kernel.yml). + for pattern in 'openshell-vm-*.tar.gz' 'openshell-driver-vm-*.tar.gz'; do + for file in release/${pattern}; do + [ -f "$file" ] || continue + cp "$file" release-final/ + done + done + vm_count=$(ls release-final/openshell-vm-*.tar.gz 2>/dev/null | wc -l) + driver_count=$(ls release-final/openshell-driver-vm-*.tar.gz 2>/dev/null | wc -l) + if [ "$vm_count" -eq 0 ] || [ "$driver_count" -eq 0 ]; then + echo "ERROR: Missing binary tarballs (openshell-vm=${vm_count}, openshell-driver-vm=${driver_count})" >&2 + ls -la release/ || true exit 1 fi - echo "Release artifacts (${count} binaries):" + echo "Release artifacts (openshell-vm=${vm_count}, openshell-driver-vm=${driver_count}):" ls -lh release-final/ - name: Generate checksums run: | set -euo pipefail cd release-final - sha256sum openshell-vm-*.tar.gz > vm-binary-checksums-sha256.txt + sha256sum openshell-vm-*.tar.gz openshell-driver-vm-*.tar.gz \ + > vm-binary-checksums-sha256.txt cat vm-binary-checksums-sha256.txt - name: Ensure vm-dev tag exists @@ -479,7 +729,11 @@ jobs: } // Delete old VM binary assets (keep kernel runtime assets) for (const asset of release.data.assets) { - if (asset.name.startsWith('openshell-vm-') || asset.name === 'vm-binary-checksums-sha256.txt') { + if ( + asset.name.startsWith('openshell-vm-') || + asset.name.startsWith('openshell-driver-vm-') || + asset.name === 'vm-binary-checksums-sha256.txt' + ) { core.info(`Deleting stale asset: ${asset.name}`); await github.rest.repos.deleteReleaseAsset({ owner, repo, asset_id: asset.id }); } @@ -520,6 +774,18 @@ jobs: | Linux x86_64 | `openshell-vm-x86_64-unknown-linux-gnu.tar.gz` | | macOS ARM64 | `openshell-vm-aarch64-apple-darwin.tar.gz` | + ### VM Compute Driver Binaries + + `openshell-driver-vm` binaries with embedded kernel runtime and sandbox rootfs. + Launched by the gateway when `--drivers=vm` is configured. Rebuilt on every + push to main alongside the openshell-vm binaries. + + | Platform | Artifact | + |----------|----------| + | Linux ARM64 | `openshell-driver-vm-aarch64-unknown-linux-gnu.tar.gz` | + | Linux x86_64 | `openshell-driver-vm-x86_64-unknown-linux-gnu.tar.gz` | + | macOS ARM64 | `openshell-driver-vm-aarch64-apple-darwin.tar.gz` | + ### Quick install ``` @@ -532,4 +798,7 @@ jobs: release-final/openshell-vm-aarch64-unknown-linux-gnu.tar.gz release-final/openshell-vm-x86_64-unknown-linux-gnu.tar.gz release-final/openshell-vm-aarch64-apple-darwin.tar.gz + release-final/openshell-driver-vm-aarch64-unknown-linux-gnu.tar.gz + release-final/openshell-driver-vm-x86_64-unknown-linux-gnu.tar.gz + release-final/openshell-driver-vm-aarch64-apple-darwin.tar.gz release-final/vm-binary-checksums-sha256.txt diff --git a/AGENTS.md b/AGENTS.md index a6cbd29ce..8968c5296 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -40,6 +40,8 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-providers/` | Provider management | Credential provider backends | | `crates/openshell-tui/` | Terminal UI | Ratatui-based dashboard for monitoring | | `crates/openshell-vm/` | MicroVM runtime | Experimental, work-in-progress libkrun-based VM execution | +| `crates/openshell-driver-kubernetes/` | Kubernetes compute driver | In-process `ComputeDriver` backend for K8s sandbox pods | +| `crates/openshell-driver-vm/` | VM compute driver | Standalone libkrun-backed `ComputeDriver` subprocess (embeds its own rootfs + runtime) | | `python/openshell/` | Python SDK | Python bindings and CLI packaging | | `proto/` | Protobuf definitions | gRPC service contracts | | `deploy/` | Docker, Helm, K8s | Dockerfiles, Helm chart, manifests | diff --git a/Cargo.lock b/Cargo.lock index e4057f75c..4b29a0c7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3090,6 +3090,28 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "openshell-driver-vm" +version = "0.0.0" +dependencies = [ + "clap", + "futures", + "libc", + "libloading", + "miette", + "nix", + "openshell-core", + "prost-types", + "tar", + "tokio", + "tokio-stream", + "tonic", + "tracing", + "tracing-subscriber", + "url", + "zstd", +] + [[package]] name = "openshell-ocsf" version = "0.0.0" @@ -3245,6 +3267,7 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", "uuid", "wiremock", ] diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index ce4d0bf39..548b86d17 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -80,6 +80,23 @@ The embedded rootfs uses a "minimal" configuration: Container images are pulled on demand when sandboxes are created. First boot takes ~30-60s as k3s initializes; subsequent boots use cached state for ~3-5s startup. +For the VM compute driver, the same embedded rootfs is rewritten into a +supervisor-only sandbox guest before boot: + +- removes k3s state and Kubernetes manifests from the extracted rootfs +- installs `/srv/openshell-vm-sandbox-init.sh` +- boots directly into `openshell-sandbox` instead of `openshell-vm-init.sh` +- keeps the same embedded libkrun/libkrunfw kernel/runtime bundle + +`openshell-driver-vm` now embeds the sandbox rootfs tarball independently so it can +prepare sandbox guests without linking against the `openshell-vm` Rust crate. +It now also embeds the minimal libkrun/libkrunfw bundle it needs for sandbox +boots and launches sandbox guests via a hidden helper mode in the +`openshell-driver-vm` binary itself, without depending on the `openshell-vm` +binary. The helper still starts its own embedded `gvproxy` instance to provide +virtio-net guest egress plus the single inbound SSH port forward used by the +compute driver. + For fully air-gapped environments requiring pre-loaded images, build with: ```bash mise run vm:rootfs # Full rootfs (~2GB, includes images) diff --git a/architecture/gateway.md b/architecture/gateway.md index 9677dfc47..02f487050 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -72,6 +72,7 @@ graph TD | Persistence: Postgres | `crates/openshell-server/src/persistence/postgres.rs` | `PostgresStore` with sqlx | | Compute runtime | `crates/openshell-server/src/compute/mod.rs` | `ComputeRuntime`, gateway-owned sandbox lifecycle orchestration over a compute backend | | Compute driver: Kubernetes | `crates/openshell-driver-kubernetes/src/driver.rs` | Kubernetes CRD create/delete, endpoint resolution, watch stream, pod template translation | +| Compute driver: VM | `crates/openshell-driver-vm/src/driver.rs` | Per-sandbox microVM create/delete, localhost endpoint resolution, watch stream, supervisor-only guest boot | | Sandbox index | `crates/openshell-server/src/sandbox_index.rs` | `SandboxIndex` -- in-memory name/pod-to-id correlation | | Watch bus | `crates/openshell-server/src/sandbox_watch.rs` | `SandboxWatchBus` -- in-memory broadcast for persisted sandbox updates | | Tracing bus | `crates/openshell-server/src/tracing_bus.rs` | `TracingLogBus` -- captures tracing events keyed by `sandbox_id` | @@ -96,7 +97,9 @@ The gateway boots in `main()` (`crates/openshell-server/src/main.rs`) and procee 4. **Build `Config`** -- Assembles a `openshell_core::Config` from the parsed arguments. 5. **Call `run_server()`** (`crates/openshell-server/src/lib.rs`): 1. Connect to the persistence store (`Store::connect`), which auto-detects SQLite vs Postgres from the URL prefix and runs migrations. - 2. Create `ComputeRuntime` with an in-process `ComputeDriverService` backed by `KubernetesComputeDriver`, so the gateway calls the `openshell.compute.v1.ComputeDriver` RPC surface even without transport. + 2. Create `ComputeRuntime` with a `ComputeDriver` implementation selected by `OPENSHELL_DRIVERS`: + - `kubernetes` wraps `KubernetesComputeDriver` in `ComputeDriverService`, so the gateway uses the `openshell.compute.v1.ComputeDriver` RPC surface even without transport. + - `vm` spawns the sibling `openshell-driver-vm` binary as a local compute-driver process, connects to it over a Unix domain socket, and keeps the libkrun/rootfs runtime out of the gateway binary. 3. Build `ServerState` (shared via `Arc` across all handlers). 4. **Spawn background tasks**: - `ComputeRuntime::spawn_watchers` -- consumes the compute-driver watch stream, republishes platform events, and runs a periodic `ListSandboxes` snapshot reconcile so the store-backed public sandbox reads stay aligned with the compute driver. @@ -123,6 +126,15 @@ All configuration is via CLI flags with environment variable fallbacks. The `--d | `--sandbox-namespace` | `OPENSHELL_SANDBOX_NAMESPACE` | `default` | Kubernetes namespace for sandbox CRDs | | `--sandbox-image` | `OPENSHELL_SANDBOX_IMAGE` | None | Default container image for sandbox pods | | `--grpc-endpoint` | `OPENSHELL_GRPC_ENDPOINT` | None | gRPC endpoint reachable from within the cluster (for sandbox callbacks) | +| `--drivers` | `OPENSHELL_DRIVERS` | `kubernetes` | Compute backend to use. Current options are `kubernetes` and `vm`. | +| `--vm-driver-state-dir` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Host directory for VM sandbox rootfs, console logs, and runtime state | +| `--vm-compute-driver-bin` | `OPENSHELL_VM_COMPUTE_DRIVER_BIN` | sibling `openshell-driver-vm` binary | Local VM compute-driver process spawned by the gateway | +| `--vm-krun-log-level` | `OPENSHELL_VM_KRUN_LOG_LEVEL` | `1` | libkrun log level for VM helper processes | +| `--vm-driver-vcpus` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | Default vCPU count for VM sandboxes | +| `--vm-driver-mem-mib` | `OPENSHELL_VM_DRIVER_MEM_MIB` | `2048` | Default memory allocation for VM sandboxes in MiB | +| `--vm-tls-ca` | `OPENSHELL_VM_TLS_CA` | None | CA cert copied into VM guests for gateway mTLS | +| `--vm-tls-cert` | `OPENSHELL_VM_TLS_CERT` | None | Client cert copied into VM guests for gateway mTLS | +| `--vm-tls-key` | `OPENSHELL_VM_TLS_KEY` | None | Client private key copied into VM guests for gateway mTLS | | `--ssh-gateway-host` | `OPENSHELL_SSH_GATEWAY_HOST` | `127.0.0.1` | Public hostname returned in SSH session responses | | `--ssh-gateway-port` | `OPENSHELL_SSH_GATEWAY_PORT` | `8080` | Public port returned in SSH session responses | | `--ssh-connect-path` | `OPENSHELL_SSH_CONNECT_PATH` | `/connect/ssh` | HTTP path for SSH CONNECT/upgrade | @@ -544,6 +556,17 @@ The Kubernetes driver also watches namespace-scoped Kubernetes `Event` objects a Matched events are published to the `PlatformEventBus` as `SandboxStreamEvent::Event` payloads. +## VM Driver + +`VmDriver` (`crates/openshell-driver-vm/src/driver.rs`) is served by the standalone `openshell-driver-vm` process. The gateway spawns that binary on demand, talks to it over the internal `openshell.compute.v1.ComputeDriver` gRPC contract via a Unix domain socket, and keeps VM runtime dependencies out of `openshell-server`. + +- **Create**: The VM driver process allocates a localhost SSH port, prepares a sandbox-specific rootfs from its own embedded `rootfs.tar.zst`, injects an explicitly configured guest mTLS bundle when the gateway callback endpoint is `https://`, then re-execs itself in a hidden helper mode that loads libkrun directly and boots `/srv/openshell-vm-sandbox-init.sh`. +- **Networking**: The helper starts an embedded `gvproxy`, wires it into libkrun as virtio-net, and exposes the single inbound SSH port (`host_port:2222`) through gvproxy’s forwarder API. This keeps VM launch inside `openshell-driver-vm` without depending on the `openshell-vm` binary. +- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to host aliases or the gvproxy gateway IP (`192.168.127.1`) when local hostname resolution is unavailable on macOS. +- **Guest boot**: The sandbox guest runs a minimal init script that skips k3s and starts `openshell-sandbox` directly as PID 1 inside the VM. +- **Endpoint resolution**: Returns `127.0.0.1:` for SSH/exec transport. +- **Watch stream**: Emits provisioning, ready, error, deleting, deleted, and platform-event updates so the gateway store remains the durable source of truth. + ## Sandbox Index `SandboxIndex` (`crates/openshell-server/src/sandbox_index.rs`) maintains two in-memory maps protected by an `RwLock`: diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index 279752b4f..01b5c2372 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -14,6 +14,7 @@ use std::str::FromStr; #[serde(rename_all = "snake_case")] pub enum ComputeDriverKind { Kubernetes, + Vm, Podman, } @@ -22,6 +23,7 @@ impl ComputeDriverKind { pub const fn as_str(self) -> &'static str { match self { Self::Kubernetes => "kubernetes", + Self::Vm => "vm", Self::Podman => "podman", } } @@ -39,9 +41,10 @@ impl FromStr for ComputeDriverKind { fn from_str(value: &str) -> Result { match value.trim().to_ascii_lowercase().as_str() { "kubernetes" => Ok(Self::Kubernetes), + "vm" => Ok(Self::Vm), "podman" => Ok(Self::Podman), other => Err(format!( - "unsupported compute driver '{other}'. expected one of: kubernetes, podman" + "unsupported compute driver '{other}'. expected one of: kubernetes, vm, podman" )), } } @@ -358,6 +361,10 @@ mod tests { "kubernetes".parse::().unwrap(), ComputeDriverKind::Kubernetes ); + assert_eq!( + "vm".parse::().unwrap(), + ComputeDriverKind::Vm + ); assert_eq!( "podman".parse::().unwrap(), ComputeDriverKind::Podman diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml new file mode 100644 index 000000000..368716ef9 --- /dev/null +++ b/crates/openshell-driver-vm/Cargo.toml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-driver-vm" +description = "MicroVM compute driver for OpenShell" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +name = "openshell_driver_vm" +path = "src/lib.rs" + +[[bin]] +name = "openshell-driver-vm" +path = "src/main.rs" + +[dependencies] +openshell-core = { path = "../openshell-core" } + +tokio = { workspace = true } +tonic = { workspace = true, features = ["transport"] } +prost-types = { workspace = true } +futures = { workspace = true } +tokio-stream = { workspace = true, features = ["net"] } +nix = { workspace = true } +clap = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +miette = { workspace = true } +url = { workspace = true } +libc = "0.2" +libloading = "0.8" +tar = "0.4" +zstd = "0.13" + +[lints] +workspace = true diff --git a/crates/openshell-driver-vm/Makefile b/crates/openshell-driver-vm/Makefile new file mode 100644 index 000000000..e1c360f3d --- /dev/null +++ b/crates/openshell-driver-vm/Makefile @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +.PHONY: start + +start: + ./start.sh diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md new file mode 100644 index 000000000..6b874eb10 --- /dev/null +++ b/crates/openshell-driver-vm/README.md @@ -0,0 +1,151 @@ +# openshell-driver-vm + +> Status: Experimental. The VM compute driver is under active development and the interface still has VM-specific plumbing that will be generalized. + +Standalone libkrun-backed [`ComputeDriver`](../../proto/compute_driver.proto) for OpenShell. The gateway spawns this binary as a subprocess, talks to it over a Unix domain socket with the `openshell.compute.v1.ComputeDriver` gRPC surface, and lets it manage per-sandbox microVMs. The runtime (libkrun + libkrunfw + gvproxy) and sandbox rootfs are embedded directly in the binary — no sibling files required at runtime. + +## How it fits together + +```mermaid +flowchart LR + subgraph host["Host process"] + gateway["openshell-server
(compute::vm::spawn)"] + driver["openshell-driver-vm
├── libkrun (VM)
├── gvproxy (net)
└── rootfs.tar.zst"] + gateway <-->|"gRPC over UDS
compute-driver.sock"| driver + end + + subgraph guest["Per-sandbox microVM"] + init["/srv/openshell-vm-
sandbox-init.sh"] + supervisor["/opt/openshell/bin/
openshell-sandbox
(PID 1)"] + init --> supervisor + end + + driver -->|"CreateSandbox
boots via libkrun"| guest + supervisor -.->|"gRPC callback
--grpc-endpoint"| gateway + + client["openshell-cli"] -->|"SSH proxy
127.0.0.1:<port>"| supervisor + client -->|"CreateSandbox / Watch"| gateway +``` + +Sandbox guests execute `/opt/openshell/bin/openshell-sandbox` as PID 1 inside the VM. gvproxy exposes a single inbound SSH port (`host:` → `guest:2222`) and provides virtio-net egress. + +## Quick start (recommended) + +`start.sh` handles runtime setup, builds, codesigning, and environment wiring. From the repo root: + +```shell +crates/openshell-driver-vm/start.sh +``` + +or equivalently: + +```shell +make -C crates/openshell-driver-vm start +``` + +First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. State lives under `target/openshell-vm-driver-dev/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`). + +Override via environment: + +```shell +OPENSHELL_SERVER_PORT=9090 \ +OPENSHELL_SSH_HANDSHAKE_SECRET=$(openssl rand -hex 32) \ +crates/openshell-driver-vm/start.sh +``` + +Teardown: + +```shell +rm -rf target/openshell-vm-driver-dev +``` + +## Manual equivalent + +If you want to drive the launch yourself instead of using `start.sh`: + +```shell +# 1. Stage runtime artifacts + base rootfs into target/vm-runtime-compressed/ +mise run vm:setup +mise run vm:rootfs -- --base # if rootfs.tar.zst is not already present + +# 2. Build both binaries with the staged artifacts embedded +OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed \ + cargo build -p openshell-server -p openshell-driver-vm + +# 3. macOS only: codesign the driver for Hypervisor.framework +codesign \ + --entitlements crates/openshell-driver-vm/entitlements.plist \ + --force -s - target/debug/openshell-driver-vm + +# 4. Start the gateway with the VM driver +mkdir -p target/openshell-vm-driver-dev +target/debug/openshell-gateway \ + --drivers vm \ + --disable-tls \ + --database-url sqlite:target/openshell-vm-driver-dev/openshell.db \ + --grpc-endpoint http://host.containers.internal:8080 \ + --ssh-handshake-secret dev-vm-driver-secret \ + --ssh-gateway-host 127.0.0.1 \ + --ssh-gateway-port 8080 \ + --vm-driver-state-dir $PWD/target/openshell-vm-driver-dev +``` + +The gateway discovers `openshell-driver-vm` as a sibling of its own binary. Pass `--vm-compute-driver-bin /path/to/openshell-driver-vm` (or set `OPENSHELL_VM_COMPUTE_DRIVER_BIN`) to override. + +## Flags + +| Flag | Env var | Default | Purpose | +|---|---|---|---| +| `--drivers vm` | `OPENSHELL_DRIVERS` | `kubernetes` | Select the VM compute driver. | +| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (gvproxy answers `host.containers.internal` and `host.openshell.internal` to `192.168.127.1`). | +| `--vm-driver-state-dir DIR` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Per-sandbox rootfs, console logs, and the `compute-driver.sock` UDS. | +| `--vm-compute-driver-bin PATH` | `OPENSHELL_VM_COMPUTE_DRIVER_BIN` | sibling of gateway binary | Override the driver binary path. | +| `--vm-driver-vcpus N` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | vCPUs per sandbox. | +| `--vm-driver-mem-mib N` | `OPENSHELL_VM_DRIVER_MEM_MIB` | `2048` | Memory per sandbox, in MiB. | +| `--vm-krun-log-level N` | `OPENSHELL_VM_KRUN_LOG_LEVEL` | `1` | libkrun verbosity (0–5). | +| `--vm-tls-ca PATH` | `OPENSHELL_VM_TLS_CA` | — | CA cert for the guest's mTLS client bundle. Required when `--grpc-endpoint` uses `https://`. | +| `--vm-tls-cert PATH` | `OPENSHELL_VM_TLS_CERT` | — | Guest client certificate. | +| `--vm-tls-key PATH` | `OPENSHELL_VM_TLS_KEY` | — | Guest client private key. | + +See [`openshell-gateway --help`](../openshell-server/src/cli.rs) for the full flag surface shared with the Kubernetes driver. + +## Verifying the gateway + +In another terminal: + +```shell +export OPENSHELL_GATEWAY_URL=http://127.0.0.1:8080 +cargo run -p openshell-cli -- gateway register local --url $OPENSHELL_GATEWAY_URL --no-tls +cargo run -p openshell-cli -- sandbox create --name demo +cargo run -p openshell-cli -- sandbox connect demo +``` + +First sandbox takes 10–30 seconds to boot (rootfs extraction + libkrun + guest init). Subsequent creates reuse the prepared sandbox rootfs. + +## Logs and debugging + +Raise log verbosity for both processes: + +```shell +RUST_LOG=openshell_server=debug,openshell_driver_vm=debug \ + crates/openshell-driver-vm/start.sh +``` + +The VM guest's serial console is appended to `//console.log`. The `compute-driver.sock` lives at `/compute-driver.sock`; the gateway removes it on clean shutdown via `ManagedDriverProcess::drop`. + +## Prerequisites + +- macOS on Apple Silicon, or Linux on aarch64/x86_64 with KVM +- Rust toolchain +- [mise](https://mise.jdx.dev/) task runner +- Docker (needed by `mise run vm:rootfs` to build the base rootfs) +- `gh` CLI (used by `mise run vm:setup` to download pre-built runtime artifacts) + +## Relationship to `openshell-vm` + +`openshell-vm` is a separate, legacy crate that runs the **whole OpenShell gateway inside a single VM**. `openshell-driver-vm` is the compute driver called by a host-resident gateway to spawn **per-sandbox VMs**. Both embed libkrun but share no Rust code — the driver vendors its own rootfs handling and runtime loader so `openshell-server` never has to link libkrun. + +## TODOs + +- The gateway still configures the driver via CLI args; this will move to a gRPC bootstrap call so the driver interface is uniform across backends. See the `TODO(driver-abstraction)` notes in `crates/openshell-server/src/lib.rs` and `crates/openshell-server/src/compute/vm.rs`. +- macOS codesigning is handled by `start.sh`; a packaged release would need signing in CI. diff --git a/crates/openshell-driver-vm/build.rs b/crates/openshell-driver-vm/build.rs new file mode 100644 index 000000000..174a90fc8 --- /dev/null +++ b/crates/openshell-driver-vm/build.rs @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Build script for openshell-driver-vm. +//! +//! This crate embeds the sandbox rootfs plus the minimal libkrun runtime +//! artifacts it needs to boot base VMs without depending on the openshell-vm +//! binary or crate. + +use std::path::PathBuf; +use std::{env, fs}; + +fn main() { + println!("cargo:rerun-if-env-changed=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR"); + + if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { + println!("cargo:rerun-if-changed={dir}"); + for name in &[ + "libkrun.so.zst", + "libkrunfw.so.5.zst", + "libkrun.dylib.zst", + "libkrunfw.5.dylib.zst", + "gvproxy.zst", + "rootfs.tar.zst", + ] { + println!("cargo:rerun-if-changed={dir}/{name}"); + } + } + + let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set")); + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + + let (libkrun_name, libkrunfw_name) = match target_os.as_str() { + "macos" => ("libkrun.dylib", "libkrunfw.5.dylib"), + "linux" => ("libkrun.so", "libkrunfw.so.5"), + _ => { + println!("cargo:warning=VM runtime not available for {target_os}-{target_arch}"); + generate_stub_resources(&out_dir, &["libkrun", "libkrunfw", "rootfs.tar.zst"]); + return; + } + }; + + let compressed_dir = if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { + PathBuf::from(dir) + } else { + println!("cargo:warning=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR not set"); + println!("cargo:warning=Run: mise run vm:setup"); + generate_stub_resources( + &out_dir, + &[ + &format!("{libkrun_name}.zst"), + &format!("{libkrunfw_name}.zst"), + "gvproxy.zst", + "rootfs.tar.zst", + ], + ); + return; + }; + + if !compressed_dir.is_dir() { + println!( + "cargo:warning=Compressed runtime dir not found: {}", + compressed_dir.display() + ); + println!("cargo:warning=Run: mise run vm:setup"); + generate_stub_resources( + &out_dir, + &[ + &format!("{libkrun_name}.zst"), + &format!("{libkrunfw_name}.zst"), + "gvproxy.zst", + "rootfs.tar.zst", + ], + ); + return; + } + + let files = [ + (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")), + ( + format!("{libkrunfw_name}.zst"), + format!("{libkrunfw_name}.zst"), + ), + ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()), + ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()), + ]; + + let mut all_found = true; + for (src_name, dst_name) in &files { + let src_path = compressed_dir.join(src_name); + let dst_path = out_dir.join(dst_name); + + if !src_path.exists() { + println!( + "cargo:warning=Missing compressed artifact: {}", + src_path.display() + ); + all_found = false; + continue; + } + + if dst_path.exists() { + let _ = fs::remove_file(&dst_path); + } + fs::copy(&src_path, &dst_path).unwrap_or_else(|e| { + panic!( + "Failed to copy {} to {}: {}", + src_path.display(), + dst_path.display(), + e + ) + }); + let size = fs::metadata(&dst_path).map(|m| m.len()).unwrap_or(0); + println!("cargo:warning=Embedded {src_name}: {size} bytes"); + } + + if !all_found { + println!("cargo:warning=Some artifacts missing. Run: mise run vm:setup"); + generate_stub_resources( + &out_dir, + &[ + &format!("{libkrun_name}.zst"), + &format!("{libkrunfw_name}.zst"), + "gvproxy.zst", + "rootfs.tar.zst", + ], + ); + } +} + +fn generate_stub_resources(out_dir: &PathBuf, names: &[&str]) { + for name in names { + let path = out_dir.join(name); + if !path.exists() { + fs::write(&path, b"") + .unwrap_or_else(|e| panic!("Failed to write stub {}: {}", path.display(), e)); + } + } +} diff --git a/crates/openshell-driver-vm/entitlements.plist b/crates/openshell-driver-vm/entitlements.plist new file mode 100644 index 000000000..154f3308e --- /dev/null +++ b/crates/openshell-driver-vm/entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.security.hypervisor + + + diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh new file mode 100644 index 000000000..70dda5acb --- /dev/null +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Minimal init for sandbox VMs. Runs as PID 1 inside the guest, mounts the +# essential filesystems, configures gvproxy networking when present, then +# execs the OpenShell sandbox supervisor. + +set -euo pipefail + +BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) + +ts() { + local now + now=$(date +%s%3N 2>/dev/null || date +%s) + local elapsed=$((now - BOOT_START)) + printf "[%d.%03ds] %s\n" $((elapsed / 1000)) $((elapsed % 1000)) "$*" +} + +parse_endpoint() { + local endpoint="$1" + local scheme rest authority path host port + + case "$endpoint" in + *://*) + scheme="${endpoint%%://*}" + rest="${endpoint#*://}" + ;; + *) + return 1 + ;; + esac + + authority="${rest%%/*}" + path="${rest#"$authority"}" + if [ "$path" = "$rest" ]; then + path="" + fi + + if [[ "$authority" =~ ^\[([^]]+)\]:(.+)$ ]]; then + host="${BASH_REMATCH[1]}" + port="${BASH_REMATCH[2]}" + elif [[ "$authority" =~ ^\[([^]]+)\]$ ]]; then + host="${BASH_REMATCH[1]}" + port="" + elif [[ "$authority" == *:* ]]; then + host="${authority%%:*}" + port="${authority##*:}" + else + host="$authority" + port="" + fi + + if [ -z "$port" ]; then + case "$scheme" in + https) port="443" ;; + *) port="80" ;; + esac + fi + + printf '%s\n%s\n%s\n%s\n' "$scheme" "$host" "$port" "$path" +} + +tcp_probe() { + local host="$1" + local port="$2" + + if command -v timeout >/dev/null 2>&1; then + timeout 2 bash -c "exec 3<>/dev/tcp/${host}/${port}" >/dev/null 2>&1 + else + bash -c "exec 3<>/dev/tcp/${host}/${port}" >/dev/null 2>&1 + fi +} + +rewrite_openshell_endpoint_if_needed() { + local endpoint="${OPENSHELL_ENDPOINT:-}" + [ -n "$endpoint" ] || return 0 + + local parsed + if ! parsed="$(parse_endpoint "$endpoint")"; then + ts "WARNING: could not parse OPENSHELL_ENDPOINT=$endpoint" + return 0 + fi + + local scheme host port path + scheme="$(printf '%s\n' "$parsed" | sed -n '1p')" + host="$(printf '%s\n' "$parsed" | sed -n '2p')" + port="$(printf '%s\n' "$parsed" | sed -n '3p')" + path="$(printf '%s\n' "$parsed" | sed -n '4p')" + + if tcp_probe "$host" "$port"; then + return 0 + fi + + for candidate in host.containers.internal host.docker.internal 192.168.127.1; do + if [ "$candidate" = "$host" ]; then + continue + fi + if tcp_probe "$candidate" "$port"; then + local authority="$candidate" + if ! { [ "$scheme" = "http" ] && [ "$port" = "80" ]; } \ + && ! { [ "$scheme" = "https" ] && [ "$port" = "443" ]; }; then + authority="${authority}:${port}" + fi + export OPENSHELL_ENDPOINT="${scheme}://${authority}${path}" + ts "rewrote OPENSHELL_ENDPOINT to ${OPENSHELL_ENDPOINT}" + return 0 + fi + done + + ts "WARNING: could not reach OpenShell endpoint ${host}:${port}" +} + +mount -t proc proc /proc 2>/dev/null & +mount -t sysfs sysfs /sys 2>/dev/null & +mount -t tmpfs tmpfs /tmp 2>/dev/null & +mount -t tmpfs tmpfs /run 2>/dev/null & +mount -t devtmpfs devtmpfs /dev 2>/dev/null & +wait + +mkdir -p /dev/pts /dev/shm /sys/fs/cgroup /sandbox +mount -t devpts devpts /dev/pts 2>/dev/null & +mount -t tmpfs tmpfs /dev/shm 2>/dev/null & +mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & +wait + +mount -t tmpfs tmpfs /sandbox 2>/dev/null || true +mkdir -p /sandbox +chown sandbox:sandbox /sandbox 2>/dev/null || true + +hostname openshell-sandbox-vm 2>/dev/null || true +ip link set lo up 2>/dev/null || true + +if ip link show eth0 >/dev/null 2>&1; then + ts "detected eth0 (gvproxy networking)" + ip link set eth0 up 2>/dev/null || true + + if command -v udhcpc >/dev/null 2>&1; then + UDHCPC_SCRIPT="/usr/share/udhcpc/default.script" + if [ ! -f "$UDHCPC_SCRIPT" ]; then + mkdir -p /usr/share/udhcpc + cat > "$UDHCPC_SCRIPT" <<'DHCP_SCRIPT' +#!/bin/sh +case "$1" in + bound|renew) + ip addr flush dev "$interface" + ip addr add "$ip/$mask" dev "$interface" + if [ -n "$router" ]; then + ip route add default via "$router" dev "$interface" + fi + if [ -n "$dns" ]; then + : > /etc/resolv.conf + for d in $dns; do + echo "nameserver $d" >> /etc/resolv.conf + done + fi + ;; +esac +DHCP_SCRIPT + chmod +x "$UDHCPC_SCRIPT" + fi + + if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then + ts "WARNING: DHCP failed, falling back to static config" + ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true + ip route add default via 192.168.127.1 2>/dev/null || true + fi + else + ts "no DHCP client, using static config" + ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true + ip route add default via 192.168.127.1 2>/dev/null || true + fi + + if [ ! -s /etc/resolv.conf ]; then + echo "nameserver 8.8.8.8" > /etc/resolv.conf + echo "nameserver 8.8.4.4" >> /etc/resolv.conf + fi +else + ts "WARNING: eth0 not found; supervisor will start without guest egress" +fi + +export HOME=/sandbox +export USER=sandbox + +rewrite_openshell_endpoint_if_needed + +ts "starting openshell-sandbox supervisor" +exec /opt/openshell/bin/openshell-sandbox --workdir /sandbox diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs new file mode 100644 index 000000000..3d3fbf4b6 --- /dev/null +++ b/crates/openshell-driver-vm/src/driver.rs @@ -0,0 +1,1363 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ + GUEST_SSH_PORT, + rootfs::{extract_sandbox_rootfs_to, sandbox_guest_init_path}, +}; +use futures::Stream; +use nix::errno::Errno; +use nix::sys::signal::{Signal, kill}; +use nix::unistd::Pid; +use openshell_core::proto::compute::v1::{ + CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, + DriverCondition as SandboxCondition, DriverPlatformEvent as PlatformEvent, + DriverSandbox as Sandbox, DriverSandboxStatus as SandboxStatus, GetCapabilitiesRequest, + GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, ListSandboxesRequest, + ListSandboxesResponse, ResolveSandboxEndpointRequest, ResolveSandboxEndpointResponse, + SandboxEndpoint, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, + ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, + WatchSandboxesPlatformEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, + compute_driver_server::ComputeDriver, sandbox_endpoint, watch_sandboxes_event, +}; +use std::collections::{HashMap, HashSet}; +use std::net::{Ipv4Addr, SocketAddr, TcpListener}; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::process::Stdio; +use std::sync::Arc; +use std::time::Duration; +use tokio::net::TcpStream; +use tokio::process::{Child, Command}; +use tokio::sync::{Mutex, broadcast, mpsc}; +use tokio_stream::wrappers::ReceiverStream; +use tonic::{Request, Response, Status}; +use url::{Host, Url}; + +const DRIVER_NAME: &str = "openshell-driver-vm"; +const WATCH_BUFFER: usize = 256; +const DEFAULT_VCPUS: u8 = 2; +const DEFAULT_MEM_MIB: u32 = 2048; +const GUEST_TLS_DIR: &str = "/opt/openshell/tls"; +const GUEST_TLS_CA_PATH: &str = "/opt/openshell/tls/ca.crt"; +const GUEST_TLS_CERT_PATH: &str = "/opt/openshell/tls/tls.crt"; +const GUEST_TLS_KEY_PATH: &str = "/opt/openshell/tls/tls.key"; + +#[derive(Debug, Clone)] +struct VmDriverTlsPaths { + ca: PathBuf, + cert: PathBuf, + key: PathBuf, +} + +#[derive(Debug, Clone)] +pub struct VmDriverConfig { + pub openshell_endpoint: String, + pub state_dir: PathBuf, + pub launcher_bin: Option, + pub ssh_handshake_secret: String, + pub ssh_handshake_skew_secs: u64, + pub log_level: String, + pub krun_log_level: u32, + pub vcpus: u8, + pub mem_mib: u32, + pub guest_tls_ca: Option, + pub guest_tls_cert: Option, + pub guest_tls_key: Option, +} + +impl Default for VmDriverConfig { + fn default() -> Self { + Self { + openshell_endpoint: String::new(), + state_dir: PathBuf::from("target/openshell-vm-driver"), + launcher_bin: None, + ssh_handshake_secret: String::new(), + ssh_handshake_skew_secs: 300, + log_level: "info".to_string(), + krun_log_level: 1, + vcpus: DEFAULT_VCPUS, + mem_mib: DEFAULT_MEM_MIB, + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + } + } +} + +impl VmDriverConfig { + fn requires_tls_materials(&self) -> bool { + self.openshell_endpoint.starts_with("https://") + } + + fn tls_paths(&self) -> Result, String> { + let provided = [ + self.guest_tls_ca.as_ref(), + self.guest_tls_cert.as_ref(), + self.guest_tls_key.as_ref(), + ]; + if provided.iter().all(Option::is_none) { + return if self.requires_tls_materials() { + Err( + "https:// openshell endpoint requires OPENSHELL_VM_TLS_CA, OPENSHELL_VM_TLS_CERT, and OPENSHELL_VM_TLS_KEY so sandbox VMs can authenticate to the gateway" + .to_string(), + ) + } else { + Ok(None) + }; + } + + let Some(ca) = self.guest_tls_ca.clone() else { + return Err( + "OPENSHELL_VM_TLS_CA is required when TLS materials are configured".to_string(), + ); + }; + let Some(cert) = self.guest_tls_cert.clone() else { + return Err( + "OPENSHELL_VM_TLS_CERT is required when TLS materials are configured".to_string(), + ); + }; + let Some(key) = self.guest_tls_key.clone() else { + return Err( + "OPENSHELL_VM_TLS_KEY is required when TLS materials are configured".to_string(), + ); + }; + + for path in [&ca, &cert, &key] { + if !path.is_file() { + return Err(format!( + "TLS material '{}' does not exist or is not a file", + path.display() + )); + } + } + + Ok(Some(VmDriverTlsPaths { ca, cert, key })) + } +} + +fn validate_openshell_endpoint(endpoint: &str) -> Result<(), String> { + let url = Url::parse(endpoint) + .map_err(|err| format!("invalid openshell endpoint '{endpoint}': {err}"))?; + let Some(host) = url.host() else { + return Err(format!("openshell endpoint '{endpoint}' is missing a host")); + }; + + let invalid_from_vm = match host { + Host::Domain(_) => false, + Host::Ipv4(ip) => ip.is_unspecified(), + Host::Ipv6(ip) => ip.is_unspecified(), + }; + + if invalid_from_vm { + return Err(format!( + "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, host.containers.internal, or another routable address" + )); + } + + Ok(()) +} + +#[derive(Debug)] +struct VmProcess { + child: Child, + deleting: bool, +} + +#[derive(Debug)] +struct SandboxRecord { + snapshot: Sandbox, + ssh_port: u16, + state_dir: PathBuf, + process: Arc>, +} + +#[derive(Debug, Clone)] +pub struct VmDriver { + config: VmDriverConfig, + launcher_bin: PathBuf, + registry: Arc>>, + events: broadcast::Sender, +} + +impl VmDriver { + pub async fn new(config: VmDriverConfig) -> Result { + if config.openshell_endpoint.trim().is_empty() { + return Err("openshell endpoint is required".to_string()); + } + validate_openshell_endpoint(&config.openshell_endpoint)?; + let _ = config.tls_paths()?; + + let state_root = config.state_dir.join("sandboxes"); + tokio::fs::create_dir_all(&state_root) + .await + .map_err(|err| { + format!( + "failed to create state dir '{}': {err}", + state_root.display() + ) + })?; + + let launcher_bin = if let Some(path) = config.launcher_bin.clone() { + path + } else { + std::env::current_exe() + .map_err(|err| format!("failed to resolve vm driver executable: {err}"))? + }; + + let (events, _) = broadcast::channel(WATCH_BUFFER); + Ok(Self { + config, + launcher_bin, + registry: Arc::new(Mutex::new(HashMap::new())), + events, + }) + } + + #[must_use] + pub fn capabilities(&self) -> GetCapabilitiesResponse { + GetCapabilitiesResponse { + driver_name: DRIVER_NAME.to_string(), + driver_version: openshell_core::VERSION.to_string(), + default_image: String::new(), + supports_gpu: false, + } + } + + pub async fn validate_sandbox(&self, sandbox: &Sandbox) -> Result<(), Status> { + validate_vm_sandbox(sandbox) + } + + pub async fn create_sandbox(&self, sandbox: &Sandbox) -> Result { + validate_vm_sandbox(sandbox)?; + + if self.registry.lock().await.contains_key(&sandbox.id) { + return Err(Status::already_exists("sandbox already exists")); + } + + let ssh_port = allocate_local_port()?; + let state_dir = sandbox_state_dir(&self.config.state_dir, &sandbox.id); + let rootfs = state_dir.join("rootfs"); + + tokio::fs::create_dir_all(&state_dir) + .await + .map_err(|err| Status::internal(format!("create state dir failed: {err}")))?; + + let tls_paths = self + .config + .tls_paths() + .map_err(Status::failed_precondition)?; + let rootfs_for_extract = rootfs.clone(); + tokio::task::spawn_blocking(move || extract_sandbox_rootfs_to(&rootfs_for_extract)) + .await + .map_err(|err| Status::internal(format!("sandbox rootfs extraction panicked: {err}")))? + .map_err(|err| Status::internal(format!("extract sandbox rootfs failed: {err}")))?; + if let Some(tls_paths) = tls_paths.as_ref() { + prepare_guest_tls_materials(&rootfs, tls_paths) + .await + .map_err(|err| { + Status::internal(format!("prepare guest TLS materials failed: {err}")) + })?; + } + + let console_output = state_dir.join("rootfs-console.log"); + let mut command = Command::new(&self.launcher_bin); + command.kill_on_drop(true); + command.stdin(Stdio::null()); + command.stdout(Stdio::inherit()); + command.stderr(Stdio::inherit()); + command.arg("--internal-run-vm"); + command.arg("--vm-rootfs").arg(&rootfs); + command.arg("--vm-exec").arg(sandbox_guest_init_path()); + command.arg("--vm-workdir").arg("/"); + command.arg("--vm-vcpus").arg(self.config.vcpus.to_string()); + command + .arg("--vm-mem-mib") + .arg(self.config.mem_mib.to_string()); + command + .arg("--vm-krun-log-level") + .arg(self.config.krun_log_level.to_string()); + command.arg("--vm-console-output").arg(&console_output); + command + .arg("--vm-port") + .arg(format!("{ssh_port}:{GUEST_SSH_PORT}")); + for env in build_guest_environment(sandbox, &self.config) { + command.arg("--vm-env").arg(env); + } + + let child = match command.spawn() { + Ok(child) => child, + Err(err) => { + let _ = tokio::fs::remove_dir_all(&state_dir).await; + return Err(Status::internal(format!( + "failed to launch vm helper '{}': {err}", + self.launcher_bin.display() + ))); + } + }; + let snapshot = sandbox_snapshot(sandbox, provisioning_condition(), false); + let process = Arc::new(Mutex::new(VmProcess { + child, + deleting: false, + })); + + { + let mut registry = self.registry.lock().await; + registry.insert( + sandbox.id.clone(), + SandboxRecord { + snapshot: snapshot.clone(), + ssh_port, + state_dir: state_dir.clone(), + process: process.clone(), + }, + ); + } + + self.publish_snapshot(snapshot.clone()); + tokio::spawn({ + let driver = self.clone(); + let sandbox_id = sandbox.id.clone(); + async move { + driver.monitor_sandbox(sandbox_id).await; + } + }); + + Ok(CreateSandboxResponse {}) + } + + pub async fn delete_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result { + let record = { + let registry = self.registry.lock().await; + if let Some((id, record)) = registry.get_key_value(sandbox_id) { + Some((id.clone(), record.state_dir.clone(), record.process.clone())) + } else { + let matched_id = registry + .iter() + .find(|(_, record)| record.snapshot.name == sandbox_name) + .map(|(id, _)| id.clone()); + matched_id.and_then(|id| { + registry + .get(&id) + .map(|record| (id, record.state_dir.clone(), record.process.clone())) + }) + } + }; + + let Some((record_id, state_dir, process)) = record else { + return Ok(DeleteSandboxResponse { deleted: false }); + }; + + if let Some(snapshot) = self + .set_snapshot_condition(&record_id, deleting_condition(), true) + .await + { + self.publish_snapshot(snapshot); + } + + { + let mut process = process.lock().await; + process.deleting = true; + terminate_vm_process(&mut process.child) + .await + .map_err(|err| Status::internal(format!("failed to stop vm: {err}")))?; + } + + if let Err(err) = tokio::fs::remove_dir_all(&state_dir).await + && err.kind() != std::io::ErrorKind::NotFound + { + return Err(Status::internal(format!( + "failed to remove state dir: {err}" + ))); + } + + { + let mut registry = self.registry.lock().await; + registry.remove(&record_id); + } + + self.publish_deleted(record_id); + Ok(DeleteSandboxResponse { deleted: true }) + } + + pub async fn resolve_endpoint( + &self, + sandbox: &Sandbox, + ) -> Result { + let registry = self.registry.lock().await; + let record = registry.get(&sandbox.id).or_else(|| { + registry + .values() + .find(|record| record.snapshot.name == sandbox.name) + }); + let record = record.ok_or_else(|| Status::not_found("sandbox not found"))?; + Ok(ResolveSandboxEndpointResponse { + endpoint: Some(SandboxEndpoint { + target: Some(sandbox_endpoint::Target::Host("127.0.0.1".to_string())), + port: u32::from(record.ssh_port), + }), + }) + } + + pub async fn get_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result, Status> { + let registry = self.registry.lock().await; + let sandbox = if !sandbox_id.is_empty() { + registry + .get(sandbox_id) + .map(|record| record.snapshot.clone()) + } else { + registry + .values() + .find(|record| record.snapshot.name == sandbox_name) + .map(|record| record.snapshot.clone()) + }; + Ok(sandbox) + } + + pub async fn current_snapshots(&self) -> Vec { + let registry = self.registry.lock().await; + let mut snapshots = registry + .values() + .map(|record| record.snapshot.clone()) + .collect::>(); + snapshots.sort_by(|left, right| left.name.cmp(&right.name)); + snapshots + } + + async fn monitor_sandbox(&self, sandbox_id: String) { + let mut ready_emitted = false; + + loop { + let (process, ssh_port, state_dir) = { + let registry = self.registry.lock().await; + let Some(record) = registry.get(&sandbox_id) else { + return; + }; + ( + record.process.clone(), + record.ssh_port, + record.state_dir.clone(), + ) + }; + + let exit_status = { + let mut process = process.lock().await; + if process.deleting { + return; + } + match process.child.try_wait() { + Ok(status) => status, + Err(err) => { + if let Some(snapshot) = self + .set_snapshot_condition( + &sandbox_id, + error_condition("ProcessPollFailed", &err.to_string()), + false, + ) + .await + { + self.publish_snapshot(snapshot); + } + self.publish_platform_event( + sandbox_id.clone(), + platform_event( + "vm", + "Warning", + "ProcessPollFailed", + format!("Failed to poll VM helper process: {err}"), + ), + ); + return; + } + } + }; + + if let Some(status) = exit_status { + let message = match status.code() { + Some(code) => format!("VM process exited with status {code}"), + None => "VM process exited".to_string(), + }; + if let Some(snapshot) = self + .set_snapshot_condition( + &sandbox_id, + error_condition("ProcessExited", &message), + false, + ) + .await + { + self.publish_snapshot(snapshot); + } + self.publish_platform_event( + sandbox_id.clone(), + platform_event("vm", "Warning", "ProcessExited", message), + ); + return; + } + + if !ready_emitted && port_is_ready(ssh_port).await && guest_ssh_ready(&state_dir).await + { + if let Some(snapshot) = self + .set_snapshot_condition(&sandbox_id, ready_condition(), false) + .await + { + self.publish_snapshot(snapshot); + } + ready_emitted = true; + } + + tokio::time::sleep(Duration::from_millis(250)).await; + } + } + + async fn set_snapshot_condition( + &self, + sandbox_id: &str, + condition: SandboxCondition, + deleting: bool, + ) -> Option { + let mut registry = self.registry.lock().await; + let record = registry.get_mut(sandbox_id)?; + record.snapshot.status = Some(status_with_condition(&record.snapshot, condition, deleting)); + Some(record.snapshot.clone()) + } + + fn publish_snapshot(&self, sandbox: Sandbox) { + let _ = self.events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Sandbox( + WatchSandboxesSandboxEvent { + sandbox: Some(sandbox), + }, + )), + }); + } + + fn publish_deleted(&self, sandbox_id: String) { + let _ = self.events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Deleted( + WatchSandboxesDeletedEvent { sandbox_id }, + )), + }); + } + + fn publish_platform_event(&self, sandbox_id: String, event: PlatformEvent) { + let _ = self.events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::PlatformEvent( + WatchSandboxesPlatformEvent { + sandbox_id, + event: Some(event), + }, + )), + }); + } +} + +#[tonic::async_trait] +impl ComputeDriver for VmDriver { + async fn get_capabilities( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(self.capabilities())) + } + + async fn validate_sandbox_create( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request + .into_inner() + .sandbox + .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; + self.validate_sandbox(&sandbox).await?; + Ok(Response::new(ValidateSandboxCreateResponse {})) + } + + async fn create_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request + .into_inner() + .sandbox + .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; + let response = self.create_sandbox(&sandbox).await?; + Ok(Response::new(response)) + } + + async fn get_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() && request.sandbox_name.is_empty() { + return Err(Status::invalid_argument( + "sandbox_id or sandbox_name is required", + )); + } + + let sandbox = self + .get_sandbox(&request.sandbox_id, &request.sandbox_name) + .await? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + + if !request.sandbox_id.is_empty() && request.sandbox_id != sandbox.id { + return Err(Status::failed_precondition( + "sandbox_id did not match the fetched sandbox", + )); + } + + Ok(Response::new(GetSandboxResponse { + sandbox: Some(sandbox), + })) + } + + async fn list_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(ListSandboxesResponse { + sandboxes: self.current_snapshots().await, + })) + } + + async fn stop_sandbox( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "stop sandbox is not implemented by the vm compute driver", + )) + } + + async fn delete_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + let response = self + .delete_sandbox(&request.sandbox_id, &request.sandbox_name) + .await?; + Ok(Response::new(response)) + } + + async fn resolve_sandbox_endpoint( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request + .into_inner() + .sandbox + .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; + Ok(Response::new(self.resolve_endpoint(&sandbox).await?)) + } + + type WatchSandboxesStream = + Pin> + Send + 'static>>; + + async fn watch_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + let initial = self.current_snapshots().await; + let mut rx = self.events.subscribe(); + let (tx, out_rx) = mpsc::channel(WATCH_BUFFER); + tokio::spawn(async move { + let mut sent = HashSet::new(); + for sandbox in initial { + sent.insert(sandbox.id.clone()); + if tx + .send(Ok(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Sandbox( + WatchSandboxesSandboxEvent { + sandbox: Some(sandbox), + }, + )), + })) + .await + .is_err() + { + return; + } + } + + loop { + match rx.recv().await { + Ok(event) => { + if let Some(watch_sandboxes_event::Payload::Sandbox(sandbox_event)) = + &event.payload + && let Some(sandbox) = &sandbox_event.sandbox + && !sent.insert(sandbox.id.clone()) + { + // duplicate snapshots are still forwarded + } + if tx.send(Ok(event)).await.is_err() { + return; + } + } + Err(broadcast::error::RecvError::Lagged(_)) => continue, + Err(broadcast::error::RecvError::Closed) => return, + } + } + }); + + Ok(Response::new(Box::pin(ReceiverStream::new(out_rx)))) + } +} + +fn validate_vm_sandbox(sandbox: &Sandbox) -> Result<(), Status> { + let spec = sandbox + .spec + .as_ref() + .ok_or_else(|| Status::invalid_argument("sandbox spec is required"))?; + if spec.gpu { + return Err(Status::failed_precondition( + "vm sandboxes do not support gpu=true", + )); + } + if let Some(template) = spec.template.as_ref() { + if !template.image.is_empty() { + return Err(Status::failed_precondition( + "vm sandboxes do not support template.image", + )); + } + if !template.agent_socket_path.is_empty() { + return Err(Status::failed_precondition( + "vm sandboxes do not support template.agent_socket_path", + )); + } + if template.platform_config.is_some() { + return Err(Status::failed_precondition( + "vm sandboxes do not support template.platform_config", + )); + } + if template.resources.is_some() { + return Err(Status::failed_precondition( + "vm sandboxes do not support template.resources", + )); + } + } + Ok(()) +} + +fn merged_environment(sandbox: &Sandbox) -> HashMap { + let mut environment = sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .map_or_else(HashMap::new, |template| template.environment.clone()); + if let Some(spec) = sandbox.spec.as_ref() { + environment.extend(spec.environment.clone()); + } + environment +} + +fn guest_visible_openshell_endpoint(endpoint: &str) -> String { + let Ok(mut url) = Url::parse(endpoint) else { + return endpoint.to_string(); + }; + + let should_rewrite = match url.host() { + Some(Host::Ipv4(ip)) => ip.is_loopback(), + Some(Host::Ipv6(ip)) => ip.is_loopback(), + Some(Host::Domain(host)) => host.eq_ignore_ascii_case("localhost"), + None => false, + }; + + if should_rewrite && url.set_host(Some("192.168.127.1")).is_ok() { + return url.to_string(); + } + + endpoint.to_string() +} + +fn build_guest_environment(sandbox: &Sandbox, config: &VmDriverConfig) -> Vec { + let mut environment = HashMap::from([ + ("HOME".to_string(), "/root".to_string()), + ( + "PATH".to_string(), + "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + ), + ("TERM".to_string(), "xterm".to_string()), + ( + "OPENSHELL_ENDPOINT".to_string(), + guest_visible_openshell_endpoint(&config.openshell_endpoint), + ), + ("OPENSHELL_SANDBOX_ID".to_string(), sandbox.id.clone()), + ("OPENSHELL_SANDBOX".to_string(), sandbox.name.clone()), + ( + "OPENSHELL_SSH_LISTEN_ADDR".to_string(), + format!("0.0.0.0:{GUEST_SSH_PORT}"), + ), + ( + "OPENSHELL_SSH_HANDSHAKE_SECRET".to_string(), + config.ssh_handshake_secret.clone(), + ), + ( + "OPENSHELL_SSH_HANDSHAKE_SKEW_SECS".to_string(), + config.ssh_handshake_skew_secs.to_string(), + ), + ( + "OPENSHELL_SANDBOX_COMMAND".to_string(), + "tail -f /dev/null".to_string(), + ), + ( + "OPENSHELL_LOG_LEVEL".to_string(), + sandbox_log_level(sandbox, &config.log_level), + ), + ]); + if config.requires_tls_materials() { + environment.extend(HashMap::from([ + ( + "OPENSHELL_TLS_CA".to_string(), + GUEST_TLS_CA_PATH.to_string(), + ), + ( + "OPENSHELL_TLS_CERT".to_string(), + GUEST_TLS_CERT_PATH.to_string(), + ), + ( + "OPENSHELL_TLS_KEY".to_string(), + GUEST_TLS_KEY_PATH.to_string(), + ), + ])); + } + environment.extend(merged_environment(sandbox)); + + let mut pairs = environment.into_iter().collect::>(); + pairs.sort_by(|left, right| left.0.cmp(&right.0)); + pairs + .into_iter() + .map(|(key, value)| format!("{key}={value}")) + .collect() +} + +fn sandbox_log_level(sandbox: &Sandbox, default_level: &str) -> String { + sandbox + .spec + .as_ref() + .map(|spec| spec.log_level.as_str()) + .filter(|level| !level.is_empty()) + .unwrap_or(default_level) + .to_string() +} + +fn sandbox_state_dir(root: &Path, sandbox_id: &str) -> PathBuf { + root.join("sandboxes").join(sandbox_id) +} + +async fn prepare_guest_tls_materials( + rootfs: &Path, + paths: &VmDriverTlsPaths, +) -> Result<(), std::io::Error> { + let guest_tls_dir = rootfs.join(GUEST_TLS_DIR.trim_start_matches('/')); + tokio::fs::create_dir_all(&guest_tls_dir).await?; + + copy_guest_tls_material(&paths.ca, &guest_tls_dir.join("ca.crt"), 0o644).await?; + copy_guest_tls_material(&paths.cert, &guest_tls_dir.join("tls.crt"), 0o644).await?; + copy_guest_tls_material(&paths.key, &guest_tls_dir.join("tls.key"), 0o600).await?; + Ok(()) +} + +async fn copy_guest_tls_material( + source: &Path, + dest: &Path, + mode: u32, +) -> Result<(), std::io::Error> { + tokio::fs::copy(source, dest).await?; + tokio::fs::set_permissions(dest, std::fs::Permissions::from_mode(mode)).await?; + Ok(()) +} + +async fn terminate_vm_process(child: &mut Child) -> Result<(), std::io::Error> { + if let Some(pid) = child.id() + && let Err(err) = kill(Pid::from_raw(pid as i32), Signal::SIGTERM) + && err != Errno::ESRCH + { + return Err(std::io::Error::other(format!( + "send SIGTERM to vm process {pid}: {err}" + ))); + } + + match tokio::time::timeout(Duration::from_secs(5), child.wait()).await { + Ok(Ok(_)) => Ok(()), + Ok(Err(err)) => Err(err), + Err(_) => { + child.kill().await?; + child.wait().await.map(|_| ()) + } + } +} + +fn allocate_local_port() -> Result { + let listener = TcpListener::bind((Ipv4Addr::LOCALHOST, 0)) + .map_err(|err| Status::internal(format!("failed to allocate local ssh port: {err}")))?; + listener + .local_addr() + .map(|addr| addr.port()) + .map_err(|err| Status::internal(format!("failed to inspect local ssh port: {err}"))) +} + +async fn port_is_ready(port: u16) -> bool { + TcpStream::connect(SocketAddr::new(Ipv4Addr::LOCALHOST.into(), port)) + .await + .is_ok() +} + +async fn guest_ssh_ready(state_dir: &Path) -> bool { + let console_log = state_dir.join("rootfs-console.log"); + let Ok(contents) = tokio::fs::read_to_string(console_log).await else { + return false; + }; + + contents.contains("SSH server is ready to accept connections") + || contents.contains("SSH server listening") +} + +fn sandbox_snapshot(sandbox: &Sandbox, condition: SandboxCondition, deleting: bool) -> Sandbox { + Sandbox { + id: sandbox.id.clone(), + name: sandbox.name.clone(), + namespace: sandbox.namespace.clone(), + status: Some(SandboxStatus { + sandbox_name: sandbox.name.clone(), + instance_id: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![condition], + deleting, + }), + ..Default::default() + } +} + +fn status_with_condition( + snapshot: &Sandbox, + condition: SandboxCondition, + deleting: bool, +) -> SandboxStatus { + SandboxStatus { + sandbox_name: snapshot.name.clone(), + instance_id: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![condition], + deleting, + } +} + +fn provisioning_condition() -> SandboxCondition { + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: "Starting".to_string(), + message: "VM is starting".to_string(), + last_transition_time: String::new(), + } +} + +fn ready_condition() -> SandboxCondition { + SandboxCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "Listening".to_string(), + message: "Supervisor is listening for SSH connections".to_string(), + last_transition_time: String::new(), + } +} + +fn deleting_condition() -> SandboxCondition { + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: "Deleting".to_string(), + message: "Sandbox is being deleted".to_string(), + last_transition_time: String::new(), + } +} + +fn error_condition(reason: &str, message: &str) -> SandboxCondition { + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: reason.to_string(), + message: message.to_string(), + last_transition_time: String::new(), + } +} + +fn platform_event(source: &str, event_type: &str, reason: &str, message: String) -> PlatformEvent { + PlatformEvent { + timestamp_ms: current_time_ms(), + source: source.to_string(), + r#type: event_type.to_string(), + reason: reason.to_string(), + message, + metadata: HashMap::new(), + } +} + +fn current_time_ms() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_millis() as i64) +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_core::proto::compute::v1::{ + DriverSandboxSpec as SandboxSpec, DriverSandboxTemplate as SandboxTemplate, + }; + use prost_types::{Struct, Value, value::Kind}; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + use tonic::Code; + + #[test] + fn validate_vm_sandbox_rejects_gpu() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + gpu: true, + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox).expect_err("gpu should be rejected"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("gpu")); + } + + #[test] + fn validate_vm_sandbox_rejects_platform_config() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + platform_config: Some(Struct { + fields: [( + "runtime_class_name".to_string(), + Value { + kind: Some(Kind::StringValue("kata".to_string())), + }, + )] + .into_iter() + .collect(), + }), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox).expect_err("platform config should be rejected"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("platform_config")); + } + + #[test] + fn merged_environment_prefers_spec_values() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + environment: HashMap::from([("A".to_string(), "spec".to_string())]), + template: Some(SandboxTemplate { + environment: HashMap::from([ + ("A".to_string(), "template".to_string()), + ("B".to_string(), "template".to_string()), + ]), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let merged = merged_environment(&sandbox); + assert_eq!(merged.get("A"), Some(&"spec".to_string())); + assert_eq!(merged.get("B"), Some(&"template".to_string())); + } + + #[test] + fn build_guest_environment_sets_supervisor_defaults() { + let config = VmDriverConfig { + openshell_endpoint: "http://127.0.0.1:8080".to_string(), + ssh_handshake_secret: "secret".to_string(), + ..Default::default() + }; + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + name: "sandbox-123".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let env = build_guest_environment(&sandbox, &config); + assert!(env.contains(&"HOME=/root".to_string())); + assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string())); + assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string())); + assert!(env.contains(&format!( + "OPENSHELL_SSH_LISTEN_ADDR=0.0.0.0:{GUEST_SSH_PORT}" + ))); + } + + #[test] + fn guest_visible_openshell_endpoint_preserves_non_loopback_hosts() { + assert_eq!( + guest_visible_openshell_endpoint("http://host.containers.internal:8080"), + "http://host.containers.internal:8080" + ); + assert_eq!( + guest_visible_openshell_endpoint("https://gateway.internal:8443"), + "https://gateway.internal:8443" + ); + } + + #[test] + fn build_guest_environment_includes_tls_paths_for_https_endpoint() { + let config = VmDriverConfig { + openshell_endpoint: "https://127.0.0.1:8443".to_string(), + ssh_handshake_secret: "secret".to_string(), + guest_tls_ca: Some(PathBuf::from("/host/ca.crt")), + guest_tls_cert: Some(PathBuf::from("/host/tls.crt")), + guest_tls_key: Some(PathBuf::from("/host/tls.key")), + ..Default::default() + }; + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + name: "sandbox-123".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let env = build_guest_environment(&sandbox, &config); + assert!(env.contains(&format!("OPENSHELL_TLS_CA={GUEST_TLS_CA_PATH}"))); + assert!(env.contains(&format!("OPENSHELL_TLS_CERT={GUEST_TLS_CERT_PATH}"))); + assert!(env.contains(&format!("OPENSHELL_TLS_KEY={GUEST_TLS_KEY_PATH}"))); + } + + #[test] + fn vm_driver_config_requires_tls_materials_for_https_endpoint() { + let config = VmDriverConfig { + openshell_endpoint: "https://127.0.0.1:8443".to_string(), + ..Default::default() + }; + let err = config + .tls_paths() + .expect_err("https endpoint should require TLS materials"); + assert!(err.contains("OPENSHELL_VM_TLS_CA")); + } + + #[tokio::test] + async fn delete_sandbox_keeps_registry_entry_when_cleanup_fails() { + let (events, _) = broadcast::channel(WATCH_BUFFER); + let driver = VmDriver { + config: VmDriverConfig::default(), + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events, + }; + + let base = unique_temp_dir(); + std::fs::create_dir_all(&base).unwrap(); + let state_file = base.join("state-file"); + std::fs::write(&state_file, "not a directory").unwrap(); + + insert_test_record( + &driver, + "sandbox-123", + state_file.clone(), + spawn_exited_child(), + ) + .await; + + let err = driver + .delete_sandbox("sandbox-123", "sandbox-123") + .await + .expect_err("state dir cleanup should fail for a file path"); + assert!(err.message().contains("failed to remove state dir")); + assert!(driver.registry.lock().await.contains_key("sandbox-123")); + + let retry_state_dir = base.join("state-dir"); + std::fs::create_dir_all(&retry_state_dir).unwrap(); + { + let mut registry = driver.registry.lock().await; + let record = registry.get_mut("sandbox-123").unwrap(); + record.state_dir = retry_state_dir; + record.process = Arc::new(Mutex::new(VmProcess { + child: spawn_exited_child(), + deleting: false, + })); + } + + let response = driver + .delete_sandbox("sandbox-123", "sandbox-123") + .await + .expect("delete retry should succeed once cleanup works"); + assert!(response.deleted); + assert!(!driver.registry.lock().await.contains_key("sandbox-123")); + + let _ = std::fs::remove_dir_all(base); + } + + #[test] + fn validate_openshell_endpoint_accepts_loopback_hosts() { + validate_openshell_endpoint("http://127.0.0.1:8080") + .expect("ipv4 loopback should be allowed for TSI"); + validate_openshell_endpoint("http://localhost:8080") + .expect("localhost should be allowed for TSI"); + validate_openshell_endpoint("http://[::1]:8080") + .expect("ipv6 loopback should be allowed for TSI"); + } + + #[test] + fn validate_openshell_endpoint_rejects_unspecified_hosts() { + let err = validate_openshell_endpoint("http://0.0.0.0:8080") + .expect_err("unspecified endpoint should fail"); + assert!(err.contains("not reachable from sandbox VMs")); + } + + #[test] + fn validate_openshell_endpoint_accepts_host_gateway() { + validate_openshell_endpoint("http://host.containers.internal:8080") + .expect("guest-reachable host alias should be accepted"); + validate_openshell_endpoint("http://192.168.127.1:8080") + .expect("gateway IP should be accepted"); + validate_openshell_endpoint("http://host.openshell.internal:8080") + .expect("openshell host alias should be accepted"); + validate_openshell_endpoint("https://gateway.internal:8443") + .expect("dns endpoint should be accepted"); + } + + #[tokio::test] + async fn prepare_guest_tls_materials_copies_bundle_into_rootfs() { + let base = unique_temp_dir(); + let source_dir = base.join("source"); + let rootfs = base.join("rootfs"); + std::fs::create_dir_all(&source_dir).unwrap(); + std::fs::create_dir_all(&rootfs).unwrap(); + + let ca = source_dir.join("ca.crt"); + let cert = source_dir.join("tls.crt"); + let key = source_dir.join("tls.key"); + std::fs::write(&ca, "ca").unwrap(); + std::fs::write(&cert, "cert").unwrap(); + std::fs::write(&key, "key").unwrap(); + + prepare_guest_tls_materials( + &rootfs, + &VmDriverTlsPaths { + ca: ca.clone(), + cert: cert.clone(), + key: key.clone(), + }, + ) + .await + .unwrap(); + + let guest_dir = rootfs.join(GUEST_TLS_DIR.trim_start_matches('/')); + assert_eq!( + std::fs::read_to_string(guest_dir.join("ca.crt")).unwrap(), + "ca" + ); + assert_eq!( + std::fs::read_to_string(guest_dir.join("tls.crt")).unwrap(), + "cert" + ); + assert_eq!( + std::fs::read_to_string(guest_dir.join("tls.key")).unwrap(), + "key" + ); + let key_mode = std::fs::metadata(guest_dir.join("tls.key")) + .unwrap() + .permissions() + .mode() + & 0o777; + assert_eq!(key_mode, 0o600); + + let _ = std::fs::remove_dir_all(base); + } + + #[tokio::test] + async fn guest_ssh_ready_detects_guest_console_marker() { + let base = unique_temp_dir(); + std::fs::create_dir_all(&base).unwrap(); + std::fs::write( + base.join("rootfs-console.log"), + "...\nINFO openshell_sandbox: SSH server is ready to accept connections\n", + ) + .unwrap(); + + assert!(guest_ssh_ready(&base).await); + + let _ = std::fs::remove_dir_all(base); + } + + #[tokio::test] + async fn guest_ssh_ready_is_false_without_marker() { + let base = unique_temp_dir(); + std::fs::create_dir_all(&base).unwrap(); + std::fs::write(base.join("rootfs-console.log"), "sandbox booting\n").unwrap(); + + assert!(!guest_ssh_ready(&base).await); + + let _ = std::fs::remove_dir_all(base); + } + + fn unique_temp_dir() -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let suffix = COUNTER.fetch_add(1, Ordering::Relaxed); + std::env::temp_dir().join(format!( + "openshell-vm-driver-test-{}-{nanos}-{suffix}", + std::process::id() + )) + } + + fn spawn_exited_child() -> Child { + Command::new("sh") + .arg("-c") + .arg("exit 0") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .unwrap() + } + + async fn insert_test_record( + driver: &VmDriver, + sandbox_id: &str, + state_dir: PathBuf, + child: Child, + ) { + let sandbox = Sandbox { + id: sandbox_id.to_string(), + name: sandbox_id.to_string(), + ..Default::default() + }; + let process = Arc::new(Mutex::new(VmProcess { + child, + deleting: false, + })); + + let mut registry = driver.registry.lock().await; + registry.insert( + sandbox_id.to_string(), + SandboxRecord { + snapshot: sandbox, + ssh_port: 2222, + state_dir, + process, + }, + ); + } +} diff --git a/crates/openshell-driver-vm/src/embedded_runtime.rs b/crates/openshell-driver-vm/src/embedded_runtime.rs new file mode 100644 index 000000000..63f83b874 --- /dev/null +++ b/crates/openshell-driver-vm/src/embedded_runtime.rs @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Embedded libkrun runtime resources for the VM driver. + +use std::fs; +use std::path::{Path, PathBuf}; + +#[cfg(all(target_os = "macos", target_arch = "aarch64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.dylib.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.dylib"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib"; +} + +#[cfg(all(target_os = "linux", target_arch = "aarch64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.so"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; +} + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +mod resources { + pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); + pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); + pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); + pub const LIBKRUN_NAME: &str = "libkrun.so"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; +} + +#[cfg(not(any( + all(target_os = "macos", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "aarch64"), + all(target_os = "linux", target_arch = "x86_64"), +)))] +mod resources { + pub const LIBKRUN: &[u8] = &[]; + pub const LIBKRUNFW: &[u8] = &[]; + pub const GVPROXY: &[u8] = &[]; + pub const LIBKRUN_NAME: &str = "libkrun"; + pub const LIBKRUNFW_NAME: &str = "libkrunfw"; +} + +const VERSION: &str = env!("CARGO_PKG_VERSION"); + +pub fn ensure_runtime_extracted() -> Result { + if resources::LIBKRUN.is_empty() { + return Err( + "VM runtime not embedded for this platform. Supported: macOS ARM64, Linux ARM64, Linux x86_64" + .to_string(), + ); + } + + let cache_dir = runtime_cache_dir()?; + let version_marker = cache_dir.join(".version"); + let cache_key = runtime_cache_key(); + + if version_marker.exists() + && let Ok(cached_key) = fs::read_to_string(&version_marker) + && cached_key.trim() == cache_key + && validate_runtime_dir(&cache_dir).is_ok() + { + return Ok(cache_dir); + } + + cleanup_old_versions(&cache_dir)?; + + if cache_dir.exists() { + fs::remove_dir_all(&cache_dir) + .map_err(|e| format!("remove old runtime cache {}: {e}", cache_dir.display()))?; + } + fs::create_dir_all(&cache_dir) + .map_err(|e| format!("create runtime cache {}: {e}", cache_dir.display()))?; + + extract_resource(resources::LIBKRUN, &cache_dir.join(resources::LIBKRUN_NAME))?; + extract_resource( + resources::LIBKRUNFW, + &cache_dir.join(resources::LIBKRUNFW_NAME), + )?; + extract_resource(resources::GVPROXY, &cache_dir.join("gvproxy"))?; + + #[cfg(target_os = "macos")] + { + let unversioned = cache_dir.join("libkrunfw.dylib"); + if !unversioned.exists() { + std::os::unix::fs::symlink(resources::LIBKRUNFW_NAME, &unversioned) + .map_err(|e| format!("symlink {}: {e}", unversioned.display()))?; + } + } + + fs::write(&version_marker, cache_key) + .map_err(|e| format!("write runtime marker {}: {e}", version_marker.display()))?; + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + fs::set_permissions(cache_dir.join("gvproxy"), fs::Permissions::from_mode(0o755)) + .map_err(|e| format!("chmod gvproxy: {e}"))?; + } + + Ok(cache_dir) +} + +pub fn validate_runtime_dir(dir: &Path) -> Result<(), String> { + let libkrun = dir.join(resources::LIBKRUN_NAME); + let libkrunfw = dir.join(resources::LIBKRUNFW_NAME); + let gvproxy = dir.join("gvproxy"); + + for path in [&libkrun, &libkrunfw, &gvproxy] { + if !path.is_file() { + return Err(format!("missing runtime file: {}", path.display())); + } + let size = fs::metadata(path).map(|m| m.len()).unwrap_or(0); + if size == 0 { + return Err(format!("runtime file is empty (stub): {}", path.display())); + } + } + + Ok(()) +} + +fn runtime_cache_key() -> String { + let mut fp: u64 = 0; + for (index, chunk) in [resources::LIBKRUN, resources::LIBKRUNFW] + .into_iter() + .chain(std::iter::once(resources::GVPROXY)) + .enumerate() + { + let sample = &chunk[..chunk.len().min(64)]; + let mut word: u64 = 0; + for (offset, byte) in sample.iter().enumerate() { + word ^= (*byte as u64) << ((offset % 8) * 8); + } + fp ^= word.rotate_left((index as u32) * 13 + 7); + fp ^= (chunk.len() as u64).rotate_left((index as u32) * 17 + 3); + } + format!("{VERSION}-{fp:016x}") +} + +fn runtime_cache_dir() -> Result { + let base = + openshell_core::paths::xdg_data_dir().map_err(|e| format!("resolve XDG data dir: {e}"))?; + Ok(base.join("openshell").join("vm-runtime").join(VERSION)) +} + +fn runtime_cache_base() -> Result { + let base = + openshell_core::paths::xdg_data_dir().map_err(|e| format!("resolve XDG data dir: {e}"))?; + Ok(base.join("openshell").join("vm-runtime")) +} + +fn cleanup_old_versions(current_dir: &Path) -> Result<(), String> { + let base = runtime_cache_base()?; + if !base.exists() { + return Ok(()); + } + + let entries = fs::read_dir(&base).map_err(|e| format!("read {}: {e}", base.display()))?; + for entry in entries.filter_map(Result::ok) { + let path = entry.path(); + if path.is_dir() && !current_dir.starts_with(&path) && path != current_dir { + let _ = fs::remove_dir_all(&path); + } + } + Ok(()) +} + +fn extract_resource(compressed: &[u8], dest: &Path) -> Result<(), String> { + if compressed.is_empty() { + return Err(format!("embedded resource is empty: {}", dest.display())); + } + + let decompressed = + zstd::decode_all(compressed).map_err(|e| format!("decompress {}: {e}", dest.display()))?; + fs::write(dest, decompressed).map_err(|e| format!("write {}: {e}", dest.display())) +} diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs new file mode 100644 index 000000000..750788ac1 --- /dev/null +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal runtime-loaded bindings for the libkrun C API used by the VM driver. + +#![allow(unsafe_code)] + +use std::ffi::{CStr, CString}; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use libc::c_char; +use libloading::Library; + +use crate::runtime::validate_runtime_dir; + +pub const KRUN_LOG_TARGET_DEFAULT: i32 = -1; +pub const KRUN_LOG_LEVEL_OFF: u32 = 0; +pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; +pub const KRUN_LOG_LEVEL_WARN: u32 = 2; +pub const KRUN_LOG_LEVEL_INFO: u32 = 3; +pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; +pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; +pub const KRUN_LOG_STYLE_AUTO: u32 = 0; +pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; + +type KrunInitLog = + unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; +type KrunCreateCtx = unsafe extern "C" fn() -> i32; +type KrunFreeCtx = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunSetVmConfig = unsafe extern "C" fn(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; +type KrunSetRoot = unsafe extern "C" fn(ctx_id: u32, root_path: *const c_char) -> i32; +type KrunSetWorkdir = unsafe extern "C" fn(ctx_id: u32, workdir_path: *const c_char) -> i32; +type KrunSetExec = unsafe extern "C" fn( + ctx_id: u32, + exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, +) -> i32; +type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; +type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; +type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; +#[cfg(target_os = "macos")] +type KrunAddNetUnixgram = unsafe extern "C" fn( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, +) -> i32; +type KrunAddNetUnixstream = unsafe extern "C" fn( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, +) -> i32; + +pub struct LibKrun { + pub krun_init_log: KrunInitLog, + pub krun_create_ctx: KrunCreateCtx, + pub krun_free_ctx: KrunFreeCtx, + pub krun_set_vm_config: KrunSetVmConfig, + pub krun_set_root: KrunSetRoot, + pub krun_set_workdir: KrunSetWorkdir, + pub krun_set_exec: KrunSetExec, + pub krun_set_port_map: KrunSetPortMap, + pub krun_set_console_output: KrunSetConsoleOutput, + pub krun_start_enter: KrunStartEnter, + pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, + pub krun_add_vsock: KrunAddVsock, + #[cfg(target_os = "macos")] + pub krun_add_net_unixgram: KrunAddNetUnixgram, + #[allow(dead_code)] // Used on Linux when gvproxy runs in qemu/unixstream mode. + pub krun_add_net_unixstream: KrunAddNetUnixstream, +} + +static LIBKRUN: OnceLock = OnceLock::new(); + +pub fn libkrun(runtime_dir: &Path) -> Result<&'static LibKrun, String> { + if let Some(lib) = LIBKRUN.get() { + return Ok(lib); + } + + validate_runtime_dir(runtime_dir)?; + let loaded = LibKrun::load(runtime_dir)?; + let _ = LIBKRUN.set(loaded); + Ok(LIBKRUN.get().expect("libkrun should be initialized")) +} + +pub fn required_runtime_lib_name() -> &'static str { + #[cfg(target_os = "macos")] + { + "libkrun.dylib" + } + #[cfg(not(target_os = "macos"))] + { + "libkrun.so" + } +} + +impl LibKrun { + fn load(runtime_dir: &Path) -> Result { + let libkrun_path = runtime_dir.join(required_runtime_lib_name()); + preload_runtime_support_libraries(runtime_dir)?; + + let library = Box::leak(Box::new(unsafe { + Library::new(&libkrun_path) + .map_err(|e| format!("load libkrun from {}: {e}", libkrun_path.display()))? + })); + + Ok(Self { + krun_init_log: load_symbol(library, b"krun_init_log\0", &libkrun_path)?, + krun_create_ctx: load_symbol(library, b"krun_create_ctx\0", &libkrun_path)?, + krun_free_ctx: load_symbol(library, b"krun_free_ctx\0", &libkrun_path)?, + krun_set_vm_config: load_symbol(library, b"krun_set_vm_config\0", &libkrun_path)?, + krun_set_root: load_symbol(library, b"krun_set_root\0", &libkrun_path)?, + krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &libkrun_path)?, + krun_set_exec: load_symbol(library, b"krun_set_exec\0", &libkrun_path)?, + krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &libkrun_path)?, + krun_set_console_output: load_symbol( + library, + b"krun_set_console_output\0", + &libkrun_path, + )?, + krun_start_enter: load_symbol(library, b"krun_start_enter\0", &libkrun_path)?, + krun_disable_implicit_vsock: load_symbol( + library, + b"krun_disable_implicit_vsock\0", + &libkrun_path, + )?, + krun_add_vsock: load_symbol(library, b"krun_add_vsock\0", &libkrun_path)?, + #[cfg(target_os = "macos")] + krun_add_net_unixgram: load_symbol(library, b"krun_add_net_unixgram\0", &libkrun_path)?, + krun_add_net_unixstream: load_symbol( + library, + b"krun_add_net_unixstream\0", + &libkrun_path, + )?, + }) + } +} + +fn preload_runtime_support_libraries(runtime_dir: &Path) -> Result, String> { + let entries = std::fs::read_dir(runtime_dir) + .map_err(|e| format!("read {}: {e}", runtime_dir.display()))?; + + let mut support_libs: Vec = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| { + path.file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| { + #[cfg(target_os = "macos")] + { + name.starts_with("libkrunfw") && name.ends_with(".dylib") + } + #[cfg(not(target_os = "macos"))] + { + name.starts_with("libkrunfw") && name.contains(".so") + } + }) + }) + .collect(); + + support_libs.sort(); + for path in &support_libs { + let path_cstr = CString::new(path.to_string_lossy().as_bytes()) + .map_err(|e| format!("invalid support library path {}: {e}", path.display()))?; + let handle = + unsafe { libc::dlopen(path_cstr.as_ptr(), libc::RTLD_NOW | libc::RTLD_GLOBAL) }; + if handle.is_null() { + let error = unsafe { + let err = libc::dlerror(); + if err.is_null() { + "unknown dlopen error".to_string() + } else { + CStr::from_ptr(err).to_string_lossy().into_owned() + } + }; + return Err(format!( + "preload runtime support library {}: {error}", + path.display() + )); + } + } + + Ok(support_libs) +} + +fn load_symbol(library: &'static Library, name: &[u8], path: &Path) -> Result { + unsafe { + library.get::(name).map(|symbol| *symbol).map_err(|e| { + format!( + "load symbol {} from {}: {e}", + String::from_utf8_lossy(name).trim_end_matches('\0'), + path.display() + ) + }) + } +} diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs new file mode 100644 index 000000000..1c424deeb --- /dev/null +++ b/crates/openshell-driver-vm/src/lib.rs @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod driver; +mod embedded_runtime; +mod ffi; +mod rootfs; +mod runtime; + +pub const GUEST_SSH_PORT: u16 = 2222; + +pub use driver::{VmDriver, VmDriverConfig}; +pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm}; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs new file mode 100644 index 000000000..3a7976273 --- /dev/null +++ b/crates/openshell-driver-vm/src/main.rs @@ -0,0 +1,229 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use clap::Parser; +use miette::{IntoDiagnostic, Result}; +use openshell_core::VERSION; +use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; +use openshell_driver_vm::{ + VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm, +}; +use std::net::SocketAddr; +use std::path::PathBuf; +use tokio::net::UnixListener; +use tokio_stream::wrappers::UnixListenerStream; +use tracing::info; +use tracing_subscriber::EnvFilter; + +#[derive(Parser, Debug)] +#[command(name = "openshell-driver-vm")] +#[command(version = VERSION)] +struct Args { + #[arg(long, hide = true, default_value_t = false)] + internal_run_vm: bool, + + #[arg(long, hide = true)] + vm_rootfs: Option, + + #[arg(long, hide = true)] + vm_exec: Option, + + #[arg(long, hide = true, default_value = "/")] + vm_workdir: String, + + #[arg(long, hide = true)] + vm_env: Vec, + + #[arg(long, hide = true)] + vm_port: Vec, + + #[arg(long, hide = true)] + vm_console_output: Option, + + #[arg(long, hide = true, default_value_t = 2)] + vm_vcpus: u8, + + #[arg(long, hide = true, default_value_t = 2048)] + vm_mem_mib: u32, + + #[arg(long, hide = true, default_value_t = 1)] + vm_krun_log_level: u32, + + #[arg( + long, + env = "OPENSHELL_COMPUTE_DRIVER_BIND", + default_value = "127.0.0.1:50061" + )] + bind_address: SocketAddr, + + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")] + bind_socket: Option, + + #[arg(long, env = "OPENSHELL_LOG_LEVEL", default_value = "info")] + log_level: String, + + #[arg(long, env = "OPENSHELL_GRPC_ENDPOINT")] + openshell_endpoint: Option, + + #[arg( + long, + env = "OPENSHELL_VM_DRIVER_STATE_DIR", + default_value = "target/openshell-vm-driver" + )] + state_dir: PathBuf, + + #[arg(long, env = "OPENSHELL_SSH_HANDSHAKE_SECRET")] + ssh_handshake_secret: Option, + + #[arg(long, env = "OPENSHELL_SSH_HANDSHAKE_SKEW_SECS", default_value_t = 300)] + ssh_handshake_skew_secs: u64, + + #[arg(long = "guest-tls-ca", env = "OPENSHELL_VM_TLS_CA")] + guest_tls_ca: Option, + + #[arg(long = "guest-tls-cert", env = "OPENSHELL_VM_TLS_CERT")] + guest_tls_cert: Option, + + #[arg(long = "guest-tls-key", env = "OPENSHELL_VM_TLS_KEY")] + guest_tls_key: Option, + + #[arg(long, env = "OPENSHELL_VM_KRUN_LOG_LEVEL", default_value_t = 1)] + krun_log_level: u32, + + #[arg(long, env = "OPENSHELL_VM_DRIVER_VCPUS", default_value_t = 2)] + vcpus: u8, + + #[arg(long, env = "OPENSHELL_VM_DRIVER_MEM_MIB", default_value_t = 2048)] + mem_mib: u32, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + if args.internal_run_vm { + maybe_reexec_internal_vm_with_runtime_env()?; + let config = build_vm_launch_config(&args).map_err(|err| miette::miette!("{err}"))?; + run_vm(&config).map_err(|err| miette::miette!("{err}"))?; + return Ok(()); + } + + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), + ) + .init(); + + let driver = VmDriver::new(VmDriverConfig { + openshell_endpoint: args + .openshell_endpoint + .ok_or_else(|| miette::miette!("OPENSHELL_GRPC_ENDPOINT is required"))?, + state_dir: args.state_dir, + launcher_bin: None, + ssh_handshake_secret: args.ssh_handshake_secret.unwrap_or_default(), + ssh_handshake_skew_secs: args.ssh_handshake_skew_secs, + log_level: args.log_level, + krun_log_level: args.krun_log_level, + vcpus: args.vcpus, + mem_mib: args.mem_mib, + guest_tls_ca: args.guest_tls_ca, + guest_tls_cert: args.guest_tls_cert, + guest_tls_key: args.guest_tls_key, + }) + .await + .map_err(|err| miette::miette!("{err}"))?; + + if let Some(socket_path) = args.bind_socket { + if let Some(parent) = socket_path.parent() { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + match std::fs::remove_file(&socket_path) { + Ok(()) => {} + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(err).into_diagnostic(), + } + + info!(socket = %socket_path.display(), "Starting vm compute driver"); + let listener = UnixListener::bind(&socket_path).into_diagnostic()?; + let result = tonic::transport::Server::builder() + .add_service(ComputeDriverServer::new(driver)) + .serve_with_incoming(UnixListenerStream::new(listener)) + .await + .into_diagnostic(); + let _ = std::fs::remove_file(&socket_path); + result + } else { + info!(address = %args.bind_address, "Starting vm compute driver"); + tonic::transport::Server::builder() + .add_service(ComputeDriverServer::new(driver)) + .serve(args.bind_address) + .await + .into_diagnostic() + } +} + +fn build_vm_launch_config(args: &Args) -> std::result::Result { + let rootfs = args + .vm_rootfs + .clone() + .ok_or_else(|| "--vm-rootfs is required in internal VM mode".to_string())?; + let exec_path = args + .vm_exec + .clone() + .ok_or_else(|| "--vm-exec is required in internal VM mode".to_string())?; + let console_output = args + .vm_console_output + .clone() + .ok_or_else(|| "--vm-console-output is required in internal VM mode".to_string())?; + + Ok(VmLaunchConfig { + rootfs, + vcpus: args.vm_vcpus, + mem_mib: args.vm_mem_mib, + exec_path, + args: Vec::new(), + env: args.vm_env.clone(), + workdir: args.vm_workdir.clone(), + port_map: args.vm_port.clone(), + log_level: args.vm_krun_log_level, + console_output, + }) +} + +#[cfg(target_os = "macos")] +fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { + const REEXEC_ENV: &str = "__OPENSHELL_DRIVER_VM_REEXEC"; + + if std::env::var_os(REEXEC_ENV).is_some() { + return Ok(()); + } + + let runtime_dir = configured_runtime_dir().map_err(|err| miette::miette!("{err}"))?; + let runtime_str = runtime_dir.to_string_lossy(); + let needs_reexec = std::env::var_os("DYLD_LIBRARY_PATH") + .is_none_or(|value| !value.to_string_lossy().contains(runtime_str.as_ref())); + if !needs_reexec { + return Ok(()); + } + + let mut dyld_paths = vec![runtime_dir.clone()]; + if let Some(existing) = std::env::var_os("DYLD_LIBRARY_PATH") { + dyld_paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(&dyld_paths) + .map_err(|err| miette::miette!("join DYLD_LIBRARY_PATH: {err}"))?; + let exe = std::env::current_exe().into_diagnostic()?; + let args: Vec = std::env::args().skip(1).collect(); + let status = std::process::Command::new(exe) + .args(&args) + .env("DYLD_LIBRARY_PATH", &joined) + .env(VM_RUNTIME_DIR_ENV, runtime_dir) + .env(REEXEC_ENV, "1") + .status() + .into_diagnostic()?; + std::process::exit(status.code().unwrap_or(1)); +} + +#[cfg(not(target_os = "macos"))] +fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { + Ok(()) +} diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs new file mode 100644 index 000000000..b9b29b5fc --- /dev/null +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -0,0 +1,234 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs; +use std::io::Cursor; +use std::path::Path; + +const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); +const ROOTFS_VARIANT_MARKER: &str = ".openshell-rootfs-variant"; +const SANDBOX_GUEST_INIT_PATH: &str = "/srv/openshell-vm-sandbox-init.sh"; + +pub const fn sandbox_guest_init_path() -> &'static str { + SANDBOX_GUEST_INIT_PATH +} + +pub fn extract_sandbox_rootfs_to(dest: &Path) -> Result<(), String> { + if ROOTFS.is_empty() { + return Err( + "sandbox rootfs not embedded. Build openshell-driver-vm with OPENSHELL_VM_RUNTIME_COMPRESSED_DIR set or run `mise run vm:setup` first" + .to_string(), + ); + } + + let expected_marker = format!("{}:sandbox", env!("CARGO_PKG_VERSION")); + let marker_path = dest.join(ROOTFS_VARIANT_MARKER); + + if dest.is_dir() + && fs::read_to_string(&marker_path) + .map(|value| value.trim() == expected_marker) + .unwrap_or(false) + { + return Ok(()); + } + + if dest.exists() { + fs::remove_dir_all(dest) + .map_err(|e| format!("remove old rootfs {}: {e}", dest.display()))?; + } + + extract_rootfs_to(dest)?; + prepare_sandbox_rootfs(dest)?; + fs::write(marker_path, format!("{expected_marker}\n")) + .map_err(|e| format!("write rootfs variant marker: {e}"))?; + Ok(()) +} + +fn extract_rootfs_to(dest: &Path) -> Result<(), String> { + fs::create_dir_all(dest).map_err(|e| format!("create rootfs dir {}: {e}", dest.display()))?; + + let decoder = + zstd::Decoder::new(Cursor::new(ROOTFS)).map_err(|e| format!("decompress rootfs: {e}"))?; + let mut archive = tar::Archive::new(decoder); + archive + .unpack(dest) + .map_err(|e| format!("extract rootfs tarball into {}: {e}", dest.display())) +} + +fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { + for relative in [ + "usr/local/bin/k3s", + "usr/local/bin/kubectl", + "var/lib/rancher", + "etc/rancher", + "opt/openshell/charts", + "opt/openshell/manifests", + "opt/openshell/.initialized", + "opt/openshell/.rootfs-type", + ] { + remove_rootfs_path(rootfs, relative)?; + } + + let init_path = rootfs.join("srv/openshell-vm-sandbox-init.sh"); + if let Some(parent) = init_path.parent() { + fs::create_dir_all(parent).map_err(|e| format!("create {}: {e}", parent.display()))?; + } + fs::write( + &init_path, + include_str!("../scripts/openshell-vm-sandbox-init.sh"), + ) + .map_err(|e| format!("write {}: {e}", init_path.display()))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + + fs::set_permissions(&init_path, fs::Permissions::from_mode(0o755)) + .map_err(|e| format!("chmod {}: {e}", init_path.display()))?; + } + + let opt_dir = rootfs.join("opt/openshell"); + fs::create_dir_all(&opt_dir).map_err(|e| format!("create {}: {e}", opt_dir.display()))?; + fs::write(opt_dir.join(".rootfs-type"), "sandbox\n") + .map_err(|e| format!("write sandbox rootfs marker: {e}"))?; + ensure_sandbox_guest_user(rootfs)?; + fs::create_dir_all(rootfs.join("sandbox")) + .map_err(|e| format!("create sandbox workdir: {e}"))?; + + Ok(()) +} + +fn ensure_sandbox_guest_user(rootfs: &Path) -> Result<(), String> { + const SANDBOX_UID: u32 = 10001; + const SANDBOX_GID: u32 = 10001; + + let etc_dir = rootfs.join("etc"); + fs::create_dir_all(&etc_dir).map_err(|e| format!("create {}: {e}", etc_dir.display()))?; + + ensure_line_in_file( + &etc_dir.join("group"), + &format!("sandbox:x:{SANDBOX_GID}:"), + |line| line.starts_with("sandbox:"), + )?; + ensure_line_in_file(&etc_dir.join("gshadow"), "sandbox:!::", |line| { + line.starts_with("sandbox:") + })?; + ensure_line_in_file( + &etc_dir.join("passwd"), + &format!("sandbox:x:{SANDBOX_UID}:{SANDBOX_GID}:OpenShell Sandbox:/sandbox:/bin/bash"), + |line| line.starts_with("sandbox:"), + )?; + ensure_line_in_file( + &etc_dir.join("shadow"), + "sandbox:!:20123:0:99999:7:::", + |line| line.starts_with("sandbox:"), + )?; + + Ok(()) +} + +fn ensure_line_in_file( + path: &Path, + line: &str, + exists: impl Fn(&str) -> bool, +) -> Result<(), String> { + let mut contents = if path.exists() { + fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))? + } else { + String::new() + }; + + if contents.lines().any(exists) { + return Ok(()); + } + + if !contents.is_empty() && !contents.ends_with('\n') { + contents.push('\n'); + } + contents.push_str(line); + contents.push('\n'); + + fs::write(path, contents).map_err(|e| format!("write {}: {e}", path.display())) +} + +fn remove_rootfs_path(rootfs: &Path, relative: &str) -> Result<(), String> { + let path = rootfs.join(relative); + if !path.exists() { + return Ok(()); + } + + let result = if path.is_dir() { + fs::remove_dir_all(&path) + } else { + fs::remove_file(&path) + }; + result.map_err(|e| format!("remove {}: {e}", path.display())) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + + #[test] + fn prepare_sandbox_rootfs_rewrites_guest_layout() { + let dir = unique_temp_dir(); + let rootfs = dir.join("rootfs"); + + fs::create_dir_all(rootfs.join("usr/local/bin")).expect("create usr/local/bin"); + fs::create_dir_all(rootfs.join("etc")).expect("create etc"); + fs::create_dir_all(rootfs.join("var/lib/rancher")).expect("create var/lib/rancher"); + fs::create_dir_all(rootfs.join("opt/openshell/charts")).expect("create charts"); + fs::create_dir_all(rootfs.join("opt/openshell/manifests")).expect("create manifests"); + fs::write(rootfs.join("usr/local/bin/k3s"), b"k3s").expect("write k3s"); + fs::write(rootfs.join("usr/local/bin/kubectl"), b"kubectl").expect("write kubectl"); + fs::write(rootfs.join("opt/openshell/.initialized"), b"yes").expect("write initialized"); + fs::write( + rootfs.join("etc/passwd"), + "root:x:0:0:root:/root:/bin/bash\n", + ) + .expect("write passwd"); + fs::write(rootfs.join("etc/group"), "root:x:0:\n").expect("write group"); + fs::write(rootfs.join("etc/hosts"), "127.0.0.1 localhost\n").expect("write hosts"); + + prepare_sandbox_rootfs(&rootfs).expect("prepare sandbox rootfs"); + + assert!(!rootfs.join("usr/local/bin/k3s").exists()); + assert!(!rootfs.join("usr/local/bin/kubectl").exists()); + assert!(!rootfs.join("var/lib/rancher").exists()); + assert!(!rootfs.join("opt/openshell/charts").exists()); + assert!(!rootfs.join("opt/openshell/manifests").exists()); + assert!(rootfs.join("srv/openshell-vm-sandbox-init.sh").is_file()); + assert!(rootfs.join("sandbox").is_dir()); + assert!( + fs::read_to_string(rootfs.join("etc/passwd")) + .expect("read passwd") + .contains("sandbox:x:10001:10001:OpenShell Sandbox:/sandbox:/bin/bash") + ); + assert!( + fs::read_to_string(rootfs.join("etc/group")) + .expect("read group") + .contains("sandbox:x:10001:") + ); + assert_eq!( + fs::read_to_string(rootfs.join("etc/hosts")).expect("read hosts"), + "127.0.0.1 localhost\n" + ); + + let _ = fs::remove_dir_all(&dir); + } + + fn unique_temp_dir() -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time went backwards") + .as_nanos(); + let suffix = COUNTER.fetch_add(1, Ordering::Relaxed); + std::env::temp_dir().join(format!( + "openshell-driver-vm-rootfs-test-{}-{nanos}-{suffix}", + std::process::id() + )) + } +} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs new file mode 100644 index 000000000..9888feb18 --- /dev/null +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -0,0 +1,877 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(unsafe_code)] + +use std::ffi::CString; +use std::io::{Read, Write}; +use std::os::unix::net::UnixStream; +use std::path::{Path, PathBuf}; +use std::process::{Child as StdChild, Command as StdCommand, Stdio}; +use std::ptr; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::time::{Duration, Instant}; + +use crate::{GUEST_SSH_PORT, embedded_runtime, ffi}; + +pub const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; + +static CHILD_PID: AtomicI32 = AtomicI32::new(0); + +pub struct VmLaunchConfig { + pub rootfs: PathBuf, + pub vcpus: u8, + pub mem_mib: u32, + pub exec_path: String, + pub args: Vec, + pub env: Vec, + pub workdir: String, + pub port_map: Vec, + pub log_level: u32, + pub console_output: PathBuf, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct PortMapping { + host_port: u16, + guest_port: u16, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct GvproxyPortPlan { + ssh_port: u16, + forwarded_ports: Vec, +} + +pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { + if !config.rootfs.is_dir() { + return Err(format!( + "rootfs directory not found: {}", + config.rootfs.display() + )); + } + + #[cfg(target_os = "linux")] + check_kvm_access()?; + + let runtime_dir = configured_runtime_dir()?; + validate_runtime_dir(&runtime_dir)?; + configure_runtime_loader_env(&runtime_dir)?; + raise_nofile_limit(); + + let vm = VmContext::create(&runtime_dir, config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + vm.set_workdir(&config.workdir)?; + + let mut forwarded_port_map = config.port_map.clone(); + let mut gvproxy_guard = None; + let mut gvproxy_api_sock = None; + if !config.port_map.is_empty() { + let gvproxy_binary = runtime_dir.join("gvproxy"); + if !gvproxy_binary.is_file() { + return Err(format!( + "missing runtime file: {}", + gvproxy_binary.display() + )); + } + + kill_stale_gvproxy_by_port_map(&config.port_map); + + let sock_base = gvproxy_socket_base(&config.rootfs)?; + let net_sock = sock_base.with_extension("v"); + let api_sock = sock_base.with_extension("a"); + let _ = std::fs::remove_file(&net_sock); + let _ = std::fs::remove_file(&api_sock); + let _ = std::fs::remove_file(sock_base.with_extension("v-krun.sock")); + + let run_dir = config.rootfs.parent().unwrap_or(&config.rootfs); + let gvproxy_log = run_dir.join("gvproxy.log"); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| format!("create gvproxy log {}: {e}", gvproxy_log.display()))?; + + let gvproxy_ports = plan_gvproxy_ports(&config.port_map)?; + forwarded_port_map = gvproxy_ports.forwarded_ports; + + #[cfg(target_os = "linux")] + let (gvproxy_net_flag, gvproxy_net_url) = + ("-listen-qemu", format!("unix://{}", net_sock.display())); + #[cfg(target_os = "macos")] + let (gvproxy_net_flag, gvproxy_net_url) = ( + "-listen-vfkit", + format!("unixgram://{}", net_sock.display()), + ); + + let child = StdCommand::new(&gvproxy_binary) + .arg(gvproxy_net_flag) + .arg(&gvproxy_net_url) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .arg("-ssh-port") + .arg(gvproxy_ports.ssh_port.to_string()) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| format!("failed to start gvproxy {}: {e}", gvproxy_binary.display()))?; + + wait_for_path(&net_sock, Duration::from_secs(5), "gvproxy data socket")?; + + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + + #[cfg(target_os = "linux")] + vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?; + #[cfg(target_os = "macos")] + { + const NET_FLAG_VFKIT: u32 = 1 << 0; + vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; + } + + gvproxy_guard = Some(GvproxyGuard::new(child)); + gvproxy_api_sock = Some(api_sock); + } + + if !config.port_map.is_empty() && gvproxy_api_sock.is_none() { + vm.set_port_map(&config.port_map)?; + } + vm.set_console_output(&config.console_output)?; + + let env = if config.env.is_empty() { + vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ] + } else { + config.env.clone() + }; + vm.set_exec(&config.exec_path, &config.args, &env)?; + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(format!("fork failed: {}", std::io::Error::last_os_error())), + 0 => { + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + install_signal_forwarding(pid); + + let port_forward_result = if let Some(api_sock) = gvproxy_api_sock.as_ref() { + expose_port_map(api_sock, &forwarded_port_map) + } else { + Ok(()) + }; + + if let Err(err) = port_forward_result { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + let _ = wait_for_child(pid); + cleanup_gvproxy(gvproxy_guard); + return Err(err); + } + + let status = wait_for_child(pid)?; + CHILD_PID.store(0, Ordering::Relaxed); + cleanup_gvproxy(gvproxy_guard); + + if libc::WIFEXITED(status) { + match libc::WEXITSTATUS(status) { + 0 => Ok(()), + code => Err(format!("VM exited with status {code}")), + } + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + Err(format!("VM killed by signal {sig}")) + } else { + Err(format!("VM exited with unexpected wait status {status}")) + } + } + } +} + +pub fn validate_runtime_dir(dir: &Path) -> Result<(), String> { + if !dir.is_dir() { + return Err(format!( + "VM runtime not found at {}. Run `mise run vm:setup` or set {VM_RUNTIME_DIR_ENV}", + dir.display() + )); + } + + embedded_runtime::validate_runtime_dir(dir) +} + +pub fn configured_runtime_dir() -> Result { + if let Some(path) = std::env::var_os(VM_RUNTIME_DIR_ENV) { + return Ok(PathBuf::from(path)); + } + embedded_runtime::ensure_runtime_extracted() +} + +#[cfg(target_os = "macos")] +fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), String> { + let existing = std::env::var_os("DYLD_FALLBACK_LIBRARY_PATH"); + let mut paths = vec![runtime_dir.to_path_buf()]; + if let Some(existing) = existing { + paths.extend(std::env::split_paths(&existing)); + } + let joined = + std::env::join_paths(paths).map_err(|e| format!("join DYLD_FALLBACK_LIBRARY_PATH: {e}"))?; + unsafe { + std::env::set_var("DYLD_FALLBACK_LIBRARY_PATH", joined); + } + Ok(()) +} + +#[cfg(target_os = "linux")] +fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), String> { + let existing = std::env::var_os("LD_LIBRARY_PATH"); + let mut paths = vec![runtime_dir.to_path_buf()]; + if let Some(existing) = existing { + paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(paths).map_err(|e| format!("join LD_LIBRARY_PATH: {e}"))?; + unsafe { + std::env::set_var("LD_LIBRARY_PATH", joined); + } + Ok(()) +} + +#[cfg(not(any(target_os = "macos", target_os = "linux")))] +fn configure_runtime_loader_env(_runtime_dir: &Path) -> Result<(), String> { + Ok(()) +} + +fn raise_nofile_limit() { + #[cfg(unix)] + unsafe { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if libc::getrlimit(libc::RLIMIT_NOFILE, &raw mut rlim) == 0 { + rlim.rlim_cur = rlim.rlim_max; + let _ = libc::setrlimit(libc::RLIMIT_NOFILE, &raw const rlim); + } + } +} + +fn clamp_log_level(level: u32) -> u32 { + match level { + 0 => ffi::KRUN_LOG_LEVEL_OFF, + 1 => ffi::KRUN_LOG_LEVEL_ERROR, + 2 => ffi::KRUN_LOG_LEVEL_WARN, + 3 => ffi::KRUN_LOG_LEVEL_INFO, + 4 => ffi::KRUN_LOG_LEVEL_DEBUG, + _ => ffi::KRUN_LOG_LEVEL_TRACE, + } +} + +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(runtime_dir: &Path, log_level: u32) -> Result { + let krun = ffi::libkrun(runtime_dir)?; + check( + unsafe { + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ) + }, + "krun_init_log", + )?; + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(format!("krun_create_ctx failed with error code {ctx_id}")); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), String> { + check( + unsafe { (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib) }, + "krun_set_vm_config", + ) + } + + fn set_root(&self, rootfs: &Path) -> Result<(), String> { + let rootfs_c = path_to_cstring(rootfs)?; + check( + unsafe { (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()) }, + "krun_set_root", + ) + } + + fn set_workdir(&self, workdir: &str) -> Result<(), String> { + let workdir_c = CString::new(workdir).map_err(|e| format!("invalid workdir: {e}"))?; + check( + unsafe { (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()) }, + "krun_set_workdir", + ) + } + + fn disable_implicit_vsock(&self) -> Result<(), String> { + check( + unsafe { (self.krun.krun_disable_implicit_vsock)(self.ctx_id) }, + "krun_disable_implicit_vsock", + ) + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), String> { + check( + unsafe { (self.krun.krun_add_vsock)(self.ctx_id, tsi_features) }, + "krun_add_vsock", + ) + } + + #[cfg(target_os = "macos")] + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), String> { + let sock_c = path_to_cstring(socket_path)?; + check( + unsafe { + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ) + }, + "krun_add_net_unixgram", + ) + } + + #[allow(dead_code)] // Used on Linux when gvproxy runs in qemu/unixstream mode. + fn add_net_unixstream( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + ) -> Result<(), String> { + let sock_c = path_to_cstring(socket_path)?; + check( + unsafe { + (self.krun.krun_add_net_unixstream)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + 0, + ) + }, + "krun_add_net_unixstream", + ) + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), String> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_owners, ptrs) = c_string_array(&port_strs)?; + check( + unsafe { (self.krun.krun_set_port_map)(self.ctx_id, ptrs.as_ptr()) }, + "krun_set_port_map", + ) + } + + fn set_console_output(&self, path: &Path) -> Result<(), String> { + let console_c = path_to_cstring(path)?; + check( + unsafe { (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()) }, + "krun_set_console_output", + ) + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), String> { + let exec_c = CString::new(exec_path).map_err(|e| format!("invalid exec path: {e}"))?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + check( + unsafe { + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ) + }, + "krun_set_exec", + ) + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + let ret = unsafe { (self.krun.krun_free_ctx)(self.ctx_id) }; + if ret < 0 { + eprintln!( + "warning: krun_free_ctx({}) failed with code {ret}", + self.ctx_id + ); + } + } +} + +struct GvproxyGuard { + child: Option, +} + +impl GvproxyGuard { + fn new(child: StdChild) -> Self { + Self { child: Some(child) } + } + + fn disarm(&mut self) -> Option { + self.child.take() + } +} + +impl Drop for GvproxyGuard { + fn drop(&mut self) { + if let Some(mut child) = self.child.take() { + let _ = child.kill(); + let _ = child.wait(); + } + } +} + +fn expose_port_map(api_sock: &Path, port_map: &[String]) -> Result<(), String> { + wait_for_path(api_sock, Duration::from_secs(2), "gvproxy API socket")?; + let guest_ip = "192.168.127.2"; + + for pm in port_map { + let mapping = parse_port_mapping(pm)?; + + let expose_body = format!( + r#"{{"local":":{}","remote":"{guest_ip}:{}","protocol":"tcp"}}"#, + mapping.host_port, mapping.guest_port + ); + + let deadline = Instant::now() + Duration::from_secs(10); + let mut retry_interval = Duration::from_millis(100); + loop { + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => break, + Err(err) if Instant::now() < deadline => { + std::thread::sleep(retry_interval); + retry_interval = (retry_interval * 2).min(Duration::from_secs(1)); + if retry_interval == Duration::from_secs(1) { + eprintln!("retrying gvproxy port expose {pm}: {err}"); + } + } + Err(err) => { + return Err(format!( + "failed to forward port {} via gvproxy: {err}", + mapping.host_port + )); + } + } + } + } + + Ok(()) +} + +fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { + let mut stream = + UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; + + let request = format!( + "POST /services/forwarder/expose HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n\ + {}", + body.len(), + body, + ); + + stream + .write_all(request.as_bytes()) + .map_err(|e| format!("write to gvproxy API: {e}"))?; + + let mut buf = [0u8; 1024]; + let n = stream + .read(&mut buf) + .map_err(|e| format!("read from gvproxy API: {e}"))?; + let response = String::from_utf8_lossy(&buf[..n]); + let status = response + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("0"); + + match status { + "200" | "204" => Ok(()), + _ => Err(format!( + "gvproxy API: {}", + response.lines().next().unwrap_or("") + )), + } +} + +fn plan_gvproxy_ports(port_map: &[String]) -> Result { + let mut ssh_port = None; + let mut forwarded_ports = Vec::with_capacity(port_map.len()); + + for pm in port_map { + let mapping = parse_port_mapping(pm)?; + if ssh_port.is_none() && mapping.guest_port == GUEST_SSH_PORT && mapping.host_port >= 1024 { + ssh_port = Some(mapping.host_port); + continue; + } + forwarded_ports.push(pm.clone()); + } + + Ok(GvproxyPortPlan { + ssh_port: match ssh_port { + Some(port) => port, + None => pick_gvproxy_ssh_port()?, + }, + forwarded_ports, + }) +} + +fn parse_port_mapping(pm: &str) -> Result { + let parts: Vec<&str> = pm.split(':').collect(); + let (host, guest) = match parts.as_slice() { + [host, guest] => (*host, *guest), + [port] => (*port, *port), + _ => return Err(format!("invalid port mapping '{pm}'")), + }; + + let host_port = host + .parse::() + .map_err(|_| format!("invalid port mapping '{pm}'"))?; + let guest_port = guest + .parse::() + .map_err(|_| format!("invalid port mapping '{pm}'"))?; + + Ok(PortMapping { + host_port, + guest_port, + }) +} + +fn wait_for_path(path: &Path, timeout: Duration, label: &str) -> Result<(), String> { + let deadline = Instant::now() + timeout; + let mut interval = Duration::from_millis(5); + while !path.exists() { + if Instant::now() >= deadline { + return Err(format!( + "{label} did not appear within {:.1}s: {}", + timeout.as_secs_f64(), + path.display() + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + Ok(()) +} + +fn hash_path_id(path: &Path) -> String { + let mut hash: u64 = 0xcbf29ce484222325; + for byte in path.to_string_lossy().as_bytes() { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(0x100000001b3); + } + format!("{:012x}", hash & 0x0000_ffff_ffff_ffff) +} + +fn secure_socket_base(subdir: &str) -> Result { + let base = if let Some(xdg) = std::env::var_os("XDG_RUNTIME_DIR") { + PathBuf::from(xdg) + } else { + let mut base = PathBuf::from("/tmp"); + if !base.is_dir() { + base = std::env::temp_dir(); + } + base + }; + let dir = base.join(subdir); + + if dir.exists() { + let meta = dir + .symlink_metadata() + .map_err(|e| format!("lstat {}: {e}", dir.display()))?; + if meta.file_type().is_symlink() { + return Err(format!( + "socket directory {} is a symlink; refusing to use it", + dir.display() + )); + } + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt as _; + let uid = unsafe { libc::getuid() }; + if meta.uid() != uid { + return Err(format!( + "socket directory {} is owned by uid {} but we are uid {}", + dir.display(), + meta.uid(), + uid + )); + } + } + } else { + std::fs::create_dir_all(&dir) + .map_err(|e| format!("create socket dir {}: {e}", dir.display()))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700)); + } + } + + Ok(dir) +} + +fn gvproxy_socket_base(rootfs: &Path) -> Result { + Ok(secure_socket_base("osd-gv")?.join(hash_path_id(rootfs))) +} + +fn pick_gvproxy_ssh_port() -> Result { + let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) + .map_err(|e| format!("allocate gvproxy ssh port on localhost: {e}"))?; + let port = listener + .local_addr() + .map_err(|e| format!("read gvproxy ssh port: {e}"))? + .port(); + drop(listener); + Ok(port) +} + +fn kill_stale_gvproxy_by_port_map(port_map: &[String]) { + for pm in port_map { + if let Some(host_port) = pm + .split(':') + .next() + .and_then(|port| port.parse::().ok()) + { + kill_stale_gvproxy_by_port(host_port); + } + } +} + +fn kill_stale_gvproxy_by_port(port: u16) { + let output = StdCommand::new("lsof") + .args(["-ti", &format!(":{port}")]) + .output(); + + let pids = match output { + Ok(output) if output.status.success() => { + String::from_utf8_lossy(&output.stdout).to_string() + } + _ => return, + }; + + for line in pids.lines() { + if let Ok(pid) = line.trim().parse::() + && is_process_named(pid as libc::pid_t, "gvproxy") + { + kill_gvproxy_pid(pid); + } + } +} + +fn kill_gvproxy_pid(pid: u32) { + let pid = pid as libc::pid_t; + if unsafe { libc::kill(pid, 0) } != 0 { + return; + } + if !is_process_named(pid, "gvproxy") { + return; + } + unsafe { + libc::kill(pid, libc::SIGTERM); + } + std::thread::sleep(Duration::from_millis(200)); +} + +#[cfg(target_os = "macos")] +fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { + StdCommand::new("ps") + .args(["-p", &pid.to_string(), "-o", "comm="]) + .output() + .ok() + .and_then(|output| { + if output.status.success() { + String::from_utf8(output.stdout).ok() + } else { + None + } + }) + .is_some_and(|name| name.trim().contains(expected)) +} + +#[cfg(target_os = "linux")] +fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { + std::fs::read_to_string(format!("/proc/{pid}/comm")) + .map(|name| name.trim().contains(expected)) + .unwrap_or(false) +} + +#[cfg(not(any(target_os = "macos", target_os = "linux")))] +fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { + false +} + +fn install_signal_forwarding(pid: i32) { + unsafe { + libc::signal( + libc::SIGINT, + forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + forward_signal as *const () as libc::sighandler_t, + ); + } + CHILD_PID.store(pid, Ordering::Relaxed); +} + +extern "C" fn forward_signal(_sig: libc::c_int) { + let pid = CHILD_PID.load(Ordering::Relaxed); + if pid > 0 { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + } +} + +fn wait_for_child(pid: i32) -> Result { + let mut status: libc::c_int = 0; + let rc = unsafe { libc::waitpid(pid, &raw mut status, 0) }; + if rc < 0 { + return Err(format!( + "waitpid({pid}) failed: {}", + std::io::Error::last_os_error() + )); + } + Ok(status) +} + +fn cleanup_gvproxy(mut guard: Option) { + if let Some(mut guard) = guard.take() + && let Some(mut child) = guard.disarm() + { + let _ = child.kill(); + let _ = child.wait(); + } +} + +fn check(ret: i32, func: &'static str) -> Result<(), String> { + if ret < 0 { + Err(format!("{func} failed with error code {ret}")) + } else { + Ok(()) + } +} + +fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), String> { + let owned: Vec = strings + .iter() + .map(|s| CString::new(*s)) + .collect::, _>>() + .map_err(|e| format!("invalid string array entry: {e}"))?; + let mut ptrs: Vec<*const libc::c_char> = owned.iter().map(|c| c.as_ptr()).collect(); + ptrs.push(ptr::null()); + Ok((owned, ptrs)) +} + +fn path_to_cstring(path: &Path) -> Result { + let path = path + .to_str() + .ok_or_else(|| format!("path is not valid UTF-8: {}", path.display()))?; + CString::new(path).map_err(|e| format!("invalid path string {}: {e}", path)) +} + +#[cfg(target_os = "linux")] +fn check_kvm_access() -> Result<(), String> { + std::fs::OpenOptions::new() + .read(true) + .open("/dev/kvm") + .map(|_| ()) + .map_err(|e| { + format!("cannot open /dev/kvm: {e}\nKVM access is required to run microVMs on Linux.") + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn plan_gvproxy_ports_reuses_sandbox_ssh_mapping() { + let plan = plan_gvproxy_ports(&["64739:2222".to_string()]).expect("plan should succeed"); + + assert_eq!(plan.ssh_port, 64739); + assert!(plan.forwarded_ports.is_empty()); + } + + #[test] + fn plan_gvproxy_ports_keeps_non_ssh_mappings_for_forwarder() { + let plan = plan_gvproxy_ports(&["64739:8080".to_string()]).expect("plan should succeed"); + + assert_ne!(plan.ssh_port, 64739); + assert_eq!(plan.forwarded_ports, vec!["64739:8080".to_string()]); + } + + #[test] + fn plan_gvproxy_ports_ignores_privileged_host_ports_for_direct_ssh() { + let plan = plan_gvproxy_ports(&["22:2222".to_string()]).expect("plan should succeed"); + + assert_ne!(plan.ssh_port, 22); + assert_eq!(plan.forwarded_ports, vec!["22:2222".to_string()]); + } + + #[test] + fn parse_port_mapping_rejects_invalid_entries() { + let err = parse_port_mapping("bad:mapping").expect_err("invalid mapping should fail"); + assert!(err.contains("invalid port mapping")); + } +} diff --git a/crates/openshell-driver-vm/start.sh b/crates/openshell-driver-vm/start.sh new file mode 100755 index 000000000..aa9887460 --- /dev/null +++ b/crates/openshell-driver-vm/start.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" +STATE_DIR_DEFAULT="${ROOT}/target/openshell-vm-driver-dev" +STATE_DIR="${OPENSHELL_VM_DRIVER_STATE_DIR:-${STATE_DIR_DEFAULT}}" +DB_PATH_DEFAULT="${STATE_DIR}/openshell.db" +SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}" +VM_HOST_GATEWAY_DEFAULT="${OPENSHELL_VM_HOST_GATEWAY:-host.containers.internal}" + +export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}" + +mkdir -p "${STATE_DIR}" + +normalize_bool() { + case "${1,,}" in + 1|true|yes|on) echo "true" ;; + 0|false|no|off) echo "false" ;; + *) + echo "invalid boolean value '$1' (expected true/false, 1/0, yes/no, on/off)" >&2 + exit 1 + ;; + esac +} + +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ]; then + echo "==> Building base VM rootfs tarball" + mise run vm:rootfs -- --base +fi + +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ] || ! find "${COMPRESSED_DIR}" -maxdepth 1 -name 'libkrun*.zst' | grep -q .; then + echo "==> Preparing embedded VM runtime" + mise run vm:setup +fi + +echo "==> Building gateway and VM compute driver" +cargo build -p openshell-server -p openshell-driver-vm + +if [ "$(uname -s)" = "Darwin" ]; then + echo "==> Codesigning VM compute driver" + codesign \ + --entitlements "${ROOT}/crates/openshell-driver-vm/entitlements.plist" \ + --force \ + -s - \ + "${ROOT}/target/debug/openshell-driver-vm" +fi + +export OPENSHELL_DISABLE_TLS="$(normalize_bool "${OPENSHELL_DISABLE_TLS:-true}")" +export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${DB_PATH_DEFAULT}}" +export OPENSHELL_DRIVERS="${OPENSHELL_DRIVERS:-vm}" +export OPENSHELL_GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-http://${VM_HOST_GATEWAY_DEFAULT}:${SERVER_PORT}}" +export OPENSHELL_SSH_GATEWAY_HOST="${OPENSHELL_SSH_GATEWAY_HOST:-127.0.0.1}" +export OPENSHELL_SSH_GATEWAY_PORT="${OPENSHELL_SSH_GATEWAY_PORT:-${SERVER_PORT}}" +export OPENSHELL_SSH_HANDSHAKE_SECRET="${OPENSHELL_SSH_HANDSHAKE_SECRET:-dev-vm-driver-secret}" +export OPENSHELL_VM_DRIVER_STATE_DIR="${STATE_DIR}" +export OPENSHELL_VM_COMPUTE_DRIVER_BIN="${OPENSHELL_VM_COMPUTE_DRIVER_BIN:-${ROOT}/target/debug/openshell-driver-vm}" + +echo "==> Starting OpenShell server with VM compute driver" +exec "${ROOT}/target/debug/openshell-gateway" diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 33e354247..29f99d009 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -65,6 +65,7 @@ tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } uuid = { workspace = true } +url = { workspace = true } hmac = "0.12" sha2 = "0.10" hex = "0.4" diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 9509fe84b..ba9425036 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -11,6 +11,7 @@ use std::path::PathBuf; use tracing::info; use tracing_subscriber::EnvFilter; +use crate::compute::VmComputeConfig; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -112,6 +113,54 @@ struct Args { #[arg(long, env = "OPENSHELL_HOST_GATEWAY_IP")] host_gateway_ip: Option, + /// Working directory for VM driver sandbox state. + #[arg( + long, + env = "OPENSHELL_VM_DRIVER_STATE_DIR", + default_value_os_t = VmComputeConfig::default_state_dir() + )] + vm_driver_state_dir: PathBuf, + + /// VM compute-driver binary spawned by the gateway. + #[arg(long, env = "OPENSHELL_VM_COMPUTE_DRIVER_BIN")] + vm_compute_driver_bin: Option, + + /// libkrun log level used by the VM helper. + #[arg( + long, + env = "OPENSHELL_VM_KRUN_LOG_LEVEL", + default_value_t = VmComputeConfig::default_krun_log_level() + )] + vm_krun_log_level: u32, + + /// Default vCPU count for VM sandboxes. + #[arg( + long, + env = "OPENSHELL_VM_DRIVER_VCPUS", + default_value_t = VmComputeConfig::default_vcpus() + )] + vm_vcpus: u8, + + /// Default memory allocation for VM sandboxes, in MiB. + #[arg( + long, + env = "OPENSHELL_VM_DRIVER_MEM_MIB", + default_value_t = VmComputeConfig::default_mem_mib() + )] + vm_mem_mib: u32, + + /// CA certificate installed into VM sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_VM_TLS_CA")] + vm_tls_ca: Option, + + /// Client certificate installed into VM sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_VM_TLS_CERT")] + vm_tls_cert: Option, + + /// Client private key installed into VM sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] + vm_tls_key: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -211,6 +260,17 @@ async fn run_from_args(args: Args) -> Result<()> { config = config.with_host_gateway_ip(ip); } + let vm_config = VmComputeConfig { + state_dir: args.vm_driver_state_dir, + compute_driver_bin: args.vm_compute_driver_bin, + krun_log_level: args.vm_krun_log_level, + vcpus: args.vm_vcpus, + mem_mib: args.vm_mem_mib, + guest_tls_ca: args.vm_tls_ca, + guest_tls_cert: args.vm_tls_cert, + guest_tls_key: args.vm_tls_key, + }; + if args.disable_tls { info!("TLS disabled — listening on plaintext HTTP"); } else if args.disable_gateway_auth { @@ -219,7 +279,9 @@ async fn run_from_args(args: Args) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, tracing_log_bus).await.into_diagnostic() + run_server(config, vm_config, tracing_log_bus) + .await + .into_diagnostic() } fn parse_compute_driver(value: &str) -> std::result::Result { diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 846782c65..6b5574016 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -3,6 +3,10 @@ //! Gateway-owned compute orchestration over a pluggable compute backend. +pub mod vm; + +pub use vm::VmComputeConfig; + use crate::grpc::policy::{SANDBOX_SETTINGS_OBJECT_TYPE, sandbox_settings_id}; use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store}; use crate::sandbox_index::SandboxIndex; @@ -14,8 +18,8 @@ use openshell_core::proto::compute::v1::{ DriverResourceRequirements, DriverSandbox, DriverSandboxSpec, DriverSandboxStatus, DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, ListSandboxesRequest, ResolveSandboxEndpointRequest, ResolveSandboxEndpointResponse, ValidateSandboxCreateRequest, - WatchSandboxesEvent, WatchSandboxesRequest, compute_driver_server::ComputeDriver, - sandbox_endpoint, watch_sandboxes_event, + WatchSandboxesEvent, WatchSandboxesRequest, compute_driver_client::ComputeDriverClient, + compute_driver_server::ComputeDriver, sandbox_endpoint, watch_sandboxes_event, }; use openshell_core::proto::{ PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, @@ -31,6 +35,7 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; use tokio::sync::Mutex; +use tonic::transport::Channel; use tonic::{Code, Request, Status}; use tracing::{info, warn}; @@ -54,16 +59,145 @@ pub enum ComputeError { #[error("{0}")] Message(String), } - #[derive(Debug)] pub enum ResolvedEndpoint { Ip(IpAddr, u16), Host(String, u16), } +#[derive(Debug)] +pub(crate) struct ManagedDriverProcess { + child: std::sync::Mutex>, + socket_path: std::path::PathBuf, +} + +impl ManagedDriverProcess { + pub(crate) fn new(child: tokio::process::Child, socket_path: std::path::PathBuf) -> Self { + Self { + child: std::sync::Mutex::new(Some(child)), + socket_path, + } + } +} + +impl Drop for ManagedDriverProcess { + fn drop(&mut self) { + if let Ok(mut child) = self.child.lock() { + let _ = child.take(); + } + let _ = std::fs::remove_file(&self.socket_path); + } +} + +#[derive(Debug, Clone)] +struct RemoteComputeDriver { + channel: Channel, +} + +impl RemoteComputeDriver { + fn new(channel: Channel) -> Self { + Self { channel } + } + + fn client(&self) -> ComputeDriverClient { + ComputeDriverClient::new(self.channel.clone()) + } +} + +#[tonic::async_trait] +impl ComputeDriver for RemoteComputeDriver { + type WatchSandboxesStream = DriverWatchStream; + + async fn get_capabilities( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.get_capabilities(request).await + } + + async fn validate_sandbox_create( + &self, + request: Request, + ) -> Result< + tonic::Response, + Status, + > { + let mut client = self.client(); + client.validate_sandbox_create(request).await + } + + async fn get_sandbox( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.get_sandbox(request).await + } + + async fn list_sandboxes( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.list_sandboxes(request).await + } + + async fn create_sandbox( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.create_sandbox(request).await + } + + async fn stop_sandbox( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.stop_sandbox(request).await + } + + async fn delete_sandbox( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.delete_sandbox(request).await + } + + async fn resolve_sandbox_endpoint( + &self, + request: Request, + ) -> Result, Status> { + let mut client = self.client(); + client.resolve_sandbox_endpoint(request).await + } + + async fn watch_sandboxes( + &self, + request: Request, + ) -> Result, Status> { + let mut client = self.client(); + let response = client.watch_sandboxes(request).await?; + let stream = response + .into_inner() + .map(|item| item.map_err(|status| status)); + Ok(tonic::Response::new(Box::pin(stream))) + } +} + #[derive(Clone)] pub struct ComputeRuntime { driver: SharedComputeDriver, + _driver_process: Option>, default_image: String, store: Arc, sandbox_index: SandboxIndex, @@ -79,17 +213,14 @@ impl fmt::Debug for ComputeRuntime { } impl ComputeRuntime { - pub async fn new_kubernetes( - config: KubernetesComputeConfig, + async fn from_driver( + driver: SharedComputeDriver, + driver_process: Option>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, ) -> Result { - let driver = KubernetesComputeDriver::new(config) - .await - .map_err(|err| ComputeError::Message(err.to_string()))?; - let driver: SharedComputeDriver = Arc::new(ComputeDriverService::new(driver)); let default_image = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) .await @@ -98,6 +229,7 @@ impl ComputeRuntime { .default_image; Ok(Self { driver, + _driver_process: driver_process, default_image, store, sandbox_index, @@ -107,6 +239,48 @@ impl ComputeRuntime { }) } + pub async fn new_kubernetes( + config: KubernetesComputeConfig, + store: Arc, + sandbox_index: SandboxIndex, + sandbox_watch_bus: SandboxWatchBus, + tracing_log_bus: TracingLogBus, + ) -> Result { + let driver = KubernetesComputeDriver::new(config) + .await + .map_err(|err| ComputeError::Message(err.to_string()))?; + let driver: SharedComputeDriver = Arc::new(ComputeDriverService::new(driver)); + Self::from_driver( + driver, + None, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + ) + .await + } + + pub(crate) async fn new_remote_vm( + channel: Channel, + driver_process: Option>, + store: Arc, + sandbox_index: SandboxIndex, + sandbox_watch_bus: SandboxWatchBus, + tracing_log_bus: TracingLogBus, + ) -> Result { + let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); + Self::from_driver( + driver, + driver_process, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + ) + .await + } + #[must_use] pub fn default_image(&self) -> &str { &self.default_image @@ -919,7 +1093,7 @@ fn rewrite_user_facing_conditions(status: &mut Option, spec: Opti fn is_terminal_failure_reason(reason: &str) -> bool { let reason = reason.to_ascii_lowercase(); - let transient_reasons = ["reconcilererror", "dependenciesnotready"]; + let transient_reasons = ["reconcilererror", "dependenciesnotready", "starting"]; !transient_reasons.contains(&reason.as_str()) } @@ -1061,6 +1235,7 @@ mod tests { let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); ComputeRuntime { driver, + _driver_process: None, default_image: "openshell/sandbox:test".to_string(), store, sandbox_index: SandboxIndex::new(), @@ -1134,6 +1309,7 @@ mod tests { "Pod exists with phase: Pending; Service Exists", ), ("dependenciesnotready", "lowercase also works"), + ("Starting", "VM is starting"), ]; for (reason, message) in transient_cases { @@ -1170,6 +1346,7 @@ mod tests { "DependenciesNotReady", "Pod exists with phase: Pending; Service Exists", ), + ("Starting", "VM is starting"), ]; for (reason, message) in transient_conditions { diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs new file mode 100644 index 000000000..d0f397b01 --- /dev/null +++ b/crates/openshell-server/src/compute/vm.rs @@ -0,0 +1,429 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM compute driver plumbing. +//! +//! This module owns everything needed to hand the gateway a `Channel` speaking +//! the `openshell.compute.v1.ComputeDriver` RPC surface against an +//! `openshell-driver-vm` subprocess over a Unix domain socket: +//! +//! - [`VmComputeConfig`]: gateway-local configuration (state dir, driver binary, +//! VM shape, guest TLS material). +//! - [`spawn`]: spawn the driver subprocess, wait for its UDS to be ready, +//! and return a live gRPC channel plus a [`ManagedDriverProcess`] handle +//! that will reap the subprocess and clean up the socket on drop. +//! - Helpers to resolve the driver binary, compute the socket path, and +//! validate guest TLS material when the gateway runs an `https://` control +//! plane. +//! +//! The VM-driver fields deliberately live here rather than in +//! [`openshell_core::Config`] so the shared core stays free of driver-specific +//! plumbing. +//! +//! TODO(driver-abstraction): this module still assumes the concrete VM driver +//! (argv shape, guest-TLS flags, libkrun-specific settings). Once we land the +//! generalized compute-driver interface, the CLI-arg plumbing below should +//! be replaced with a driver-agnostic launcher that speaks gRPC to +//! configure the driver — and this file should collapse to the types that +//! are genuinely VM-specific (libkrun log level, vCPU / memory shape) plus a +//! trait implementation registering the VM driver against the generic +//! interface. + +#[cfg(unix)] +use super::ManagedDriverProcess; +#[cfg(unix)] +use hyper_util::rt::TokioIo; +#[cfg(unix)] +use openshell_core::proto::compute::v1::{ + GetCapabilitiesRequest, compute_driver_client::ComputeDriverClient, +}; +use openshell_core::{Config, Error, Result}; +use std::path::PathBuf; +#[cfg(unix)] +use std::{io::ErrorKind, process::Stdio, sync::Arc, time::Duration}; +#[cfg(unix)] +use tokio::net::UnixStream; +#[cfg(unix)] +use tokio::process::Command; +use tonic::transport::Channel; +#[cfg(unix)] +use tonic::transport::Endpoint; +#[cfg(unix)] +use tower::service_fn; + +/// Configuration for launching and talking to the VM compute driver. +#[derive(Debug, Clone)] +pub struct VmComputeConfig { + /// Working directory for VM driver sandbox state. + pub state_dir: PathBuf, + + /// Optional override for the `openshell-driver-vm` binary path. + /// When `None`, the gateway resolves a sibling of its own executable. + pub compute_driver_bin: Option, + + /// libkrun log level used by the VM driver helper. + pub krun_log_level: u32, + + /// Default vCPU count for VM sandboxes. + pub vcpus: u8, + + /// Default memory allocation for VM sandboxes, in MiB. + pub mem_mib: u32, + + /// Host-side CA certificate for the guest's mTLS client bundle. + pub guest_tls_ca: Option, + + /// Host-side client certificate for the guest's mTLS client bundle. + pub guest_tls_cert: Option, + + /// Host-side private key for the guest's mTLS client bundle. + pub guest_tls_key: Option, +} + +impl VmComputeConfig { + /// Default working directory for VM driver state. + #[must_use] + pub fn default_state_dir() -> PathBuf { + PathBuf::from("target/openshell-vm-driver") + } + + /// Default libkrun log level. + #[must_use] + pub const fn default_krun_log_level() -> u32 { + 1 + } + + /// Default vCPU count. + #[must_use] + pub const fn default_vcpus() -> u8 { + 2 + } + + /// Default memory allocation, in MiB. + #[must_use] + pub const fn default_mem_mib() -> u32 { + 2048 + } +} + +impl Default for VmComputeConfig { + fn default() -> Self { + Self { + state_dir: Self::default_state_dir(), + compute_driver_bin: None, + krun_log_level: Self::default_krun_log_level(), + vcpus: Self::default_vcpus(), + mem_mib: Self::default_mem_mib(), + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + } + } +} + +#[cfg(unix)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct VmGuestTlsPaths { + pub(crate) ca: PathBuf, + pub(crate) cert: PathBuf, + pub(crate) key: PathBuf, +} + +/// Resolve the `openshell-driver-vm` binary path, falling back to a sibling +/// of the gateway's own executable when an override is not supplied. +pub(crate) fn resolve_compute_driver_bin(vm_config: &VmComputeConfig) -> Result { + let path = if let Some(path) = vm_config.compute_driver_bin.clone() { + path + } else { + let current_exe = std::env::current_exe() + .map_err(|e| Error::config(format!("failed to resolve current executable: {e}")))?; + let Some(parent) = current_exe.parent() else { + return Err(Error::config(format!( + "current executable '{}' has no parent directory", + current_exe.display() + ))); + }; + parent.join("openshell-driver-vm") + }; + + if !path.is_file() { + return Err(Error::config(format!( + "vm compute driver binary '{}' does not exist; set --vm-compute-driver-bin or OPENSHELL_VM_COMPUTE_DRIVER_BIN", + path.display() + ))); + } + + Ok(path) +} + +/// Path of the Unix domain socket the driver will listen on. +pub(crate) fn compute_driver_socket_path(vm_config: &VmComputeConfig) -> PathBuf { + vm_config.state_dir.join("compute-driver.sock") +} + +#[cfg(unix)] +pub(crate) fn compute_driver_guest_tls_paths( + config: &Config, + vm_config: &VmComputeConfig, +) -> Result> { + if !config.grpc_endpoint.starts_with("https://") { + return Ok(None); + } + + let provided = [ + vm_config.guest_tls_ca.as_ref(), + vm_config.guest_tls_cert.as_ref(), + vm_config.guest_tls_key.as_ref(), + ]; + if provided.iter().all(Option::is_none) { + return Err(Error::config( + "vm compute driver requires --vm-tls-ca, --vm-tls-cert, and --vm-tls-key when OPENSHELL_GRPC_ENDPOINT uses https://", + )); + } + + let Some(ca) = vm_config.guest_tls_ca.clone() else { + return Err(Error::config( + "--vm-tls-ca is required when VM guest TLS materials are configured", + )); + }; + let Some(cert) = vm_config.guest_tls_cert.clone() else { + return Err(Error::config( + "--vm-tls-cert is required when VM guest TLS materials are configured", + )); + }; + let Some(key) = vm_config.guest_tls_key.clone() else { + return Err(Error::config( + "--vm-tls-key is required when VM guest TLS materials are configured", + )); + }; + + for path in [&ca, &cert, &key] { + if !path.is_file() { + return Err(Error::config(format!( + "vm guest TLS material '{}' does not exist or is not a file", + path.display() + ))); + } + } + + Ok(Some(VmGuestTlsPaths { ca, cert, key })) +} + +/// Launch the VM compute-driver subprocess, wait for its UDS to come up, +/// and return a gRPC `Channel` connected to it plus a process handle that +/// kills the subprocess and removes the socket on drop. +#[cfg(unix)] +pub(crate) async fn spawn( + config: &Config, + vm_config: &VmComputeConfig, +) -> Result<(Channel, Arc)> { + if config.grpc_endpoint.trim().is_empty() { + return Err(Error::config( + "grpc_endpoint is required when using the vm compute driver", + )); + } + + let driver_bin = resolve_compute_driver_bin(vm_config)?; + let socket_path = compute_driver_socket_path(vm_config); + let guest_tls_paths = compute_driver_guest_tls_paths(config, vm_config)?; + if let Some(parent) = socket_path.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + Error::execution(format!( + "failed to create vm compute driver socket dir '{}': {e}", + parent.display() + )) + })?; + } + match std::fs::remove_file(&socket_path) { + Ok(()) => {} + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + return Err(Error::execution(format!( + "failed to remove stale vm compute driver socket '{}': {err}", + socket_path.display() + ))); + } + } + + let mut command = Command::new(&driver_bin); + command.kill_on_drop(true); + command.stdin(Stdio::null()); + command.stdout(Stdio::inherit()); + command.stderr(Stdio::inherit()); + command.arg("--bind-socket").arg(&socket_path); + command.arg("--log-level").arg(&config.log_level); + command + .arg("--openshell-endpoint") + .arg(&config.grpc_endpoint); + command.arg("--state-dir").arg(&vm_config.state_dir); + command + .arg("--ssh-handshake-secret") + .arg(&config.ssh_handshake_secret); + command + .arg("--ssh-handshake-skew-secs") + .arg(config.ssh_handshake_skew_secs.to_string()); + command + .arg("--krun-log-level") + .arg(vm_config.krun_log_level.to_string()); + command.arg("--vcpus").arg(vm_config.vcpus.to_string()); + command.arg("--mem-mib").arg(vm_config.mem_mib.to_string()); + if let Some(tls) = guest_tls_paths { + command.arg("--guest-tls-ca").arg(tls.ca); + command.arg("--guest-tls-cert").arg(tls.cert); + command.arg("--guest-tls-key").arg(tls.key); + } + + let mut child = command.spawn().map_err(|e| { + Error::execution(format!( + "failed to launch vm compute driver '{}': {e}", + driver_bin.display() + )) + })?; + let channel = wait_for_compute_driver(&socket_path, &mut child).await?; + let process = Arc::new(ManagedDriverProcess::new(child, socket_path)); + Ok((channel, process)) +} + +#[cfg(not(unix))] +pub(crate) async fn spawn( + _config: &Config, + _vm_config: &VmComputeConfig, +) -> Result<(Channel, std::sync::Arc)> { + Err(Error::config( + "the vm compute driver requires unix domain socket support", + )) +} + +#[cfg(unix)] +async fn wait_for_compute_driver( + socket_path: &std::path::Path, + child: &mut tokio::process::Child, +) -> Result { + let mut last_error: Option = None; + for _ in 0..100 { + if let Some(status) = child.try_wait().map_err(|e| { + Error::execution(format!("failed to poll vm compute driver process: {e}")) + })? { + return Err(Error::execution(format!( + "vm compute driver exited before becoming ready with status {status}" + ))); + } + + match connect_compute_driver(socket_path).await { + Ok(channel) => { + let mut client = ComputeDriverClient::new(channel.clone()); + match client + .get_capabilities(tonic::Request::new(GetCapabilitiesRequest {})) + .await + { + Ok(_) => return Ok(channel), + Err(status) => last_error = Some(status.to_string()), + } + } + Err(err) => last_error = Some(err.to_string()), + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + + Err(Error::execution(format!( + "timed out waiting for vm compute driver socket '{}': {}", + socket_path.display(), + last_error.unwrap_or_else(|| "unknown error".to_string()) + ))) +} + +#[cfg(unix)] +async fn connect_compute_driver(socket_path: &std::path::Path) -> Result { + let socket_path = socket_path.to_path_buf(); + let display_path = socket_path.clone(); + Endpoint::from_static("http://[::]:50051") + .connect_with_connector(service_fn(move |_: tonic::transport::Uri| { + let socket_path = socket_path.clone(); + async move { UnixStream::connect(socket_path).await.map(TokioIo::new) } + })) + .await + .map_err(|e| { + Error::execution(format!( + "failed to connect to vm compute driver socket '{}': {e}", + display_path.display() + )) + }) +} + +#[cfg(all(test, unix))] +mod tests { + use super::{VmComputeConfig, compute_driver_guest_tls_paths}; + use openshell_core::{Config, TlsConfig}; + use tempfile::tempdir; + + #[test] + fn vm_compute_driver_tls_requires_explicit_guest_bundle() { + let dir = tempdir().unwrap(); + let server_cert = dir.path().join("server.crt"); + let server_key = dir.path().join("server.key"); + let server_ca = dir.path().join("client-ca.crt"); + std::fs::write(&server_cert, "server-cert").unwrap(); + std::fs::write(&server_key, "server-key").unwrap(); + std::fs::write(&server_ca, "client-ca").unwrap(); + + let config = Config::new(Some(TlsConfig { + cert_path: server_cert, + key_path: server_key, + client_ca_path: server_ca, + allow_unauthenticated: false, + })) + .with_grpc_endpoint("https://gateway.internal:8443"); + + let err = compute_driver_guest_tls_paths(&config, &VmComputeConfig::default()) + .expect_err("https vm endpoints should require an explicit guest client bundle"); + assert!( + err.to_string() + .contains("--vm-tls-ca, --vm-tls-cert, and --vm-tls-key") + ); + } + + #[test] + fn vm_compute_driver_tls_uses_guest_bundle_not_gateway_server_identity() { + let dir = tempdir().unwrap(); + let server_cert = dir.path().join("server.crt"); + let server_key = dir.path().join("server.key"); + let server_ca = dir.path().join("client-ca.crt"); + let guest_ca = dir.path().join("guest-ca.crt"); + let guest_cert = dir.path().join("guest.crt"); + let guest_key = dir.path().join("guest.key"); + for path in [ + &server_cert, + &server_key, + &server_ca, + &guest_ca, + &guest_cert, + &guest_key, + ] { + std::fs::write(path, path.display().to_string()).unwrap(); + } + + let config = Config::new(Some(TlsConfig { + cert_path: server_cert.clone(), + key_path: server_key.clone(), + client_ca_path: server_ca, + allow_unauthenticated: false, + })) + .with_grpc_endpoint("https://gateway.internal:8443"); + let vm_config = VmComputeConfig { + guest_tls_ca: Some(guest_ca.clone()), + guest_tls_cert: Some(guest_cert.clone()), + guest_tls_key: Some(guest_key.clone()), + ..Default::default() + }; + + let guest_paths = compute_driver_guest_tls_paths(&config, &vm_config) + .unwrap() + .expect("https vm endpoints should pass an explicit guest client bundle"); + assert_eq!(guest_paths.ca, guest_ca); + assert_eq!(guest_paths.cert, guest_cert); + assert_eq!(guest_paths.key, guest_key); + assert_ne!(guest_paths.cert, server_cert); + assert_ne!(guest_paths.key, server_key); + } +} diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 8e5930826..0a729c099 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -930,6 +930,15 @@ fn is_safe_ssh_proxy_target(ip: std::net::IpAddr) -> bool { } } +fn is_explicit_loopback_exec_target(host: &str) -> bool { + if host.eq_ignore_ascii_case("localhost") { + return true; + } + + host.parse::() + .is_ok_and(|ip| ip.is_loopback()) +} + async fn start_single_use_ssh_proxy( target_host: &str, target_port: u16, @@ -938,6 +947,7 @@ async fn start_single_use_ssh_proxy( let listener = TcpListener::bind(("127.0.0.1", 0)).await?; let port = listener.local_addr()?.port(); let target_host = target_host.to_string(); + let allow_explicit_loopback = is_explicit_loopback_exec_target(target_host.as_str()); let handshake_secret = handshake_secret.to_string(); let task = tokio::spawn(async move { @@ -962,7 +972,7 @@ async fn start_single_use_ssh_proxy( } }; - if !is_safe_ssh_proxy_target(resolved.ip()) { + if !allow_explicit_loopback && !is_safe_ssh_proxy_target(resolved.ip()) { warn!( target_host = %target_host, resolved_ip = %resolved.ip(), @@ -1214,6 +1224,21 @@ mod tests { assert!(!is_safe_ssh_proxy_target(ip)); } + #[test] + fn explicit_loopback_exec_target_is_allowed() { + assert!(is_explicit_loopback_exec_target("127.0.0.1")); + assert!(is_explicit_loopback_exec_target("::1")); + assert!(is_explicit_loopback_exec_target("localhost")); + } + + #[test] + fn non_loopback_exec_target_is_not_treated_as_explicit_loopback() { + assert!(!is_explicit_loopback_exec_target("10.0.0.5")); + assert!(!is_explicit_loopback_exec_target( + "sandbox.default.svc.cluster.local" + )); + } + // ---- petname / generate_name ---- #[test] diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 7549a1774..da56a5c94 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -8,6 +8,16 @@ //! - HTTP health endpoints //! - Protocol multiplexing (gRPC + HTTP on same port) //! - mTLS support +//! +//! TODO(driver-abstraction): `build_compute_runtime` still switches on +//! [`ComputeDriverKind`] and calls driver-specific constructors +//! ([`ComputeRuntime::new_kubernetes`], [`compute::vm::spawn`] + +//! [`ComputeRuntime::new_remote_vm`]). Once we have a generalized compute +//! driver interface, the per-arm wiring here should collapse to a single +//! driver-agnostic path that asks each registered driver to produce a +//! [`Channel`](tonic::transport::Channel) and hands the rest of the gateway a +//! uniform [`ComputeRuntime`]. The remaining VM plumbing now lives in +//! [`compute::vm`]; keep this file driver-agnostic going forward. mod auth; pub mod cli; @@ -28,10 +38,11 @@ use openshell_core::{ComputeDriverKind, Config, Error, Result}; use std::collections::HashMap; use std::io::ErrorKind; use std::sync::{Arc, Mutex}; +use std::time::Duration; use tokio::net::TcpListener; use tracing::{debug, error, info}; -use compute::ComputeRuntime; +use compute::{ComputeRuntime, VmComputeConfig}; pub use grpc::OpenShellService; pub use http::{health_router, http_router}; pub use multiplex::{MultiplexService, MultiplexedService}; @@ -115,7 +126,11 @@ impl ServerState { /// # Errors /// /// Returns an error if the server fails to start or encounters a fatal error. -pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Result<()> { +pub async fn run_server( + config: Config, + vm_config: VmComputeConfig, + tracing_log_bus: TracingLogBus, +) -> Result<()> { let database_url = config.database_url.trim(); if database_url.is_empty() { return Err(Error::config("database_url is required")); @@ -132,6 +147,7 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul let sandbox_watch_bus = SandboxWatchBus::new(); let compute = build_compute_runtime( &config, + &vm_config, store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -148,7 +164,7 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul )); state.compute.spawn_watchers(); - ssh_tunnel::spawn_session_reaper(store.clone(), std::time::Duration::from_secs(3600)); + ssh_tunnel::spawn_session_reaper(store.clone(), Duration::from_secs(3600)); // Create the multiplexed service let service = MultiplexService::new(state.clone()); @@ -215,6 +231,7 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul async fn build_compute_runtime( config: &Config, + vm_config: &VmComputeConfig, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -244,6 +261,19 @@ async fn build_compute_runtime( ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), + ComputeDriverKind::Vm => { + let (channel, driver_process) = compute::vm::spawn(config, vm_config).await?; + ComputeRuntime::new_remote_vm( + channel, + Some(driver_process), + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) + } ComputeDriverKind::Podman => Err(Error::config( "compute driver 'podman' is not implemented yet", )), @@ -255,7 +285,7 @@ fn configured_compute_driver(config: &Config) -> Result { [] => Err(Error::config( "at least one compute driver must be configured", )), - [driver @ ComputeDriverKind::Kubernetes] => Ok(*driver), + [driver @ ComputeDriverKind::Kubernetes] | [driver @ ComputeDriverKind::Vm] => Ok(*driver), [ComputeDriverKind::Podman] => Err(Error::config( "compute driver 'podman' is not implemented yet", )), @@ -332,4 +362,13 @@ mod tests { .contains("compute driver 'podman' is not implemented yet") ); } + + #[test] + fn configured_compute_driver_accepts_vm() { + let config = Config::new(None).with_compute_drivers([ComputeDriverKind::Vm]); + assert_eq!( + configured_compute_driver(&config).unwrap(), + ComputeDriverKind::Vm + ); + } } diff --git a/deploy/docker/Dockerfile.driver-vm-macos b/deploy/docker/Dockerfile.driver-vm-macos new file mode 100644 index 000000000..ac0aec952 --- /dev/null +++ b/deploy/docker/Dockerfile.driver-vm-macos @@ -0,0 +1,118 @@ +# syntax=docker/dockerfile:1.6 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Cross-compile the openshell-driver-vm binary for macOS aarch64 (Apple +# Silicon) using the osxcross toolchain. +# +# openshell-driver-vm loads libkrun/libkrunfw at runtime via dlopen, so it +# does NOT need Hypervisor.framework headers at build time. Pre-compressed +# runtime artifacts (libkrun, libkrunfw, gvproxy, rootfs) are injected via +# the vm-runtime-compressed build context and embedded into the binary via +# include_bytes!(). +# +# Usage: +# docker buildx build -f deploy/docker/Dockerfile.driver-vm-macos \ +# --build-arg OPENSHELL_CARGO_VERSION=0.6.0 \ +# --build-context vm-runtime-compressed=/path/to/compressed-dir \ +# --output type=local,dest=out/ . + +ARG OSXCROSS_IMAGE=crazymax/osxcross:latest + +FROM ${OSXCROSS_IMAGE} AS osxcross + +FROM python:3.12-slim AS builder + +ARG CARGO_TARGET_CACHE_SCOPE=default + +ENV PATH="/root/.cargo/bin:/usr/local/bin:/osxcross/bin:${PATH}" +ENV LD_LIBRARY_PATH="/osxcross/lib" + +COPY --from=osxcross /osxcross /osxcross + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + clang \ + cmake \ + curl \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 + +RUN rustup target add aarch64-apple-darwin + +WORKDIR /build + +ENV CC_aarch64_apple_darwin=oa64-clang +ENV CXX_aarch64_apple_darwin=oa64-clang++ +ENV AR_aarch64_apple_darwin=aarch64-apple-darwin25.1-ar +ENV CARGO_TARGET_AARCH64_APPLE_DARWIN_LINKER=oa64-clang +ENV CARGO_TARGET_AARCH64_APPLE_DARWIN_AR=aarch64-apple-darwin25.1-ar + +# aws-lc-sys workaround (in case it ends up in the dep tree via feature unification) +RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld + +# --------------------------------------------------------------------------- +# Stage 1: dependency caching — copy only manifests, create dummy sources, +# build dependencies. This layer is cached unless Cargo.toml/lock changes. +# --------------------------------------------------------------------------- +COPY Cargo.toml Cargo.lock ./ +COPY crates/openshell-driver-vm/Cargo.toml crates/openshell-driver-vm/Cargo.toml +COPY crates/openshell-driver-vm/build.rs crates/openshell-driver-vm/build.rs +COPY crates/openshell-core/Cargo.toml crates/openshell-core/Cargo.toml +COPY crates/openshell-core/build.rs crates/openshell-core/build.rs +COPY proto/ proto/ + +# Scope workspace to the driver + its only internal dep. +RUN sed -i 's|members = \["crates/\*"\]|members = ["crates/openshell-driver-vm", "crates/openshell-core"]|' Cargo.toml + +RUN mkdir -p crates/openshell-driver-vm/src \ + crates/openshell-core/src && \ + echo "fn main() {}" > crates/openshell-driver-vm/src/main.rs && \ + touch crates/openshell-driver-vm/src/lib.rs && \ + touch crates/openshell-core/src/lib.rs + +# Build deps only (cached layer). The 2>/dev/null || true is a warm-cache +# technique; real source is copied in stage 2. +RUN --mount=type=cache,id=cargo-registry-driver-vm-macos,sharing=locked,target=/root/.cargo/registry \ + --mount=type=cache,id=cargo-git-driver-vm-macos,sharing=locked,target=/root/.cargo/git \ + --mount=type=cache,id=cargo-target-driver-vm-macos-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ + cargo build --release --target aarch64-apple-darwin -p openshell-driver-vm 2>/dev/null || true + +# --------------------------------------------------------------------------- +# Stage 2: real build with compressed runtime artifacts +# --------------------------------------------------------------------------- +COPY crates/ crates/ + +# Copy compressed VM runtime artifacts for embedding. +# These are passed in via --build-context vm-runtime-compressed=... +COPY --from=vm-runtime-compressed / /build/vm-runtime-compressed/ + +# Touch source files to ensure they're rebuilt (not the cached dummy). +RUN touch crates/openshell-driver-vm/src/main.rs \ + crates/openshell-driver-vm/src/lib.rs \ + crates/openshell-driver-vm/build.rs \ + crates/openshell-core/src/lib.rs \ + crates/openshell-core/build.rs \ + proto/*.proto + +# Declare version ARGs here (not earlier) so the git-hash-bearing values do not +# invalidate the expensive dependency-build layers above on every commit. +ARG OPENSHELL_CARGO_VERSION +ARG OPENSHELL_IMAGE_TAG +RUN --mount=type=cache,id=cargo-registry-driver-vm-macos,sharing=locked,target=/root/.cargo/registry \ + --mount=type=cache,id=cargo-git-driver-vm-macos,sharing=locked,target=/root/.cargo/git \ + --mount=type=cache,id=cargo-target-driver-vm-macos-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ + if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ + sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ + fi && \ + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=/build/vm-runtime-compressed \ + OPENSHELL_IMAGE_TAG="${OPENSHELL_IMAGE_TAG:-dev}" \ + cargo build --release --target aarch64-apple-darwin -p openshell-driver-vm && \ + cp target/aarch64-apple-darwin/release/openshell-driver-vm /openshell-driver-vm + +FROM scratch AS binary +COPY --from=builder /openshell-driver-vm /openshell-driver-vm