diff --git a/internal/config/config.go b/internal/config/config.go index 13fa3e96..45c3f8c0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -197,6 +197,15 @@ type Config struct { // Off → /deploy/new rejects source=git with 501; tarball/image unaffected. DeploySourceGitEnabled bool + // DeployScaleToZeroEnabled gates scale-to-zero (idle descheduling, Task #54). + // Default FALSE: the worker idle-scaler patches idle Deployments to + // replicas=0 and the api wake path (POST /deploy/:id/wake) brings them back. + // Off → the wake endpoint returns 501 and nothing in the api scales an app; + // the worker idle-scaler is independently gated by its own + // DEPLOY_SCALE_TO_ZERO_ENABLED env so the two services share the flag name. + // Enabling it is an operator action (see infra runbook) after a canary. + DeployScaleToZeroEnabled bool + // ResourceCountCapsEnabled gates per-service resource-count enforcement // (Task #55). Default FALSE: when off, the count-check block in every // provision handler (db/vector/cache/nosql/storage) is skipped entirely — @@ -512,6 +521,16 @@ func Load() *Config { cfg.DeploySourceGitEnabled = false } + // DEPLOY_SCALE_TO_ZERO_ENABLED: default FALSE (off until operator canary). + // Shared flag name with the worker idle-scaler; the api half gates the wake + // endpoint + any api-initiated scale, the worker half gates the idle sweep. + switch strings.ToLower(strings.TrimSpace(os.Getenv("DEPLOY_SCALE_TO_ZERO_ENABLED"))) { + case "true", "1", "yes": + cfg.DeployScaleToZeroEnabled = true + default: + cfg.DeployScaleToZeroEnabled = false + } + // RESOURCE_COUNT_CAPS_ENABLED: default FALSE (Task #55). Off → the per-service // count-check block in every provision handler is skipped (zero behavior // change). On → over-cap provisions get 402. Operator action after a usage diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d8554abb..be2d60b5 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -63,6 +63,7 @@ func allKeys() []string { "METRICS_TOKEN", "DASHBOARD_BASE_URL", "API_PUBLIC_URL", "DELETION_CONFIRMATION_TTL_MINUTES", "FAMILY_BINDINGS_ENABLED", "DEPLOY_SOURCE_IMAGE_ENABLED", "DEPLOY_SOURCE_GIT_ENABLED", + "DEPLOY_SCALE_TO_ZERO_ENABLED", "RESOURCE_COUNT_CAPS_ENABLED", "GITHUB_APP_ENABLED", "GITHUB_APP_ID", "GITHUB_APP_SLUG", "GITHUB_APP_PRIVATE_KEY", "GITHUB_APP_WEBHOOK_SECRET", "GITHUB_APP_CLIENT_ID", "GITHUB_APP_CLIENT_SECRET", @@ -388,6 +389,21 @@ func TestLoad_DeploySourceGitEnabled(t *testing.T) { } } +func TestLoad_DeployScaleToZeroEnabled(t *testing.T) { + for _, val := range []string{"true", "1", "yes", "TRUE", " Yes "} { + applyBaselineEnv(t, map[string]string{"DEPLOY_SCALE_TO_ZERO_ENABLED": val}) + if !Load().DeployScaleToZeroEnabled { + t.Errorf("DEPLOY_SCALE_TO_ZERO_ENABLED=%q should enable", val) + } + } + for _, val := range []string{"false", "0", "no", "maybe", ""} { + applyBaselineEnv(t, map[string]string{"DEPLOY_SCALE_TO_ZERO_ENABLED": val}) + if Load().DeployScaleToZeroEnabled { + t.Errorf("DEPLOY_SCALE_TO_ZERO_ENABLED=%q should stay disabled", val) + } + } +} + func TestLoad_ResourceCountCapsEnabled(t *testing.T) { for _, val := range []string{"true", "1", "yes", "TRUE", " Yes "} { applyBaselineEnv(t, map[string]string{"RESOURCE_COUNT_CAPS_ENABLED": val}) diff --git a/internal/db/migrations/068_deploy_scale_to_zero.sql b/internal/db/migrations/068_deploy_scale_to_zero.sql new file mode 100644 index 00000000..c8794bd9 --- /dev/null +++ b/internal/db/migrations/068_deploy_scale_to_zero.sql @@ -0,0 +1,68 @@ +-- 068_deploy_scale_to_zero.sql — scale-to-zero (idle descheduling) state columns. +-- +-- WHY: a deployed-but-idle app costs a full pod's worth of compute even when it +-- serves zero requests. Scale-to-zero (Task #54) lets the worker patch an idle +-- Deployment to replicas=0 (~$0 compute) and wake it back to replicas=1 on +-- demand. This migration adds the per-deployment state the idle-scaler and the +-- wake path read/write. The whole feature is gated behind the +-- DEPLOY_SCALE_TO_ZERO_ENABLED worker env flag (default OFF), so these columns +-- are inert — populated at create-time but acted upon only when an operator +-- enables the flag. +-- +-- Columns: +-- last_activity_at TIMESTAMPTZ — floor "last known activity" marker. Set to +-- now() at create-time, bumped on every wake +-- and on redeploy. The idle-scaler descheduals +-- a Deployment only when +-- now() - last_activity_at > idle_threshold. +-- +-- v1 NOTE: the api is NOT in the request path +-- (apps are served by k8s Ingress straight to +-- the per-app Service), and no nginx-ingress +-- request-total scrape is wired yet, so the +-- honest "activity" signal v1 captures is +-- deploy / redeploy / explicit-wake events — +-- NOT per-HTTP-request traffic. A follow-up +-- (documented in the worker job header) will +-- wire an ingress request-counter to bump this +-- column on real traffic for true +-- traffic-based idle detection. +-- +-- scaled_to_zero BOOLEAN — true while the app is currently descheduled +-- (replicas=0). The wake path reads this to +-- decide whether a scale-up is needed; the +-- dashboard/agent reads it to show "sleeping". +-- The idle-scaler sets it true on scale-down, +-- the wake path sets it false on scale-up. +-- +-- always_on BOOLEAN — per-app opt-out. A pinned app (an operator +-- or Pro+ user who wants zero cold-starts) is +-- never descheduled by the idle-scaler. Default +-- false → eligible for scale-to-zero. +-- +-- Idempotent + forward-only. Existing rows get last_activity_at backfilled from +-- updated_at (their most recent known activity) so the idle-scaler does not +-- immediately deschedule every pre-existing deploy the first time the flag is +-- turned on; scaled_to_zero / always_on default to false. + +ALTER TABLE deployments + ADD COLUMN IF NOT EXISTS last_activity_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS scaled_to_zero BOOLEAN NOT NULL DEFAULT false, + ADD COLUMN IF NOT EXISTS always_on BOOLEAN NOT NULL DEFAULT false; + +-- Backfill: seed last_activity_at from updated_at for every pre-existing row so +-- the very first idle-scaler tick after the flag is enabled treats existing +-- deploys as "recently active" rather than immediately idle. New rows set +-- last_activity_at = now() at INSERT time (see CreateDeployment). +UPDATE deployments +SET last_activity_at = COALESCE(updated_at, created_at, now()) +WHERE last_activity_at IS NULL; + +-- Partial index: the idle-scaler scans for healthy, eligible, not-yet-zeroed +-- deployments ordered by activity. Excluding always_on + already-zeroed + +-- terminal rows keeps the index narrow and the scan cheap. +CREATE INDEX IF NOT EXISTS idx_deployments_idle_candidates + ON deployments (last_activity_at) + WHERE status = 'healthy' + AND scaled_to_zero = false + AND always_on = false; diff --git a/internal/handlers/deploy.go b/internal/handlers/deploy.go index 71ac311a..b164a62e 100644 --- a/internal/handlers/deploy.go +++ b/internal/handlers/deploy.go @@ -572,6 +572,11 @@ func deploymentToMapWithDB(d *models.Deployment, db *sql.DB) fiber.Map { // image_ref is echoed (caller-supplied, no secret); registry_creds is // NEVER returned — only registry_creds_set lifecycle metadata. "source": deploymentSourceOrDefault(d.Source), + // Scale-to-zero state (migration 068). scaled_to_zero=true → the app is + // asleep (replicas=0); the dashboard/agent surfaces "sleeping — wake" + // and POSTs /deploy/:id/wake. always_on=true → pinned (never descheduled). + "scaled_to_zero": d.ScaledToZero, + "always_on": d.AlwaysOn, } if d.Source == "image" { m["image_ref"] = d.ImageRef diff --git a/internal/handlers/deploy_buildfailed_autopsy_test.go b/internal/handlers/deploy_buildfailed_autopsy_test.go index f85a9871..3d20f6d3 100644 --- a/internal/handlers/deploy_buildfailed_autopsy_test.go +++ b/internal/handlers/deploy_buildfailed_autopsy_test.go @@ -56,6 +56,9 @@ func (m *mockProvider) Redeploy(_ context.Context, _ string, _ []byte, _ map[str func (m *mockProvider) UpdateAccessControl(_ context.Context, _ string, _ bool, _ []string) error { panic("mockProvider.UpdateAccessControl: not expected in this test") } +func (m *mockProvider) Scale(_ context.Context, _ string, _ int32) error { + panic("mockProvider.Scale: not expected in this test") +} // mockBuildLogFetcher wraps mockProvider and adds FetchBuildLogs so the handler // code can type-assert to compute.BuildLogFetcher. diff --git a/internal/handlers/deploy_redeploy_inplace_mock_test.go b/internal/handlers/deploy_redeploy_inplace_mock_test.go index c7e1c20c..488e7f3c 100644 --- a/internal/handlers/deploy_redeploy_inplace_mock_test.go +++ b/internal/handlers/deploy_redeploy_inplace_mock_test.go @@ -77,6 +77,7 @@ var deploymentColumnsList = []string{ "expires_at", "ttl_policy", "reminders_sent", "last_reminder_at", "source", "image_ref", "registry_creds_enc", "git_url", "git_ref", "git_token_enc", + "last_activity_at", "scaled_to_zero", "always_on", } // redeployMockApp wires a minimal Fiber app that drives DeployHandler.New @@ -256,6 +257,7 @@ func TestDeployNew_Redeploy_WrongTeam_DefenceInDepth(t *testing.T) { sql.NullTime{}, "permanent", 0, sql.NullTime{}, // ttl_* "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) "", "", "", // git_url, git_ref, git_token_enc (mig 065) + sql.NullTime{}, false, false, // last_activity_at, scaled_to_zero, always_on (mig 068) )) body, ct := multipartRedeployMockBody(t, map[string]string{ @@ -328,6 +330,7 @@ func TestDeployNew_Redeploy_UpdateStatusError_StillAccepts(t *testing.T) { sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) "", "", "", // git_url, git_ref, git_token_enc (mig 065) + sql.NullTime{}, false, false, // last_activity_at, scaled_to_zero, always_on (mig 068) )) // MarkDeploymentBuilding (guarded CAS) → driver error. The handler must @@ -335,7 +338,7 @@ func TestDeployNew_Redeploy_UpdateStatusError_StillAccepts(t *testing.T) { // non-determinate (we can't tell whether the flip landed), and // runRedeployAsync will reconcile the row later. Only an explicit 0-row // CAS miss means "reaped concurrently, return 409". - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WithArgs(rowID). WillReturnError(errMockRedeployDriver) @@ -420,6 +423,7 @@ func TestDeployNew_Redeploy_EmptyProviderID_Returns409(t *testing.T) { sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) "", "", "", // git_url, git_ref, git_token_enc (mig 065) + sql.NullTime{}, false, false, // last_activity_at, scaled_to_zero, always_on (mig 068) )) body, ct := multipartRedeployMockBody(t, map[string]string{ @@ -486,10 +490,11 @@ func TestDeployNew_Redeploy_CASMiss_Returns409(t *testing.T) { sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) "", "", "", // git_url, git_ref, git_token_enc (mig 065) + sql.NullTime{}, false, false, // last_activity_at, scaled_to_zero, always_on (mig 068) )) // Guarded CAS matches 0 rows — the reaper won the race. Handler 409s. - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WithArgs(rowID). WillReturnResult(sqlmock.NewResult(0, 0)) @@ -597,10 +602,11 @@ func TestDeployRedeploy_ByID_CASMiss_Returns409(t *testing.T) { sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", "", "", "", + sql.NullTime{}, false, false, )) // Guarded CAS matches 0 rows — the reaper won the race after the read. - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WithArgs(rowID). WillReturnResult(sqlmock.NewResult(0, 0)) @@ -652,9 +658,10 @@ func TestDeployRedeploy_ByID_CASSuccess_Returns202(t *testing.T) { sql.NullString{}, sql.NullString{}, "unset", 0, sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", "", "", "", + sql.NullTime{}, false, false, )) - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WithArgs(rowID). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -702,10 +709,11 @@ func TestDeployRedeploy_ByID_CASDriverError_StillAccepts(t *testing.T) { sql.NullString{}, sql.NullString{}, "unset", 0, sql.NullTime{}, "permanent", 0, sql.NullTime{}, "tarball", "", "", "", "", "", + sql.NullTime{}, false, false, )) // Guarded CAS → driver error (non-determinate). Handler logs + continues. - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WithArgs(rowID). WillReturnError(errMockRedeployDriver) diff --git a/internal/handlers/deploy_stack_internal_coverage_test.go b/internal/handlers/deploy_stack_internal_coverage_test.go index 0ef42cc8..6f64f71a 100644 --- a/internal/handlers/deploy_stack_internal_coverage_test.go +++ b/internal/handlers/deploy_stack_internal_coverage_test.go @@ -64,6 +64,9 @@ func (covPanicProvider) Redeploy(context.Context, string, []byte, map[string]str func (covPanicProvider) UpdateAccessControl(context.Context, string, bool, []string) error { panic("covPanicProvider.UpdateAccessControl: not expected") } +func (covPanicProvider) Scale(context.Context, string, int32) error { + panic("covPanicProvider.Scale: not expected") +} // covFailProvider's Deploy/Redeploy return a configurable error. It does NOT // implement BuildLogFetcher, so fetchBuildLogsForAutopsy returns nil diff --git a/internal/handlers/deploy_teardown_reconciler_test.go b/internal/handlers/deploy_teardown_reconciler_test.go index 9ba768c7..86638076 100644 --- a/internal/handlers/deploy_teardown_reconciler_test.go +++ b/internal/handlers/deploy_teardown_reconciler_test.go @@ -75,6 +75,9 @@ func (f *fakeTeardownProvider) Redeploy(context.Context, string, []byte, map[str func (f *fakeTeardownProvider) UpdateAccessControl(context.Context, string, bool, []string) error { return nil } +func (f *fakeTeardownProvider) Scale(context.Context, string, int32) error { + return nil +} func reconcilerRequireDB(t *testing.T) { t.Helper() diff --git a/internal/handlers/deploy_wake.go b/internal/handlers/deploy_wake.go new file mode 100644 index 00000000..71575625 --- /dev/null +++ b/internal/handlers/deploy_wake.go @@ -0,0 +1,116 @@ +package handlers + +// deploy_wake.go — explicit wake path for scale-to-zero (Task #54). +// +// WHY AN EXPLICIT WAKE (v1 design decision) +// +// instanode.dev serves a deployed app via a k8s Ingress on +// *.deployment.instanode.dev that routes straight to the per-app Service in +// the instant-deploy- namespace. The api process is NOT in the request +// path. Transparent wake-on-request (a request to a sleeping app +// auto-scales it and holds the connection until ready) therefore requires an +// ACTIVATOR proxy in front of every app — KEDA http-add-on or a Knative-style +// activator. That is a significant new dependency and is explicitly out of +// scope for the scale-to-zero v1. +// +// v1 ships scale-DOWN (worker idle-scaler) + this fast EXPLICIT wake: +// +// POST /deploy/:id/wake → scales the app back to replicas=1 and returns once +// the scale patch is accepted by k8s. The pod still needs its normal startup +// time before it serves traffic, so a request that races the wake gets the +// app's own cold-start latency (a brief 502/503 from the ingress until the +// pod is Ready), exactly as a fresh rollout would. Callers/dashboard/agents +// surface "sleeping — wake" and retry the app URL after waking. +// +// COLD-START CONTRACT (documented v1 limitation) +// +// - While scaled_to_zero, the app URL returns the ingress's upstream-down +// response (502/503) because there is no pod. This is the documented v1 +// trade-off of explicit wake vs a transparent activator. +// - POST /deploy/:id/wake is idempotent: waking an already-awake app just +// refreshes last_activity_at (so it won't be re-descheduled immediately). +// - The endpoint is gated by DEPLOY_SCALE_TO_ZERO_ENABLED. With the flag OFF +// it returns 501 and performs NO scaling and NO DB writes (flag-off inert). + +import ( + "errors" + "log/slog" + + "github.com/gofiber/fiber/v2" + + "instant.dev/internal/middleware" + "instant.dev/internal/models" +) + +// Wake handles POST /deploy/:id/wake. It scales a (possibly scaled-to-zero) +// deployment back to replicas=1 and clears the scaled_to_zero flag, returning +// the refreshed deployment. See the file header for the cold-start contract. +func (h *DeployHandler) Wake(c *fiber.Ctx) error { + if !h.cfg.DeployScaleToZeroEnabled { + // Flag OFF → fully inert: no scale call, no DB write. + return respondError(c, fiber.StatusNotImplemented, "scale_to_zero_disabled", + "Scale-to-zero is not enabled on this platform") + } + + team, err := h.requireTeam(c) + if err != nil { + return err + } + + appID := c.Params("id") + d, err := models.GetDeploymentByAppID(c.Context(), h.db, appID) + if err != nil { + var notFound *models.ErrDeploymentNotFound + if errors.As(err, ¬Found) { + return respondError(c, fiber.StatusNotFound, "not_found", "Deployment not found") + } + return respondError(c, fiber.StatusServiceUnavailable, "fetch_failed", "Failed to fetch deployment") + } + + if d.TeamID != team.ID { + // 404 not 403: never confirm the existence of another team's deployment. + return respondError(c, fiber.StatusNotFound, "not_found", "Deployment not found") + } + + // Scale the k8s Deployment back to 1 replica. A NotFound Deployment is a + // no-op inside compute.Scale (the row may have been torn down), so this only + // errors on a real k8s transport failure — surface it so the caller retries. + if d.ProviderID != "" { + if scaleErr := h.compute.Scale(c.Context(), appID, 1); scaleErr != nil { + slog.Warn("deploy.wake.scale_failed", + "app_id", appID, "provider_id", d.ProviderID, "error", scaleErr, + "request_id", middleware.GetRequestID(c)) + return respondError(c, fiber.StatusServiceUnavailable, "wake_failed", + "Failed to wake deployment; please retry") + } + } + + // DB half: clear scaled_to_zero + bump last_activity_at so the idle-scaler + // doesn't immediately re-deschedule the just-woken app. + if _, dbErr := models.WakeDeployment(c.Context(), h.db, d.ID); dbErr != nil { + slog.Error("deploy.wake.db_failed", + "app_id", appID, "error", dbErr, + "request_id", middleware.GetRequestID(c)) + return respondError(c, fiber.StatusServiceUnavailable, "wake_failed", + "Failed to record wake; please retry") + } + + // Re-read so the response reflects the cleared flag + new activity stamp. + fresh, err := models.GetDeploymentByID(c.Context(), h.db, d.ID) + if err != nil { + // The scale + DB write already succeeded; a re-read failure shouldn't + // fail the wake. Fall back to the pre-read row with the fields we just set. + d.ScaledToZero = false + fresh = d + } + + slog.Info("deploy.woke", + "app_id", appID, "team_id", team.ID, + "request_id", middleware.GetRequestID(c)) + + return c.JSON(fiber.Map{ + "ok": true, + "message": "Deployment woken — the app will be reachable once its pod is Ready (cold start).", + "deployment": deploymentToMapWithDB(fresh, h.db), + }) +} diff --git a/internal/handlers/deploy_wake_mock_test.go b/internal/handlers/deploy_wake_mock_test.go new file mode 100644 index 00000000..0164e74d --- /dev/null +++ b/internal/handlers/deploy_wake_mock_test.go @@ -0,0 +1,363 @@ +package handlers + +// deploy_wake_mock_test.go — sqlmock-driven happy-path + error-branch coverage +// for POST /deploy/:id/wake (Wake handler, Task #54). The flag-off 501 path is +// covered in deploy_wake_test.go; this file covers the flag-ON branches: +// happy path (scale + DB flip + re-read), not-found, cross-team 404, scale +// failure (503), and the WakeDeployment DB-error (503). +// +// In-package test so the unexported DeployHandler fields are reachable and a +// recording compute provider can be injected without import indirection. + +import ( + "context" + "database/sql" + "errors" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" + + sqlmock "github.com/DATA-DOG/go-sqlmock" + "github.com/alicebob/miniredis/v2" + "github.com/gofiber/fiber/v2" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/require" + + "instant.dev/internal/config" + "instant.dev/internal/middleware" + "instant.dev/internal/plans" + "instant.dev/internal/providers/compute" +) + +// wakeRecordingProvider records Scale calls and can be told to fail. +type wakeRecordingProvider struct { + scaleCalls []int32 + scaleErr error +} + +func (p *wakeRecordingProvider) Deploy(context.Context, compute.DeployOptions) (*compute.AppDeployment, error) { + return nil, nil +} +func (p *wakeRecordingProvider) Status(context.Context, string) (*compute.AppDeployment, error) { + return nil, nil +} +func (p *wakeRecordingProvider) Logs(context.Context, string, bool) (io.ReadCloser, error) { + return nil, nil +} +func (p *wakeRecordingProvider) Teardown(context.Context, string) error { return nil } +func (p *wakeRecordingProvider) Redeploy(context.Context, string, []byte, map[string]string) (*compute.AppDeployment, error) { + return nil, nil +} +func (p *wakeRecordingProvider) UpdateAccessControl(context.Context, string, bool, []string) error { + return nil +} +func (p *wakeRecordingProvider) Scale(_ context.Context, _ string, replicas int32) error { + p.scaleCalls = append(p.scaleCalls, replicas) + return p.scaleErr +} + +// wakeMockApp builds a flag-ON wake app with faked auth Locals + the recording +// provider. Returns app + teamID + provider so tests assert Scale calls. +func wakeMockApp(t *testing.T, db *sql.DB, prov compute.Provider) (*fiber.App, uuid.UUID) { + t.Helper() + teamID := uuid.New() + + mr, err := miniredis.Run() + require.NoError(t, err) + t.Cleanup(mr.Close) + rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()}) + t.Cleanup(func() { _ = rdb.Close() }) + + h := &DeployHandler{ + db: db, + rdb: rdb, + cfg: &config.Config{DeployScaleToZeroEnabled: true, Environment: "test"}, + compute: prov, + planRegistry: plans.Default(), + } + app := fiber.New(fiber.Config{ + ErrorHandler: func(c *fiber.Ctx, err error) error { + if errors.Is(err, ErrResponseWritten) { + return nil + } + code := fiber.StatusInternalServerError + if e, ok := err.(*fiber.Error); ok { + code = e.Code + } + return c.Status(code).JSON(fiber.Map{"ok": false, "error": "internal_error"}) + }, + }) + app.Use(func(c *fiber.Ctx) error { + c.Locals(middleware.LocalKeyTeamID, teamID.String()) + return c.Next() + }) + app.Post("/deploy/:id/wake", h.Wake) + return app, teamID +} + +// wakeDeploymentRow builds the full deploymentColumns row for sqlmock. Column +// order MUST match the deploymentColumns constant in models/deployment.go. +func wakeDeploymentRow(id, teamID uuid.UUID, appID, providerID string, scaledToZero bool) *sqlmock.Rows { + cols := []string{ + "id", "team_id", "resource_id", "app_id", "provider_id", "status", "app_url", + "env_vars", "port", "tier", "env", "private", "allowed_ips", "error_message", + "created_at", "updated_at", + "notify_webhook", "notify_webhook_secret", "notify_state", "notify_attempts", + "expires_at", "ttl_policy", "reminders_sent", "last_reminder_at", + "source", "image_ref", "registry_creds_enc", + "git_url", "git_ref", "git_token_enc", + "last_activity_at", "scaled_to_zero", "always_on", + } + now := time.Now() + return sqlmock.NewRows(cols).AddRow( + id, teamID, uuid.NullUUID{}, appID, providerID, "healthy", "https://x.deployment.instanode.dev", + []byte(`{}`), 8080, "hobby", "production", false, "", "", + now, now, + sql.NullString{}, sql.NullString{}, "unset", 0, + sql.NullTime{}, "permanent", 0, sql.NullTime{}, + "tarball", "", "", + "", "", "", + sql.NullTime{Time: now, Valid: true}, scaledToZero, false, + ) +} + +// wakeMockAppNoAuth is wakeMockApp without the team-injecting middleware, so +// requireTeam sees an empty team_id and the handler returns 401. Used to cover +// the `team, err := h.requireTeam(c); if err != nil { return err }` arm. +func wakeMockAppNoAuth(t *testing.T, db *sql.DB) *fiber.App { + t.Helper() + mr, err := miniredis.Run() + require.NoError(t, err) + t.Cleanup(mr.Close) + rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()}) + t.Cleanup(func() { _ = rdb.Close() }) + + h := &DeployHandler{ + db: db, + rdb: rdb, + cfg: &config.Config{DeployScaleToZeroEnabled: true, Environment: "test"}, + compute: &wakeRecordingProvider{}, + planRegistry: plans.Default(), + } + app := fiber.New(fiber.Config{ + ErrorHandler: func(c *fiber.Ctx, err error) error { + if errors.Is(err, ErrResponseWritten) { + return nil + } + code := fiber.StatusInternalServerError + if e, ok := err.(*fiber.Error); ok { + code = e.Code + } + return c.Status(code).JSON(fiber.Map{"ok": false, "error": "internal_error"}) + }, + }) + app.Post("/deploy/:id/wake", h.Wake) + return app +} + +// TestWake_RequireTeamFails covers the requireTeam error arm: no team_id in +// Locals → 401 before any scale or DB work. +func TestWake_RequireTeamFails(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app := wakeMockAppNoAuth(t, db) + req := httptest.NewRequest(http.MethodPost, "/deploy/app-noauth/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("no-auth wake = %d, want 401", resp.StatusCode) + } +} + +// TestWake_FetchDriverError503 covers the generic GetDeploymentByAppID driver +// error arm (NOT sql.ErrNoRows) → 503 fetch_failed. +func TestWake_FetchDriverError503(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := wakeMockApp(t, db, &wakeRecordingProvider{}) + expectTeamLookupOK(mock, teamID, "hobby") + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-drv"). + WillReturnError(errors.New("deployments table exploded")) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-drv/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusServiceUnavailable { + t.Fatalf("fetch-driver-error wake = %d, want 503", resp.StatusCode) + } +} + +// TestWake_ReReadFailureFallsBack covers the post-write re-read failure arm: +// scale + WakeDeployment already succeeded, so a failing GetDeploymentByID must +// NOT fail the wake — the handler falls back to the pre-read row with +// ScaledToZero cleared and still returns 200. +func TestWake_ReReadFailureFallsBack(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + prov := &wakeRecordingProvider{} + app, teamID := wakeMockApp(t, db, prov) + id := uuid.New() + + expectTeamLookupOK(mock, teamID, "hobby") + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-reread"). + WillReturnRows(wakeDeploymentRow(id, teamID, "app-reread", "app-reread", true)) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id). + WillReturnResult(sqlmock.NewResult(0, 1)) + // Re-read fails → handler must fall back, NOT 5xx. + mock.ExpectQuery(`FROM deployments WHERE id = \$1`). + WithArgs(id). + WillReturnError(errors.New("re-read exploded")) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-reread/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("re-read-failure wake = %d, want 200 (fallback); body: %s", resp.StatusCode, string(body)) + } + if len(prov.scaleCalls) != 1 { + t.Errorf("expected one Scale call before re-read, got %v", prov.scaleCalls) + } + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestWake_HappyPath(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + prov := &wakeRecordingProvider{} + app, teamID := wakeMockApp(t, db, prov) + id := uuid.New() + + expectTeamLookupOK(mock, teamID, "hobby") + // GetDeploymentByAppID — asleep row owned by the team. + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-abc"). + WillReturnRows(wakeDeploymentRow(id, teamID, "app-abc", "app-abc", true)) + // WakeDeployment UPDATE. + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id). + WillReturnResult(sqlmock.NewResult(0, 1)) + // Re-read after wake. + mock.ExpectQuery(`FROM deployments WHERE id = \$1`). + WithArgs(id). + WillReturnRows(wakeDeploymentRow(id, teamID, "app-abc", "app-abc", false)) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-abc/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + t.Fatalf("wake status = %d, want 200", resp.StatusCode) + } + if len(prov.scaleCalls) != 1 || prov.scaleCalls[0] != 1 { + t.Errorf("expected one Scale(1) call, got %v", prov.scaleCalls) + } + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestWake_NotFound(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := wakeMockApp(t, db, &wakeRecordingProvider{}) + expectTeamLookupOK(mock, teamID, "hobby") + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-missing"). + WillReturnError(sql.ErrNoRows) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-missing/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("wake on missing deploy = %d, want 404", resp.StatusCode) + } +} + +func TestWake_CrossTeam404(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := wakeMockApp(t, db, &wakeRecordingProvider{}) + otherTeam := uuid.New() + id := uuid.New() + expectTeamLookupOK(mock, teamID, "hobby") + // Row owned by a DIFFERENT team → handler must 404 (not 403). + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-other"). + WillReturnRows(wakeDeploymentRow(id, otherTeam, "app-other", "app-other", true)) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-other/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("cross-team wake = %d, want 404", resp.StatusCode) + } +} + +func TestWake_ScaleFailure503(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + prov := &wakeRecordingProvider{scaleErr: errors.New("k8s boom")} + app, teamID := wakeMockApp(t, db, prov) + id := uuid.New() + expectTeamLookupOK(mock, teamID, "hobby") + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-boom"). + WillReturnRows(wakeDeploymentRow(id, teamID, "app-boom", "app-boom", true)) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-boom/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusServiceUnavailable { + t.Fatalf("scale-failure wake = %d, want 503", resp.StatusCode) + } +} + +func TestWake_DBFlipFailure503(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := wakeMockApp(t, db, &wakeRecordingProvider{}) + id := uuid.New() + expectTeamLookupOK(mock, teamID, "hobby") + mock.ExpectQuery(`FROM deployments WHERE app_id = \$1`). + WithArgs("app-dbfail"). + WillReturnRows(wakeDeploymentRow(id, teamID, "app-dbfail", "app-dbfail", true)) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id). + WillReturnError(errors.New("db exploded")) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-dbfail/wake", nil) + resp, err := app.Test(req, 2000) + require.NoError(t, err) + defer resp.Body.Close() + if resp.StatusCode != http.StatusServiceUnavailable { + t.Fatalf("db-flip-failure wake = %d, want 503", resp.StatusCode) + } +} diff --git a/internal/handlers/deploy_wake_test.go b/internal/handlers/deploy_wake_test.go new file mode 100644 index 00000000..df8bb1b6 --- /dev/null +++ b/internal/handlers/deploy_wake_test.go @@ -0,0 +1,83 @@ +package handlers + +// deploy_wake_test.go — scale-to-zero wake endpoint coverage (Task #54). +// +// The flag-off path is the load-bearing safety property (rule: default OFF, +// inert when off). It must short-circuit with 501 BEFORE any auth lookup, scale +// call, or DB write — so this test constructs the handler with the flag off and +// asserts a 501 with no compute interaction. A panicking compute provider proves +// the handler never reaches the scale layer when the flag is off. + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/gofiber/fiber/v2" + + "instant.dev/internal/config" + "instant.dev/internal/providers/compute" +) + +// wakePanicProvider satisfies compute.Provider; Scale panics so a flag-off wake +// that incorrectly reaches the compute layer fails loudly. +type wakePanicProvider struct{} + +func (wakePanicProvider) Deploy(context.Context, compute.DeployOptions) (*compute.AppDeployment, error) { + panic("Deploy: not expected") +} +func (wakePanicProvider) Status(context.Context, string) (*compute.AppDeployment, error) { + panic("Status: not expected") +} +func (wakePanicProvider) Logs(context.Context, string, bool) (io.ReadCloser, error) { + panic("Logs: not expected") +} +func (wakePanicProvider) Teardown(context.Context, string) error { panic("Teardown: not expected") } +func (wakePanicProvider) Redeploy(context.Context, string, []byte, map[string]string) (*compute.AppDeployment, error) { + panic("Redeploy: not expected") +} +func (wakePanicProvider) UpdateAccessControl(context.Context, string, bool, []string) error { + panic("UpdateAccessControl: not expected") +} +func (wakePanicProvider) Scale(context.Context, string, int32) error { + panic("Scale: not expected when scale-to-zero flag is OFF") +} + +// TestWake_FlagOff_Returns501Inert proves the wake endpoint is fully inert when +// DEPLOY_SCALE_TO_ZERO_ENABLED is off: 501 response, and the (panicking) +// compute provider is never touched. +func TestWake_FlagOff_Returns501Inert(t *testing.T) { + h := &DeployHandler{ + cfg: &config.Config{DeployScaleToZeroEnabled: false}, + compute: wakePanicProvider{}, + } + // Mirror the production fiber ErrorHandler so respondError's + // ErrResponseWritten sentinel isn't turned into a 500 by the default handler. + app := fiber.New(fiber.Config{ + ErrorHandler: func(_ *fiber.Ctx, err error) error { + if err == ErrResponseWritten { + return nil + } + return err + }, + }) + app.Post("/deploy/:id/wake", h.Wake) + + req := httptest.NewRequest(http.MethodPost, "/deploy/app-123/wake", nil) + resp, err := app.Test(req, 1000) + if err != nil { + t.Fatalf("app.Test: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("flag-off wake status = %d, want 501", resp.StatusCode) + } + body, _ := io.ReadAll(resp.Body) + if !strings.Contains(string(body), "scale_to_zero_disabled") { + t.Errorf("flag-off body = %q; want scale_to_zero_disabled error code", string(body)) + } +} diff --git a/internal/handlers/helpers.go b/internal/handlers/helpers.go index 7abea9b9..ba905bb2 100644 --- a/internal/handlers/helpers.go +++ b/internal/handlers/helpers.go @@ -953,6 +953,16 @@ var codeToAgentAction = map[string]errorCodeMeta{ "fetch_failed": { AgentAction: "Tell the user the fetch hit a transient backend error. Retry in 30 seconds — see https://instanode.dev/status.", }, + // Scale-to-zero wake path (deploy_wake.go). `scale_to_zero_disabled` is a + // 501 returned when the DEPLOY_SCALE_TO_ZERO_ENABLED flag is off — the wake + // route is inert, so there is no agent retry; just inform the user. `wake_failed` + // is a transient 503 (k8s scale-up or the post-wake DB write failed) — retry. + "scale_to_zero_disabled": { + AgentAction: "Tell the user scale-to-zero wake isn't enabled on this deployment. No action needed; the app stays as-is. See https://instanode.dev/status.", + }, + "wake_failed": { + AgentAction: "Tell the user waking the sleeping deployment hit a transient error. Retry in 30 seconds; if it persists check https://instanode.dev/status.", + }, "create_failed": { AgentAction: "Tell the user the resource could not be created right now. Retry in 30 seconds; if it persists check https://instanode.dev/status.", }, diff --git a/internal/handlers/openapi.go b/internal/handlers/openapi.go index e6b31c12..a1abe2b4 100644 --- a/internal/handlers/openapi.go +++ b/internal/handlers/openapi.go @@ -586,6 +586,21 @@ const openAPISpec = `{ } } }, + "/deploy/{id}/wake": { + "post": { + "summary": "Wake a scaled-to-zero (sleeping) deployment", + "description": "Scale-to-zero (Task #54). Scales an idle, descheduled app back to one replica and clears its sleeping state. The app becomes reachable once its pod is Ready (a one-time cold start — a request that races the wake gets the ingress's upstream-down response until the pod is up). Idempotent: waking an already-awake app just refreshes its last-activity marker so the idle-scaler won't immediately re-deschedule it. Returns 501 when scale-to-zero is not enabled on the platform (the default). Cross-tenant requests return 404.", + "security": [{ "bearerAuth": [] }], + "parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string" }, "description": "Deployment id (UUID or short app_id slug)." }], + "responses": { + "200": { "description": "Deployment woken", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/DeployResponse" } } } }, + "401": { "description": "Unauthorized" }, + "404": { "description": "Not found (or owned by another team)" }, + "501": { "description": "scale_to_zero_disabled — scale-to-zero is not enabled on this platform (default)." }, + "503": { "description": "wake_failed — transient failure scaling the app; retry." } + } + } + }, "/api/v1/deployments/{id}/make-permanent": { "post": { "summary": "Opt a deployment out of the auto-24h TTL", diff --git a/internal/models/coverage_deployment_test.go b/internal/models/coverage_deployment_test.go index 0bcfb1e6..82d59bb9 100644 --- a/internal/models/coverage_deployment_test.go +++ b/internal/models/coverage_deployment_test.go @@ -351,7 +351,7 @@ func TestMarkDeploymentBuilding_Branches(t *testing.T) { // 1 row matched — redeployable status flipped to building. db, mock := newMock(t) - mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WillReturnResult(sqlmock.NewResult(0, 1)) n, err := MarkDeploymentBuilding(ctx, db, uuid.New()) require.NoError(t, err) @@ -359,7 +359,7 @@ func TestMarkDeploymentBuilding_Branches(t *testing.T) { // 0 rows matched — the row was reaped to a terminal status; CAS no-op. db2, mock2 := newMock(t) - mock2.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + mock2.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL,\s+scaled_to_zero = false, last_activity_at = now\(\),\s+updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). WillReturnResult(sqlmock.NewResult(0, 0)) n, err = MarkDeploymentBuilding(ctx, db2, uuid.New()) require.NoError(t, err) diff --git a/internal/models/coverage_provision_gate_test.go b/internal/models/coverage_provision_gate_test.go index ee4f72b6..ae34e8c4 100644 --- a/internal/models/coverage_provision_gate_test.go +++ b/internal/models/coverage_provision_gate_test.go @@ -19,6 +19,7 @@ func deploymentMockCols() []string { "expires_at", "ttl_policy", "reminders_sent", "last_reminder_at", "source", "image_ref", "registry_creds_enc", "git_url", "git_ref", "git_token_enc", + "last_activity_at", "scaled_to_zero", "always_on", } } @@ -30,6 +31,7 @@ func deploymentMockRow() *sqlmock.Rows { nil, "auto_24h", 0, nil, "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) "", "", "", // git_url, git_ref, git_token_enc (mig 065) + nil, false, false, // last_activity_at, scaled_to_zero, always_on (mig 068) ) } diff --git a/internal/models/deployment.go b/internal/models/deployment.go index 843ec50a..7f1ff56f 100644 --- a/internal/models/deployment.go +++ b/internal/models/deployment.go @@ -77,6 +77,20 @@ type Deployment struct { TTLPolicy string RemindersSent int LastReminderAt sql.NullTime + // Scale-to-zero state (migration 068). Inert unless the worker + // DEPLOY_SCALE_TO_ZERO_ENABLED flag is on. + // + // LastActivityAt is the floor "last known activity" marker — set at + // create-time, bumped on every wake + redeploy. The idle-scaler + // descheduals a Deployment only when now()-LastActivityAt exceeds the + // idle threshold. v1 captures deploy/redeploy/wake events, not per-HTTP + // traffic (see migration 068 + the worker job header). + // + // ScaledToZero is true while the app is currently descheduled (replicas=0). + // AlwaysOn opts an app out of scale-to-zero entirely (pinned, no cold start). + LastActivityAt sql.NullTime + ScaledToZero bool + AlwaysOn bool CreatedAt time.Time UpdatedAt time.Time } @@ -153,7 +167,8 @@ const deploymentColumns = `id, team_id, resource_id, app_id, provider_id, status notify_webhook, notify_webhook_secret, notify_state, notify_attempts, expires_at, ttl_policy, reminders_sent, last_reminder_at, source, image_ref, registry_creds_enc, - git_url, git_ref, git_token_enc` + git_url, git_ref, git_token_enc, + last_activity_at, scaled_to_zero, always_on` // scanDeployment reads a single deployments row into a Deployment struct. // env_vars is stored as JSONB; error_message, provider_id, and app_url are nullable. @@ -189,6 +204,10 @@ func scanDeployment(row interface { // migration 065: git source deploys. NOT NULL DEFAULT '' — plain string // scan targets are safe. &d.GitURL, &d.GitRef, &d.GitTokenEnc, + // migration 068: scale-to-zero state. last_activity_at is nullable + // (legacy rows backfilled from updated_at); scaled_to_zero / always_on + // are NOT NULL DEFAULT false → plain bool scan targets are safe. + &d.LastActivityAt, &d.ScaledToZero, &d.AlwaysOn, ); err != nil { return nil, err } @@ -326,8 +345,9 @@ func CreateDeployment(ctx context.Context, db dbExecutor, p CreateDeploymentPara notify_webhook, notify_webhook_secret, notify_state, expires_at, ttl_policy, source, image_ref, registry_creds_enc, - git_url, git_ref, git_token_enc) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20) + git_url, git_ref, git_token_enc, + last_activity_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, now()) RETURNING `+deploymentColumns, p.TeamID, resourceID, p.AppID, port, p.Tier, env, envVarsJSON, p.Private, allowedIPs, @@ -528,9 +548,15 @@ const redeployableStatusesSQL = `('building', 'deploying', 'healthy', 'failed')` // updated_at is set to now() by the database. error_message is cleared (a // fresh build supersedes any prior failure message). func MarkDeploymentBuilding(ctx context.Context, db *sql.DB, id uuid.UUID) (int64, error) { + // A redeploy is genuine activity AND brings replicas back to 1 (the new + // rollout starts the app), so it also clears any scaled_to_zero state and + // bumps last_activity_at — otherwise the idle-scaler could immediately + // re-deschedule a freshly-redeployed app (migration 068). res, err := db.ExecContext(ctx, ` UPDATE deployments - SET status = 'building', error_message = NULL, updated_at = now() + SET status = 'building', error_message = NULL, + scaled_to_zero = false, last_activity_at = now(), + updated_at = now() WHERE id = $1 AND status IN `+redeployableStatusesSQL+` `, id) if err != nil { @@ -540,6 +566,69 @@ func MarkDeploymentBuilding(ctx context.Context, db *sql.DB, id uuid.UUID) (int6 return n, nil } +// ── Scale-to-zero state mutators (migration 068, Task #54) ────────────────── +// +// These back the idle-scaler (worker) and the explicit-wake endpoint (api). +// All are inert unless the worker DEPLOY_SCALE_TO_ZERO_ENABLED flag is on — +// nothing calls them otherwise. + +// MarkDeploymentScaledToZero flips scaled_to_zero=true on a row. Called by the +// api after a successful compute.Scale(appID, 0). The CAS guard keeps the +// write narrow: only a currently-healthy, not-already-zeroed, not-always-on +// row is descheduled, so a concurrent wake / redeploy / teardown that changed +// the row between the scaler's SELECT and this UPDATE reports 0 rows and the +// scaler treats it as "raced — skip". Returns rows affected. +func MarkDeploymentScaledToZero(ctx context.Context, db dbExecutor, id uuid.UUID) (int64, error) { + res, err := db.ExecContext(ctx, ` + UPDATE deployments + SET scaled_to_zero = true, updated_at = now() + WHERE id = $1 + AND status = 'healthy' + AND scaled_to_zero = false + AND always_on = false + `, id) + if err != nil { + return 0, fmt.Errorf("models.MarkDeploymentScaledToZero: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + +// WakeDeployment clears scaled_to_zero and bumps last_activity_at — the DB half +// of a wake. Called by the api wake endpoint after a successful +// compute.Scale(appID, 1). Idempotent: a row that is already awake (or never +// slept) simply has its last_activity_at refreshed, which is the correct +// "this app was just touched" semantics. Returns rows affected (0 only when +// the id does not exist). +func WakeDeployment(ctx context.Context, db dbExecutor, id uuid.UUID) (int64, error) { + res, err := db.ExecContext(ctx, ` + UPDATE deployments + SET scaled_to_zero = false, last_activity_at = now(), updated_at = now() + WHERE id = $1 + `, id) + if err != nil { + return 0, fmt.Errorf("models.WakeDeployment: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + +// SetDeploymentAlwaysOn toggles the per-app scale-to-zero opt-out. always_on=true +// pins the app (the idle-scaler never descheduals it); always_on=false re-enables +// scale-to-zero eligibility. Returns rows affected (0 when the id does not exist). +func SetDeploymentAlwaysOn(ctx context.Context, db dbExecutor, id uuid.UUID, alwaysOn bool) (int64, error) { + res, err := db.ExecContext(ctx, ` + UPDATE deployments + SET always_on = $2, updated_at = now() + WHERE id = $1 + `, id, alwaysOn) + if err != nil { + return 0, fmt.Errorf("models.SetDeploymentAlwaysOn: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + // UpdateDeploymentProviderID records the k8s Deployment name and the resolved app URL // after the k8s Deployment object has been successfully created. // updated_at is set to now() by the database. diff --git a/internal/models/deployment_scale_test.go b/internal/models/deployment_scale_test.go new file mode 100644 index 00000000..adbaa8b7 --- /dev/null +++ b/internal/models/deployment_scale_test.go @@ -0,0 +1,279 @@ +package models_test + +// deployment_scale_test.go — scale-to-zero model coverage (migration 068, +// Task #54). Covers: CreateDeployment seeds last_activity_at + defaults +// scaled_to_zero/always_on=false; MarkDeploymentScaledToZero CAS (only a +// healthy, not-already-zeroed, not-always-on row is descheduled); +// WakeDeployment clears the flag + bumps activity; SetDeploymentAlwaysOn +// toggles the opt-out; MarkDeploymentBuilding (redeploy) clears scaled_to_zero. +// +// Skips when TEST_DATABASE_URL is unset (see requireDB). + +import ( + "context" + "errors" + "testing" + "time" + + sqlmock "github.com/DATA-DOG/go-sqlmock" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/models" + "instant.dev/internal/testhelpers" +) + +// errScaleDriver is the sentinel sqlmock returns for the driver-error arms of +// the scale-to-zero model writes. Named so the wrapped %w error is searchable. +var errScaleDriver = errors.New("mock: deployments UPDATE exploded") + +// TestMarkDeploymentScaledToZero_DriverError pins the error return (the +// fmt.Errorf wrap) when the UPDATE itself fails — distinct from a 0-row CAS +// miss, which is not an error. +func TestMarkDeploymentScaledToZero_DriverError(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + id := uuid.New() + mock.ExpectExec(`UPDATE deployments`).WithArgs(id).WillReturnError(errScaleDriver) + + n, err := models.MarkDeploymentScaledToZero(context.Background(), db, id) + require.Error(t, err) + assert.ErrorIs(t, err, errScaleDriver) + assert.Equal(t, int64(0), n) + require.NoError(t, mock.ExpectationsWereMet()) +} + +// TestWakeDeployment_DriverError pins WakeDeployment's error return. +func TestWakeDeployment_DriverError(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + id := uuid.New() + mock.ExpectExec(`UPDATE deployments`).WithArgs(id).WillReturnError(errScaleDriver) + + n, err := models.WakeDeployment(context.Background(), db, id) + require.Error(t, err) + assert.ErrorIs(t, err, errScaleDriver) + assert.Equal(t, int64(0), n) + require.NoError(t, mock.ExpectationsWereMet()) +} + +// TestSetDeploymentAlwaysOn_DriverError pins SetDeploymentAlwaysOn's error return. +func TestSetDeploymentAlwaysOn_DriverError(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + id := uuid.New() + mock.ExpectExec(`UPDATE deployments`).WithArgs(id, true).WillReturnError(errScaleDriver) + + n, err := models.SetDeploymentAlwaysOn(context.Background(), db, id, true) + require.Error(t, err) + assert.ErrorIs(t, err, errScaleDriver) + assert.Equal(t, int64(0), n) + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCreateDeployment_SeedsScaleToZeroDefaults(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + + ctx := context.Background() + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, + AppID: "app-s2z-defaults-" + uuid.NewString()[:8], + Tier: "hobby", + TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + assert.False(t, d.ScaledToZero, "new deploy must not be scaled_to_zero") + assert.False(t, d.AlwaysOn, "new deploy must default always_on=false") + require.True(t, d.LastActivityAt.Valid, "new deploy must seed last_activity_at") + assert.WithinDuration(t, time.Now(), d.LastActivityAt.Time, 60*time.Second, + "last_activity_at must be ≈ now() at create time") +} + +func TestMarkDeploymentScaledToZero_CAS(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-cas-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + // Force healthy (the only descheduable status). + _, err = db.Exec(`UPDATE deployments SET status='healthy' WHERE id=$1`, d.ID) + require.NoError(t, err) + + // Healthy + not-zeroed + not-always-on → descheduled (1 row). + n, err := models.MarkDeploymentScaledToZero(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(1), n, "healthy row must be descheduled") + + got, err := models.GetDeploymentByID(ctx, db, d.ID) + require.NoError(t, err) + assert.True(t, got.ScaledToZero, "row must now be scaled_to_zero") + + // Second call → already zeroed → CAS skips (0 rows). + n, err = models.MarkDeploymentScaledToZero(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(0), n, "already-zeroed row must be skipped") +} + +func TestMarkDeploymentScaledToZero_SkipsAlwaysOn(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-pin-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + _, err = db.Exec(`UPDATE deployments SET status='healthy', always_on=true WHERE id=$1`, d.ID) + require.NoError(t, err) + + n, err := models.MarkDeploymentScaledToZero(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(0), n, "always_on (pinned) row must NOT be descheduled") +} + +func TestMarkDeploymentScaledToZero_SkipsNonHealthy(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-building-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + // Leaves status at the create default (building) — not descheduable. + n, err := models.MarkDeploymentScaledToZero(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(0), n, "a non-healthy (building) row must NOT be descheduled") +} + +func TestWakeDeployment_ClearsFlagAndBumpsActivity(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-wake-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + // Put it to sleep with a stale last_activity_at. + stale := time.Now().Add(-90 * time.Minute) + _, err = db.Exec(`UPDATE deployments SET status='healthy', scaled_to_zero=true, last_activity_at=$2 WHERE id=$1`, + d.ID, stale) + require.NoError(t, err) + + n, err := models.WakeDeployment(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(1), n) + + got, err := models.GetDeploymentByID(ctx, db, d.ID) + require.NoError(t, err) + assert.False(t, got.ScaledToZero, "wake must clear scaled_to_zero") + require.True(t, got.LastActivityAt.Valid) + assert.WithinDuration(t, time.Now(), got.LastActivityAt.Time, 60*time.Second, + "wake must bump last_activity_at to ≈ now()") +} + +func TestSetDeploymentAlwaysOn_Toggle(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-pin-toggle-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + n, err := models.SetDeploymentAlwaysOn(ctx, db, d.ID, true) + require.NoError(t, err) + assert.Equal(t, int64(1), n) + got, err := models.GetDeploymentByID(ctx, db, d.ID) + require.NoError(t, err) + assert.True(t, got.AlwaysOn, "always_on must be set true") + + _, err = models.SetDeploymentAlwaysOn(ctx, db, d.ID, false) + require.NoError(t, err) + got, err = models.GetDeploymentByID(ctx, db, d.ID) + require.NoError(t, err) + assert.False(t, got.AlwaysOn, "always_on must toggle back false") +} + +func TestMarkDeploymentBuilding_ClearsScaledToZero(t *testing.T) { + requireDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + + teamID := uuid.MustParse(testhelpers.MustCreateTeamDB(t, db, "hobby")) + defer db.Exec(`DELETE FROM teams WHERE id = $1`, teamID) + ctx := context.Background() + + d, err := models.CreateDeployment(ctx, db, models.CreateDeploymentParams{ + TeamID: teamID, AppID: "app-s2z-redeploy-" + uuid.NewString()[:8], + Tier: "hobby", TTLPolicy: models.DeployTTLPolicyPermanent, + }) + require.NoError(t, err) + defer db.Exec(`DELETE FROM deployments WHERE id = $1`, d.ID) + + // Asleep + healthy → a redeploy (MarkDeploymentBuilding) must wake it. + _, err = db.Exec(`UPDATE deployments SET status='healthy', scaled_to_zero=true WHERE id=$1`, d.ID) + require.NoError(t, err) + + n, err := models.MarkDeploymentBuilding(ctx, db, d.ID) + require.NoError(t, err) + assert.Equal(t, int64(1), n) + + got, err := models.GetDeploymentByID(ctx, db, d.ID) + require.NoError(t, err) + assert.False(t, got.ScaledToZero, "redeploy must clear scaled_to_zero") + assert.Equal(t, "building", got.Status) +} diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go index 72f936b5..4a312a78 100644 --- a/internal/providers/compute/k8s/client.go +++ b/internal/providers/compute/k8s/client.go @@ -2395,6 +2395,47 @@ func (p *K8sProvider) UpdateAccessControl(ctx context.Context, appID string, pri return nil } +// Scale patches the app's Deployment replica count in place (scale-to-zero, +// Task #54). replicas=0 descheduals an idle app to ~$0 compute; replicas=1 +// wakes it. No image rebuild, no namespace change — only the Deployment's +// spec.replicas is mutated, so the existing image, env, ingress and TLS cert +// are preserved across a sleep/wake cycle. +// +// A NotFound Deployment (namespace already torn down, or the row was never +// fully deployed) is treated as a no-op success: the idle-scaler must not +// wedge on a stale row, and a wake on a torn-down app is harmless. Any other +// k8s error is surfaced so the caller can retry / record a wake_failed metric. +func (p *K8sProvider) Scale(ctx context.Context, appID string, replicas int32) error { + ns := deployNamespace(appID) + name := deploymentName(appID) + + deploy, err := p.clientset.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + slog.Warn("k8s.Scale: deployment not found — no-op", + "app_id", appID, "namespace", ns, "replicas", replicas) + return nil + } + return fmt.Errorf("k8s.Scale: get deployment %q in %q: %w", name, ns, err) + } + + // Idempotent: if already at the target replica count, skip the write so a + // repeated scaler tick / concurrent wake doesn't churn the API server. + if deploy.Spec.Replicas != nil && *deploy.Spec.Replicas == replicas { + slog.Debug("k8s.Scale: already at target replicas — no-op", + "app_id", appID, "namespace", ns, "replicas", replicas) + return nil + } + + deploy.Spec.Replicas = &replicas + if _, err := p.clientset.AppsV1().Deployments(ns).Update(ctx, deploy, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("k8s.Scale: update deployment %q in %q to replicas=%d: %w", name, ns, replicas, err) + } + slog.Info("k8s.Scale: deployment replicas patched", + "app_id", appID, "namespace", ns, "replicas", replicas) + return nil +} + // deployIngressURL returns the public Ingress URL for an appID if DEPLOY_DOMAIN // is configured. Caller uses this to compute the AppURL during Status/Redeploy // without re-querying the k8s API (the value is deterministic from env + appID). diff --git a/internal/providers/compute/k8s/scale_test.go b/internal/providers/compute/k8s/scale_test.go new file mode 100644 index 00000000..1c5d7d7e --- /dev/null +++ b/internal/providers/compute/k8s/scale_test.go @@ -0,0 +1,119 @@ +package k8s + +// scale_test.go — unit tests for K8sProvider.Scale (scale-to-zero, Task #54). +// +// Covers: scale-down patches replicas to 0, wake patches back to 1, an +// already-at-target call is a no-op (no Update), a NotFound Deployment is a +// no-op success (the idle-scaler must not wedge on a stale row), and a +// transport-level Get/Update error is surfaced. + +import ( + "context" + "errors" + "testing" + + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientfake "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" +) + +// seedScalableDeployment creates an app- Deployment in instant-deploy- +// with the given replica count so Scale has something to patch. +func seedScalableDeployment(cs *clientfake.Clientset, appID string, replicas int32) error { + ns := deployNamespace(appID) + r := replicas + _, err := cs.AppsV1().Deployments(ns).Create(context.Background(), &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName(appID), Namespace: ns}, + Spec: appsv1.DeploymentSpec{Replicas: &r}, + }, metav1.CreateOptions{}) + return err +} + +func currentReplicas(t *testing.T, cs *clientfake.Clientset, appID string) int32 { + t.Helper() + d, err := cs.AppsV1().Deployments(deployNamespace(appID)).Get( + context.Background(), deploymentName(appID), metav1.GetOptions{}) + if err != nil { + t.Fatalf("get deployment: %v", err) + } + if d.Spec.Replicas == nil { + return -1 + } + return *d.Spec.Replicas +} + +func TestScale_DownThenWake(t *testing.T) { + cs := clientfake.NewSimpleClientset() + if err := seedScalableDeployment(cs, "abc", 1); err != nil { + t.Fatalf("seed: %v", err) + } + p := &K8sProvider{clientset: cs} + + // Scale down to 0 (idle descheduling). + if err := p.Scale(context.Background(), "abc", 0); err != nil { + t.Fatalf("Scale(0): %v", err) + } + if got := currentReplicas(t, cs, "abc"); got != 0 { + t.Errorf("after Scale(0): replicas = %d; want 0", got) + } + + // Wake back to 1. + if err := p.Scale(context.Background(), "abc", 1); err != nil { + t.Fatalf("Scale(1): %v", err) + } + if got := currentReplicas(t, cs, "abc"); got != 1 { + t.Errorf("after Scale(1): replicas = %d; want 1", got) + } +} + +func TestScale_AlreadyAtTargetNoUpdate(t *testing.T) { + cs := clientfake.NewSimpleClientset() + if err := seedScalableDeployment(cs, "abc", 0); err != nil { + t.Fatalf("seed: %v", err) + } + // Fail any Update so the test proves the idempotent branch skips it. + cs.PrependReactor("update", "deployments", func(_ clienttesting.Action) (bool, runtime.Object, error) { + return true, nil, errors.New("update should not be called when already at target") + }) + p := &K8sProvider{clientset: cs} + + if err := p.Scale(context.Background(), "abc", 0); err != nil { + t.Fatalf("Scale(0) on already-zero deployment should be a no-op, got: %v", err) + } +} + +func TestScale_NotFoundIsNoOp(t *testing.T) { + cs := clientfake.NewSimpleClientset() + p := &K8sProvider{clientset: cs} + // No seeded deployment → Get returns NotFound → Scale must succeed (no-op). + if err := p.Scale(context.Background(), "missing", 0); err != nil { + t.Errorf("Scale on missing deployment should be no-op success, got: %v", err) + } +} + +func TestScale_GetErrorSurfaced(t *testing.T) { + cs := clientfake.NewSimpleClientset() + cs.PrependReactor("get", "deployments", func(_ clienttesting.Action) (bool, runtime.Object, error) { + return true, nil, errors.New("boom") + }) + p := &K8sProvider{clientset: cs} + if err := p.Scale(context.Background(), "abc", 0); err == nil { + t.Error("Scale should surface a transport-level Get error") + } +} + +func TestScale_UpdateErrorSurfaced(t *testing.T) { + cs := clientfake.NewSimpleClientset() + if err := seedScalableDeployment(cs, "abc", 1); err != nil { + t.Fatalf("seed: %v", err) + } + cs.PrependReactor("update", "deployments", func(_ clienttesting.Action) (bool, runtime.Object, error) { + return true, nil, errors.New("boom") + }) + p := &K8sProvider{clientset: cs} + if err := p.Scale(context.Background(), "abc", 0); err == nil { + t.Error("Scale should surface a transport-level Update error") + } +} diff --git a/internal/providers/compute/noop/noop.go b/internal/providers/compute/noop/noop.go index 86f3afc1..27b17736 100644 --- a/internal/providers/compute/noop/noop.go +++ b/internal/providers/compute/noop/noop.go @@ -77,6 +77,16 @@ func (n *NoopProvider) Redeploy(_ context.Context, providerID string, _ []byte, }, nil } +// Scale logs a warning and returns nil. The DB-only scaled_to_zero flip is the +// user-visible change on a backend with no real Deployment. +func (n *NoopProvider) Scale(_ context.Context, appID string, replicas int32) error { + slog.Warn("compute.noop: Scale called but compute is disabled", + "app_id", appID, + "replicas", replicas, + ) + return nil +} + // UpdateAccessControl logs a warning and returns nil. Tests use this — the // DB-only update is the user-visible change. func (n *NoopProvider) UpdateAccessControl(_ context.Context, appID string, private bool, allowedIPs []string) error { diff --git a/internal/providers/compute/noop/noop_test.go b/internal/providers/compute/noop/noop_test.go index 4a6f81e1..ab775ae6 100644 --- a/internal/providers/compute/noop/noop_test.go +++ b/internal/providers/compute/noop/noop_test.go @@ -122,6 +122,17 @@ func TestNoop_UpdateAccessControl(t *testing.T) { } } +// TestNoop_Scale verifies Scale is a no-op for both scale-down and wake. +func TestNoop_Scale(t *testing.T) { + p := New() + if err := p.Scale(context.Background(), "appid", 0); err != nil { + t.Errorf("Scale(0): %v", err) + } + if err := p.Scale(context.Background(), "appid", 1); err != nil { + t.Errorf("Scale(1): %v", err) + } +} + // ── Stack provider ────────────────────────────────────────────────────────── // TestNoop_NewStack verifies the stack constructor. diff --git a/internal/providers/compute/provider.go b/internal/providers/compute/provider.go index 77c56a84..3e9da684 100644 --- a/internal/providers/compute/provider.go +++ b/internal/providers/compute/provider.go @@ -76,6 +76,18 @@ type Provider interface { // Redeploy rebuilds the image from a new tarball and rolls out. Redeploy(ctx context.Context, providerID string, tarball []byte, envVars map[string]string) (*AppDeployment, error) + // Scale patches the app's k8s Deployment replica count in place — no + // image rebuild, no namespace change. Backs scale-to-zero (Task #54): + // the worker idle-scaler calls Scale(appID, 0) to deschedule an idle app + // (~$0 compute), and the wake path calls Scale(appID, 1) to bring it back. + // Implementations on backends without a real Deployment concept (noop, + // local-dev without a cluster) should return nil after a slog.Warn — the + // DB-only state flip (deployments.scaled_to_zero) is the user-visible + // change. A NotFound Deployment is NOT an error: an app whose namespace + // was already torn down is treated as a no-op so a stale idle row can't + // wedge the scaler. + Scale(ctx context.Context, appID string, replicas int32) error + // UpdateAccessControl patches the access-control annotations on an // existing deploy's Ingress in place — no image rebuild, no pod restart. // Backs PATCH /api/v1/deployments/:id for the private + allowed_ips diff --git a/internal/router/route_donebar_guard_test.go b/internal/router/route_donebar_guard_test.go index be832995..830ed014 100644 --- a/internal/router/route_donebar_guard_test.go +++ b/internal/router/route_donebar_guard_test.go @@ -304,8 +304,14 @@ var routeTestMap = map[string]string{ // 402 + redeploy CAS guard. Heavy Kaniko-build legs assert the // accepted/contract surface (noop compute), not a live build — deferred to // the W4 e2e specs. Moved here from routeCoverageExemptions. - "PATCH /deploy/:id/env": "TestDeployLifecycle_UpdateEnv_MergesAndRedacts", - "POST /deploy/:id/redeploy": "TestDeployLifecycle_Redeploy_HealthyRow_Accepts202", + "PATCH /deploy/:id/env": "TestDeployLifecycle_UpdateEnv_MergesAndRedacts", + "POST /deploy/:id/redeploy": "TestDeployLifecycle_Redeploy_HealthyRow_Accepts202", + // Scale-to-zero explicit wake (mig 068, Task #54). Covered by the flag-ON + // sqlmock handler suite (deploy_wake_mock_test.go): happy-path scale+DB + // flip+re-read, not-found, cross-team 404, scale-failure 503, DB-error 503, + // requireTeam 401, fetch driver-error 503, re-read fallback. Flag-OFF 501 is + // in deploy_wake_test.go. + "POST /deploy/:id/wake": "TestWake_HappyPath", "PATCH /api/v1/deployments/:id": "TestDeployLifecycle_Patch_Pro_SetsPrivate", "POST /api/v1/deployments/:id/make-permanent": "TestDeployLifecycle_MakePermanent_HappyPath", "POST /api/v1/deployments/:id/ttl": "TestDeployLifecycle_SetTTL_HappyPath", diff --git a/internal/router/router.go b/internal/router/router.go index a117d54c..c32ce757 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -759,6 +759,9 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid deployGroup.Patch("/:id/env", deployH.UpdateEnv) deployGroup.Delete("/:id", deployH.Delete) deployGroup.Post("/:id/redeploy", deployH.Redeploy) + // Scale-to-zero explicit wake (Task #54). Gated by + // DEPLOY_SCALE_TO_ZERO_ENABLED inside the handler → 501 when off. + deployGroup.Post("/:id/wake", deployH.Wake) // Stacks — Phase 6 multi-service. // New/Get/Logs/Delete are anonymous-capable (same model as /db/new etc.). diff --git a/internal/testhelpers/testhelpers.go b/internal/testhelpers/testhelpers.go index f3ecd783..5682ca45 100644 --- a/internal/testhelpers/testhelpers.go +++ b/internal/testhelpers/testhelpers.go @@ -1144,6 +1144,7 @@ func NewTestAppWithServices(t *testing.T, db *sql.DB, rdb *redis.Client, service deployGroup.Patch("/:id/env", deployH.UpdateEnv) deployGroup.Delete("/:id", deployH.Delete) deployGroup.Post("/:id/redeploy", deployH.Redeploy) + deployGroup.Post("/:id/wake", deployH.Wake) // Register role lookup so RequireRole can resolve the caller's role // against the test DB (mirror of the production wiring in router.go). diff --git a/openapi.snapshot.json b/openapi.snapshot.json index 502e7375..a3627d9e 100644 --- a/openapi.snapshot.json +++ b/openapi.snapshot.json @@ -8887,6 +8887,52 @@ "summary": "Redeploy with the latest stored env vars" } }, + "/deploy/{id}/wake": { + "post": { + "description": "Scale-to-zero (Task #54). Scales an idle, descheduled app back to one replica and clears its sleeping state. The app becomes reachable once its pod is Ready (a one-time cold start — a request that races the wake gets the ingress's upstream-down response until the pod is up). Idempotent: waking an already-awake app just refreshes its last-activity marker so the idle-scaler won't immediately re-deschedule it. Returns 501 when scale-to-zero is not enabled on the platform (the default). Cross-tenant requests return 404.", + "parameters": [ + { + "description": "Deployment id (UUID or short app_id slug).", + "in": "path", + "name": "id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeployResponse" + } + } + }, + "description": "Deployment woken" + }, + "401": { + "description": "Unauthorized" + }, + "404": { + "description": "Not found (or owned by another team)" + }, + "501": { + "description": "scale_to_zero_disabled — scale-to-zero is not enabled on this platform (default)." + }, + "503": { + "description": "wake_failed — transient failure scaling the app; retry." + } + }, + "security": [ + { + "bearerAuth": [] + } + ], + "summary": "Wake a scaled-to-zero (sleeping) deployment" + } + }, "/healthz": { "get": { "description": "Process-level liveness — returns 200 if the api binary is up and can ping its primary platform DB. Wired to Kubernetes livenessProbe. Use /readyz for deep upstream-reachability checks.",