Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 125 additions & 2 deletions graceful_shutdown_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ import (
"github.com/gofiber/fiber/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"instant.dev/internal/handlers"
"instant.dev/internal/router"
)

// pickFreePort returns a TCP port number currently free on localhost. Lets
Expand Down Expand Up @@ -66,7 +69,7 @@ func TestRunServerWithGracefulShutdown_DrainsInflight(t *testing.T) {
// Run the helper in a goroutine — same shape main() uses.
srvErr := make(chan error, 1)
go func() {
srvErr <- runServerWithGracefulShutdown(app, fmt.Sprintf("127.0.0.1:%d", port), 5*time.Second)
srvErr <- runServerWithGracefulShutdown(app, fmt.Sprintf("127.0.0.1:%d", port), 5*time.Second, router.ShutdownHooks{})
}()

// Wait for the listener to bind. Tight retry loop with a generous cap so
Expand Down Expand Up @@ -138,10 +141,130 @@ func TestRunServerWithGracefulShutdown_DrainsInflight(t *testing.T) {
}
}


// TestRunServerWithGracefulShutdown_MarksReadinessDraining — the
// MR-P0-7 readiness contract: on SIGTERM the helper MUST flip
// hooks.Readyz.MarkDraining BEFORE Fiber's ShutdownWithTimeout closes
// the listener. Without this, the kubelet's readinessProbe keeps
// returning 200 right up to SIGKILL and the Service keeps routing new
// traffic to a pod that is about to stop accepting connections.
func TestRunServerWithGracefulShutdown_MarksReadinessDraining(t *testing.T) {
port := pickFreePort(t)
app := fiber.New(fiber.Config{DisableStartupMessage: true})
app.Get("/ping", func(c *fiber.Ctx) error { return c.SendString("ok") })

readyzH := &handlers.ReadyzHandler{}
require.False(t, readyzH.IsDraining(), "fresh ReadyzHandler must not start in draining state")

srvErr := make(chan error, 1)
go func() {
srvErr <- runServerWithGracefulShutdown(
app,
fmt.Sprintf("127.0.0.1:%d", port),
3*time.Second,
router.ShutdownHooks{Readyz: readyzH},
)
}()

require.Eventually(t, func() bool {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("127.0.0.1:%d", port), 100*time.Millisecond)
if err != nil {
return false
}
_ = conn.Close()
return true
}, 3*time.Second, 25*time.Millisecond, "server never bound to :%d", port)

require.NoError(t, syscall.Kill(os.Getpid(), syscall.SIGTERM))

require.Eventually(t, readyzH.IsDraining,
2*time.Second, 10*time.Millisecond,
"hooks.Readyz.MarkDraining was never called — readinessProbe will keep returning 200 (MR-P0-7 regression)")

select {
case sErr := <-srvErr:
assert.NoError(t, sErr, "clean SIGTERM drain must return nil")
case <-time.After(8 * time.Second):
t.Fatal("runServerWithGracefulShutdown never returned after the drain")
}
assert.True(t, readyzH.IsDraining(), "drain flag is single-shot, never un-flipped")
}

// TestRunServerWithGracefulShutdown_TimeoutKillsStuckRequest — the
// MR-P0-7 timeout contract: a request that never returns MUST NOT
// block the helper past shutdownTimeout. ShutdownWithTimeout returns a
// non-nil error which the helper surfaces, so the process exits in
// bounded time instead of being SIGKILLed by the kubelet.
func TestRunServerWithGracefulShutdown_TimeoutKillsStuckRequest(t *testing.T) {
port := pickFreePort(t)
app := fiber.New(fiber.Config{DisableStartupMessage: true})

stuck := make(chan struct{})
defer close(stuck)
requestStarted := make(chan struct{}, 1)
app.Get("/stuck", func(c *fiber.Ctx) error {
select {
case requestStarted <- struct{}{}:
default:
}
<-stuck
return nil
})

const tinyTimeout = 500 * time.Millisecond
srvErr := make(chan error, 1)
go func() {
srvErr <- runServerWithGracefulShutdown(
app,
fmt.Sprintf("127.0.0.1:%d", port),
tinyTimeout,
router.ShutdownHooks{},
)
}()

require.Eventually(t, func() bool {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("127.0.0.1:%d", port), 100*time.Millisecond)
if err != nil {
return false
}
_ = conn.Close()
return true
}, 3*time.Second, 25*time.Millisecond, "server never bound to :%d", port)

clientCtx, cancelClient := context.WithCancel(context.Background())
defer cancelClient()
go func() {
req, _ := http.NewRequestWithContext(clientCtx, http.MethodGet,
fmt.Sprintf("http://127.0.0.1:%d/stuck", port), nil)
_, _ = http.DefaultClient.Do(req)
}()

select {
case <-requestStarted:
case <-time.After(2 * time.Second):
t.Fatal("stuck handler never started — setup is broken, not the SUT")
}

start := time.Now()
require.NoError(t, syscall.Kill(os.Getpid(), syscall.SIGTERM))

select {
case sErr := <-srvErr:
elapsed := time.Since(start)
assert.Error(t, sErr,
"stuck request must surface ShutdownWithTimeout's non-nil return so operators can grep server.graceful_shutdown_failed")
// readinessDrainGrace (3s) + tinyTimeout (0.5s) + slack ≤ ~6s.
assert.Less(t, elapsed, 6*time.Second,
"helper took %s — timeout path is broken; a real pod would be SIGKILLed", elapsed)
case <-time.After(10 * time.Second):
t.Fatal("runServerWithGracefulShutdown blocked past shutdownTimeout — the kubelet would SIGKILL this pod")
}
}

// Compile-time guard against a regression that removes the helper or changes
// its signature in a way that would silently bypass the MR-P0-7 fix.
var _ = func(app *fiber.App) error {
return runServerWithGracefulShutdown(app, ":0", time.Second)
return runServerWithGracefulShutdown(app, ":0", time.Second, router.ShutdownHooks{})
}

// sync.WaitGroup import-guard so a future test that adds goroutines can rely
Expand Down
31 changes: 31 additions & 0 deletions internal/handlers/readyz.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ import (
"database/sql"
"encoding/base64"
"net/http"
"sync/atomic"
"time"

"github.com/gofiber/fiber/v2"
"github.com/redis/go-redis/v9"

"instant.dev/common/buildinfo"
"instant.dev/common/readiness"
"instant.dev/internal/config"
"instant.dev/internal/metrics"
Expand All @@ -53,6 +55,8 @@ type ReadyzHandler struct {
// probes. Sharing it avoids spinning up a new transport per probe
// (each transport leaks an idle connection pool until GC).
http *http.Client
// draining flips to true on graceful shutdown (MR-P0-7).
draining atomic.Bool
}

// NewReadyzHandler wires the runner. Pass the same db/rdb/cfg/prov the
Expand Down Expand Up @@ -184,13 +188,40 @@ func (h *ReadyzHandler) buildChecks() []readiness.Check {
// hood but adapts the net/http body to Fiber's response writer.
//
// Mounted at GET /readyz in router.go.
//
// When draining (MarkDraining called during graceful shutdown), Get
// short-circuits to 503 + overall=failed so the kubelet's readiness
// probe pulls the pod from Service endpoints before the listener
// stops accepting new connections (MR-P0-7).
func (h *ReadyzHandler) Get(c *fiber.Ctx) error {
if h.draining.Load() {
c.Set("Cache-Control", "no-store")
c.Status(http.StatusServiceUnavailable)
return c.JSON(readiness.Response{
Overall: readiness.StatusFailed,
Service: "instant-api",
CommitID: buildinfo.GitSHA,
Checks: []readiness.CheckResult{{
Name: "shutting_down",
Status: readiness.StatusFailed,
LastError: "draining",
LastCheckAt: time.Now(),
}},
})
}
resp, code := h.runner.Run(c.UserContext())
c.Set("Cache-Control", "no-store")
c.Status(code)
return c.JSON(resp)
}

// MarkDraining flips the handler into drain mode. Subsequent /readyz
// probes return 503 + overall=failed. Idempotent.
func (h *ReadyzHandler) MarkDraining() { h.draining.Store(true) }

// IsDraining reports whether MarkDraining has been called.
func (h *ReadyzHandler) IsDraining() bool { return h.draining.Load() }

// customerDBCheck builds a CheckFunc that opens a one-shot pool against
// the customer DB. The pool is closed on every call — the cache window
// keeps the open-rate low (one dial per 10s under default config).
Expand Down
72 changes: 72 additions & 0 deletions internal/handlers/readyz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/alicebob/miniredis/v2"
"github.com/gofiber/fiber/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"instant.dev/common/readiness"
Expand Down Expand Up @@ -214,3 +215,74 @@ func TestReadyz_ResponseShape(t *testing.T) {
require.Equal(t, "no-store", resp.Header.Get("Cache-Control"),
"/readyz responses MUST be no-store to prevent probe staleness")
}


// TestReadyz_DrainingReturns503 — MR-P0-7 contract. Once MarkDraining
// is called by the graceful-shutdown signal handler, GET /readyz MUST
// short-circuit to 503 + overall=failed + a single shutting_down
// check. The kubelet sees 503, pulls the pod from Service endpoints,
// and new traffic stops landing on a pod about to close its listener.
//
// We pre-record NO sqlmock expectations: when draining is set the
// runner must NOT be consulted. If a future refactor reorders the
// drain check, the un-met sqlmock pings (or a real upstream call)
// would surface here.
func TestReadyz_DrainingReturns503(t *testing.T) {
db, _, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
require.NoError(t, err)
defer db.Close()
// Intentionally no mock.ExpectPing() — runner must NOT run.

mr := miniredis.RunT(t)
rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()})
defer rdb.Close()

h := handlers.NewReadyzHandler(&config.Config{Environment: "test"}, db, rdb, nil)

require.False(t, h.IsDraining(), "fresh handler must not start in draining state")
h.MarkDraining()
require.True(t, h.IsDraining(), "MarkDraining must flip the flag immediately")

app := fiber.New()
app.Get("/readyz", h.Get)

resp, err := app.Test(httptest.NewRequest("GET", "/readyz", nil))
require.NoError(t, err)
defer resp.Body.Close()

assert.Equal(t, http.StatusServiceUnavailable, resp.StatusCode,
"draining /readyz MUST return 503 so the kubelet pulls the pod from the Service")
assert.Equal(t, "no-store", resp.Header.Get("Cache-Control"),
"draining response stays no-store — a cached 503 would persist past the next deploy")

body, _ := io.ReadAll(resp.Body)
var got readiness.Response
require.NoError(t, json.Unmarshal(body, &got))

assert.Equal(t, readiness.StatusFailed, got.Overall, "draining overall must be 'failed'")
assert.Equal(t, "instant-api", got.Service)

require.Len(t, got.Checks, 1, "draining must surface a single shutting_down check, not the full registry")
assert.Equal(t, "shutting_down", got.Checks[0].Name)
assert.Equal(t, readiness.StatusFailed, got.Checks[0].Status)
assert.Equal(t, "draining", got.Checks[0].LastError)
}

// TestReadyz_DrainingIsIdempotent — MarkDraining is single-shot but
// safe to call multiple times. The graceful-shutdown sequence may, in
// theory, re-enter (a sibling SIGTERM, a panic-recovered shutdown
// handler); the second call must no-op rather than panic or double-log.
func TestReadyz_DrainingIsIdempotent(t *testing.T) {
db, _, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
require.NoError(t, err)
defer db.Close()

mr := miniredis.RunT(t)
rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()})
defer rdb.Close()

h := handlers.NewReadyzHandler(&config.Config{Environment: "test"}, db, rdb, nil)
h.MarkDraining()
h.MarkDraining()
assert.True(t, h.IsDraining())
}
20 changes: 19 additions & 1 deletion internal/router/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,30 @@ import (
"instant.dev/internal/razorpaybilling"
)

// ShutdownHooks bundles handlers that participate in graceful shutdown
// (MR-P0-7). Today: Readyz.MarkDraining — flips /readyz to 503 so the
// kubelet pulls the pod from the Service endpoint list before the
// listener stops accepting connections.
type ShutdownHooks struct {
Readyz *handlers.ReadyzHandler
}

// New creates and configures the Fiber application with all middleware and routes registered.
//
// nrApp may be nil — the New Relic Go agent fails open when no license
// key is set (local dev, CI). The NewRelic Fiber middleware degrades to
// a no-op in that case, so the rest of the chain is unaffected.
//
// Legacy entrypoint — existing tests use it. Production main.go uses
// NewWithHooks so the graceful-shutdown wiring has the ReadyzHandler.
func New(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *middleware.GeoDBs, emailClient *email.Client, planRegistry *plans.Registry, provClient *provisioner.Client, nrApp *newrelic.Application) *fiber.App {
app, _ := NewWithHooks(cfg, db, rdb, geoDbs, emailClient, planRegistry, provClient, nrApp)
return app
}

// NewWithHooks is the production entrypoint — returns both the Fiber
// app and the ShutdownHooks needed for graceful shutdown.
func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *middleware.GeoDBs, emailClient *email.Client, planRegistry *plans.Registry, provClient *provisioner.Client, nrApp *newrelic.Application) (*fiber.App, ShutdownHooks) {
app := fiber.New(fiber.Config{
// Disable default error handler — we write our own JSON errors.
// Routes that go through respondError write their own body and
Expand Down Expand Up @@ -1127,7 +1145,7 @@ func New(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *middleware.G
internal.Post("/set-tier", handlers.NewSetTierHandler(db))
}

return app
return app, ShutdownHooks{Readyz: readyzH}
}

// isolationLabel maps the storage backend to a human-readable isolation
Expand Down
Loading
Loading