From df771e6f5f7dbb6b8fe1823cd6d9f396b4c8abcb Mon Sep 17 00:00:00 2001 From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com> Date: Tue, 12 May 2026 08:40:52 +1000 Subject: [PATCH 1/3] Retry Redis ping on startup --- cmd/analysis/main.go | 2 +- cmd/app/main.go | 2 +- cmd/worker/main.go | 2 +- internal/broker/redis.go | 45 +++++++++++++++++++++++++++++++++++ internal/broker/redis_test.go | 45 +++++++++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 3 deletions(-) diff --git a/cmd/analysis/main.go b/cmd/analysis/main.go index edb96626..102c98fc 100644 --- a/cmd/analysis/main.go +++ b/cmd/analysis/main.go @@ -95,7 +95,7 @@ func main() { analysisLog.Fatal("failed to create Redis client", "error", err) } defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { analysisLog.Fatal("failed to ping Redis", "error", err) } analysisLog.Info("connected to Redis") diff --git a/cmd/app/main.go b/cmd/app/main.go index fcd9105c..355a554c 100644 --- a/cmd/app/main.go +++ b/cmd/app/main.go @@ -547,7 +547,7 @@ func main() { redisClient = client defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { startupLog.Fatal("failed to ping Redis", "error", err) } startupLog.Info("connected to Redis") diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 310fb24f..549ba421 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -115,7 +115,7 @@ func main() { } defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { workerLog.Fatal("failed to ping Redis", "error", err) } workerLog.Info("connected to Redis") diff --git a/internal/broker/redis.go b/internal/broker/redis.go index 053a9dad..8619f072 100644 --- a/internal/broker/redis.go +++ b/internal/broker/redis.go @@ -77,6 +77,51 @@ func (c *Client) Ping(ctx context.Context) error { return c.rdb.Ping(ctx).Err() } +// PingWithRetry retries Ping with capped exponential backoff until the +// total budget elapses. Each attempt uses its own perAttempt timeout so +// a hung server can't stall the loop. Returns nil on first success or +// the last error if the budget is exhausted. Exists because Upstash-on-Fly +// review apps occasionally close the first PING with EOF during cold +// start, which used to Fatal the binary on boot (see HOVER-JX/MD/JZ). +func (c *Client) PingWithRetry(ctx context.Context, total, perAttempt time.Duration) error { + return pingWithRetry(ctx, total, perAttempt, c.Ping) +} + +func pingWithRetry(ctx context.Context, total, perAttempt time.Duration, ping func(context.Context) error) error { + deadline := time.Now().Add(total) + backoff := 100 * time.Millisecond + const maxBackoff = 2 * time.Second + + var lastErr error + for { + attemptCtx, cancel := context.WithTimeout(ctx, perAttempt) + err := ping(attemptCtx) + cancel() + if err == nil { + return nil + } + lastErr = err + + if ctx.Err() != nil { + return ctx.Err() + } + if !time.Now().Add(backoff).Before(deadline) { + return lastErr + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + } +} + func (c *Client) Close() error { return c.rdb.Close() } diff --git a/internal/broker/redis_test.go b/internal/broker/redis_test.go index 82c8bb75..de8d7853 100644 --- a/internal/broker/redis_test.go +++ b/internal/broker/redis_test.go @@ -2,14 +2,59 @@ package broker import ( "context" + "errors" "strings" "testing" + "time" "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +// TestPingWithRetry covers the helper that wraps Ping with a bounded +// retry loop. It uses a stub ping function so the test stays +// hermetic and fast — no miniredis, no sleeps longer than the +// helper's first backoff (100 ms). +func TestPingWithRetry(t *testing.T) { + t.Run("immediate success", func(t *testing.T) { + var calls int + err := pingWithRetry(context.Background(), time.Second, 100*time.Millisecond, + func(context.Context) error { calls++; return nil }) + require.NoError(t, err) + assert.Equal(t, 1, calls) + }) + + t.Run("succeeds after transient errors", func(t *testing.T) { + var calls int + err := pingWithRetry(context.Background(), 2*time.Second, 100*time.Millisecond, + func(context.Context) error { + calls++ + if calls < 3 { + return errors.New("EOF") + } + return nil + }) + require.NoError(t, err) + assert.Equal(t, 3, calls) + }) + + t.Run("exhausts budget returning last error", func(t *testing.T) { + want := errors.New("still EOF") + err := pingWithRetry(context.Background(), 250*time.Millisecond, 50*time.Millisecond, + func(context.Context) error { return want }) + require.ErrorIs(t, err, want) + }) + + t.Run("aborts on context cancellation", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + err := pingWithRetry(ctx, time.Second, 100*time.Millisecond, + func(context.Context) error { return errors.New("boom") }) + require.ErrorIs(t, err, context.Canceled) + }) +} + // TestClient_ClearAll seeds every prefix the broker owns plus an // unrelated key, then asserts ClearAll wipes only the hover:* keys. func TestClient_ClearAll(t *testing.T) { From 4338f2eedadf23d7b7e676f3bfe85d45708c9e25 Mon Sep 17 00:00:00 2001 From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com> Date: Tue, 12 May 2026 08:45:22 +1000 Subject: [PATCH 2/3] Add changelog entry for Redis retry --- CHANGELOG.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa274966..37dddae2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,16 @@ On merge, CI will: ## [Unreleased] -_Add unreleased changes here._ +### Fixed + +- App, worker, and analysis binaries no longer `Fatal` on the first Redis `PING` + failure at startup. The ping is now wrapped in a bounded retry loop (30 s + total, 3 s per attempt, capped exponential backoff) so the binary rides out + the Upstash-on-Fly cold-start window that briefly closes connections with EOF + on freshly-provisioned review apps. Production behaviour is unchanged — a + healthy Redis still succeeds on the first attempt and persistent + misconfiguration still fails fast. Resolves the recurring EOF burst on every + PR preview deploy (Sentry: HOVER-JX, HOVER-MD, HOVER-JZ). ## Full changelog history From b79c830f52562d21e3692ca16227e019909515d4 Mon Sep 17 00:00:00 2001 From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com> Date: Tue, 12 May 2026 08:53:05 +1000 Subject: [PATCH 3/3] Clamp Redis retry to total budget --- internal/broker/redis.go | 23 ++++++++++++++++++++--- internal/broker/redis_test.go | 15 +++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/internal/broker/redis.go b/internal/broker/redis.go index 8619f072..f14d92d2 100644 --- a/internal/broker/redis.go +++ b/internal/broker/redis.go @@ -94,7 +94,19 @@ func pingWithRetry(ctx context.Context, total, perAttempt time.Duration, ping fu var lastErr error for { - attemptCtx, cancel := context.WithTimeout(ctx, perAttempt) + remaining := time.Until(deadline) + if remaining <= 0 { + if lastErr != nil { + return lastErr + } + return context.DeadlineExceeded + } + + attemptTimeout := perAttempt + if attemptTimeout > remaining { + attemptTimeout = remaining + } + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) err := ping(attemptCtx) cancel() if err == nil { @@ -105,14 +117,19 @@ func pingWithRetry(ctx context.Context, total, perAttempt time.Duration, ping fu if ctx.Err() != nil { return ctx.Err() } - if !time.Now().Add(backoff).Before(deadline) { + + sleep := backoff + if remaining := time.Until(deadline); sleep > remaining { + sleep = remaining + } + if sleep <= 0 { return lastErr } select { case <-ctx.Done(): return ctx.Err() - case <-time.After(backoff): + case <-time.After(sleep): } backoff *= 2 diff --git a/internal/broker/redis_test.go b/internal/broker/redis_test.go index de8d7853..cd72bfcc 100644 --- a/internal/broker/redis_test.go +++ b/internal/broker/redis_test.go @@ -53,6 +53,21 @@ func TestPingWithRetry(t *testing.T) { func(context.Context) error { return errors.New("boom") }) require.ErrorIs(t, err, context.Canceled) }) + + // Guards against a regression where perAttempt > total let the + // final attempt run past the overall budget. The ping function + // blocks until its per-attempt context expires, so without the + // clamp the call would take perAttempt rather than total. + t.Run("clamps attempt timeout to remaining budget", func(t *testing.T) { + start := time.Now() + err := pingWithRetry(context.Background(), 80*time.Millisecond, time.Second, + func(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() + }) + require.Error(t, err) + assert.LessOrEqual(t, time.Since(start), 250*time.Millisecond) + }) } // TestClient_ClearAll seeds every prefix the broker owns plus an