Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ On merge, CI will:

## [Unreleased]

_Add unreleased changes here._
### Fixed

- App, worker, and analysis binaries no longer `Fatal` on the first Redis `PING`
failure at startup. The ping is now wrapped in a bounded retry loop (30 s
total, 3 s per attempt, capped exponential backoff) so the binary rides out
the Upstash-on-Fly cold-start window that briefly closes connections with EOF
on freshly-provisioned review apps. Production behaviour is unchanged — a
healthy Redis still succeeds on the first attempt and persistent
misconfiguration still fails fast. Resolves the recurring EOF burst on every
PR preview deploy (Sentry: HOVER-JX, HOVER-MD, HOVER-JZ).

## Full changelog history

Expand Down
2 changes: 1 addition & 1 deletion cmd/analysis/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func main() {
analysisLog.Fatal("failed to create Redis client", "error", err)
}
defer redisClient.Close()
if err := redisClient.Ping(context.Background()); err != nil {
if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil {
analysisLog.Fatal("failed to ping Redis", "error", err)
}
analysisLog.Info("connected to Redis")
Expand Down
2 changes: 1 addition & 1 deletion cmd/app/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ func main() {
redisClient = client
defer redisClient.Close()

if err := redisClient.Ping(context.Background()); err != nil {
if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil {
startupLog.Fatal("failed to ping Redis", "error", err)
}
startupLog.Info("connected to Redis")
Expand Down
2 changes: 1 addition & 1 deletion cmd/worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func main() {
}
defer redisClient.Close()

if err := redisClient.Ping(context.Background()); err != nil {
if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil {
workerLog.Fatal("failed to ping Redis", "error", err)
}
workerLog.Info("connected to Redis")
Expand Down
62 changes: 62 additions & 0 deletions internal/broker/redis.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,68 @@ func (c *Client) Ping(ctx context.Context) error {
return c.rdb.Ping(ctx).Err()
}

// PingWithRetry retries Ping with capped exponential backoff until the
// total budget elapses. Each attempt uses its own perAttempt timeout so
// a hung server can't stall the loop. Returns nil on first success or
// the last error if the budget is exhausted. Exists because Upstash-on-Fly
// review apps occasionally close the first PING with EOF during cold
// start, which used to Fatal the binary on boot (see HOVER-JX/MD/JZ).
func (c *Client) PingWithRetry(ctx context.Context, total, perAttempt time.Duration) error {
return pingWithRetry(ctx, total, perAttempt, c.Ping)
}

func pingWithRetry(ctx context.Context, total, perAttempt time.Duration, ping func(context.Context) error) error {
deadline := time.Now().Add(total)
backoff := 100 * time.Millisecond
const maxBackoff = 2 * time.Second

var lastErr error
for {
remaining := time.Until(deadline)
if remaining <= 0 {
if lastErr != nil {
return lastErr
}
return context.DeadlineExceeded
}

attemptTimeout := perAttempt
if attemptTimeout > remaining {
attemptTimeout = remaining
}
attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout)
err := ping(attemptCtx)
cancel()
if err == nil {
return nil
}
lastErr = err

if ctx.Err() != nil {
return ctx.Err()
}

sleep := backoff
if remaining := time.Until(deadline); sleep > remaining {
sleep = remaining
}
if sleep <= 0 {
return lastErr
}

select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(sleep):
}

backoff *= 2
if backoff > maxBackoff {
backoff = maxBackoff
}
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

func (c *Client) Close() error {
return c.rdb.Close()
}
Expand Down
60 changes: 60 additions & 0 deletions internal/broker/redis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,74 @@ package broker

import (
"context"
"errors"
"strings"
"testing"
"time"

"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// TestPingWithRetry covers the helper that wraps Ping with a bounded
// retry loop. It uses a stub ping function so the test stays
// hermetic and fast — no miniredis, no sleeps longer than the
// helper's first backoff (100 ms).
func TestPingWithRetry(t *testing.T) {
t.Run("immediate success", func(t *testing.T) {
var calls int
err := pingWithRetry(context.Background(), time.Second, 100*time.Millisecond,
func(context.Context) error { calls++; return nil })
require.NoError(t, err)
assert.Equal(t, 1, calls)
})

t.Run("succeeds after transient errors", func(t *testing.T) {
var calls int
err := pingWithRetry(context.Background(), 2*time.Second, 100*time.Millisecond,
func(context.Context) error {
calls++
if calls < 3 {
return errors.New("EOF")
}
return nil
})
require.NoError(t, err)
assert.Equal(t, 3, calls)
})

t.Run("exhausts budget returning last error", func(t *testing.T) {
want := errors.New("still EOF")
err := pingWithRetry(context.Background(), 250*time.Millisecond, 50*time.Millisecond,
func(context.Context) error { return want })
require.ErrorIs(t, err, want)
})

t.Run("aborts on context cancellation", func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel()
err := pingWithRetry(ctx, time.Second, 100*time.Millisecond,
func(context.Context) error { return errors.New("boom") })
require.ErrorIs(t, err, context.Canceled)
})

// Guards against a regression where perAttempt > total let the
// final attempt run past the overall budget. The ping function
// blocks until its per-attempt context expires, so without the
// clamp the call would take perAttempt rather than total.
t.Run("clamps attempt timeout to remaining budget", func(t *testing.T) {
start := time.Now()
err := pingWithRetry(context.Background(), 80*time.Millisecond, time.Second,
func(ctx context.Context) error {
<-ctx.Done()
return ctx.Err()
})
require.Error(t, err)
assert.LessOrEqual(t, time.Since(start), 250*time.Millisecond)
})
}

// TestClient_ClearAll seeds every prefix the broker owns plus an
// unrelated key, then asserts ClearAll wipes only the hover:* keys.
func TestClient_ClearAll(t *testing.T) {
Expand Down
Loading