Skip to content

Commit

Permalink
HA: Give up retrying after 5 minutes
Browse files Browse the repository at this point in the history
Since we are now retrying every database error,
we also need to set a retry timeout.
  • Loading branch information
lippserd committed Apr 10, 2024
1 parent 9393333 commit 81085c0
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 0 deletions.
44 changes: 44 additions & 0 deletions pkg/icingadb/ha.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,19 @@ func (h *HA) controller() {
defer routineLogTicker.Stop()
shouldLogRoutineEvents := true

// The retry logic in HA is twofold:
//
// 1) Updating or inserting the instance row based on the current heartbeat must be done within the heartbeat's
// expiration time. Therefore, we use a deadline ctx to retry.WithBackoff() in realize() which expires earlier
// than our default timeout.
// 2) Since we do not want to exit before our default timeout expires, we have to repeat step 1 until it does.
retryTimeout := time.NewTimer(5 * time.Minute)
defer retryTimeout.Stop()

for {
select {
case <-retryTimeout.C:
h.abort(errors.New("retry deadline exceeded"))
case m := <-h.heartbeat.Events():
if m != nil {
now := time.Now()
Expand All @@ -163,8 +174,13 @@ func (h *HA) controller() {
}
if tt.Before(now.Add(-1 * peerTimeout)) {
h.logger.Errorw("Received heartbeat from the past", zap.Time("time", tt))

h.signalHandover("received heartbeat from the past")
h.realizeLostHeartbeat()

// Reset retry timeout so that the next iterations have the full amount of time available again.
retry.ResetTimeout(retryTimeout, 5*time.Minute)

continue
}
s, err := m.Stats().IcingaStatus()
Expand Down Expand Up @@ -207,6 +223,10 @@ func (h *HA) controller() {
if errors.Is(err, context.DeadlineExceeded) {
h.signalHandover("instance update/insert deadline exceeded heartbeat expiry time")

// Instance insert/update was not completed by the expiration time of the current heartbeat.
// Pass control back to the loop to try again with the next heartbeat,
// or exit the loop when the retry timeout has expired. Therefore,
// retry timeout is **not** reset here so that retries continue until the timeout has expired.
continue
}
if err != nil {
Expand All @@ -224,6 +244,14 @@ func (h *HA) controller() {
h.signalHandover("lost heartbeat")
h.realizeLostHeartbeat()
}

// Reset retry timeout so that the next iterations have the full amount of time available again.
// Don't be surprised by the location of the code,
// as it is obvious that the timer is also reset after an error that ends the loop anyway.
// But this is the best place to catch all scenarios where the timeout needs to be reset.
// And since HA needs quite a bit of refactoring anyway to e.g. return immediately after calling h.abort(),
// it's fine to have it here for now.
retry.ResetTimeout(retryTimeout, 5*time.Minute)
case <-h.heartbeat.Done():
if err := h.heartbeat.Err(); err != nil {
h.abort(err)
Expand Down Expand Up @@ -369,6 +397,22 @@ func (h *HA) realize(
log("Can't update or insert instance. Retrying", zap.Error(err), zap.Uint64("retry count", attempt))
}
},
OnSuccess: func(elapsed time.Duration, attempt uint64, lastErr error) {
if attempt > 0 {
log := h.logger.Debugw

if attempt > 3 {
// We log errors with severity info starting from the fourth attempt, (see above)
// so we need to log success with severity info from the fifth attempt.
log = h.logger.Infow
}

log("Instance updated/inserted successfully after error",
zap.Duration("after", elapsed),
zap.Uint64("attempts", attempt+1),
zap.NamedError("recovered_error", lastErr))
}
},
},
)
if err != nil {
Expand Down
12 changes: 12 additions & 0 deletions pkg/retry/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,18 @@ func WithBackoff(
}
}

// ResetTimeout changes the possibly expired timer t to expire after duration d.
//
// If the timer has already expired and nothing has been received from its channel,
// it is automatically drained as if the timer had never expired.
func ResetTimeout(t *time.Timer, d time.Duration) {
if !t.Stop() {
<-t.C
}

t.Reset(d)
}

// Retryable returns true for common errors that are considered retryable,
// i.e. temporary, timeout, DNS, connection refused and reset, host down and unreachable and
// network down and unreachable errors. In addition, any database error is considered retryable.
Expand Down

0 comments on commit 81085c0

Please sign in to comment.