forked from ava-labs/avalanchego
-
Notifications
You must be signed in to change notification settings - Fork 4
/
adaptive_timeout_manager.go
325 lines (281 loc) · 9.79 KB
/
adaptive_timeout_manager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// Copyright (C) 2019-2022, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.
package timer
import (
"container/heap"
"errors"
"fmt"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/MetalBlockchain/metalgo/ids"
"github.com/MetalBlockchain/metalgo/utils/math"
"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
"github.com/MetalBlockchain/metalgo/utils/wrappers"
)
var (
errNonPositiveHalflife = errors.New("timeout halflife must be positive")
_ heap.Interface = (*timeoutQueue)(nil)
_ AdaptiveTimeoutManager = (*adaptiveTimeoutManager)(nil)
)
type adaptiveTimeout struct {
index int // Index in the wait queue
id ids.RequestID // Unique ID of this timeout
handler func() // Function to execute if timed out
duration time.Duration // How long this timeout was set for
deadline time.Time // When this timeout should be fired
measureLatency bool // Whether this request should impact latency
}
type timeoutQueue []*adaptiveTimeout
func (tq timeoutQueue) Len() int {
return len(tq)
}
func (tq timeoutQueue) Less(i, j int) bool {
return tq[i].deadline.Before(tq[j].deadline)
}
func (tq timeoutQueue) Swap(i, j int) {
tq[i], tq[j] = tq[j], tq[i]
tq[i].index = i
tq[j].index = j
}
// Push adds an item to this priority queue. x must have type *adaptiveTimeout
func (tq *timeoutQueue) Push(x interface{}) {
item := x.(*adaptiveTimeout)
item.index = len(*tq)
*tq = append(*tq, item)
}
// Pop returns the next item in this queue
func (tq *timeoutQueue) Pop() interface{} {
n := len(*tq)
item := (*tq)[n-1]
(*tq)[n-1] = nil // make sure the item is freed from memory
*tq = (*tq)[:n-1]
return item
}
// AdaptiveTimeoutConfig contains the parameters provided to the
// adaptive timeout manager.
type AdaptiveTimeoutConfig struct {
InitialTimeout time.Duration `json:"initialTimeout"`
MinimumTimeout time.Duration `json:"minimumTimeout"`
MaximumTimeout time.Duration `json:"maximumTimeout"`
// Timeout is [timeoutCoefficient] * average response time
// [timeoutCoefficient] must be > 1
TimeoutCoefficient float64 `json:"timeoutCoefficient"`
// Larger halflife --> less volatile timeout
// [timeoutHalfLife] must be positive
TimeoutHalflife time.Duration `json:"timeoutHalflife"`
}
type AdaptiveTimeoutManager interface {
// Start the timeout manager.
// Must be called before any other method.
// Must only be called once.
Dispatch()
// Stop the timeout manager.
// Must only be called once.
Stop()
// Returns the current network timeout duration.
TimeoutDuration() time.Duration
// Registers a timeout for the item with the given [id].
// If the timeout occurs before the item is Removed, [timeoutHandler] is called.
Put(id ids.RequestID, measureLatency bool, timeoutHandler func())
// Remove the timeout associated with [id].
// Its timeout handler will not be called.
Remove(id ids.RequestID)
// ObserveLatency manually registers a response latency.
// We use this to pretend that it a query to a benched validator
// timed out when actually, we never even sent them a request.
ObserveLatency(latency time.Duration)
}
type adaptiveTimeoutManager struct {
lock sync.Mutex
// Tells the time. Can be faked for testing.
clock mockable.Clock
networkTimeoutMetric, avgLatency prometheus.Gauge
numTimeouts prometheus.Counter
numPendingTimeouts prometheus.Gauge
// Averages the response time from all peers
averager math.Averager
// Timeout is [timeoutCoefficient] * average response time
// [timeoutCoefficient] must be > 1
timeoutCoefficient float64
minimumTimeout time.Duration
maximumTimeout time.Duration
currentTimeout time.Duration // Amount of time before a timeout
timeoutMap map[ids.RequestID]*adaptiveTimeout
timeoutQueue timeoutQueue
timer *Timer // Timer that will fire to clear the timeouts
}
func NewAdaptiveTimeoutManager(
config *AdaptiveTimeoutConfig,
metricsNamespace string,
metricsRegister prometheus.Registerer,
) (AdaptiveTimeoutManager, error) {
switch {
case config.InitialTimeout > config.MaximumTimeout:
return nil, fmt.Errorf("initial timeout (%s) > maximum timeout (%s)", config.InitialTimeout, config.MaximumTimeout)
case config.InitialTimeout < config.MinimumTimeout:
return nil, fmt.Errorf("initial timeout (%s) < minimum timeout (%s)", config.InitialTimeout, config.MinimumTimeout)
case config.TimeoutCoefficient < 1:
return nil, fmt.Errorf("timeout coefficient must be >= 1 but got %f", config.TimeoutCoefficient)
case config.TimeoutHalflife <= 0:
return nil, errNonPositiveHalflife
}
tm := &adaptiveTimeoutManager{
networkTimeoutMetric: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "current_timeout",
Help: "Duration of current network timeout in nanoseconds",
}),
avgLatency: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "average_latency",
Help: "Average network latency in nanoseconds",
}),
numTimeouts: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: metricsNamespace,
Name: "timeouts",
Help: "Number of timed out requests",
}),
numPendingTimeouts: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "pending_timeouts",
Help: "Number of pending timeouts",
}),
minimumTimeout: config.MinimumTimeout,
maximumTimeout: config.MaximumTimeout,
currentTimeout: config.InitialTimeout,
timeoutCoefficient: config.TimeoutCoefficient,
timeoutMap: make(map[ids.RequestID]*adaptiveTimeout),
}
tm.timer = NewTimer(tm.timeout)
tm.averager = math.NewAverager(float64(config.InitialTimeout), config.TimeoutHalflife, tm.clock.Time())
errs := &wrappers.Errs{}
errs.Add(
metricsRegister.Register(tm.networkTimeoutMetric),
metricsRegister.Register(tm.avgLatency),
metricsRegister.Register(tm.numTimeouts),
metricsRegister.Register(tm.numPendingTimeouts),
)
return tm, errs.Err
}
func (tm *adaptiveTimeoutManager) TimeoutDuration() time.Duration {
tm.lock.Lock()
defer tm.lock.Unlock()
return tm.currentTimeout
}
func (tm *adaptiveTimeoutManager) Dispatch() {
tm.timer.Dispatch()
}
func (tm *adaptiveTimeoutManager) Stop() {
tm.timer.Stop()
}
func (tm *adaptiveTimeoutManager) Put(id ids.RequestID, measureLatency bool, timeoutHandler func()) {
tm.lock.Lock()
defer tm.lock.Unlock()
tm.put(id, measureLatency, timeoutHandler)
}
// Assumes [tm.lock] is held
func (tm *adaptiveTimeoutManager) put(id ids.RequestID, measureLatency bool, handler func()) {
now := tm.clock.Time()
tm.remove(id, now)
timeout := &adaptiveTimeout{
id: id,
handler: handler,
duration: tm.currentTimeout,
deadline: now.Add(tm.currentTimeout),
measureLatency: measureLatency,
}
tm.timeoutMap[id] = timeout
tm.numPendingTimeouts.Set(float64(len(tm.timeoutMap)))
heap.Push(&tm.timeoutQueue, timeout)
tm.setNextTimeoutTime()
}
func (tm *adaptiveTimeoutManager) Remove(id ids.RequestID) {
tm.lock.Lock()
defer tm.lock.Unlock()
tm.remove(id, tm.clock.Time())
}
// Assumes [tm.lock] is held
func (tm *adaptiveTimeoutManager) remove(id ids.RequestID, now time.Time) {
timeout, exists := tm.timeoutMap[id]
if !exists {
return
}
// Observe the response time to update average network response time.
if timeout.measureLatency {
timeoutRegisteredAt := timeout.deadline.Add(-1 * timeout.duration)
latency := now.Sub(timeoutRegisteredAt)
tm.observeLatencyAndUpdateTimeout(latency, now)
}
// Remove the timeout from the map
delete(tm.timeoutMap, id)
tm.numPendingTimeouts.Set(float64(len(tm.timeoutMap)))
// Remove the timeout from the queue
heap.Remove(&tm.timeoutQueue, timeout.index)
}
// Assumes [tm.lock] is not held.
func (tm *adaptiveTimeoutManager) timeout() {
tm.lock.Lock()
defer tm.lock.Unlock()
now := tm.clock.Time()
for {
// getNextTimeoutHandler returns nil once there is nothing left to remove
timeoutHandler := tm.getNextTimeoutHandler(now)
if timeoutHandler == nil {
break
}
tm.numTimeouts.Inc()
// Don't execute a callback with a lock held
tm.lock.Unlock()
timeoutHandler()
tm.lock.Lock()
}
tm.setNextTimeoutTime()
}
func (tm *adaptiveTimeoutManager) ObserveLatency(latency time.Duration) {
tm.lock.Lock()
defer tm.lock.Unlock()
tm.observeLatencyAndUpdateTimeout(latency, tm.clock.Time())
}
// Assumes [tm.lock] is held
func (tm *adaptiveTimeoutManager) observeLatencyAndUpdateTimeout(latency time.Duration, now time.Time) {
tm.averager.Observe(float64(latency), now)
avgLatency := tm.averager.Read()
tm.currentTimeout = time.Duration(tm.timeoutCoefficient * avgLatency)
if tm.currentTimeout > tm.maximumTimeout {
tm.currentTimeout = tm.maximumTimeout
} else if tm.currentTimeout < tm.minimumTimeout {
tm.currentTimeout = tm.minimumTimeout
}
// Update the metrics
tm.networkTimeoutMetric.Set(float64(tm.currentTimeout))
tm.avgLatency.Set(avgLatency)
}
// Returns the handler function associated with the next timeout.
// If there are no timeouts, or if the next timeout is after [now],
// returns nil.
// Assumes [tm.lock] is held
func (tm *adaptiveTimeoutManager) getNextTimeoutHandler(now time.Time) func() {
if tm.timeoutQueue.Len() == 0 {
return nil
}
nextTimeout := tm.timeoutQueue[0]
if nextTimeout.deadline.After(now) {
return nil
}
tm.remove(nextTimeout.id, now)
return nextTimeout.handler
}
// Calculate the time of the next timeout and set
// the timer to fire at that time.
func (tm *adaptiveTimeoutManager) setNextTimeoutTime() {
if tm.timeoutQueue.Len() == 0 {
// There are no pending timeouts
tm.timer.Cancel()
return
}
now := tm.clock.Time()
nextTimeout := tm.timeoutQueue[0]
timeToNextTimeout := nextTimeout.deadline.Sub(now)
tm.timer.SetTimeoutIn(timeToNextTimeout)
}