Skip to content

Commit d093b5c

Browse files
committed
get-batch: add Prometheus metrics
* x-moss: - start tracking plain objects vs archived files, and - throughput (counts + sizes), and - operational health (throttling, soft/hard errors) * remove internal stats but keep CLI observability via CtlMsg/xact.Snap * tests: fix to override max-soft-errors default ------- * stats package: - group target metrics into 4 categories - add get-batch (num-requests, object and file counts/sizes, and more) Signed-off-by: Alex Aizman <alex.aizman@gmail.com>
1 parent be60bc5 commit d093b5c

File tree

4 files changed

+258
-172
lines changed

4 files changed

+258
-172
lines changed

ais/test/moss_test.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"io"
1212
"os"
1313
"path"
14+
"strconv"
1415
"strings"
1516
"sync"
1617
"testing"
@@ -39,6 +40,7 @@ import (
3940
const (
4041
mossMissingPrefix = "missing-"
4142
mossMissingSuffix = ".nonexistent"
43+
mossMissingRatio = 3 // as in: every 3rd
4244
)
4345

4446
type mossConfig struct {
@@ -190,6 +192,8 @@ func TestMoss(t *testing.T) {
190192

191193
t.Cleanup(stopMossJobs) // in re: ErrLimitedCoexistence
192194

195+
oconfig := tools.GetClusterConfig(t)
196+
193197
for _, test := range tests {
194198
t.Run(test.name(), func(t *testing.T) {
195199
m := ioContext{
@@ -209,6 +213,15 @@ func TestMoss(t *testing.T) {
209213
tools.CreateBucket(t, proxyURL, m.bck, nil, true /*cleanup*/)
210214
m.init(true /*cleanup*/)
211215

216+
if test.withMissing {
217+
s := strconv.Itoa(numPlainObjs)
218+
tools.SetClusterConfig(t, cos.StrKVs{"get_batch.max_soft_errs": s})
219+
t.Cleanup(func() {
220+
s := strconv.Itoa(oconfig.GetBatch.MaxSoftErrs)
221+
tools.SetClusterConfig(t, cos.StrKVs{"get_batch.max_soft_errs": s})
222+
})
223+
}
224+
212225
if test.inputFormat == "" {
213226
testMossPlainObjects(t, &m, &test, numPlainObjs)
214227
} else {
@@ -250,11 +263,11 @@ func testMossPlainObjects(t *testing.T, m *ioContext, test *mossConfig, numObjs
250263
// Inject missing objects if requested
251264
if test.withMissing {
252265
originalEntries := mossIn
253-
mossIn = make([]apc.MossIn, 0, len(originalEntries)+len(originalEntries)/3)
266+
mossIn = make([]apc.MossIn, 0, len(originalEntries)+len(originalEntries)/mossMissingRatio)
254267

255268
for i, entry := range originalEntries {
256269
mossIn = append(mossIn, entry)
257-
if i%3 == 0 {
270+
if i%mossMissingRatio == 0 {
258271
missingEntry := apc.MossIn{
259272
ObjName: mossMissingPrefix + trand.String(8),
260273
}
@@ -342,7 +355,7 @@ func testMossArchives(t *testing.T, m *ioContext, test *mossConfig, numArchives,
342355
// Inject missing archive paths if requested
343356
if test.withMissing {
344357
for i := range mossIn {
345-
if i%3 == 0 {
358+
if i%mossMissingRatio == 0 {
346359
mossIn[i].ArchPath = trand.String(8) + mossMissingSuffix
347360
}
348361
}

cmn/config.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2547,7 +2547,7 @@ const (
25472547
numWarmupWorkersDisabled = -1
25482548
numWarmupWorkersDflt = 2
25492549

2550-
getBatchSoftErrs = 6
2550+
GetBatchSoftErrsDflt = 6
25512551
)
25522552

25532553
func (c *GetBatchConf) WarmupWorkers() int {
@@ -2571,7 +2571,7 @@ func (c *GetBatchConf) Validate() error {
25712571
}
25722572
}
25732573
if c.MaxSoftErrs == 0 {
2574-
c.MaxSoftErrs = getBatchSoftErrs
2574+
c.MaxSoftErrs = GetBatchSoftErrsDflt
25752575
} else if c.MaxSoftErrs < 0 {
25762576
return fmt.Errorf("invalid get_batch.max_soft_errs=%d (expecting non-negative integer)", c.MaxSoftErrs)
25772577
}

stats/target_stats.go

Lines changed: 119 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -31,24 +31,33 @@ import (
3131
//
3232
// all error counters must have "err_" prefix (see `errPrefix`)
3333

34+
//
35+
// ais target metrics: groups 1 through 4 =====================
36+
//
37+
38+
// 1. datapath (counters, sizes, latencies) and common errors
3439
const (
35-
// KindCounter & KindSize - always incremented
40+
// KindThroughput
41+
GetThroughput = "get.bps" // bytes per second
42+
PutThroughput = "put.bps" // ditto
3643

37-
LruEvictCount = "lru.evict.n"
38-
LruEvictSize = "lru.evict.size"
44+
// same as above via `.cumulative`
45+
GetSize = "get.size"
46+
PutSize = "put.size"
3947

40-
CleanupStoreCount = "cleanup.store.n"
41-
CleanupStoreSize = "cleanup.store.size"
48+
// common latencies
49+
AppendLatency = "append.ns"
50+
GetRedirLatency = "get.redir.ns"
51+
PutRedirLatency = "put.redir.ns"
52+
HeadLatencyTotal = "head.ns.total"
4253

54+
// out-of-band
4355
VerChangeCount = "ver.change.n"
4456
VerChangeSize = "ver.change.size"
4557

46-
// errors
58+
// errors (note common prefix convention)
4759
ErrPutCksumCount = errPrefix + "put.cksum.n"
48-
49-
ErrFSHCCount = errPrefix + "fshc.n"
50-
51-
ErrDloadCount = errPrefix + "dl.n"
60+
ErrFSHCCount = errPrefix + "fshc.n"
5261

5362
// IO errors (must have ioErrPrefix)
5463
IOErrGetCount = ioErrPrefix + "get.n"
@@ -69,50 +78,77 @@ const (
6978
RatelimPutRetryCount = "ratelim.retry.put.n"
7079
RatelimPutRetryLatencyTotal = "ratelim.retry.put.ns.total"
7180

72-
AppendLatency = "append.ns"
73-
GetRedirLatency = "get.redir.ns"
74-
PutRedirLatency = "put.redir.ns"
75-
DloadLatencyTotal = "dl.ns.total"
76-
HeadLatencyTotal = "head.ns.total"
81+
// compare w/ common `DeleteCount`
82+
RemoteDeletedDelCount = core.RemoteDeletedDelCount
83+
)
84+
85+
// 2. object metadata in memory
86+
const (
87+
LcacheCollisionCount = core.LcacheCollisionCount
88+
LcacheEvictedCount = core.LcacheEvictedCount
89+
LcacheErrCount = core.LcacheErrCount
90+
LcacheFlushColdCount = core.LcacheFlushColdCount
91+
)
92+
93+
// 3. xactions (jobs)
94+
const (
95+
// blob downloader
96+
GetBlobSize = "getblob.size"
97+
98+
// LRU eviction
99+
LruEvictCount = "lru.evict.n"
100+
LruEvictSize = "lru.evict.size"
77101

78-
// Dsort
102+
// space cleanup
103+
CleanupStoreCount = "cleanup.store.n"
104+
CleanupStoreSize = "cleanup.store.size"
105+
106+
// distributed sort (ext/dsort)
79107
DsortCreationReqCount = "dsort.creation.req.n"
80108
DsortCreationRespCount = "dsort.creation.resp.n"
81109
DsortCreationRespLatency = "dsort.creation.resp.ns"
82110
DsortExtractShardDskCnt = "dsort.extract.shard.dsk.n"
83111
DsortExtractShardMemCnt = "dsort.extract.shard.mem.n"
84112
DsortExtractShardSize = "dsort.extract.shard.size" // uncompressed
85113

86-
// ETL
114+
// ETL (ext/etl)
87115
ETLInlineCount = "etl.inline.n"
88116
ETLInlineLatencyTotal = "etl.inline.ns.total"
89117
ETLInlineSize = "etl.inline.size"
90118
ETLOfflineCount = "etl.offline.n"
91119
ETLOfflineLatencyTotal = "etl.offline.ns.total"
92120
ETLOfflineSize = "etl.offline.size"
93121

94-
// Downloader
95-
DloadSize = "dl.size"
96-
97-
// KindThroughput
98-
GetThroughput = "get.bps" // bytes per second
99-
PutThroughput = "put.bps" // ditto
122+
// downloader (ext/dload)
123+
// (not to confuse with blob downloader)
124+
DloadSize = "dl.size"
125+
DloadLatencyTotal = "dl.ns.total"
126+
ErrDloadCount = errPrefix + "dl.n"
100127

101-
// same as above via `.cumulative`
102-
GetSize = "get.size"
103-
PutSize = "put.size"
128+
// get-batch (x-moss)
129+
GetBatchCount = "getbatch.n"
130+
GetBatchObjCount = "getbatch.obj.n"
131+
GetBatchFileCount = "getbatch.file.n"
132+
GetBatchObjSize = "getbatch.obj.size"
133+
GetBatchFileSize = "getbatch.file.size"
104134

105-
GetBlobSize = "getblob.size"
135+
GetBatchRxWaitTotal = "getbatch.rxwait.ns"
136+
GetBatchThrottleTotal = "getbatch.throttle.ns"
106137

107-
// core
108-
RemoteDeletedDelCount = core.RemoteDeletedDelCount // compare w/ common `DeleteCount`
138+
ErrGetBatchCount = errPrefix + "getbatch.n"
139+
GetBatchSoftErrCount = errPrefix + "soft.getbatch.n"
140+
)
109141

110-
LcacheCollisionCount = core.LcacheCollisionCount
111-
LcacheEvictedCount = core.LcacheEvictedCount
112-
LcacheErrCount = core.LcacheErrCount
113-
LcacheFlushColdCount = core.LcacheFlushColdCount
142+
// 4, streams (peer-to-peer long-lived connections)
143+
const (
144+
_ = cos.StreamsOutObjCount
145+
_ = cos.StreamsOutObjSize
146+
_ = cos.StreamsInObjCount
147+
_ = cos.StreamsInObjSize
148+
)
114149

115-
// variable label used for prometheus disk metrics
150+
// variable label used for prometheus disk metrics
151+
const (
116152
diskMetricLabel = "disk"
117153
)
118154

@@ -420,7 +456,7 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
420456
},
421457
)
422458

423-
// streams
459+
// streams: peer-to-peer long-lived connections
424460
r.reg(snode, cos.StreamsOutObjCount, KindCounter,
425461
&Extra{
426462
Help: "intra-cluster streaming communications: number of sent objects",
@@ -442,6 +478,7 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
442478
},
443479
)
444480

481+
// downloader (ext/dload)
445482
r.reg(snode, DloadSize, KindSize,
446483
&Extra{
447484
Help: "total downloaded size (bytes)",
@@ -569,6 +606,53 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
569606
Help: "number of times a LOM from cache was written to stable storage (core, internal)",
570607
},
571608
)
609+
610+
// get-batch (x-moss)
611+
r.reg(snode, GetBatchCount, KindCounter,
612+
&Extra{
613+
Help: "total number of get-batch requests (work items)",
614+
},
615+
)
616+
r.reg(snode, GetBatchObjCount, KindCounter,
617+
&Extra{
618+
Help: "get-batch: total number of whole objects retrieved and delivered via output archive",
619+
},
620+
)
621+
r.reg(snode, GetBatchFileCount, KindCounter,
622+
&Extra{
623+
Help: "get-batch: total number of files extracted from shards and delivered via output archive",
624+
},
625+
)
626+
r.reg(snode, GetBatchObjSize, KindSize,
627+
&Extra{
628+
Help: "get-batch: total cumulative size (bytes) of whole objects",
629+
},
630+
)
631+
r.reg(snode, GetBatchFileSize, KindSize,
632+
&Extra{
633+
Help: "get-batch: total cumulative size (bytes) of archived files extracted from shards",
634+
},
635+
)
636+
r.reg(snode, GetBatchRxWaitTotal, KindTotal,
637+
&Extra{
638+
Help: "get-batch: total cumulative time (nanoseconds) spent waiting to receive entries from peer targets",
639+
},
640+
)
641+
r.reg(snode, GetBatchThrottleTotal, KindTotal,
642+
&Extra{
643+
Help: "get-batch: total cumulative time (nanoseconds) slept due to resource pressure",
644+
},
645+
)
646+
r.reg(snode, GetBatchSoftErrCount, KindCounter,
647+
&Extra{
648+
Help: "get-batch: number of transient errors (retryable failures under configured limit)",
649+
},
650+
)
651+
r.reg(snode, ErrGetBatchCount, KindCounter,
652+
&Extra{
653+
Help: "get-batch: number of hard errors including request failures and 429 rejections",
654+
},
655+
)
572656
}
573657

574658
func (r *Trunner) RegDiskMetrics(snode *meta.Snode, disk string) {

0 commit comments

Comments
 (0)