@@ -31,24 +31,33 @@ import (
3131//
3232// all error counters must have "err_" prefix (see `errPrefix`)
3333
34+ //
35+ // ais target metrics: groups 1 through 4 =====================
36+ //
37+
38+ // 1. datapath (counters, sizes, latencies) and common errors
3439const (
35- // KindCounter & KindSize - always incremented
40+ // KindThroughput
41+ GetThroughput = "get.bps" // bytes per second
42+ PutThroughput = "put.bps" // ditto
3643
37- LruEvictCount = "lru.evict.n"
38- LruEvictSize = "lru.evict.size"
44+ // same as above via `.cumulative`
45+ GetSize = "get.size"
46+ PutSize = "put.size"
3947
40- CleanupStoreCount = "cleanup.store.n"
41- CleanupStoreSize = "cleanup.store.size"
48+ // common latencies
49+ AppendLatency = "append.ns"
50+ GetRedirLatency = "get.redir.ns"
51+ PutRedirLatency = "put.redir.ns"
52+ HeadLatencyTotal = "head.ns.total"
4253
54+ // out-of-band
4355 VerChangeCount = "ver.change.n"
4456 VerChangeSize = "ver.change.size"
4557
46- // errors
58+ // errors (note common prefix convention)
4759 ErrPutCksumCount = errPrefix + "put.cksum.n"
48-
49- ErrFSHCCount = errPrefix + "fshc.n"
50-
51- ErrDloadCount = errPrefix + "dl.n"
60+ ErrFSHCCount = errPrefix + "fshc.n"
5261
5362 // IO errors (must have ioErrPrefix)
5463 IOErrGetCount = ioErrPrefix + "get.n"
@@ -69,50 +78,77 @@ const (
6978 RatelimPutRetryCount = "ratelim.retry.put.n"
7079 RatelimPutRetryLatencyTotal = "ratelim.retry.put.ns.total"
7180
72- AppendLatency = "append.ns"
73- GetRedirLatency = "get.redir.ns"
74- PutRedirLatency = "put.redir.ns"
75- DloadLatencyTotal = "dl.ns.total"
76- HeadLatencyTotal = "head.ns.total"
81+ // compare w/ common `DeleteCount`
82+ RemoteDeletedDelCount = core .RemoteDeletedDelCount
83+ )
84+
85+ // 2. object metadata in memory
86+ const (
87+ LcacheCollisionCount = core .LcacheCollisionCount
88+ LcacheEvictedCount = core .LcacheEvictedCount
89+ LcacheErrCount = core .LcacheErrCount
90+ LcacheFlushColdCount = core .LcacheFlushColdCount
91+ )
92+
93+ // 3. xactions (jobs)
94+ const (
95+ // blob downloader
96+ GetBlobSize = "getblob.size"
97+
98+ // LRU eviction
99+ LruEvictCount = "lru.evict.n"
100+ LruEvictSize = "lru.evict.size"
77101
78- // Dsort
102+ // space cleanup
103+ CleanupStoreCount = "cleanup.store.n"
104+ CleanupStoreSize = "cleanup.store.size"
105+
106+ // distributed sort (ext/dsort)
79107 DsortCreationReqCount = "dsort.creation.req.n"
80108 DsortCreationRespCount = "dsort.creation.resp.n"
81109 DsortCreationRespLatency = "dsort.creation.resp.ns"
82110 DsortExtractShardDskCnt = "dsort.extract.shard.dsk.n"
83111 DsortExtractShardMemCnt = "dsort.extract.shard.mem.n"
84112 DsortExtractShardSize = "dsort.extract.shard.size" // uncompressed
85113
86- // ETL
114+ // ETL (ext/etl)
87115 ETLInlineCount = "etl.inline.n"
88116 ETLInlineLatencyTotal = "etl.inline.ns.total"
89117 ETLInlineSize = "etl.inline.size"
90118 ETLOfflineCount = "etl.offline.n"
91119 ETLOfflineLatencyTotal = "etl.offline.ns.total"
92120 ETLOfflineSize = "etl.offline.size"
93121
94- // Downloader
95- DloadSize = "dl.size"
96-
97- // KindThroughput
98- GetThroughput = "get.bps" // bytes per second
99- PutThroughput = "put.bps" // ditto
122+ // downloader (ext/dload)
123+ // (not to confuse with blob downloader)
124+ DloadSize = "dl.size"
125+ DloadLatencyTotal = "dl.ns.total"
126+ ErrDloadCount = errPrefix + "dl.n"
100127
101- // same as above via `.cumulative`
102- GetSize = "get.size"
103- PutSize = "put.size"
128+ // get-batch (x-moss)
129+ GetBatchCount = "getbatch.n"
130+ GetBatchObjCount = "getbatch.obj.n"
131+ GetBatchFileCount = "getbatch.file.n"
132+ GetBatchObjSize = "getbatch.obj.size"
133+ GetBatchFileSize = "getbatch.file.size"
104134
105- GetBlobSize = "getblob.size"
135+ GetBatchRxWaitTotal = "getbatch.rxwait.ns"
136+ GetBatchThrottleTotal = "getbatch.throttle.ns"
106137
107- // core
108- RemoteDeletedDelCount = core .RemoteDeletedDelCount // compare w/ common `DeleteCount`
138+ ErrGetBatchCount = errPrefix + "getbatch.n"
139+ GetBatchSoftErrCount = errPrefix + "soft.getbatch.n"
140+ )
109141
110- LcacheCollisionCount = core .LcacheCollisionCount
111- LcacheEvictedCount = core .LcacheEvictedCount
112- LcacheErrCount = core .LcacheErrCount
113- LcacheFlushColdCount = core .LcacheFlushColdCount
142+ // 4, streams (peer-to-peer long-lived connections)
143+ const (
144+ _ = cos .StreamsOutObjCount
145+ _ = cos .StreamsOutObjSize
146+ _ = cos .StreamsInObjCount
147+ _ = cos .StreamsInObjSize
148+ )
114149
115- // variable label used for prometheus disk metrics
150+ // variable label used for prometheus disk metrics
151+ const (
116152 diskMetricLabel = "disk"
117153)
118154
@@ -420,7 +456,7 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
420456 },
421457 )
422458
423- // streams
459+ // streams: peer-to-peer long-lived connections
424460 r .reg (snode , cos .StreamsOutObjCount , KindCounter ,
425461 & Extra {
426462 Help : "intra-cluster streaming communications: number of sent objects" ,
@@ -442,6 +478,7 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
442478 },
443479 )
444480
481+ // downloader (ext/dload)
445482 r .reg (snode , DloadSize , KindSize ,
446483 & Extra {
447484 Help : "total downloaded size (bytes)" ,
@@ -569,6 +606,53 @@ func (r *Trunner) RegMetrics(snode *meta.Snode) {
569606 Help : "number of times a LOM from cache was written to stable storage (core, internal)" ,
570607 },
571608 )
609+
610+ // get-batch (x-moss)
611+ r .reg (snode , GetBatchCount , KindCounter ,
612+ & Extra {
613+ Help : "total number of get-batch requests (work items)" ,
614+ },
615+ )
616+ r .reg (snode , GetBatchObjCount , KindCounter ,
617+ & Extra {
618+ Help : "get-batch: total number of whole objects retrieved and delivered via output archive" ,
619+ },
620+ )
621+ r .reg (snode , GetBatchFileCount , KindCounter ,
622+ & Extra {
623+ Help : "get-batch: total number of files extracted from shards and delivered via output archive" ,
624+ },
625+ )
626+ r .reg (snode , GetBatchObjSize , KindSize ,
627+ & Extra {
628+ Help : "get-batch: total cumulative size (bytes) of whole objects" ,
629+ },
630+ )
631+ r .reg (snode , GetBatchFileSize , KindSize ,
632+ & Extra {
633+ Help : "get-batch: total cumulative size (bytes) of archived files extracted from shards" ,
634+ },
635+ )
636+ r .reg (snode , GetBatchRxWaitTotal , KindTotal ,
637+ & Extra {
638+ Help : "get-batch: total cumulative time (nanoseconds) spent waiting to receive entries from peer targets" ,
639+ },
640+ )
641+ r .reg (snode , GetBatchThrottleTotal , KindTotal ,
642+ & Extra {
643+ Help : "get-batch: total cumulative time (nanoseconds) slept due to resource pressure" ,
644+ },
645+ )
646+ r .reg (snode , GetBatchSoftErrCount , KindCounter ,
647+ & Extra {
648+ Help : "get-batch: number of transient errors (retryable failures under configured limit)" ,
649+ },
650+ )
651+ r .reg (snode , ErrGetBatchCount , KindCounter ,
652+ & Extra {
653+ Help : "get-batch: number of hard errors including request failures and 429 rejections" ,
654+ },
655+ )
572656}
573657
574658func (r * Trunner ) RegDiskMetrics (snode * meta.Snode , disk string ) {
0 commit comments