From 3dd13f5cbd74245f9d3cdeda8201b3b992b91e91 Mon Sep 17 00:00:00 2001 From: Andrey Chursin Date: Mon, 12 Sep 2022 14:24:10 -0700 Subject: [PATCH] Add consensus time metrics on SUI side This diff adds two metrics (a) `sequencing_certificate_latency`: histogram for consensus adapter that measures latency of transaction sequencing. This should be a key metric that shows how much consensus contributes to overall transaction latency. Normally in the stress test tool we should see `shared_object_latency = owned_object_latency + sequencing_certificate_latency`. Metric that existed before, `sequencing_certificate_control_delay` is a proxy metric(it indicates consensus timeout derived from the current latency) and is hard to use directly. (b) `handle_consensus_duration_mcs` is a metric that can be used to derive the utilization of the `handle_consensus_transaction` task. The metrics is simply a sum of execution times. Based on this metric we can build a graph `rate(handle_consensus_duration_mcs) / 1_000_000`, this graph will show 'utilization' of the consensus task on sui side. If value here gets close to 1.0 it means that consensus is generating tasks faster than sui can assign shared object locks. We will need to add something similar to (b) on the narwhal side to include batch waiter etc time. --- crates/sui-core/src/authority.rs | 23 ++++++++++++++++++ crates/sui-core/src/consensus_adapter.rs | 14 +++++++++-- crates/sui-core/src/metrics.rs | 30 +++++++++++++++++++++++- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/crates/sui-core/src/authority.rs b/crates/sui-core/src/authority.rs index 3170ca91f0e6d..00691ae91d99b 100644 --- a/crates/sui-core/src/authority.rs +++ b/crates/sui-core/src/authority.rs @@ -94,6 +94,7 @@ pub mod authority_store_tables; mod authority_store; use crate::epoch::epoch_store::EpochStore; +use crate::metrics::TaskUtilizationExt; pub use authority_store::{ AuthorityStore, GatewayStore, ResolverWrapper, SuiDataStore, UpdateType, }; @@ -141,6 +142,8 @@ pub struct AuthorityMetrics { handle_node_sync_certificate_latency: Histogram, total_consensus_txns: IntCounter, + handle_consensus_duration_mcs: IntCounter, + verify_narwhal_transaction_duration_mcs: IntCounter, pub follower_items_streamed: IntCounter, pub follower_items_loaded: IntCounter, @@ -280,6 +283,18 @@ impl AuthorityMetrics { registry, ) .unwrap(), + handle_consensus_duration_mcs: register_int_counter_with_registry!( + "handle_consensus_duration_mcs", + "Total duration of handle_consensus_transaction", + registry, + ) + .unwrap(), + verify_narwhal_transaction_duration_mcs: register_int_counter_with_registry!( + "verify_narwhal_transaction_duration_mcs", + "Total duration of verify_narwhal_transaction", + registry, + ) + .unwrap(), follower_items_streamed: register_int_counter_with_registry!( "follower_items_streamed", "Number of transactions/signed batches streamed to followers", @@ -1876,6 +1891,10 @@ impl AuthorityState { } fn verify_narwhal_transaction(&self, certificate: &CertifiedTransaction) -> SuiResult { + let _timer = self + .metrics + .verify_narwhal_transaction_duration_mcs + .utilization_timer(); // Ensure the input is a shared object certificate. Remember that Byzantine authorities // may input anything into consensus. fp_ensure!( @@ -1905,6 +1924,10 @@ impl ExecutionState for AuthorityState { transaction: Self::Transaction, ) -> Result { self.metrics.total_consensus_txns.inc(); + let _timer = self + .metrics + .handle_consensus_duration_mcs + .utilization_timer(); let tracking_id = transaction.get_tracking_id(); match transaction.kind { ConsensusTransactionKind::UserTransaction(certificate) => { diff --git a/crates/sui-core/src/consensus_adapter.rs b/crates/sui-core/src/consensus_adapter.rs index fe71941e605ee..73df8875af02b 100644 --- a/crates/sui-core/src/consensus_adapter.rs +++ b/crates/sui-core/src/consensus_adapter.rs @@ -11,11 +11,11 @@ use narwhal_executor::SubscriberResult; use narwhal_types::TransactionProto; use narwhal_types::TransactionsClient; use parking_lot::Mutex; -use prometheus::register_int_counter_with_registry; use prometheus::register_int_gauge_with_registry; use prometheus::IntCounter; use prometheus::IntGauge; use prometheus::Registry; +use prometheus::{register_histogram_with_registry, register_int_counter_with_registry, Histogram}; use std::collections::VecDeque; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -71,8 +71,9 @@ pub struct ConsensusAdapterMetrics { pub sequencing_certificate_success: IntCounter, pub sequencing_certificate_timeouts: IntCounter, pub sequencing_certificate_control_delay: IntGauge, + pub sequencing_certificate_latency: Histogram, - // Certificate sequencing metrics + // Fragment sequencing metrics pub sequencing_fragment_attempt: IntCounter, pub sequencing_fragment_success: IntCounter, pub sequencing_fragment_timeouts: IntCounter, @@ -113,6 +114,12 @@ impl ConsensusAdapterMetrics { registry, ) .unwrap(), + sequencing_certificate_latency: register_histogram_with_registry!( + "sequencing_certificate_latency", + "The latency for certificate sequencing.", + registry, + ) + .unwrap(), sequencing_fragment_attempt: register_int_counter_with_registry!( "sequencing_fragment_attempt", "Counts the number of sequenced fragments submitted.", @@ -351,6 +358,9 @@ impl ConsensusAdapter { metrics .sequencing_certificate_control_delay .set(new_delay as i64); + metrics + .sequencing_certificate_latency + .observe(past_ms as f64); }); self.delay_ms.store(new_delay, Ordering::Relaxed); diff --git a/crates/sui-core/src/metrics.rs b/crates/sui-core/src/metrics.rs index 460bbcc689ed0..5b106c7b6a9d0 100644 --- a/crates/sui-core/src/metrics.rs +++ b/crates/sui-core/src/metrics.rs @@ -1,7 +1,7 @@ // Copyright (c) 2022, Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use prometheus::Histogram; +use prometheus::{Histogram, IntCounter}; use tokio::time::Instant; pub fn start_timer(metrics: Histogram) -> impl Drop { @@ -10,3 +10,31 @@ pub fn start_timer(metrics: Histogram) -> impl Drop { metrics.observe(start_ts.elapsed().as_secs_f64()); }) } + +pub struct TaskUtilizationGuard<'a> { + metric: &'a IntCounter, + start: Instant, +} + +pub trait TaskUtilizationExt { + /// Measures amount of time spent until guard is dropped and increments the counter by duration in mcs + /// Primary usage for this counter is to measure 'utilization' of the single task + /// E.g. having rate(metric) / 1_000_000 can tell what portion of time this task is busy + /// For the tasks that are run in single thread this indicates how close is this task to a complete saturation + fn utilization_timer(&self) -> TaskUtilizationGuard; +} + +impl TaskUtilizationExt for IntCounter { + fn utilization_timer(&self) -> TaskUtilizationGuard { + TaskUtilizationGuard { + start: Instant::now(), + metric: self, + } + } +} + +impl<'a> Drop for TaskUtilizationGuard<'a> { + fn drop(&mut self) { + self.metric.inc_by(self.start.elapsed().as_micros() as u64); + } +}