Skip to content

Commit

Permalink
Add consensus time metrics on SUI side
Browse files Browse the repository at this point in the history
This diff adds two metrics
(a) `sequencing_certificate_latency`: histogram for consensus adapter that measures latency of transaction sequencing. This should be a key metric that shows how much consensus contributes to overall transaction latency. Normally in the stress test tool we should see `shared_object_latency = owned_object_latency + sequencing_certificate_latency`.
Metric that existed before, `sequencing_certificate_control_delay` is a proxy metric(it indicates consensus timeout derived from the current latency) and is hard to use directly.

(b) `handle_consensus_duration_mcs` is a metric that can be used to derive the utilization of the `handle_consensus_transaction` task. The metrics is simply a sum of execution times. Based on this metric we can build a graph `rate(handle_consensus_duration_mcs) / 1_000_000`, this graph will show 'utilization' of the consensus task on sui side. If value here gets close to 1.0 it means that consensus is generating tasks faster than sui can assign shared object locks.

We will need to add something similar to (b) on the narwhal side to include batch waiter etc time.
  • Loading branch information
andll committed Sep 12, 2022
1 parent e612b58 commit 3dd13f5
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 3 deletions.
23 changes: 23 additions & 0 deletions crates/sui-core/src/authority.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pub mod authority_store_tables;

mod authority_store;
use crate::epoch::epoch_store::EpochStore;
use crate::metrics::TaskUtilizationExt;
pub use authority_store::{
AuthorityStore, GatewayStore, ResolverWrapper, SuiDataStore, UpdateType,
};
Expand Down Expand Up @@ -141,6 +142,8 @@ pub struct AuthorityMetrics {
handle_node_sync_certificate_latency: Histogram,

total_consensus_txns: IntCounter,
handle_consensus_duration_mcs: IntCounter,
verify_narwhal_transaction_duration_mcs: IntCounter,

pub follower_items_streamed: IntCounter,
pub follower_items_loaded: IntCounter,
Expand Down Expand Up @@ -280,6 +283,18 @@ impl AuthorityMetrics {
registry,
)
.unwrap(),
handle_consensus_duration_mcs: register_int_counter_with_registry!(
"handle_consensus_duration_mcs",
"Total duration of handle_consensus_transaction",
registry,
)
.unwrap(),
verify_narwhal_transaction_duration_mcs: register_int_counter_with_registry!(
"verify_narwhal_transaction_duration_mcs",
"Total duration of verify_narwhal_transaction",
registry,
)
.unwrap(),
follower_items_streamed: register_int_counter_with_registry!(
"follower_items_streamed",
"Number of transactions/signed batches streamed to followers",
Expand Down Expand Up @@ -1876,6 +1891,10 @@ impl AuthorityState {
}

fn verify_narwhal_transaction(&self, certificate: &CertifiedTransaction) -> SuiResult {
let _timer = self
.metrics
.verify_narwhal_transaction_duration_mcs
.utilization_timer();
// Ensure the input is a shared object certificate. Remember that Byzantine authorities
// may input anything into consensus.
fp_ensure!(
Expand Down Expand Up @@ -1905,6 +1924,10 @@ impl ExecutionState for AuthorityState {
transaction: Self::Transaction,
) -> Result<Self::Outcome, Self::Error> {
self.metrics.total_consensus_txns.inc();
let _timer = self
.metrics
.handle_consensus_duration_mcs
.utilization_timer();
let tracking_id = transaction.get_tracking_id();
match transaction.kind {
ConsensusTransactionKind::UserTransaction(certificate) => {
Expand Down
14 changes: 12 additions & 2 deletions crates/sui-core/src/consensus_adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ use narwhal_executor::SubscriberResult;
use narwhal_types::TransactionProto;
use narwhal_types::TransactionsClient;
use parking_lot::Mutex;
use prometheus::register_int_counter_with_registry;
use prometheus::register_int_gauge_with_registry;
use prometheus::IntCounter;
use prometheus::IntGauge;
use prometheus::Registry;
use prometheus::{register_histogram_with_registry, register_int_counter_with_registry, Histogram};
use std::collections::VecDeque;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
Expand Down Expand Up @@ -71,8 +71,9 @@ pub struct ConsensusAdapterMetrics {
pub sequencing_certificate_success: IntCounter,
pub sequencing_certificate_timeouts: IntCounter,
pub sequencing_certificate_control_delay: IntGauge,
pub sequencing_certificate_latency: Histogram,

// Certificate sequencing metrics
// Fragment sequencing metrics
pub sequencing_fragment_attempt: IntCounter,
pub sequencing_fragment_success: IntCounter,
pub sequencing_fragment_timeouts: IntCounter,
Expand Down Expand Up @@ -113,6 +114,12 @@ impl ConsensusAdapterMetrics {
registry,
)
.unwrap(),
sequencing_certificate_latency: register_histogram_with_registry!(
"sequencing_certificate_latency",
"The latency for certificate sequencing.",
registry,
)
.unwrap(),
sequencing_fragment_attempt: register_int_counter_with_registry!(
"sequencing_fragment_attempt",
"Counts the number of sequenced fragments submitted.",
Expand Down Expand Up @@ -351,6 +358,9 @@ impl ConsensusAdapter {
metrics
.sequencing_certificate_control_delay
.set(new_delay as i64);
metrics
.sequencing_certificate_latency
.observe(past_ms as f64);
});

self.delay_ms.store(new_delay, Ordering::Relaxed);
Expand Down
30 changes: 29 additions & 1 deletion crates/sui-core/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright (c) 2022, Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

use prometheus::Histogram;
use prometheus::{Histogram, IntCounter};
use tokio::time::Instant;

pub fn start_timer(metrics: Histogram) -> impl Drop {
Expand All @@ -10,3 +10,31 @@ pub fn start_timer(metrics: Histogram) -> impl Drop {
metrics.observe(start_ts.elapsed().as_secs_f64());
})
}

pub struct TaskUtilizationGuard<'a> {
metric: &'a IntCounter,
start: Instant,
}

pub trait TaskUtilizationExt {
/// Measures amount of time spent until guard is dropped and increments the counter by duration in mcs
/// Primary usage for this counter is to measure 'utilization' of the single task
/// E.g. having rate(metric) / 1_000_000 can tell what portion of time this task is busy
/// For the tasks that are run in single thread this indicates how close is this task to a complete saturation
fn utilization_timer(&self) -> TaskUtilizationGuard;
}

impl TaskUtilizationExt for IntCounter {
fn utilization_timer(&self) -> TaskUtilizationGuard {
TaskUtilizationGuard {
start: Instant::now(),
metric: self,
}
}
}

impl<'a> Drop for TaskUtilizationGuard<'a> {
fn drop(&mut self) {
self.metric.inc_by(self.start.elapsed().as_micros() as u64);
}
}

0 comments on commit 3dd13f5

Please sign in to comment.