From 263096d1f6e1f21a71b0ffeeb408e1448ed51e6c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 21 Apr 2023 09:23:21 -0700 Subject: [PATCH] stats: add query max wait time metric Add new statistic value to know the size of the queue in microseconds. Signed-off-by: Vadim Fedorenko --- daemon/remote.c | 2 ++ daemon/stats.c | 2 ++ daemon/worker.c | 12 +++++++++--- libunbound/unbound.h | 3 ++- smallapp/unbound-control.c | 1 + 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/daemon/remote.c b/daemon/remote.c index d8a3a6caf..03daa935e 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -681,6 +681,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s) (unsigned long)s->svr.num_queries_prefetch)) return 0; if(!ssl_printf(ssl, "%s.num.queries_timed_out"SQ"%lu\n", nm, (unsigned long)s->svr.num_queries_timed_out)) return 0; + if(!ssl_printf(ssl, "%s.query.queue_time_us.max"SQ"%lu\n", nm, + (unsigned long)s->svr.max_query_time_us)) return 0; if(!ssl_printf(ssl, "%s.num.expired"SQ"%lu\n", nm, (unsigned long)s->svr.ans_expired)) return 0; if(!ssl_printf(ssl, "%s.num.recursivereplies"SQ"%lu\n", nm, diff --git a/daemon/stats.c b/daemon/stats.c index 38f4597b3..cef7de827 100644 --- a/daemon/stats.c +++ b/daemon/stats.c @@ -433,6 +433,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a) total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache; total->svr.num_queries_prefetch += a->svr.num_queries_prefetch; total->svr.num_queries_timed_out += a->svr.num_queries_timed_out; + if (total->svr.max_query_time_us < a->svr.max_query_time_us) + total->svr.max_query_time_us = a->svr.max_query_time_us; total->svr.sum_query_list_size += a->svr.sum_query_list_size; total->svr.ans_expired += a->svr.ans_expired; #ifdef USE_DNSCRYPT diff --git a/daemon/worker.c b/daemon/worker.c index 63b8678e3..5c373b79b 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -1281,6 +1281,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error, int is_expired_answer = 0; int is_secure_answer = 0; int rpz_passthru = 0; + long long wait_queue_time = 0; /* We might have to chase a CNAME chain internally, in which case * we'll have up to two replies and combine them to build a complete * answer. These variables control this case. */ @@ -1289,6 +1290,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error, struct query_info* lookup_qinfo = &qinfo; struct query_info qinfo_tmp; /* placeholder for lookup_qinfo */ struct respip_client_info* cinfo = NULL, cinfo_tmp; + struct timeval wait_time; memset(&qinfo, 0, sizeof(qinfo)); if((error != NETEVENT_NOERROR && error != NETEVENT_DONE)|| !repinfo) { @@ -1297,9 +1299,13 @@ worker_handle_request(struct comm_point* c, void* arg, int error, return 0; } - if (worker->env.cfg->sock_queue_timeout && timeval_isset(c->recv_tv)) { - c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout; - if (timeval_smaller(c->recv_tv, worker->env.now_tv)) { + if (worker->env.cfg->sock_queue_timeout && timeval_isset(&c->recv_tv)) { + timeval_subtract(&wait_time, worker->env.now_tv, &c->recv_tv); + wait_queue_time = wait_time.tv_sec * 1000000 + wait_time.tv_usec; + if (worker->stats.max_query_time_us < wait_queue_time) + worker->stats.max_query_time_us = wait_queue_time; + c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout; + if (timeval_smaller(&c->recv_tv, worker->env.now_tv)) { /* count and drop queries that were sitting in the socket queue too long */ worker->stats.num_queries_timed_out++; return 0; diff --git a/libunbound/unbound.h b/libunbound/unbound.h index f83a38f8f..8a97b16fe 100644 --- a/libunbound/unbound.h +++ b/libunbound/unbound.h @@ -701,7 +701,8 @@ struct ub_server_stats { long long num_queries_prefetch; /** number of queries which are too late to process */ long long num_queries_timed_out; - + /** the longest wait time in the queue */ + long long max_query_time_us; /** * Sum of the querylistsize of the worker for * every query that missed cache. To calculate average. diff --git a/smallapp/unbound-control.c b/smallapp/unbound-control.c index d5f124b49..bbc09659f 100644 --- a/smallapp/unbound-control.c +++ b/smallapp/unbound-control.c @@ -209,6 +209,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s) PR_UL_NM("num.cachemiss", s->svr.num_queries_missed_cache); PR_UL_NM("num.prefetch", s->svr.num_queries_prefetch); PR_UL_NM("num.queries_timed_out", s->svr.num_queries_timed_out); + PR_UL_NM("query.queue_time_us.max", s->svr.max_query_time_us); PR_UL_NM("num.expired", s->svr.ans_expired); PR_UL_NM("num.recursivereplies", s->mesh_replies_sent); #ifdef USE_DNSCRYPT