From ca7cc5443831c77778c35191fb0aa26f1944673f Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 29 Feb 2024 10:47:55 +0100 Subject: [PATCH] Checkable: Don't recalculate `next_check` while processing remotely genrated check Currently, when processing a `CheckResult`, it will first trigger an `OnNextCheckChanged` event, which is sent to all connected endpoints. Then, when `Checkable::ProcessCheckResult()` returns, an `OnCheckResult` event is fired, which is of course also sent to all connected endpoints. Next, the other endpoints receive the `event::SetNextCheck` cluster event followed by `event::CheckResult`and invoke `checkable#SetNextCheck()` and `Checkable#CheckResult()` with the newly received check. So they also try to recalculate the next check themselves and invalidate the previously received next check timestamp from the source endpoint. Since each endpoint randomly initialises its own scheduling offset, the recalculated next check will always differ by a split second/millisecond on each of them. As a consequence, two Icinga DB HA instances will generate two different checksums for the same state and causes the state histories to be fully resynchronised after a takeover/Icinga 2 reload. --- lib/icinga/checkable-check.cpp | 35 ++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index efa9477a202..f0af81bdd96 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -360,21 +360,28 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr bool is_flapping = IsFlapping(); - if (cr->GetActive()) { - UpdateNextCheck(origin); - } else { - /* Reschedule the next check for external passive check results. The side effect of - * this is that for as long as we receive results for a service we - * won't execute any active checks. */ - double offset; - double ttl = cr->GetTtl(); - - if (ttl > 0) - offset = ttl; - else - offset = GetCheckInterval(); + // Don't recompute the next check when the current check isn't generated by this endpoint. When the check is + // remotely generated we should've already received the "SetNextCheck" event before the "event::CheckResult" + // cluster event. Otherwise, the next check received before this check will be invalidated and cause the Checkable + // "next_check/next_update" in a HA setup to always be different from the other endpoint as the "m_SchedulingOffset" + // is randomly initialised on each node. + if (!origin) { + if (cr->GetActive()) { + UpdateNextCheck(); + } else { + /* Reschedule the next check for external passive check results. The side effect of + * this is that for as long as we receive results for a service we + * won't execute any active checks. */ + double offset; + double ttl = cr->GetTtl(); + + if (ttl > 0) + offset = ttl; + else + offset = GetCheckInterval(); - SetNextCheck(Utility::GetTime() + offset, false, origin); + SetNextCheck(Utility::GetTime() + offset); + } } olock.Unlock();