Skip to content

Commit

Permalink
Merge pull request #4867 from IntersectMBO/karknu/retry_fixes
Browse files Browse the repository at this point in the history
Retry time fixes
  • Loading branch information
karknu committed May 4, 2024
2 parents dbf4d4f + 6e5fbcd commit 3f545be
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 17 deletions.
5 changes: 5 additions & 0 deletions ouroboros-network/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
of relying on `threadDelay`.
* Added `TraceChurnAction` and `TraceChurnTimeout` trace points of `TracePeerSelection`.
* Added `HasCallStack` to functions which call `pickPeers`.
* Update the bigledger retry state in case of an exception
* Reset public root retry state when transition between `LedgerStateJudgements`.
* Reduce public root retry timer.
* Don't classify a config file with publicRoot/bootstrapPeers IP addresss only
as a DNS error.

## 0.14.0.0 -- 2024-04-04

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -686,11 +686,21 @@ tooBusyForTooLong trace0 =
-- For normal governor events we check if the length of the busy time span
-- is now too big (adjusted for any perturbation credits). If so we've
-- found a violation.
busy !busyStartTime !credits ((busyEndTime, _dt, GovernorEvent{}) : _trace')
| busySpanLength > credits = Left (busyEndTime, credits)
busy !busyStartTime !credits ((busyEndTime, _dt, GovernorEvent{}) : trace')
| busySpanLength > endCredits credits trace'=
Left (busyEndTime, endCredits credits trace')
where
busySpanLength = diffTime busyEndTime busyStartTime

-- If the governor wakes up due to an action that gives us new credits
-- we take those credits into account before failing.
endCredits !c [] = c
endCredits !c ((t, _, MockEnvEvent e) : tr) | t == busyEndTime =
endCredits (c + fromIntegral (envEventCredits e)) tr
endCredits !c ((t, _, _) : tr) | t == busyEndTime =
endCredits c tr
endCredits !c _ = c

-- We also look at how long it is to the next event to see if this is the
-- last event in the busy span, and if so we return to idle.
busy !_busyStartTime !_credits ((_t, dt, _event) : trace')
Expand Down Expand Up @@ -3301,6 +3311,10 @@ prop_governor_only_bootstrap_peers_in_clean_state env =
govHasOnlyBootstrapPeers =
selectGovState Governor.hasOnlyBootstrapPeers events

govTargets :: Signal PeerSelectionTargets
govTargets =
selectGovState Governor.targets events

isInCleanState :: Signal Bool
isInCleanState =
fmap (not . Set.null)
Expand All @@ -3323,9 +3337,11 @@ prop_governor_only_bootstrap_peers_in_clean_state env =
)

in signalProperty 20 show
(\(b, (kp, tp)) -> (b && Set.null (Set.difference kp tp)) || not b)
((,) <$> isInCleanState
<*> govKnownAndTrustedPeers
(\(b, (kp, tp), t) -> (b && (Set.null (Set.difference kp tp) || targetNumberOfKnownPeers t <= 0))
|| not b)
((,,) <$> isInCleanState
<*> govKnownAndTrustedPeers
<*> govTargets
)

-- | This test checks that if the node is not in a sensitive state it will not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ jobReqBigLedgerPeers PeerSelectionActions{ requestPublicRootPeers }
bigLedgerPeerBackoffs'
bigLedgerPeerRetryDiffTime'],
decisionState = st {
inProgressPublicRootsReq = False,
publicRootBackoffs = bigLedgerPeerBackoffs',
publicRootRetryTime = bigLedgerPeerRetryTime'
inProgressBigLedgerPeersReq = False,
bigLedgerPeerBackoffs = bigLedgerPeerBackoffs',
bigLedgerPeerRetryTime = bigLedgerPeerRetryTime'
},
decisionJobs = []
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,8 @@ monitorLedgerStateJudgement PeerSelectionActions{ readLedgerStateJudgement }
, localRootPeers = LocalRootPeers.empty
, hasOnlyBootstrapPeers = False
, bootstrapPeersTimeout = Just (addTime governor_BOOTSTRAP_PEERS_TIMEOUT now)
, publicRootBackoffs = 0
, publicRootRetryTime = now
})
YoungEnough -> do
let nonEstablishedBootstrapPeers =
Expand All @@ -664,7 +666,7 @@ monitorLedgerStateJudgement PeerSelectionActions{ readLedgerStateJudgement }
EstablishedPeers.toSet establishedPeers
Set.\\
(inProgressPromoteCold <> inProgressPromoteWarm)
return (\_ -> st
return (\now -> st
{ ledgerStateJudgement = lsj
, hasOnlyBootstrapPeers = False
, bootstrapPeersTimeout = Nothing
Expand All @@ -676,6 +678,8 @@ monitorLedgerStateJudgement PeerSelectionActions{ readLedgerStateJudgement }
PublicRootPeers.difference
publicRootPeers
nonEstablishedBootstrapPeers
, publicRootBackoffs = 0
, publicRootRetryTime = now
})
return $ \now ->
Decision {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ jobReqPublicRootPeers PeerSelectionActions{ requestPublicRootPeers
Completion $ \st now ->
-- This is a failure, so move the backoff counter one in the failure
-- direction (negative) and schedule the next retry time accordingly.
-- We use an exponential backoff strategy. The max retry time of 2^12
-- seconds is just over an hour.
-- We use an exponential backoff strategy. The max retry time of 2^8
-- seconds is about 4 minutes.
let publicRootBackoffs' :: Int
publicRootBackoffs' = (publicRootBackoffs st `min` 0) - 1

publicRootRetryDiffTime' :: DiffTime
publicRootRetryDiffTime' = 2 ^ (abs publicRootBackoffs' `min` 12)
publicRootRetryDiffTime' = 2 ^ (abs publicRootBackoffs' `min` 8)

publicRootRetryTime' :: Time
publicRootRetryTime' = addTime publicRootRetryDiffTime' now
Expand Down Expand Up @@ -148,8 +148,8 @@ jobReqPublicRootPeers PeerSelectionActions{ requestPublicRootPeers
-- below target we're going to want to try again at some point.
-- If we made progress towards our target then we will retry at the
-- suggested ttl. But if we did not make progress then we want to
-- follow an exponential backoff strategy. The max retry time of 2^12
-- seconds is just over an hour.
-- follow an exponential backoff strategy. The max retry time of 2^8
-- seconds is about 4 minutes.
publicRootBackoffs' :: Int
publicRootBackoffs'
| PublicRootPeers.null newPeers = (publicRootBackoffs st `max` 0) + 1
Expand All @@ -158,8 +158,8 @@ jobReqPublicRootPeers PeerSelectionActions{ requestPublicRootPeers
publicRootRetryDiffTime :: DiffTime
publicRootRetryDiffTime
| publicRootBackoffs' == 0
= ttl
| otherwise = 2^(publicRootBackoffs' `min` 12)
= min 60 ttl -- don't let days long dns timeout kreep in here.
| otherwise = 2^(publicRootBackoffs' `min` 8)

publicRootRetryTime :: Time
publicRootRetryTime = addTime publicRootRetryDiffTime now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,11 @@ publicRootPeersProvider tracer
!domainsIps = [(toPeerAddr ip port, pa)
| (RelayAccessAddress ip port, pa) <- Map.assocs domains ]
!ips = Map.fromList (map fst successes) `Map.union` Map.fromList domainsIps
!ttl = ttlForResults (map snd successes)
!ttl = if null lookups
then -- Not having any peers with domains configured is not
-- a DNS error.
ttlForResults [60]
else ttlForResults (map snd successes)
-- If all the lookups failed we'll return an empty set with a minimum
-- TTL, and the governor will invoke its exponential backoff.
return (ips, ttl)
Expand Down

0 comments on commit 3f545be

Please sign in to comment.