Skip to content

Commit

Permalink
Merge #1554
Browse files Browse the repository at this point in the history
1554: Detect clock changes r=edsko a=edsko

Closes #759.

Code is tested using mock time; result of the test labelling:

```
# cabal run test-consensus -- -p delayClockShift --quickcheck-replay=680184 --quickcheck-tests 10000
Up to date
ouroboros-consensus
  WallClock
    delayClockShift: OK (18.51s)
      +++ OK, passed 10000 tests.
      
      schedule goes back (10000 in total):
      63.07% False
      36.93% True
      
      schedule length (10000 in total):
      81.03% R_Gt 20
       9.26% R_Btwn (10,20)
       4.08% R_Btwn (5,10)
       1.04% R_Btwn (4,5)
       1.03% R_Eq 1
       0.96% R_Eq 2
       0.94% R_Eq 4
       0.87% R_Eq 0
       0.79% R_Eq 3
      
      schedule skips (10000 in total):
      38.66% R_Btwn (10,20)
      24.96% R_Btwn (5,10)
       5.59% R_Eq 0
       5.43% R_Eq 2
       5.27% R_Btwn (4,5)
       5.12% R_Eq 1
       5.08% R_Eq 3
       4.99% R_Eq 4
       4.90% R_Gt 20

All 1 tests passed (18.52s)
```

I also verified that the exception bubbles up to the node. Ran the latest node with this PR, set my clock back an hour, and got

```
cardano-node: ExceptionInLinkedThread "ThreadId 20" (SystemClockMovedBack 2020-01-31 11:15:31.00184141 UTC (SlotNo {unSlotNo = 3713312}) (SlotNo {unSlotNo = 3713132}))
```

There is no need to define a custom error policy for this due to #1553 , nor a custom exit failure due to #1551 (comment) .

Co-authored-by: Edsko de Vries <edsko@well-typed.com>
  • Loading branch information
iohk-bors[bot] and edsko committed Feb 7, 2020
2 parents 4cbf893 + 2e2575f commit 901b746
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 15 deletions.
@@ -1,21 +1,26 @@
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Ouroboros.Consensus.BlockchainTime.WallClock (
realBlockchainTime
, TraceBlockchainTimeEvent(..)
, SystemClockMovedBackException(..)
-- * Low-level API (exported primarily for testing)
, getWallClockSlot
, waitUntilNextSlot
, nominalDelay
) where

import Control.Exception (Exception)
import Control.Monad
import Data.Time (NominalDiffTime, diffUTCTime)
import Data.Void

import Control.Tracer (Tracer, traceWith)

import Control.Monad.Class.MonadThrow

import Ouroboros.Network.Block (SlotNo)

import Ouroboros.Consensus.BlockchainTime.API
Expand Down Expand Up @@ -52,28 +57,36 @@ realBlockchainTime registry tracer start ls = do
threadDelay (nominalDelay delay)

-- Fork thread that continuously updates the current slot
first <- fst <$> getWallClockSlot start lsVar
slot <- newTVarM first
void $ forkLinkedThread registry $ loop lsVar slot
first <- fst <$> getWallClockSlot start lsVar
slotVar <- newTVarM first
void $ forkLinkedThread registry $ loop lsVar slotVar first

-- The API is now a simple STM one
return BlockchainTime {
getCurrentSlot = readTVar slot
getCurrentSlot = readTVar slotVar
, onSlotChange_ = fmap cancelThread .
onEachChange registry id (Just first) (readTVar slot)
onEachChange registry id (Just first) (readTVar slotVar)
}
where
-- In each iteration of the loop, we recompute how long to wait until
-- the next slot. This minimizes clock skew.
loop :: StrictTVar m FocusedSlotLengths -> StrictTVar m SlotNo -> m Void
loop lsVar slot = forever $ do
next <- waitUntilNextSlot start lsVar
atomically $ writeTVar slot next
loop :: StrictTVar m FocusedSlotLengths
-> StrictTVar m SlotNo
-> SlotNo
-> m Void
loop lsVar slotVar = go
where
go :: SlotNo -> m Void
go current = do
next <- waitUntilNextSlot start lsVar current
atomically $ writeTVar slotVar next
go next

{-------------------------------------------------------------------------------
Stateful wrappers around Ouroboros.Consensus.BlockchainTime.SlotLengths
-------------------------------------------------------------------------------}

-- | Get current slot and time spent in that slot
getWallClockSlot :: IOLike m
=> SystemStart
-> StrictTVar m FocusedSlotLengths
Expand All @@ -82,16 +95,57 @@ getWallClockSlot start lsVar = do
now <- getCurrentTime
atomically $ updateTVar lsVar $ slotFromUTCTime start now

-- | Wait until the next slot
--
-- Takes the current slot number to guard against system clock changes. Any
-- clock changes that would result in the slot number to /decrease/ will result
-- in a fatal 'SystemClockMovedBackException'. When this exception is thrown,
-- the node will shut down, and should be restarted with (full?) validation
-- enabled: it is conceivable that blocks got moved to the immutable DB that,
-- due to the clock change, should not be considered immutable anymore.
waitUntilNextSlot :: IOLike m
=> SystemStart
-> StrictTVar m FocusedSlotLengths
-> SlotNo -- ^ Current slot number
-> m SlotNo
waitUntilNextSlot start lsVar = do
now <- getCurrentTime
(delay, nextSlot) <- atomically $ updateTVar lsVar $
waitUntilNextSlot start lsVar oldCurrent = do
now <- getCurrentTime
(delay, _nextSlot) <- atomically $ updateTVar lsVar $
delayUntilNextSlot start now
threadDelay (nominalDelay delay)
return nextSlot

-- At this point we expect to be in 'nextSlot', but the actual now-current
-- slot might be different:
--
-- o If the system is under heavy load, we might have missed some slots. If
-- this is the case, that's okay, and we just report the actual
-- now-current slot as the next slot.
-- o If the system clock is adjusted back a tiny bit (maybe due to an NTP
-- client running on the system), it's possible that we are still in the
-- /old/ current slot. If this happens, we just wait again; nothing bad
-- has happened, we just stay in one slot for longer.
-- o If the system clock is adjusted back more than that, we might be in
-- a slot number /before/ the old current slot. In that case, we throw
-- an exception (see discussion above).

(newCurrent, _timeInNewCurrent) <- getWallClockSlot start lsVar

if | newCurrent > oldCurrent ->
return newCurrent
| newCurrent == oldCurrent ->
waitUntilNextSlot start lsVar oldCurrent
| otherwise ->
throwM $ SystemClockMovedBack now oldCurrent newCurrent

data SystemClockMovedBackException =
-- | The system clock got moved back so far that the slot number decreased
--
-- We record the time at which we discovered the clock change, the slot
-- number before the clock change, and the slot number after the change.
SystemClockMovedBack UTCTime SlotNo SlotNo
deriving (Show)

instance Exception SystemClockMovedBackException

{-------------------------------------------------------------------------------
Auxiliary: conversions
Expand Down
Expand Up @@ -46,7 +46,7 @@ instance Arbitrary TestDelayIO where

-- | Just as a sanity check, also run the tests in IO
--
-- We override the maxinum number of tests since there are slow.
-- We override the maximum number of tests since there are slow.
--
-- NOTE: If the system is under very heavy load, this test /could/ fail:
-- the slot number after the delay could be later than the one we expect.
Expand All @@ -61,7 +61,7 @@ prop_delayNextSlot TestDelayIO{..} =
tdioStart <- pickSystemStart
lsVar <- mkLsVar
atStart <- fst <$> getWallClockSlot tdioStart lsVar
nextSlot <- waitUntilNextSlot tdioStart lsVar
nextSlot <- waitUntilNextSlot tdioStart lsVar atStart
afterDelay <- fst <$> getWallClockSlot tdioStart lsVar
assertEqual "atStart + 1" (atStart + 1) afterDelay
assertEqual "nextSlot" nextSlot afterDelay
Expand Down

0 comments on commit 901b746

Please sign in to comment.