Skip to content

Commit

Permalink
Validate the ChainDB when the clean shutdown marker is missing
Browse files Browse the repository at this point in the history
Fixes #1551.
  • Loading branch information
mrBliss committed Feb 12, 2020
1 parent 1430bca commit cc70032
Show file tree
Hide file tree
Showing 5 changed files with 331 additions and 178 deletions.
3 changes: 2 additions & 1 deletion ouroboros-consensus/ouroboros-consensus.cabal
Expand Up @@ -91,14 +91,15 @@ library
Ouroboros.Consensus.Node
Ouroboros.Consensus.Node.DbMarker
Ouroboros.Consensus.Node.ErrorPolicy
Ouroboros.Consensus.Node.ExitFailure
Ouroboros.Consensus.Node.Exit
Ouroboros.Consensus.Node.ProtocolInfo
Ouroboros.Consensus.Node.ProtocolInfo.Abstract
Ouroboros.Consensus.Node.ProtocolInfo.Byron
Ouroboros.Consensus.Node.ProtocolInfo.Mock.BFT
Ouroboros.Consensus.Node.ProtocolInfo.Mock.PBFT
Ouroboros.Consensus.Node.ProtocolInfo.Mock.Praos
Ouroboros.Consensus.Node.ProtocolInfo.Mock.PraosRule
Ouroboros.Consensus.Node.Recovery
Ouroboros.Consensus.Node.Run
Ouroboros.Consensus.Node.Run.Abstract
Ouroboros.Consensus.Node.Run.Byron
Expand Down
135 changes: 85 additions & 50 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
Expand Up @@ -33,6 +33,7 @@ module Ouroboros.Consensus.Node
) where

import Codec.Serialise (DeserialiseFailure)
import Control.Monad (when)
import Control.Tracer (Tracer)
import Crypto.Random
import Data.ByteString.Lazy (ByteString)
Expand All @@ -57,6 +58,7 @@ import Ouroboros.Consensus.Ledger.Extended (ExtLedgerState (..))
import Ouroboros.Consensus.Node.DbMarker
import Ouroboros.Consensus.Node.ErrorPolicy
import Ouroboros.Consensus.Node.ProtocolInfo
import Ouroboros.Consensus.Node.Recovery
import Ouroboros.Consensus.Node.Run
import Ouroboros.Consensus.Node.Tracers
import Ouroboros.Consensus.NodeKernel
Expand Down Expand Up @@ -111,9 +113,8 @@ run
run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments
networkMagic dbPath pInfo isProducer customiseChainDbArgs
customiseNodeArgs onNodeKernel = do
let mountPoint = MountPoint dbPath
either throwM return =<< checkDbMarker
(ioHasFS mountPoint)
hasFS
mountPoint
(nodeProtocolMagicId (Proxy @blk) cfg)
withRegistry $ \registry -> do
Expand All @@ -125,59 +126,58 @@ run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments
(nodeStartTime (Proxy @blk) cfg)
(focusSlotLengths slotLengths)

(_, chainDB) <- allocate registry
(\_ -> openChainDB
chainDbTracer registry btime dbPath cfg initLedger
customiseChainDbArgs)
ChainDB.closeDB
-- When we shut down cleanly, we create a marker file so that the next
-- time we start, we know we don't have to validate the contents of the
-- whole ChainDB. When we shut down with an exception indicating
-- corruption or something going wrong with the file system, we don't
-- create this marker file so that the next time we start, we do a full
-- validation.
lastShutDownWasClean <- hasCleanShutdownMarker hasFS
when lastShutDownWasClean $ removeCleanShutdownMarker hasFS
let customiseChainDbArgs' args
| lastShutDownWasClean
= customiseChainDbArgs args
| otherwise
-- When the last shutdown was not clean, validate the complete
-- ChainDB to detect and recover from any corruptions. This will
-- override the default value /and/ the user-customised value of
-- the 'ChainDB.cdbValidation' field.
= (customiseChainDbArgs args)
{ ChainDB.cdbValidation = ValidateAllEpochs }

let nodeArgs = customiseNodeArgs $ mkNodeArgs
registry
cfg
initState
tracers
btime
chainDB
isProducer
-- On a clean shutdown, create a marker in the database folder so that
-- next time we start up, we know we don't have to validate the whole
-- database.
createMarkerOnCleanShutdown hasFS $ do

nodeKernel <- initNodeKernel nodeArgs
onNodeKernel registry nodeKernel
let networkApps :: NetworkApplication
IO ConnectionId
ByteString ByteString ByteString ByteString ByteString ByteString
()
networkApps = consensusNetworkApps
nodeKernel
protocolTracers
(protocolCodecs (getNodeConfig nodeKernel))
(protocolHandlers nodeArgs nodeKernel)
(_, chainDB) <- allocate registry
(\_ -> openChainDB
chainDbTracer registry btime dbPath cfg initLedger
customiseChainDbArgs')
ChainDB.closeDB

diffusionApplications = DiffusionApplications
{ daResponderApplication =
simpleSingletonVersions
NodeToNodeV_1
nodeToNodeVersionData
(DictVersion nodeToNodeCodecCBORTerm)
(responderNetworkApplication networkApps)
, daInitiatorApplication =
simpleSingletonVersions
NodeToNodeV_1
nodeToNodeVersionData
(DictVersion nodeToNodeCodecCBORTerm)
(initiatorNetworkApplication networkApps)
, daLocalResponderApplication =
simpleSingletonVersions
NodeToClientV_1
nodeToClientVersionData
(DictVersion nodeToClientCodecCBORTerm)
(localResponderNetworkApplication networkApps)
, daErrorPolicies = consensusErrorPolicy
}
let nodeArgs = customiseNodeArgs $ mkNodeArgs
registry
cfg
initState
tracers
btime
chainDB
isProducer

runDataDiffusion diffusionTracers
diffusionArguments
diffusionApplications
nodeKernel <- initNodeKernel nodeArgs
onNodeKernel registry nodeKernel

let networkApps = mkNetworkApps nodeArgs nodeKernel
diffusionApplications = mkDiffusionApplications networkApps

runDataDiffusion diffusionTracers
diffusionArguments
diffusionApplications
where
mountPoint = MountPoint dbPath
hasFS = ioHasFS mountPoint

ProtocolInfo
{ pInfoConfig = cfg
, pInfoInitLedger = initLedger
Expand All @@ -189,6 +189,41 @@ run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments
nodeToNodeVersionData = NodeToNodeVersionData { networkMagic = networkMagic }
nodeToClientVersionData = NodeToClientVersionData { networkMagic = networkMagic }

mkNetworkApps
:: NodeArgs IO ConnectionId blk
-> NodeKernel IO ConnectionId blk
-> NetworkApplication
IO ConnectionId
ByteString ByteString ByteString ByteString ByteString ByteString
()
mkNetworkApps nodeArgs nodeKernel = consensusNetworkApps
nodeKernel
protocolTracers
(protocolCodecs (getNodeConfig nodeKernel))
(protocolHandlers nodeArgs nodeKernel)

mkDiffusionApplications networkApps = DiffusionApplications
{ daResponderApplication =
simpleSingletonVersions
NodeToNodeV_1
nodeToNodeVersionData
(DictVersion nodeToNodeCodecCBORTerm)
(responderNetworkApplication networkApps)
, daInitiatorApplication =
simpleSingletonVersions
NodeToNodeV_1
nodeToNodeVersionData
(DictVersion nodeToNodeCodecCBORTerm)
(initiatorNetworkApplication networkApps)
, daLocalResponderApplication =
simpleSingletonVersions
NodeToClientV_1
nodeToClientVersionData
(DictVersion nodeToClientCodecCBORTerm)
(localResponderNetworkApplication networkApps)
, daErrorPolicies = consensusErrorPolicy
}

openChainDB
:: forall blk. RunNode blk
=> Tracer IO (ChainDB.TraceEvent blk)
Expand Down
152 changes: 152 additions & 0 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node/Exit.hs
@@ -0,0 +1,152 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Ouroboros.Consensus.Node.Exit
( -- * ExitFailure
ExitFailure
, exitReasontoExitFailure
-- * ExitReason
, ExitReason (..)
, toExitReason
) where

import Control.Exception (AsyncException (..), SomeException,
fromException)

import Control.Monad.Class.MonadAsync (ExceptionInLinkedThread (..))

import Ouroboros.Storage.ChainDB.API (ChainDbFailure (..))
import Ouroboros.Storage.FS.API.Types (FsError (..), FsErrorType (..))
import Ouroboros.Storage.ImmutableDB.Types (ImmutableDBError)
import qualified Ouroboros.Storage.ImmutableDB.Types as ImmDB
import Ouroboros.Storage.VolatileDB.Types (VolatileDBError)
import qualified Ouroboros.Storage.VolatileDB.Types as VolDB

import Ouroboros.Consensus.Node.DbMarker (DbMarkerError)
import Ouroboros.Consensus.Node.ProtocolInfo.Byron
(PBftLeaderCredentialsError)

{-------------------------------------------------------------------------------
ExitFailure
-------------------------------------------------------------------------------}

-- | The exit code to return when terminating with an exception.
--
-- To be used in the @ExitFailure@ constructor of 'System.Exit.ExitCode'.
--
-- Note that a node will never turn shut down itself, it is meant to run
-- forever, so it will always terminate with an 'ExitFailure'.
type ExitFailure = Int

-- | Convert an 'ExitReason' to an 'ExitFailure'.
exitReasontoExitFailure :: ExitReason -> ExitFailure
exitReasontoExitFailure = \case
-- Some action should be taken before restarting in the cases below.
ConfigurationError -> 3
WrongDatabase -> 4
DiskFull -> 5
InsufficientPermissions -> 6
NoNetwork -> 7

-- The node can simply be restarted in the cases below.
--
-- NOTE: Database corruption is handled automically: when the node is
-- restarted, it will do a full validation pass.
Killed -> 1
DatabaseCorruption -> 2
Other -> 2

{-------------------------------------------------------------------------------
ExitReason
-------------------------------------------------------------------------------}

-- | The reason of shutting down
data ExitReason =
-- | The node process was killed, by the @kill@ command, @CTRL-C@ or some
-- other means. This is normal way for a user to terminate the node
-- process. The node can simply be restarted.
Killed

-- | Something is wrong with the node configuration, the user should check it.
--
-- For example, for PBFT, it could be that the block signing key and the
-- delegation certificate do not match.
| ConfigurationError

-- | We were unable to open the database, probably the user is using the
-- wrong directory. See 'DbMarkerError' for details.
| WrongDatabase

-- | The disk is full, make some space before restarting the node.
| DiskFull

-- | The database folder doesn't have the right permissions.
| InsufficientPermissions

-- | There is a problem with the network connection, the user should
-- investigate.
--
-- TODO We're not yet returning this.
| NoNetwork

-- | Something went wrong with the database, restart the node with
-- recovery enabled.
| DatabaseCorruption

-- | Some exception was thrown. The node should just be restarted.
| Other

-- | Return the 'ExitReason' for the given 'SomeException'. Defaults to
-- 'Other'.
toExitReason :: SomeException -> ExitReason
toExitReason e
| Just (e' :: AsyncException) <- fromException e
= case e' of
ThreadKilled -> Killed
UserInterrupt -> Killed
_ -> Other

| Just (ExceptionInLinkedThread _ e') <- fromException e
= toExitReason e'
| Just (_ :: DbMarkerError) <- fromException e
= WrongDatabase
| Just (e' :: ChainDbFailure) <- fromException e
= case e' of
ImmDbFailure ue -> immDbUnexpectedError ue
VolDbFailure ue -> volDbUnexpectedError ue
LgrDbFailure fe -> fsError fe
_ -> DatabaseCorruption
| Just (_ :: PBftLeaderCredentialsError) <- fromException e
= ConfigurationError

-- The three exceptions below will always be wrapped in a
-- 'ChainDbFailure', but we include them just in case.
| Just (e' :: ImmutableDBError) <- fromException e
= case e' of
ImmDB.UnexpectedError ue -> immDbUnexpectedError ue
_ -> Other
| Just (e' :: VolatileDBError) <- fromException e
= case e' of
VolDB.UnexpectedError ue -> volDbUnexpectedError ue
_ -> Other
| Just (e' :: FsError) <- fromException e
= fsError e'

| otherwise
= Other
where
immDbUnexpectedError :: ImmDB.UnexpectedError -> ExitReason
immDbUnexpectedError = \case
ImmDB.FileSystemError fe -> fsError fe
_ -> DatabaseCorruption

volDbUnexpectedError :: VolDB.UnexpectedError -> ExitReason
volDbUnexpectedError = \case
VolDB.FileSystemError fe -> fsError fe
_ -> DatabaseCorruption

fsError :: FsError -> ExitReason
fsError FsError { fsErrorType } = case fsErrorType of
FsDeviceFull -> DiskFull
FsInsufficientPermissions -> InsufficientPermissions
_ -> DatabaseCorruption

0 comments on commit cc70032

Please sign in to comment.