diff --git a/ouroboros-consensus/ouroboros-consensus.cabal b/ouroboros-consensus/ouroboros-consensus.cabal index 60888177b77..5f039710bf7 100644 --- a/ouroboros-consensus/ouroboros-consensus.cabal +++ b/ouroboros-consensus/ouroboros-consensus.cabal @@ -91,7 +91,7 @@ library Ouroboros.Consensus.Node Ouroboros.Consensus.Node.DbMarker Ouroboros.Consensus.Node.ErrorPolicy - Ouroboros.Consensus.Node.ExitFailure + Ouroboros.Consensus.Node.Exit Ouroboros.Consensus.Node.ProtocolInfo Ouroboros.Consensus.Node.ProtocolInfo.Abstract Ouroboros.Consensus.Node.ProtocolInfo.Byron @@ -99,6 +99,7 @@ library Ouroboros.Consensus.Node.ProtocolInfo.Mock.PBFT Ouroboros.Consensus.Node.ProtocolInfo.Mock.Praos Ouroboros.Consensus.Node.ProtocolInfo.Mock.PraosRule + Ouroboros.Consensus.Node.Recovery Ouroboros.Consensus.Node.Run Ouroboros.Consensus.Node.Run.Abstract Ouroboros.Consensus.Node.Run.Byron diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs index b58d4bd63ba..5e67a9f4055 100644 --- a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs +++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs @@ -33,6 +33,7 @@ module Ouroboros.Consensus.Node ) where import Codec.Serialise (DeserialiseFailure) +import Control.Monad (when) import Control.Tracer (Tracer) import Crypto.Random import Data.ByteString.Lazy (ByteString) @@ -57,6 +58,7 @@ import Ouroboros.Consensus.Ledger.Extended (ExtLedgerState (..)) import Ouroboros.Consensus.Node.DbMarker import Ouroboros.Consensus.Node.ErrorPolicy import Ouroboros.Consensus.Node.ProtocolInfo +import Ouroboros.Consensus.Node.Recovery import Ouroboros.Consensus.Node.Run import Ouroboros.Consensus.Node.Tracers import Ouroboros.Consensus.NodeKernel @@ -111,9 +113,8 @@ run run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments networkMagic dbPath pInfo isProducer customiseChainDbArgs customiseNodeArgs onNodeKernel = do - let mountPoint = MountPoint dbPath either throwM return =<< checkDbMarker - (ioHasFS mountPoint) + hasFS mountPoint (nodeProtocolMagicId (Proxy @blk) cfg) withRegistry $ \registry -> do @@ -125,59 +126,58 @@ run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments (nodeStartTime (Proxy @blk) cfg) (focusSlotLengths slotLengths) - (_, chainDB) <- allocate registry - (\_ -> openChainDB - chainDbTracer registry btime dbPath cfg initLedger - customiseChainDbArgs) - ChainDB.closeDB + -- When we shut down cleanly, we create a marker file so that the next + -- time we start, we know we don't have to validate the contents of the + -- whole ChainDB. When we shut down with an exception indicating + -- corruption or something going wrong with the file system, we don't + -- create this marker file so that the next time we start, we do a full + -- validation. + lastShutDownWasClean <- hasCleanShutdownMarker hasFS + when lastShutDownWasClean $ removeCleanShutdownMarker hasFS + let customiseChainDbArgs' args + | lastShutDownWasClean + = customiseChainDbArgs args + | otherwise + -- When the last shutdown was not clean, validate the complete + -- ChainDB to detect and recover from any corruptions. This will + -- override the default value /and/ the user-customised value of + -- the 'ChainDB.cdbValidation' field. + = (customiseChainDbArgs args) + { ChainDB.cdbValidation = ValidateAllEpochs } - let nodeArgs = customiseNodeArgs $ mkNodeArgs - registry - cfg - initState - tracers - btime - chainDB - isProducer + -- On a clean shutdown, create a marker in the database folder so that + -- next time we start up, we know we don't have to validate the whole + -- database. + createMarkerOnCleanShutdown hasFS $ do - nodeKernel <- initNodeKernel nodeArgs - onNodeKernel registry nodeKernel - let networkApps :: NetworkApplication - IO ConnectionId - ByteString ByteString ByteString ByteString ByteString ByteString - () - networkApps = consensusNetworkApps - nodeKernel - protocolTracers - (protocolCodecs (getNodeConfig nodeKernel)) - (protocolHandlers nodeArgs nodeKernel) + (_, chainDB) <- allocate registry + (\_ -> openChainDB + chainDbTracer registry btime dbPath cfg initLedger + customiseChainDbArgs') + ChainDB.closeDB - diffusionApplications = DiffusionApplications - { daResponderApplication = - simpleSingletonVersions - NodeToNodeV_1 - nodeToNodeVersionData - (DictVersion nodeToNodeCodecCBORTerm) - (responderNetworkApplication networkApps) - , daInitiatorApplication = - simpleSingletonVersions - NodeToNodeV_1 - nodeToNodeVersionData - (DictVersion nodeToNodeCodecCBORTerm) - (initiatorNetworkApplication networkApps) - , daLocalResponderApplication = - simpleSingletonVersions - NodeToClientV_1 - nodeToClientVersionData - (DictVersion nodeToClientCodecCBORTerm) - (localResponderNetworkApplication networkApps) - , daErrorPolicies = consensusErrorPolicy - } + let nodeArgs = customiseNodeArgs $ mkNodeArgs + registry + cfg + initState + tracers + btime + chainDB + isProducer - runDataDiffusion diffusionTracers - diffusionArguments - diffusionApplications + nodeKernel <- initNodeKernel nodeArgs + onNodeKernel registry nodeKernel + + let networkApps = mkNetworkApps nodeArgs nodeKernel + diffusionApplications = mkDiffusionApplications networkApps + + runDataDiffusion diffusionTracers + diffusionArguments + diffusionApplications where + mountPoint = MountPoint dbPath + hasFS = ioHasFS mountPoint + ProtocolInfo { pInfoConfig = cfg , pInfoInitLedger = initLedger @@ -189,6 +189,41 @@ run tracers protocolTracers chainDbTracer diffusionTracers diffusionArguments nodeToNodeVersionData = NodeToNodeVersionData { networkMagic = networkMagic } nodeToClientVersionData = NodeToClientVersionData { networkMagic = networkMagic } + mkNetworkApps + :: NodeArgs IO ConnectionId blk + -> NodeKernel IO ConnectionId blk + -> NetworkApplication + IO ConnectionId + ByteString ByteString ByteString ByteString ByteString ByteString + () + mkNetworkApps nodeArgs nodeKernel = consensusNetworkApps + nodeKernel + protocolTracers + (protocolCodecs (getNodeConfig nodeKernel)) + (protocolHandlers nodeArgs nodeKernel) + + mkDiffusionApplications networkApps = DiffusionApplications + { daResponderApplication = + simpleSingletonVersions + NodeToNodeV_1 + nodeToNodeVersionData + (DictVersion nodeToNodeCodecCBORTerm) + (responderNetworkApplication networkApps) + , daInitiatorApplication = + simpleSingletonVersions + NodeToNodeV_1 + nodeToNodeVersionData + (DictVersion nodeToNodeCodecCBORTerm) + (initiatorNetworkApplication networkApps) + , daLocalResponderApplication = + simpleSingletonVersions + NodeToClientV_1 + nodeToClientVersionData + (DictVersion nodeToClientCodecCBORTerm) + (localResponderNetworkApplication networkApps) + , daErrorPolicies = consensusErrorPolicy + } + openChainDB :: forall blk. RunNode blk => Tracer IO (ChainDB.TraceEvent blk) diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Exit.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Exit.hs new file mode 100644 index 00000000000..c0d617c8e59 --- /dev/null +++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Exit.hs @@ -0,0 +1,152 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE ScopedTypeVariables #-} +module Ouroboros.Consensus.Node.Exit + ( -- * ExitFailure + ExitFailure + , exitReasontoExitFailure + -- * ExitReason + , ExitReason (..) + , toExitReason + ) where + +import Control.Exception (AsyncException (..), SomeException, + fromException) + +import Control.Monad.Class.MonadAsync (ExceptionInLinkedThread (..)) + +import Ouroboros.Storage.ChainDB.API (ChainDbFailure (..)) +import Ouroboros.Storage.FS.API.Types (FsError (..), FsErrorType (..)) +import Ouroboros.Storage.ImmutableDB.Types (ImmutableDBError) +import qualified Ouroboros.Storage.ImmutableDB.Types as ImmDB +import Ouroboros.Storage.VolatileDB.Types (VolatileDBError) +import qualified Ouroboros.Storage.VolatileDB.Types as VolDB + +import Ouroboros.Consensus.Node.DbMarker (DbMarkerError) +import Ouroboros.Consensus.Node.ProtocolInfo.Byron + (PBftLeaderCredentialsError) + +{------------------------------------------------------------------------------- + ExitFailure +-------------------------------------------------------------------------------} + +-- | The exit code to return when terminating with an exception. +-- +-- To be used in the @ExitFailure@ constructor of 'System.Exit.ExitCode'. +-- +-- Note that a node will never turn shut down itself, it is meant to run +-- forever, so it will always terminate with an 'ExitFailure'. +type ExitFailure = Int + +-- | Convert an 'ExitReason' to an 'ExitFailure'. +exitReasontoExitFailure :: ExitReason -> ExitFailure +exitReasontoExitFailure = \case + -- Some action should be taken before restarting in the cases below. + ConfigurationError -> 3 + WrongDatabase -> 4 + DiskFull -> 5 + InsufficientPermissions -> 6 + NoNetwork -> 7 + + -- The node can simply be restarted in the cases below. + -- + -- NOTE: Database corruption is handled automically: when the node is + -- restarted, it will do a full validation pass. + Killed -> 1 + DatabaseCorruption -> 2 + Other -> 2 + +{------------------------------------------------------------------------------- + ExitReason +-------------------------------------------------------------------------------} + +-- | The reason of shutting down +data ExitReason = + -- | The node process was killed, by the @kill@ command, @CTRL-C@ or some + -- other means. This is normal way for a user to terminate the node + -- process. The node can simply be restarted. + Killed + + -- | Something is wrong with the node configuration, the user should check it. + -- + -- For example, for PBFT, it could be that the block signing key and the + -- delegation certificate do not match. + | ConfigurationError + + -- | We were unable to open the database, probably the user is using the + -- wrong directory. See 'DbMarkerError' for details. + | WrongDatabase + + -- | The disk is full, make some space before restarting the node. + | DiskFull + + -- | The database folder doesn't have the right permissions. + | InsufficientPermissions + + -- | There is a problem with the network connection, the user should + -- investigate. + -- + -- TODO We're not yet returning this. + | NoNetwork + + -- | Something went wrong with the database, restart the node with + -- recovery enabled. + | DatabaseCorruption + + -- | Some exception was thrown. The node should just be restarted. + | Other + +-- | Return the 'ExitReason' for the given 'SomeException'. Defaults to +-- 'Other'. +toExitReason :: SomeException -> ExitReason +toExitReason e + | Just (e' :: AsyncException) <- fromException e + = case e' of + ThreadKilled -> Killed + UserInterrupt -> Killed + _ -> Other + + | Just (ExceptionInLinkedThread _ e') <- fromException e + = toExitReason e' + | Just (_ :: DbMarkerError) <- fromException e + = WrongDatabase + | Just (e' :: ChainDbFailure) <- fromException e + = case e' of + ImmDbFailure ue -> immDbUnexpectedError ue + VolDbFailure ue -> volDbUnexpectedError ue + LgrDbFailure fe -> fsError fe + _ -> DatabaseCorruption + | Just (_ :: PBftLeaderCredentialsError) <- fromException e + = ConfigurationError + + -- The three exceptions below will always be wrapped in a + -- 'ChainDbFailure', but we include them just in case. + | Just (e' :: ImmutableDBError) <- fromException e + = case e' of + ImmDB.UnexpectedError ue -> immDbUnexpectedError ue + _ -> Other + | Just (e' :: VolatileDBError) <- fromException e + = case e' of + VolDB.UnexpectedError ue -> volDbUnexpectedError ue + _ -> Other + | Just (e' :: FsError) <- fromException e + = fsError e' + + | otherwise + = Other + where + immDbUnexpectedError :: ImmDB.UnexpectedError -> ExitReason + immDbUnexpectedError = \case + ImmDB.FileSystemError fe -> fsError fe + _ -> DatabaseCorruption + + volDbUnexpectedError :: VolDB.UnexpectedError -> ExitReason + volDbUnexpectedError = \case + VolDB.FileSystemError fe -> fsError fe + _ -> DatabaseCorruption + + fsError :: FsError -> ExitReason + fsError FsError { fsErrorType } = case fsErrorType of + FsDeviceFull -> DiskFull + FsInsufficientPermissions -> InsufficientPermissions + _ -> DatabaseCorruption diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node/ExitFailure.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node/ExitFailure.hs deleted file mode 100644 index 2bff67bd475..00000000000 --- a/ouroboros-consensus/src/Ouroboros/Consensus/Node/ExitFailure.hs +++ /dev/null @@ -1,127 +0,0 @@ -{-# LANGUAGE LambdaCase #-} -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE ScopedTypeVariables #-} -module Ouroboros.Consensus.Node.ExitFailure - ( -- * Pre-defined ExitFailures - ExitFailure - , defaultExitFailure - , configurationError - , noNetwork - , restartWithRecovery - , wrongDatabase - , diskFull - , insufficientPermissions - -- * Get the ExitFailure of an Exception - , toExitFailure - ) where - -import Control.Exception (SomeException, fromException) - -import Control.Monad.Class.MonadAsync (ExceptionInLinkedThread (..)) - -import Ouroboros.Storage.ChainDB.API (ChainDbFailure (..)) -import Ouroboros.Storage.FS.API.Types (FsError (..), FsErrorType (..)) -import Ouroboros.Storage.ImmutableDB.Types (ImmutableDBError) -import qualified Ouroboros.Storage.ImmutableDB.Types as ImmDB -import Ouroboros.Storage.VolatileDB.Types (VolatileDBError) -import qualified Ouroboros.Storage.VolatileDB.Types as VolDB - -import Ouroboros.Consensus.Node.DbMarker (DbMarkerError) -import Ouroboros.Consensus.Node.ProtocolInfo.Byron - (PBftLeaderCredentialsError) - -{------------------------------------------------------------------------------- - Pre-defined ExitFailures --------------------------------------------------------------------------------} - -type ExitFailure = Int - --- | Something went wrong, just restart the node. -defaultExitFailure :: ExitFailure -defaultExitFailure = 1 - --- | Something is wrong with the node configuration, the user should check it. --- --- For example, for PBFT, it could be that the block signing key and the --- delegation certificate do not match. -configurationError :: ExitFailure -configurationError = 2 - --- | There is a problem with the network connection, the user should --- investigate. --- --- TODO We're not yet returning this. -noNetwork :: ExitFailure -noNetwork = 3 - --- | Something went wrong with the database, restart the node with recovery --- enabled. -restartWithRecovery :: ExitFailure -restartWithRecovery = 4 - --- | We were unable to open the database, probably the user is using the wrong --- directory. See 'DbMarkerError' for details. -wrongDatabase :: ExitFailure -wrongDatabase = 5 - --- | The disk is full, make some space before restarting the node. -diskFull :: ExitFailure -diskFull = 6 - --- | The database folder doesn't have the right permissions. -insufficientPermissions :: ExitFailure -insufficientPermissions = 7 - -{------------------------------------------------------------------------------- - Get the ExitFailure of an Exception --------------------------------------------------------------------------------} - --- | Return the 'ExitFailure' (to be used in the @ExitFailure@ constructor of --- 'System.Exit.ExitCode') for the given 'SomeException'. Defaults to --- 'defaultExitFailure'. -toExitFailure :: SomeException -> ExitFailure -toExitFailure e - | Just (ExceptionInLinkedThread _ e') <- fromException e - = toExitFailure e' - | Just (_ :: DbMarkerError) <- fromException e - = wrongDatabase - | Just (e' :: ChainDbFailure) <- fromException e - = case e' of - ImmDbFailure ue -> immDbUnexpectedError ue - VolDbFailure ue -> volDbUnexpectedError ue - LgrDbFailure fe -> fsError fe - _ -> restartWithRecovery - | Just (_ :: PBftLeaderCredentialsError) <- fromException e - = configurationError - - -- The three exceptions below will always be wrapped in a - -- 'ChainDbFailure', but we include them just in case. - | Just (e' :: ImmutableDBError) <- fromException e - = case e' of - ImmDB.UnexpectedError ue -> immDbUnexpectedError ue - _ -> defaultExitFailure - | Just (e' :: VolatileDBError) <- fromException e - = case e' of - VolDB.UnexpectedError ue -> volDbUnexpectedError ue - _ -> defaultExitFailure - | Just (e' :: FsError) <- fromException e - = fsError e' - - | otherwise - = defaultExitFailure - where - immDbUnexpectedError :: ImmDB.UnexpectedError -> ExitFailure - immDbUnexpectedError = \case - ImmDB.FileSystemError fe -> fsError fe - _ -> restartWithRecovery - - volDbUnexpectedError :: VolDB.UnexpectedError -> ExitFailure - volDbUnexpectedError = \case - VolDB.FileSystemError fe -> fsError fe - _ -> restartWithRecovery - - fsError :: FsError -> ExitFailure - fsError FsError { fsErrorType } = case fsErrorType of - FsDeviceFull -> diskFull - FsInsufficientPermissions -> insufficientPermissions - _ -> restartWithRecovery diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs new file mode 100644 index 00000000000..e22ad29525f --- /dev/null +++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs @@ -0,0 +1,92 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +module Ouroboros.Consensus.Node.Recovery + ( createMarkerOnCleanShutdown + , hasCleanShutdownMarker + , createCleanShutdownMarker + , removeCleanShutdownMarker + ) where + +import Control.Exception (SomeException) +import Control.Monad (unless, when) + +import Ouroboros.Consensus.Node.Exit (ExitReason (..), toExitReason) +import Ouroboros.Consensus.Util.IOLike + +import Ouroboros.Storage.FS.API (HasFS, doesFileExist, removeFile, + withFile) +import Ouroboros.Storage.FS.API.Types (AllowExisting (..), FsPath, + OpenMode (..), mkFsPath) + +-- | The path to the /clean shutdown marker file/. +cleanShutdownMarkerFile :: FsPath +cleanShutdownMarkerFile = mkFsPath ["clean"] + +-- | When the given action terminates with a /clean/ exception, create the +-- /clean shutdown marker file/. +-- +-- NOTE: we assume the action (i.e., the node itself) never terminates without +-- an exception. +-- +-- A /clean/ exception is an exception for 'exceptionRequiresRecovery' returns +-- 'False'. +createMarkerOnCleanShutdown + :: IOLike m + => HasFS m h + -> m a -- ^ Action to run + -> m a +createMarkerOnCleanShutdown mp = onExceptionIf + (not . exceptionRequiresRecovery) + (createCleanShutdownMarker mp) + +-- | Return 'True' when 'cleanShutdownMarkerFile' exists. +hasCleanShutdownMarker + :: IOLike m + => HasFS m h + -> m Bool +hasCleanShutdownMarker hasFS = + doesFileExist hasFS cleanShutdownMarkerFile + +-- | Create the 'cleanShutdownMarkerFile'. +-- +-- Idempotent. +createCleanShutdownMarker + :: IOLike m + => HasFS m h + -> m () +createCleanShutdownMarker hasFS = do + alreadyExists <- hasCleanShutdownMarker hasFS + unless alreadyExists $ + withFile hasFS cleanShutdownMarkerFile (WriteMode MustBeNew) $ \_h -> + return () + +-- | Remove 'cleanShutdownMarkerFile'. +-- +-- Will throw an 'FsResourceDoesNotExist' error when it does not exist. +removeCleanShutdownMarker + :: IOLike m + => HasFS m h + -> m () +removeCleanShutdownMarker hasFS = + removeFile hasFS cleanShutdownMarkerFile + +-- | Return 'True' if the given exception indicates that recovery of the +-- database is required on the next startup. +exceptionRequiresRecovery :: SomeException -> Bool +exceptionRequiresRecovery e = case toExitReason e of + DatabaseCorruption -> True + _ -> False + +{------------------------------------------------------------------------------- + Auxiliary +-------------------------------------------------------------------------------} + +onExceptionIf + :: (IOLike m, Exception e) + => (e -> Bool) -- ^ Predicate to selection exceptions + -> m () -- ^ Exception handler + -> m a + -> m a +onExceptionIf p h m = m `catch` \e -> do + when (p e) h + throwM e