From 728a5298a4ea2c6d9a924ed42532a682c1af0652 Mon Sep 17 00:00:00 2001 From: Federico Mastellone Date: Thu, 18 May 2023 11:36:39 +0000 Subject: [PATCH] workbench: WIP: add a backend agnostic healthcheck service --- nix/workbench/backend/nomad-job.nix | 38 +- nix/workbench/backend/nomad.nix | 23 +- nix/workbench/backend/nomad.sh | 138 +++++- nix/workbench/backend/supervisor-conf.nix | 18 + nix/workbench/backend/supervisor.sh | 6 +- nix/workbench/profile/profile.nix | 20 +- nix/workbench/run.sh | 5 + nix/workbench/service/healthcheck.nix | 526 +++++++++++++++++++++- 8 files changed, 754 insertions(+), 20 deletions(-) diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix index b15a2996ad2..2c0092a09dc 100644 --- a/nix/workbench/backend/nomad-job.nix +++ b/nix/workbench/backend/nomad-job.nix @@ -370,9 +370,11 @@ let # - host: Advertise the host port for this service. port must match # a port label specified in the network block. port = portName; - # TODO: Use it to heartbeat with cardano-ping!!! + # Checks of type "script" need "consul" instead of "nomad" as + # service provider. As healthcheck we are using a supervisord + # "program". # https://developer.hashicorp.com/nomad/docs/job-specification/check - # check = {}; + check = null; }; # Specifies the set of templates to render for the task. Templates can @@ -405,6 +407,24 @@ let change_mode = "noop"; error_on_missing_key = true; } + ## Make the profile.json file available (mainly for healthchecks) + { + env = false; + destination = "${task_statedir}/profile.json"; + data = escapeTemplate (__readFile + profileData.JSON.outPath); + change_mode = "noop"; + error_on_missing_key = true; + } + ## Make the node-specs.json file available (mainly for healthchecks) + { + env = false; + destination = "${task_statedir}/node-specs.json"; + data = escapeTemplate (__readFile + profileData.node-specs.JSON.outPath); + change_mode = "noop"; + error_on_missing_key = true; + } # entrypoint { env = false; @@ -587,6 +607,20 @@ let error_on_missing_key = true; } ]) + ++ + # healthcheck + [ + ## healthcheck start.sh script. + { + env = false; + destination = "${task_statedir}/healthcheck/start.sh"; + data = escapeTemplate + profileData.healthcheck-service.startupScript.value; + change_mode = "noop"; + error_on_missing_key = true; + perms = "744"; # Only for every "start.sh" script. Default: "644" + } + ] ; # Specifies logging configuration for the stdout and stderr of the diff --git a/nix/workbench/backend/nomad.nix b/nix/workbench/backend/nomad.nix index d2bf9b7b775..e9c2ce24159 100644 --- a/nix/workbench/backend/nomad.nix +++ b/nix/workbench/backend/nomad.nix @@ -69,7 +69,22 @@ let flake-output = "legacyPackages.x86_64-linux.python3Packages.supervisor"; installable = "${flake-reference}/${gitrev}#${flake-output}"; }; - # TODO: profileData.node-services."node-0".serviceConfig.value.eventlog + gnugrep = rec { + nix-store-path = pkgs.gnugrep; + flake-reference = "github:input-output-hk/cardano-node"; + flake-output = "legacyPackages.x86_64-linux.gnugrep"; + installable = "${flake-reference}/${gitrev}#${flake-output}"; + }; + jq = rec { + nix-store-path = pkgs.jq; + flake-reference = "github:input-output-hk/cardano-node"; + flake-output = "legacyPackages.x86_64-linux.jq"; + installable = "${flake-reference}/${gitrev}#${flake-output}"; + }; + # TODO: - cardano-node.passthru.profiled + # - cardano-node.passthru.eventlogged + # - cardano-node.passthru.asserted + # profileData.node-services."node-0".serviceConfig.value.eventlog # builtins.trace (builtins.attrNames profileData.node-services."node-0".serviceConfig.value.eventlog) XXXX cardano-node = rec { nix-store-path = with pkgs; @@ -85,6 +100,12 @@ let ; installable = "${flake-reference}/${gitrev}#${flake-output}"; }; + cardano-cli = rec { + nix-store-path = pkgs.cardanoNodePackages.cardano-cli; + flake-reference = "github:input-output-hk/cardano-cli"; + flake-output = "cardanoNodePackages.cardano-cli"; + installable = "${flake-reference}/${gitrev}#${flake-output}"; + }; cardano-tracer = rec { nix-store-path = pkgs.cardanoNodePackages.cardano-tracer; flake-reference = "github:input-output-hk/cardano-node"; diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh index 6d97b2776f9..323241cf147 100644 --- a/nix/workbench/backend/nomad.sh +++ b/nix/workbench/backend/nomad.sh @@ -69,11 +69,12 @@ backend_nomad() { * ) break;; esac; shift; done # Create the dispatcher's local directories hierarchy. - backend_nomad allocate-run-directory-nomad "${dir}" - backend_nomad allocate-run-directory-supervisor "${dir}" - backend_nomad allocate-run-directory-nodes "${dir}" - backend_nomad allocate-run-directory-generator "${dir}" - backend_nomad allocate-run-directory-tracers "${dir}" + backend_nomad allocate-run-directory-nomad "${dir}" + backend_nomad allocate-run-directory-supervisor "${dir}" + backend_nomad allocate-run-directory-nodes "${dir}" + backend_nomad allocate-run-directory-generator "${dir}" + backend_nomad allocate-run-directory-tracers "${dir}" + backend_nomad allocate-run-directory-healthcheck "${dir}" # These ones are decided at "setenv-defaults" of each sub-backend. local nomad_environment=$(envjqr 'nomad_environment') @@ -187,12 +188,28 @@ backend_nomad() { # FIXME: Looks like I'm not using these ones!!! #cp $(jq '."tracer-config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/tracer-config.json #cp $(jq '."service-config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/service-config.json - cp $(jq '."config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/config.json - cp $(jq '."start"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/start.sh + cp $(jq '."config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/config.json + cp $(jq '."start"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/start.sh fi fi ;; + allocate-run-directory-healthcheck ) + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + mkdir "${dir}"/healthcheck + # For every node ... + local nodes=($(jq_tolist keys "${dir}"/node-specs.json)) + for node in ${nodes[*]} + do + # Files "start.sh" and "topology.sh" that usually go in here are copied + # from the Task/container once it's started because the contents are + # created or patched using Nomad's "template" stanza in the job spec + # and we want to hold a copy of what was actually run. + mkdir "${dir}"/healthcheck/"${node}" + done + ;; + # Change the Nomad job name to the current run tag. This allows to run # multiple clusters simulatenously (as long as the network isolation mode # and/or topology.json allows no port clashing) @@ -447,7 +464,14 @@ backend_nomad() { backend_nomad download-config-tracer "${dir}" "tracer" & jobs_array+=("$!") fi - + # For every node ... + local nodes=($(jq_tolist keys "$dir"/node-specs.json)) + for node in ${nodes[*]} + do + # Only used for debugging! + backend_nomad download-config-healthcheck "${dir}" "${node}" & + jobs_array+=("$!") + done # Wait and check! if test -n "${jobs_array}" then @@ -803,6 +827,19 @@ backend_nomad() { # TODO: Make it in parallel ? msg "Fetch logs ..." + # Download healthcheck(s) logs. + ############################### + # Remove "live" symlinks before downloading the "originals" + if test "${nomad_environment}" != "cloud" + then + rm -f "${dir}"/healthcheck/{stdout,stderr,exit_code} + fi + # Download retry "infinite" loop. + while ! backend_nomad download-logs-healthcheck "${dir}" "node-0" + do + msg "Retrying \"healthcheck\" logs download" + done + msg "$(green "Finished downloading \"healthcheck\" logs")" # Download generator logs. ########################## # Remove "live" symlinks before downloading the "originals" @@ -1198,6 +1235,12 @@ backend_nomad() { # It was "intentionally started and should not automagically stop" flag! touch "${dir}"/generator/started fi + + # TODO ######################################## + # Still need to be decided who starts the healthchecks + # My best are on `scenario.sh` + backend_nomad start-healthcheck "${dir}" node-0 + ############################################### ;; # Called by "start" that has no exit trap, don't use fatal here! @@ -1329,6 +1372,34 @@ backend_nomad() { fi ;; + # TODO ################################################# + # Called by ... ??? #################################### + ######################################################## + start-healthcheck ) # Nomad backend specific subcommands + local usage="USAGE: wb backend $op RUN-DIR TASK" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + + if ! backend_nomad task-program-start "${dir}" "${task}" healthcheck + then + msg "$(yellow "healthcheck of \"${task}\" startup failed!")" + else + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + ln -s \ + ../../nomad/alloc/"${task}"/local/run/current/healthcheck/stdout \ + "${dir}"/healthcheck/"${task}"/stdout + ln -s \ + ../../nomad/alloc/"${task}"/local/run/current/healthcheck/stderr \ + "${dir}"/healthcheck/"${task}"/stderr + ln -s \ + ../../nomad/alloc/"${task}"/local/run/current/healthcheck/exit_code \ + "${dir}"/healthcheck/"${task}"/exit_code + fi + fi + ;; + # Called by "start-node" that has no exit trap, don't use fatal here! wait-node ) local usage="USAGE: wb backend $op RUN-DIR [NODE-NAME]" @@ -1708,6 +1779,34 @@ backend_nomad() { echo "${array[@]}" ;; + # For debugging when something fails, downloads and prints details! + download-logs-healthcheck ) + local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + local download_ok="true" + # Should show the output/log of `supervisord` (runs as "entrypoint"). + msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${task}\"") ..." + backend_nomad task-entrypoint-stdout "${dir}" "${task}" \ + > "${dir}"/nomad/"${task}"/stdout \ + || download_ok="false" + backend_nomad task-entrypoint-stderr "${dir}" "${task}" \ + > "${dir}"/nomad/"${task}"/stderr \ + || download_ok="false" + # Downloads "exit_code", "stdout", "stderr" and GHC files. + # Depending on when the start command failed, logs may not be available! + backend_nomad download-zstd-healthcheck "${dir}" "${task}" \ + || download_ok="false" + # Return + if test "${download_ok}" = "false" + then + msg "$(red "Failed to download \"healthcheck\" run files from \"${task}\"")" + return 1 + else + return 0 + fi + ;; + # For debugging when something fails, downloads and prints details! download-logs-generator ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" @@ -1861,6 +1960,20 @@ backend_nomad() { fi ;; + download-zstd-healthcheck ) + local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + + msg "$(blue Fetching) $(yellow "\"healthcheck\"") run files from Nomad $(yellow "Task \"${task}\"") ..." + # TODO: Add compression, either "--zstd" or "--xz" + backend_nomad task-exec-program-run-files-tar-zstd \ + "${dir}" "${task}" "healthcheck" \ + | tar --extract \ + --directory="${dir}"/healthcheck/ --file=- \ + --no-same-owner --no-same-permissions + ;; + download-zstd-generator ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" local dir=${1:?$usage}; shift @@ -2056,6 +2169,15 @@ backend_nomad() { fi ;; + download-config-healthcheck ) + local usage="USAGE: wb backend pass $op RUN-DIR NODE-NAME" + local dir=${1:?$usage}; shift + local node=${1:?$usage}; shift + backend_nomad task-file-contents "${dir}" "${node}" \ + /local/run/current/healthcheck/start.sh \ + > "${dir}"/healthcheck/"${node}"/start.sh + ;; + ## Nomad job tasks supervisord queries ###################################### diff --git a/nix/workbench/backend/supervisor-conf.nix b/nix/workbench/backend/supervisor-conf.nix index e5e23f6f0fd..e34595cce1f 100644 --- a/nix/workbench/backend/supervisor-conf.nix +++ b/nix/workbench/backend/supervisor-conf.nix @@ -136,6 +136,24 @@ let startsecs = 5; }; } + // + { + "program:healthcheck" = { + # "command" below assumes "directory" is set accordingly. + directory = "${stateDir}/healthcheck"; + command = "${command}"; + stdout_logfile = "${stateDir}/healthcheck/stdout"; + stderr_logfile = "${stateDir}/healthcheck/stderr"; + stopasgroup = false; + killasgroup = false; + autostart = false; + autorestart = false; + # Don't attempt any restart! + startretries = 0; + # Seconds it needs to stay running to consider the start successful + startsecs = 5; + }; + } ; in { diff --git a/nix/workbench/backend/supervisor.sh b/nix/workbench/backend/supervisor.sh index 8d4df926f7b..ab7db28e84f 100755 --- a/nix/workbench/backend/supervisor.sh +++ b/nix/workbench/backend/supervisor.sh @@ -205,7 +205,11 @@ EOF echo "$(white -------------------------------------------------)" >&2 fatal "could not start $(white supervisord)" fi - backend_supervisor save-child-pids "$dir";; + backend_supervisor save-child-pids "$dir" + if ! supervisorctl start healthcheck + then + msg "$(red "supervisorctl start healthcheck failed")" + fi;; wait-node-stopped ) local usage="USAGE: wb backend $op RUN-DIR NODE" diff --git a/nix/workbench/profile/profile.nix b/nix/workbench/profile/profile.nix index 62beb6591f6..61359402bf6 100644 --- a/nix/workbench/profile/profile.nix +++ b/nix/workbench/profile/profile.nix @@ -51,6 +51,15 @@ rec { inherit runJq nodeSpecs; }) tracer-service; + + inherit + (pkgs.callPackage + ../service/healthcheck.nix + { + inherit backend profile; + inherit runJq nodeSpecs; + }) + healthcheck-service; }; ## WARNING: IFD !! @@ -96,7 +105,8 @@ rec { }) node-services generator-service - tracer-service; + tracer-service + healthcheck-service; }; profileData = { profile }: @@ -136,11 +146,18 @@ rec { config = config.JSON; start = startupScript.JSON; }; + healthcheckService = + with profile.healthcheck-service; + __toJSON + { name = "healthcheck"; + start = startupScript.JSON; + }; passAsFile = [ "nodeServices" "generatorService" "tracerService" + "healthcheckService" "topologyJson" "topologyDot" ]; @@ -154,6 +171,7 @@ rec { cp $nodeServicesPath $out/node-services.json cp $generatorServicePath $out/generator-service.json cp $tracerServicePath $out/tracer-service.json + cp $healthcheckServicePath $out/healthcheck-service.json '' // profile; diff --git a/nix/workbench/run.sh b/nix/workbench/run.sh index 70d0e87ebd5..7cb1e0c67e8 100644 --- a/nix/workbench/run.sh +++ b/nix/workbench/run.sh @@ -1045,6 +1045,7 @@ run_instantiate_rundir_profile_services() { local svcs=$dir/profile/node-services.json local gtor=$dir/profile/generator-service.json local trac=$dir/profile/tracer-service.json + local hche=$dir/profile/healthcheck-service.json for node in $(jq_tolist 'keys' "$dir"/node-specs.json) do local node_dir="$dir"/$node @@ -1068,4 +1069,8 @@ run_instantiate_rundir_profile_services() { cp $(jq '."service-config"' -r $trac) "$trac_dir"/service-config.json cp $(jq '."config"' -r $trac) "$trac_dir"/config.json cp $(jq '."start"' -r $trac) "$trac_dir"/start.sh + + local hche_dir="$dir"/healthcheck + mkdir -p "$hche_dir" + cp $(jq '."start"' -r $hche) "$hche_dir"/start.sh } diff --git a/nix/workbench/service/healthcheck.nix b/nix/workbench/service/healthcheck.nix index 049289c8cb6..d065b2b5965 100644 --- a/nix/workbench/service/healthcheck.nix +++ b/nix/workbench/service/healthcheck.nix @@ -17,23 +17,535 @@ let bashInteractive = pkgs.bashInteractive; coreutils = pkgs.coreutils; supervisor = pkgs.supervisor; + grep = pkgs.gnugrep; + jq = pkgs.jq; + cardano-cli = pkgs.cardanoNodePackages.cardano-cli; in { startupScript = rec { JSON = pkgs.writeScript "startup-healthcheck.sh" value; + jqFilter = ""; + # Assumptions: + # - Command `date` and node's log use the same timezone! value = '' #!${bashInteractive}/bin/sh - # Store the entrypoint env vars for debugging purposes - ${coreutils}/bin/env > /local/healthcheck.env + NOMAD_DEBUG=1 - # Only needed for "exec" ? - if test "''${TASK_DRIVER}" = "exec" + # e: Immediately exit if any command has a non-zero exit status + # u: Reference to non previously defined variables is an error + # pipefail: Any failed command in a pipeline is used as return code + set -euo pipefail + + echo "$(${coreutils}/bin/date --rfc-3339=seconds): started" + + # Fetch every node name (Including "explorer" nodes) + nodes=$(${jq}/bin/jq --raw-output "keys | join (\" \")" ../node-specs.json) + pools=$(${jq}/bin/jq 'map(select(.kind == "pool")) | length' ../node-specs.json) + echo "Nodes: [''${nodes}]" + echo "Pools: ''${pools}" + + # Set the defaults! + now=$(${coreutils}/bin/date +%s) + for node in ''${nodes[*]} + do + # Create a healthcheck directory inside every node directory + mkdir "../''${node}/healthcheck" + # TODO: Store the `+RTS --info` + # Save the starting time + echo "''${now}" > "../''${node}/healthcheck/start_time" + done + + # Fetch profile parameters + network_magic=$(${jq}/bin/jq .genesis.network_magic ../profile.json) + slot_duration=$(${jq}/bin/jq .genesis.slot_duration ../profile.json) + epoch_length=$(${jq}/bin/jq .genesis.epoch_length ../profile.json) + active_slots_coeff=$(${jq}/bin/jq .genesis.active_slots_coeff ../profile.json) + #active_slots="$((epoch_length * active_slots_coeff))" + echo "network_magic: ''${network_magic}" + echo "slot_duration: ''${slot_duration}" + echo "epoch_length: ''${epoch_length}" + echo "active_slots_coeff: ''${active_slots_coeff}" + #echo "active_slots: ''${active_slots}" + + # The main function, called at the end of the file/script. + function healthcheck() { + # Start the healthcheck infinite loop + trap "" PIPE + while true + do + for node in ''${nodes[*]} + do + node_healthcheck "''${node}" + done + sleep 5 # Enough? + done + trap - PIPE + } + + function node_healthcheck() { + local node=$1 + local exit_code_path="../''${node}/exit_code" + # File exists and is a regular file? + if test -f "''${exit_code_path}" + then + # The node was started! + # File exists and has a size greater than zero? + if test -s "''${exit_code_path}" + then + # The node exited! + # Node failed with non-zero exit code? + if test "''${exit_code}" != "0" + then + # The node exited with a non-zero exit code! + exit_healthcheck "Node \"''${node}\" exited with a non-zero exit code!" + else + # The node exited with a zero exit code! + echo "Node \"''${node}\" exited with a zero exit code!" + fi + else + # The node is running! + if is_node_synced_and_past_slot_zero "''${node}" + then + node_healthcheck_block "''${node}" + node_healthcheck_txs "''${node}" + else + local now=$(${coreutils}/bin/date +%s) + local start_time=$(cat "../''${node}/healthcheck/start_time") + if test $((now - start_time)) -ge 300 + then + exit_healthcheck "''${node}: More than 5m waiting for slot 0" + fi + fi + fi + fi + } + + # { + # "epoch": 0, + # "era": "Babbage", + # "slotInEpoch": 0, + # "slotsToEpochEnd": 600, + # "syncProgress": "100.00" + # } + # { + # "block": 1, + # "epoch": 0, + # "era": "Babbage", + # "hash": "9777....8833", + # "slot": 22, + # "slotInEpoch": 22, + # "slotsToEpochEnd": 578, + # "syncProgress": "100.00" + # } + function is_node_synced_and_past_slot_zero() { + local node=$1 + local socket_path="../''${node}/node.socket" + local db_path="../''${node}/run/current/''${node}/db-testnet/" + # Socket and db folder exist! + if ! test -S "''${socket_path}" || ! test -d "''${db_path}" + then + false + else + local tip block slot + tip=$(${cardano-cli}/bin/cardano-cli query tip \ + --testnet-magic "''${network_magic}" \ + --socket-path "../''${node}/node.socket" + ) + block=$(echo "''${tip}" | ${jq}/bin/jq .block) + slot=$(echo "''${tip}" | ${jq}/bin/jq .slot) + # Block greater than or qeual to zero! + if test -n "''${block}" && test "''${block}" != "null" && test "''${block}" -ge 0 + then + # Slot greater than zero! + if test -n "''${slot}" && test "''${slot}" != "null" && test "''${slot}" -gt 0 + then + true + else + false + fi + else + false + fi + fi + } + + function node_healthcheck_forge() { + local node=$1 + local start_time + local now=$(${coreutils}/bin/date +%s) + local last_forged + last_forged=$(last_block_forged "''${node}" 2>&1) + if test -z "''${last_forged}" + then + start_time=$(cat "../''${node}/healthcheck/start_time") + if test $((now - start_time)) -ge 300 + then + exit_healthcheck "''${node}: More than 5m with no blocks forged at all" + fi + else + echo "''${last_forged}" > "../''${node}/healthcheck/last_forged" + start_time=$(echo "''${last_forged}" | ${jq}/bin/jq .at) + if test $((now - start_time)) -ge 120 + then + exit_healthcheck "''${node}: More than 2m with no newer blocks forged" + fi + fi + } + + function node_healthcheck_block() { + local node=$1 + local start_time + local now=$(${coreutils}/bin/date +%s) + local last_block + last_block=$(last_block_transmitted "''${node}" 2>&1) + if test -z "''${last_block}" + then + start_time=$(cat "../''${node}/healthcheck/start_time") + if test $((now - start_time)) -ge 300 + then + # Stderr + echo "$(${coreutils}/bin/date --rfc-3339=seconds): ''${node}: More than 5m without a first block sent or received" >&2 + exit 1 + fi + else + echo "''${last_block}" > "../''${node}/healthcheck/last_block" + start_time=$(echo "''${last_block}" | ${jq}/bin/jq .at) + if test $((now - start_time)) -ge 60 + then + exit_healthcheck "''${node}: More than 60s with no newer blocks sent or received\n''${last_block}" + fi + fi + } + + function node_healthcheck_txs() { + local node=$1 + local start_time + local now=$(${coreutils}/bin/date +%s) + local last_txs + last_txs=$(last_block_with_txs "''${node}" 2>&1) + if test -z "''${last_txs}" + then + start_time=$(cat "../''${node}/healthcheck/start_time") + if test $((now - start_time)) -ge 300 + then + exit_healthcheck "''${node}: More than 5m without a first block with transactions" + fi + else + echo "''${last_txs}" > "../''${node}/healthcheck/last_txs" + start_time=$(echo "''${last_txs}" | ${jq}/bin/jq .at) + if test $((now - start_time)) -ge 60 + then + exit_healthcheck "''${node}: More than 60s with no newer blocks with transactions\n''${last_txs}" + fi + fi + } + + # We have txIds + # 20 seconds for new one, 3 minutes for first one! + # Also check block number to see if slots are going forward! + # A specific number of slot before looking for txIds ? + + # Helper/auxiliary functions! + ###################################################################### + + # Gets the last "matching" JSON message from a Node's stdout file. + # + # To do this the stdout file is reversed using `tac`, `jq` is applied + # and its answers fetched using `head --lines=1`. To avoid reading the + # whole file (and assuming `tac` is efficient) `jq` is called with + # `--unbuffered`. + # + # Note that the log file is composed of one JSON object per line and + # we are using `--compact-output` to make sure every output log + # message matched by `jq` is contained within a line. + # + # Also, the stdout of the node starts with some text that echoed by + # node's start.sh script that is not valid JSON, so we `grep` for + # "{"at": ... }". We still need to check the exit code of these + # functions just in case because if it fails a query may have failed + # and the healthcheck should fail. + # + # Finally the "at" time has format "2023-05-16 19:57:19.0231Z" and I + # can't parse it using any standard `date` format so I'm stripping the + # milliseconds part and converting to Unix time (Integer). + # + # To avoid "Error: writing output failed: Broken pipe"!" we use + # `tail -n +1` between `jq` and `head`, is smart enough to check + # standard output and closes the pipe down cleanly. + # + # $1: node name + # $2: jq's query string + function jq_node_stdout_last() { + local node=$1 + local select=$2 + local stdout_path="../''${node}/stdout" + if ! ${coreutils}/bin/tac "''${stdout_path}" \ + | \ + ${grep}/bin/grep -E "^{\"at\":.*}$" \ + | \ + ${jq}/bin/jq \ + --compact-output \ + --unbuffered \ + --null-input \ + "nth(0; inputs | select(''${select}))" \ + | \ + ${jq}/bin/jq \ + ' + .at + |= + (.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime) + ' + then + exit_22 "stdout error: jq_node_stdout_last: ''${node}" + fi + } + + # This one exists with "write error: Broken pipe" + function jq_node_stdout_last_v0() { + local node=$1 + local select=$2 + local stdout_path="../''${node}/stdout" + if ! ${coreutils}/bin/tac "''${stdout_path}" \ + | \ + ${grep}/bin/grep -E "^{\"at\":.*}$" \ + | \ + ${jq}/bin/jq --compact-output --unbuffered "''$2" \ + | \ + ${coreutils}/bin/head --lines=+1 \ + | \ + ${jq}/bin/jq \ + ' + .at + |= + (.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime) + ' + then + exit_22 "stdout error: jq_node_stdout_last: ''${node}" + fi + } + + # { + # "at": "2023-05-16 15:02:46.0051Z", + # "ns": "Forge.Loop.AdoptedBlock", + # "data": { + # "blockHash": "26ab7db1ca55e69885665f625262118ef7db99172be6d3591ad2a8e9bfeabffa", + # "blockSize": 864, + # "kind": "TraceAdoptedBlock", + # "slot": 627 + # }, + # "sev": "Info", + # "thread": "34", + # "host": "ip-10-24-123-88.eu-central-1.compute.internal" + # } + function last_block_forged() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + (.ns == "Forge.Loop.AdoptedBlock") + and + (.data.blockHash? != null) + and + (.data.kind? == "TraceAdoptedBlock") + ' + then + exit_22 "jq error: last_block_forged: ''${node}" + fi + } + + # Block sent: + # { + # "at": "2023-05-17 16:00:57.0222Z", + # "ns": "BlockFetch.Server.SendBlock", + # "data": { + # "block": "dde....414", + # "kind": "BlockFetchServer" + # }, + # "sev": "Info", + # "thread": "67", + # "host": "rapide" + # } + # Block received: + # { + # "at": "2023-05-17 16:00:57.0246Z", + # "ns": "BlockFetch.Remote.Receive.Block", + # "data": { + # "kind": "Recv", + # "msg": { + # "agency": "ServerAgency TokStreaming", + # "blockHash": "dde....414", + # "blockSize": 64334, + # "kind": "MsgBlock", + # "txIds": [ + # "60e....6ec", + # ... + # "95d....165" + # ] + # }, + # "peer": { + # "connectionId": "127.0.0.1:37117 127.0.0.1:30000" + # } + # }, + # "sev": "Info", + # "thread": "77", + # "host": "rapide" + # } + + function last_block_transmitted() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + ( + (.ns == "BlockFetch.Server.SendBlock") + and + (.data.block? != null) + and + (.data.kind? == "BlockFetchServer") + ) or ( + (.ns == "BlockFetch.Remote.Receive.Block") + and + (.data.kind? == "Recv") + and + (.data.msg?.blockHash != null) + and + (.data.msg?.kind == "MsgBlock") + ) + ' + then + exit_22 "jq error: last_block_transmitted: ''${node}" + fi + } + + function last_block_sent() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + (.ns == "BlockFetch.Server.SendBlock") + and + (.data.block? != null) + and + (.data.kind? == "BlockFetchServer") + ' + then + exit_22 "jq error: last_block_sent: ''${node}" + fi + } + + function last_block_received() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + (.ns == "BlockFetch.Remote.Receive.Block") + and + (.data.kind? == "Recv") + and + (.data.msg?.blockHash != null) + and + (.data.msg?.kind == "MsgBlock") + ' + then + exit_22 "jq error: last_block_received: ''${node}" + fi + } + + # { + # "at": "2023-05-17 16:00:57.0246Z", + # "ns": "BlockFetch.Remote.Receive.Block", + # "data": { + # "kind": "Recv", + # "msg": { + # "agency": "ServerAgency TokStreaming", + # "blockHash": "dde....414", + # "blockSize": 64334, + # "kind": "MsgBlock", + # "txIds": [ + # "60e....6ec", + # ... + # "95d....165" + # ] + # }, + # "peer": { + # "connectionId": "127.0.0.1:37117 127.0.0.1:30000" + # } + # }, + # "sev": "Info", + # "thread": "77", + # "host": "rapide" + # } + function last_block_with_txs() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + (.ns == "BlockFetch.Remote.Receive.Block") + and + (.data.kind? == "Recv") + and + (.data.msg?.blockHash != null) + and + (.data.msg?.kind == "MsgBlock") + and + (.data.msg?.txIds != null) + and + (.data.msg?.txIds != []) + ' + then + exit_22 "jq error: last_block_with_txs: ''${node}" + fi + } + + # { + # "at": "2023-05-17 16:00:08.3351Z", + # "ns": "Mempool.AddedTx", + # "data": { + # "kind": "TraceMempoolAddedTx", + # "mempoolSize": { + # "bytes": 1328, + # "numTxs": 1 + # }, + # "tx": { + # "txid": "3d724466" + # } + # }, + # "sev": "Info", + # "thread": "66", + # "host": "rapide" + # } + function last_mempool_txs_added() { + local node=$1 + if ! jq_node_stdout_last "''${node}" \ + ' + (.ns == "Mempool.AddedTx") + and + (.data.kind? == "TraceMempoolAddedTx") + and + (.data.mempoolSize?.numTxs >= 1) + ' + then + exit_22 "jq error: last_mempool_txs_added: ''${node}" + fi + } + + function exit_healthcheck { + # Unbuffered stderr, if not the error message may be lost! + stdbuf -o0 -e0 echo -e "$(${coreutils}/bin/date --rfc-3339=seconds): $1" >&2 + exit 1 + } + + function exit_22 { + # Unbuffered stderr, if not the error message may be lost! + stdbuf -o0 -e0 echo -e "$(${coreutils}/bin/date --rfc-3339=seconds): $1" >&2 + exit 22 + } + + if test -n "''${NOMAD_DEBUG:-}" then - cd "''${TASK_WORKDIR}" + exec 5> "$(dirname "$(readlink -f "$0")")"/start.sh.debug + BASH_XTRACEFD="5" + PS4='$LINENO: ' + set -x fi - sleep 100000 - ''; + healthcheck $@ + ''; }; }) nodeSpecs.value;