Skip to content

Commit

Permalink
fix for recovery from failed nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
jreadey committed Apr 6, 2020
1 parent b6d6787 commit 91901e7
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions hsds/headnode.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async def healthCheck(app):
await asyncio.sleep(sleep_secs)

now = int(time.time())
log.info("health check {}, cluster_state: {}".format(unixTimeToUTC(now), app["cluster_state"]))
log.info("health check {}, cluster_state: {}, node_count: {}".format(unixTimeToUTC(now), app["cluster_state"], len(nodes)))

fail_count = 0
HEALTH_CHECK_RETRY_COUNT = 1 # times to try before calling a node dead
Expand Down Expand Up @@ -107,6 +107,7 @@ async def healthCheck(app):
node["failcount"] += 1
if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
log.warn("removing {}:{} from active list".format(node["host"], node["port"]))
node["host"] = None # make slow available for new registrations
fail_count += 1

log.info("node health check fail_count: {}".format(fail_count))
Expand Down Expand Up @@ -202,7 +203,7 @@ async def register(request):
log.info("inactive_node_count: {}".format(inactive_node_count))
if inactive_node_count == 0:
# all the nodes have checked in
log.info(f"setting cluster state to ready - was: {app['cluster_state']}")
log.info(f"setting cluster state to READY - was: {app['cluster_state']}")
app['cluster_state'] = "READY"

resp = StreamResponse()
Expand Down

0 comments on commit 91901e7

Please sign in to comment.