From f55370de16dcbc97bd8a7407fe39cda89be9b4fc Mon Sep 17 00:00:00 2001 From: Florian Dambrine Date: Wed, 7 Oct 2020 04:53:55 -0700 Subject: [PATCH] Fix prometheus gauges not decrementing when worker dies --- Dockerfile | 2 +- Makefile | 5 ++++- docker/entrypoint.sh | 2 +- karrot/burrow/controllers.py | 3 ++- karrot/reporters/cloudwatch/models.py | 12 ++++++++--- karrot/reporters/prometheus/models.py | 2 +- karrot/wsgi.py | 30 ++++++++++++++++++++++----- tox.ini | 8 +++---- 8 files changed, 47 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7e1b99a..f24aba2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,4 +36,4 @@ WORKDIR /app EXPOSE 5000 ENTRYPOINT ["/entrypoint.sh"] -CMD ["karrot:create_app()", "--bind :5000", "-w 4"] +CMD ["karrot:create_app()", "--config", "karrot/wsgi.py"] diff --git a/Makefile b/Makefile index 8088faa..f91ada1 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ docker: docker-run: docker run -it --rm \ --name karrot \ + -v ~/.aws:/root/.aws \ -p 5000:5000 \ lowess/karrot @@ -37,6 +38,7 @@ docs: run: @echo 🚀 Run development server... + rm -rf .prom; mkdir -p .prom; \ export prometheus_multiproc_dir=.prom; \ export FLASK_APP=karrot; \ export FLASK_ENV=dev; \ @@ -44,7 +46,8 @@ run: gunicorn: @echo 🚀 Run production gunicorn... + rm -rf .prom; mkdir -p .prom; \ export prometheus_multiproc_dir=.prom; \ export FLASK_APP=karrot.wsgi; \ export FLASK_ENV=prod; \ - flask "karrot:create_app()" --bind 127.0.0.1:5000 -w 1 + gunicorn 'karrot:create_app()' --config karrot/wsgi.py diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 22faf7f..924cf40 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -3,4 +3,4 @@ rm -rf /src/.prom; mkdir -p /src/.prom export prometheus_multiproc_dir=/src/.prom -flask $@ +gunicorn $@ diff --git a/karrot/burrow/controllers.py b/karrot/burrow/controllers.py index 817eb28..1bf2e52 100644 --- a/karrot/burrow/controllers.py +++ b/karrot/burrow/controllers.py @@ -28,8 +28,9 @@ def webhook_handler(): :param str event: A valid Burrow Json event POSTed to this endpoint """ - logger.debug("Hit on /burrow endpoint") data = request.get_json() + logger.debug(f"Hit on /burrow endpoint with {data}") + # TODO Deal with partial notifier failures with different return codes event_handler(data) return jsonify(data) diff --git a/karrot/reporters/cloudwatch/models.py b/karrot/reporters/cloudwatch/models.py index 5053754..ae728e0 100644 --- a/karrot/reporters/cloudwatch/models.py +++ b/karrot/reporters/cloudwatch/models.py @@ -4,7 +4,7 @@ import sys import datetime import boto3 -from botocore.exceptions import ClientError, ParamValidationError +from botocore.exceptions import ClientError, ParamValidationError, NoCredentialsError from structlog import get_logger from flask import current_app as app @@ -38,9 +38,8 @@ def __init__(self, name): else: self._client = boto3.client("cloudwatch") logger.info("Initialized boto client successfully") - except (ClientError, ParamValidationError) as e: + except (ClientError, ParamValidationError, NoCredentialsError) as e: logger.exception("Cloudn't initialize boto client", error=str(e)) - sys.exit(1) self._metrics = [] self._namespace = app.config["KARROT_CLOUDWATCH_NAMESPACE"] @@ -119,6 +118,13 @@ def _flush_lag_metrics(self, force=False): self._last_flush_ts = datetime.datetime.now() self._metrics = [] + except NoCredentialsError: + logger.exception( + "Could not find AWS credentials. " + "Karrot will exit as it cannot report metrics to Cloudwatch" + ) + sys.exit(1) + except Exception: logger.exception("Lag could not be reported to cloudwatch") CLOUDWATCH_API_CALLS_COUNT.labels( diff --git a/karrot/reporters/prometheus/models.py b/karrot/reporters/prometheus/models.py index 083483d..b7af0a0 100644 --- a/karrot/reporters/prometheus/models.py +++ b/karrot/reporters/prometheus/models.py @@ -11,7 +11,7 @@ "karrot_consumer_lag", "Total lag accumulated by the consumer", labelnames=["cluster", "consumer"], - multiprocess_mode="max", + multiprocess_mode="liveall", ) REPORTER_EVENTS_COUNT = Counter( diff --git a/karrot/wsgi.py b/karrot/wsgi.py index 1a03d0a..10a22c3 100644 --- a/karrot/wsgi.py +++ b/karrot/wsgi.py @@ -4,15 +4,35 @@ """WSGI callable.""" from karrot import create_app # noqa: F401 -import gunicorn.app.wsgiapp as wsgi +from prometheus_client import multiprocess +import logging +import shutil +import os +logger = logging.getLogger(__name__) -def worker_exit(server, worker): - from prometheus_client import multiprocess +def on_reload(server): + prom_dir = os.getenv("prometheus_multiproc_dir", None) + if prom_dir is not None: + logger.info(f"Flushing out Prometheus multiproc directory: {prom_dir}/*") + + for root, dirs, files in os.walk(prom_dir): + for f in files: + os.unlink(os.path.join(root, f)) + for d in dirs: + shutil.rmtree(os.path.join(root, d)) + + else: + raise RuntimeError("You must set `prometheus_multiproc_dir=`") + + +def child_exit(server, worker): + logger.warning(f"{worker} died. marking process as dead in Prometheus registry") multiprocess.mark_process_dead(worker.pid) preload_app = True - -wsgi.run() +bind = "0.0.0.0:5000" +workers = 4 +loglevel = "info" diff --git a/tox.ini b/tox.ini index 4873201..5e8e009 100644 --- a/tox.ini +++ b/tox.ini @@ -16,10 +16,10 @@ setenv = PYTHONIOENCODING = utf-8 PYTHONDONTWRITEBYTECODE = 1 AWS_DEFAULT_REGION = us-east-1 - # AWS_ACCESS_KEY_ID='testing' - # AWS_SECRET_ACCESS_KEY='testing' - # AWS_SECURITY_TOKEN='' - # AWS_SESSION_TOKEN='' + AWS_ACCESS_KEY_ID='testing' + AWS_SECRET_ACCESS_KEY='testing' + AWS_SECURITY_TOKEN='' + AWS_SESSION_TOKEN='' deps = -r{toxinidir}/tests/requirements.txt