Skip to content

Commit

Permalink
Merge pull request #4 from Lowess/bugfix-stale-prometheus-data
Browse files Browse the repository at this point in the history
Fix prometheus gauges not decrementing when worker dies
  • Loading branch information
Lowess committed Oct 7, 2020
2 parents 76918ac + f55370d commit 07569a0
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ WORKDIR /app
EXPOSE 5000

ENTRYPOINT ["/entrypoint.sh"]
CMD ["karrot:create_app()", "--bind :5000", "-w 4"]
CMD ["karrot:create_app()", "--config", "karrot/wsgi.py"]
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ docker:
docker-run:
docker run -it --rm \
--name karrot \
-v ~/.aws:/root/.aws \
-p 5000:5000 \
lowess/karrot

Expand Down Expand Up @@ -37,14 +38,16 @@ docs:

run:
@echo 馃殌 Run development server...
rm -rf .prom; mkdir -p .prom; \
export prometheus_multiproc_dir=.prom; \
export FLASK_APP=karrot; \
export FLASK_ENV=dev; \
flask run

gunicorn:
@echo 馃殌 Run production gunicorn...
rm -rf .prom; mkdir -p .prom; \
export prometheus_multiproc_dir=.prom; \
export FLASK_APP=karrot.wsgi; \
export FLASK_ENV=prod; \
flask "karrot:create_app()" --bind 127.0.0.1:5000 -w 1
gunicorn 'karrot:create_app()' --config karrot/wsgi.py
2 changes: 1 addition & 1 deletion docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
rm -rf /src/.prom; mkdir -p /src/.prom
export prometheus_multiproc_dir=/src/.prom

flask $@
gunicorn $@
3 changes: 2 additions & 1 deletion karrot/burrow/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ def webhook_handler():
:param str event: A valid Burrow Json event POSTed to this endpoint
"""
logger.debug("Hit on /burrow endpoint")
data = request.get_json()
logger.debug(f"Hit on /burrow endpoint with {data}")

# TODO Deal with partial notifier failures with different return codes
event_handler(data)
return jsonify(data)
Expand Down
12 changes: 9 additions & 3 deletions karrot/reporters/cloudwatch/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
import datetime
import boto3
from botocore.exceptions import ClientError, ParamValidationError
from botocore.exceptions import ClientError, ParamValidationError, NoCredentialsError

from structlog import get_logger
from flask import current_app as app
Expand Down Expand Up @@ -38,9 +38,8 @@ def __init__(self, name):
else:
self._client = boto3.client("cloudwatch")
logger.info("Initialized boto client successfully")
except (ClientError, ParamValidationError) as e:
except (ClientError, ParamValidationError, NoCredentialsError) as e:
logger.exception("Cloudn't initialize boto client", error=str(e))
sys.exit(1)

self._metrics = []
self._namespace = app.config["KARROT_CLOUDWATCH_NAMESPACE"]
Expand Down Expand Up @@ -119,6 +118,13 @@ def _flush_lag_metrics(self, force=False):

self._last_flush_ts = datetime.datetime.now()
self._metrics = []
except NoCredentialsError:
logger.exception(
"Could not find AWS credentials. "
"Karrot will exit as it cannot report metrics to Cloudwatch"
)
sys.exit(1)

except Exception:
logger.exception("Lag could not be reported to cloudwatch")
CLOUDWATCH_API_CALLS_COUNT.labels(
Expand Down
2 changes: 1 addition & 1 deletion karrot/reporters/prometheus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"karrot_consumer_lag",
"Total lag accumulated by the consumer",
labelnames=["cluster", "consumer"],
multiprocess_mode="max",
multiprocess_mode="liveall",
)

REPORTER_EVENTS_COUNT = Counter(
Expand Down
30 changes: 25 additions & 5 deletions karrot/wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,35 @@
"""WSGI callable."""

from karrot import create_app # noqa: F401
import gunicorn.app.wsgiapp as wsgi
from prometheus_client import multiprocess
import logging
import shutil
import os

logger = logging.getLogger(__name__)

def worker_exit(server, worker):
from prometheus_client import multiprocess

def on_reload(server):
prom_dir = os.getenv("prometheus_multiproc_dir", None)
if prom_dir is not None:
logger.info(f"Flushing out Prometheus multiproc directory: {prom_dir}/*")

for root, dirs, files in os.walk(prom_dir):
for f in files:
os.unlink(os.path.join(root, f))
for d in dirs:
shutil.rmtree(os.path.join(root, d))

else:
raise RuntimeError("You must set `prometheus_multiproc_dir=<directory>`")


def child_exit(server, worker):
logger.warning(f"{worker} died. marking process as dead in Prometheus registry")
multiprocess.mark_process_dead(worker.pid)


preload_app = True

wsgi.run()
bind = "0.0.0.0:5000"
workers = 4
loglevel = "info"
8 changes: 4 additions & 4 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ setenv =
PYTHONIOENCODING = utf-8
PYTHONDONTWRITEBYTECODE = 1
AWS_DEFAULT_REGION = us-east-1
# AWS_ACCESS_KEY_ID='testing'
# AWS_SECRET_ACCESS_KEY='testing'
# AWS_SECURITY_TOKEN=''
# AWS_SESSION_TOKEN=''
AWS_ACCESS_KEY_ID='testing'
AWS_SECRET_ACCESS_KEY='testing'
AWS_SECURITY_TOKEN=''
AWS_SESSION_TOKEN=''

deps = -r{toxinidir}/tests/requirements.txt

Expand Down

0 comments on commit 07569a0

Please sign in to comment.