From 34d400f7b47eaa6989879252d67c78768f0154e0 Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Wed, 15 Feb 2023 22:31:07 +0000 Subject: [PATCH] Observability tools (#1563) - add [Prometheus](https://github.com/prometheus/prometheus) & [Grafana](https://github.com/grafana/grafana) for custom metrics and visualization (/metrics endpoints and anything else we might want to add). - add [netdata](https://github.com/netdata/netdata) for infrastructure monitoring and alerts (redis, postgres, containers, also prometheus metrics too etc) - configure netdata to collect postgress, redis, and container metrics. - configure Prometheus to scrape itself, backend, and inference-server. - optional env var of `NETDATA_CLAIM_TOKEN` to claim to [netdata cloud](https://www.netdata.cloud/) - makes it easier to work with infra and alerts to discord etc. I work there so am pretty sure can get us a free sponsored space that might be useful. Not trying to sell here or anything, just that it's a potential useful overlap given i work there :) . - add initial sort of dummy fastapi custom dashboard in `docker/grafana/dashboards`. Idea is we can save dashboards as code in there (**NOTE**: needs much more work - anyone can add/improve dashboards as follow on PR's, my promql skills not great). - add observability tools to `observability` docker compose profile (**NOTE**: not sure what best approach is here, would need some input from other more familiar with the docker set up). - add Grafana on port 2000 instead of 3000 since app itself on 3000. - add some README.md under each `/docker` folder. --- docker-compose.yaml | 68 +++ docker/grafana/README.md | 14 + docker/grafana/dashboards/dashboard.yaml | 12 + .../grafana/dashboards/fastapi-backend.json | 516 ++++++++++++++++++ docker/grafana/datasources/datasource.yml | 9 + docker/netdata/README.md | 13 + docker/netdata/go.d/postgres.conf | 5 + docker/netdata/go.d/prometheus.conf | 6 + docker/netdata/go.d/redis.conf | 3 + docker/prometheus/README.md | 9 + docker/prometheus/prometheus.yml | 39 ++ 11 files changed, 694 insertions(+) create mode 100644 docker/grafana/README.md create mode 100644 docker/grafana/dashboards/dashboard.yaml create mode 100644 docker/grafana/dashboards/fastapi-backend.json create mode 100644 docker/grafana/datasources/datasource.yml create mode 100644 docker/netdata/README.md create mode 100644 docker/netdata/go.d/postgres.conf create mode 100644 docker/netdata/go.d/prometheus.conf create mode 100644 docker/netdata/go.d/redis.conf create mode 100644 docker/prometheus/README.md create mode 100644 docker/prometheus/prometheus.yml diff --git a/docker-compose.yaml b/docker-compose.yaml index 82a4ff8d8c..91725ce5f2 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -199,3 +199,71 @@ services: deploy: replicas: 1 profiles: ["inference"] + + prometheus: + image: prom/prometheus + container_name: prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + ports: + - 9090:9090 + restart: unless-stopped + volumes: + - ${PWD}/docker/prometheus:/etc/prometheus + - prom_data:/prometheus + profiles: ["observability"] + + grafana: + image: grafana/grafana + container_name: grafana + ports: + - 2000:2000 + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=grafana + - GF_SERVER_HTTP_PORT=2000 + volumes: + - ${PWD}/docker/grafana/datasources:/etc/grafana/provisioning/datasources + - ${PWD}/docker/grafana/dashboards/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml + - ${PWD}/docker/grafana/dashboards:/var/lib/grafana/dashboards + profiles: ["observability"] + + netdata: + image: netdata/netdata + container_name: netdata + pid: host + hostname: oasst-netdata + ports: + - 19999:19999 + restart: unless-stopped + cap_add: + - SYS_PTRACE + - SYS_ADMIN + security_opt: + - apparmor:unconfined + volumes: + - netdataconfig:/etc/netdata + - netdatalib:/var/lib/netdata + - netdatacache:/var/cache/netdata + - /etc/passwd:/host/etc/passwd:ro + - /etc/group:/host/etc/group:ro + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /etc/os-release:/host/etc/os-release:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - ${PWD}/docker/netdata/go.d/redis.conf:/etc/netdata/go.d/redis.conf + - ${PWD}/docker/netdata/go.d/postgres.conf:/etc/netdata/go.d/postgres.conf + - ${PWD}/docker/netdata/go.d/prometheus.conf:/etc/netdata/go.d/prometheus.conf + environment: + # useful if want to claim monitoring agents into https://www.netdata.cloud/ + # else ignore or leave blank to just use local netdata dashboards as localhost:19999 + - NETDATA_CLAIM_TOKEN=${NETDATA_CLAIM_TOKEN:-} + - NETDATA_CLAIM_URL=https://app.netdata.cloud + profiles: ["observability"] + +volumes: + prom_data: + netdataconfig: + netdatalib: + netdatacache: diff --git a/docker/grafana/README.md b/docker/grafana/README.md new file mode 100644 index 0000000000..ce791dc94d --- /dev/null +++ b/docker/grafana/README.md @@ -0,0 +1,14 @@ +# Grafana + +[Grafana](https://github.com/grafana/grafana) is used to visualize custom +observabiltiy metrics and much more. + +This folder contains various configuration files for Grafana. + +- [`./dashboards/dashboard.yaml`](./dashboards/dashboard.yaml) - Used to tell + Grafana where some pre-configured dashboards live. +- [`./dashboards/fastapi-backend.json`](./dashboards/fastapi-backend.json) - A + json representation of a saved Grafana dashboard focusing on some high level + api endpoint metrics etc. +- [`./datasources/datasource.yml`](./datasources/datasource.yml) - A config file + to set up Grafana to read from the local Prometheus source. diff --git a/docker/grafana/dashboards/dashboard.yaml b/docker/grafana/dashboards/dashboard.yaml new file mode 100644 index 0000000000..fd66a47911 --- /dev/null +++ b/docker/grafana/dashboards/dashboard.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "Dashboard provider" + orgId: 1 + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true diff --git a/docker/grafana/dashboards/fastapi-backend.json b/docker/grafana/dashboards/fastapi-backend.json new file mode 100644 index 0000000000..756402b0a6 --- /dev/null +++ b/docker/grafana/dashboards/fastapi-backend.json @@ -0,0 +1,516 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum by (status) (rate(http_requests_total[1m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Requests Per Minute By Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "http_request_duration_seconds_sum{job=\"backend\",handler!=\"none\"} / http_request_duration_seconds_count", + "legendFormat": "{{handler}}", + "range": true, + "refId": "A" + } + ], + "title": "Average Response Time By Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "increase(http_requests_total{job=\"backend\", status=\"2xx\"}[5m])", + "instant": true, + "key": "Q-638a78b9-fc11-4f92-973c-60c8f0bc7ed2-0", + "legendFormat": "{{handler}}", + "range": true, + "refId": "A" + } + ], + "title": "Successful Requests By Endpoint [5m]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "increase(http_requests_total{job=\"backend\", status!=\"2xx\"}[5m])", + "instant": true, + "key": "Q-638a78b9-fc11-4f92-973c-60c8f0bc7ed2-0", + "legendFormat": "{{handler}}", + "range": true, + "refId": "A" + } + ], + "title": "Unsuccessful Requests By Endpoint [5m]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "100 * (\r\n sum(increase(http_requests_total{status=\"2xx\", job=\"backend\"}[5m])) by (handler)\r\n /\r\n sum(increase(http_requests_total{job=\"backend\"}[5m])) by (handler)\r\n)", + "instant": true, + "key": "Q-638a78b9-fc11-4f92-973c-60c8f0bc7ed2-0", + "legendFormat": "{{handler}}", + "range": true, + "refId": "A" + } + ], + "title": "Success Rate By Endpoint [5m]", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "FastAPI Backend", + "uid": "H6r1GtJ4z", + "version": 15, + "weekStart": "" +} diff --git a/docker/grafana/datasources/datasource.yml b/docker/grafana/datasources/datasource.yml new file mode 100644 index 0000000000..44999d4695 --- /dev/null +++ b/docker/grafana/datasources/datasource.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + url: http://prometheus:9090 + isDefault: true + access: proxy + editable: true diff --git a/docker/netdata/README.md b/docker/netdata/README.md new file mode 100644 index 0000000000..2dfe4748e8 --- /dev/null +++ b/docker/netdata/README.md @@ -0,0 +1,13 @@ +# Netdata + +[Netdata](https://github.com/netdata/netdata) is an open source monitoring tool. + +This folder contains some configfuration files used to set up various netdata +collectors we want to use like Redis, Postgres, etc. + +- [`./go.d/postgres.conf`](./go.d/postgres.conf) - Config for Netdata + [Postgres Collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/postgres). +- [`./go.d/prometheus.conf`](./go.d/prometheus.conf) - Config for Netdata + [Prometheus Collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus). +- [`./go.d/redis.conf`](./go.d/redis.conf) - Config for Netdata + [Redis Collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/redis). diff --git a/docker/netdata/go.d/postgres.conf b/docker/netdata/go.d/postgres.conf new file mode 100644 index 0000000000..73788afec0 --- /dev/null +++ b/docker/netdata/go.d/postgres.conf @@ -0,0 +1,5 @@ +jobs: + - name: db + dsn: 'postgres://postgres:postgres@db:5432/postgres' + - name: webdb + dsn: 'postgres://postgres:postgres@webdb:5432/oasst_web' diff --git a/docker/netdata/go.d/prometheus.conf b/docker/netdata/go.d/prometheus.conf new file mode 100644 index 0000000000..21f259627d --- /dev/null +++ b/docker/netdata/go.d/prometheus.conf @@ -0,0 +1,6 @@ +jobs: + - name: backend + url: http://backend:8080/metrics + + - name: inference-server + url: http://inference-server:8080/metrics diff --git a/docker/netdata/go.d/redis.conf b/docker/netdata/go.d/redis.conf new file mode 100644 index 0000000000..d6772dbcfc --- /dev/null +++ b/docker/netdata/go.d/redis.conf @@ -0,0 +1,3 @@ +jobs: + - name: redis + address: 'redis://@redis:6379' diff --git a/docker/prometheus/README.md b/docker/prometheus/README.md new file mode 100644 index 0000000000..5c758c2c58 --- /dev/null +++ b/docker/prometheus/README.md @@ -0,0 +1,9 @@ +# Prometheus + +[Prometheus](https://github.com/prometheus/prometheus) is an open source +monitoring system. + +This folder contains some configfuration files used to set up Prometheus. + +- [`./prometheus.yml`](./prometheus.yml) - Config for Prometheus, including what + `/metrics` endpoints to scrape. diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml new file mode 100644 index 0000000000..878e5fe0dc --- /dev/null +++ b/docker/prometheus/prometheus.yml @@ -0,0 +1,39 @@ +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s +alerting: + alertmanagers: + - static_configs: + - targets: [] + scheme: http + timeout: 10s + api_version: v1 +scrape_configs: + - job_name: prometheus + honor_timestamps: true + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - localhost:9090 + - job_name: backend + honor_timestamps: true + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - backend:8080 + - job_name: inference-server + honor_timestamps: true + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - inference-server:8080