diff --git a/dev/grafana/dashboards/kiloclaw-controller-telemetry.json b/dev/grafana/dashboards/kiloclaw-controller-telemetry.json index 180e088684..6db11ec63d 100644 --- a/dev/grafana/dashboards/kiloclaw-controller-telemetry.json +++ b/dev/grafana/dashboards/kiloclaw-controller-telemetry.json @@ -56,6 +56,7 @@ }, "overrides": [] }, + "description": "Unique sandbox IDs (blob1) seen in the selected window. AE preserves true unique values across sampling, so this is exact — not extrapolated.", "gridPos": { "h": 4, "w": 6, @@ -88,7 +89,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, @@ -130,6 +131,7 @@ }, "overrides": [] }, + "description": "Unique fly machine IDs (blob8) seen in the selected window. Excludes empty machine IDs (early-boot rows).", "gridPos": { "h": 4, "w": 6, @@ -162,7 +164,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, @@ -204,6 +206,7 @@ }, "overrides": [] }, + "description": "Linux 5-min load average across selected check-ins. Compare against host CPU count; values approaching the CPU count indicate saturation.", "gridPos": { "h": 4, "w": 6, @@ -236,7 +239,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, @@ -279,6 +282,7 @@ }, "overrides": [] }, + "description": "Average machine uptime (seconds) across selected check-ins.", "gridPos": { "h": 4, "w": 6, @@ -311,7 +315,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, @@ -406,6 +410,7 @@ "x": 0, "y": 6 }, + "description": "Check-in volume (sample-corrected) split by supervisor state (blob6). Useful to see if a state is gaining or losing membership.", "id": 25, "options": { "legend": { @@ -512,6 +517,7 @@ "x": 12, "y": 6 }, + "description": "Mean 5-min Linux load average per supervisor state. Compare against the typical CPU count of the host class.", "id": 26, "options": { "legend": { @@ -536,7 +542,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "time_series", "interval": "", "intervalFactor": 1, @@ -619,6 +625,7 @@ "x": 0, "y": 14 }, + "description": "Mean machine uptime per supervisor state. A drop means many machines were recently (re)started.", "id": 27, "options": { "legend": { @@ -643,7 +650,7 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "time_series", "interval": "", "intervalFactor": 1, @@ -725,6 +732,7 @@ "x": 12, "y": 14 }, + "description": "Check-in volume by Fly region (blob7). Highlights regional traffic skew or outages.", "id": 28, "options": { "legend": { @@ -801,6 +809,7 @@ }, "overrides": [] }, + "description": "Distribution of supervisor states across check-ins, sample-corrected.", "gridPos": { "h": 8, "w": 8, @@ -864,8 +873,18 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { "id": "byName", "options": "total_bandwidth_in_bytes" }, + "properties": [{ "id": "unit", "value": "bytes" }] + }, + { + "matcher": { "id": "byName", "options": "total_bandwidth_out_bytes" }, + "properties": [{ "id": "unit", "value": "bytes" }] + } + ] }, + "description": "Top hosts by total bandwidth (sample-corrected via _sample_interval). Bandwidth columns render in IEC bytes (KiB/MiB/GiB).", "gridPos": { "h": 8, "w": 8, @@ -931,6 +950,7 @@ }, "overrides": [] }, + "description": "100 most recent raw check-ins with full dimensional context. Useful for ad-hoc debugging of a specific sandbox.", "gridPos": { "h": 8, "w": 8, @@ -975,6 +995,354 @@ ], "title": "Latest Check-ins", "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 203, + "panels": [], + "title": "Disk Usage", + "type": "row" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Average disk fill percentage (double7 / double8) across selected hosts. Climbs toward 100% indicate hosts at risk of running out of space. Extrapolation is disabled because this is an AVG of a ratio — extrapolating the last partial bucket would distort the value rather than estimate a missing tail.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 12, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "yellow", "value": 0.8 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, blob7 AS label, AVG(double7 / double8) AS disk_fill FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND double8 > 0 GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, blob7 AS label, AVG(double7 / double8) AS disk_fill FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND double8 > 0 GROUP BY t, label ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_controller_telemetry", + "useWindowFuncForMacros": true + } + ], + "title": "Disk Fill % by Region", + "type": "timeseries" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Hosts ranked by peak disk fill percentage in the selected window. Investigate any row > 90%.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": 0 }] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "peak_disk_fill" }, + "properties": [ + { "id": "unit", "value": "percentunit" }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "yellow", "value": 0.8 }, + { "color": "red", "value": 0.9 } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { "type": "color-background", "mode": "gradient" } + } + ] + }, + { + "matcher": { "id": "byName", "options": "peak_disk_used_bytes" }, + "properties": [{ "id": "unit", "value": "bytes" }] + }, + { + "matcher": { "id": "byName", "options": "disk_total_bytes" }, + "properties": [{ "id": "unit", "value": "bytes" }] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 33, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT blob8 AS host, blob1 AS sandbox_id, MAX(double7) AS peak_disk_used_bytes, MAX(double8) AS disk_total_bytes, MAX(double7) / MAX(double8) AS peak_disk_fill, MAX(timestamp) AS last_checkin FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob8 != '' AND double8 > 0 GROUP BY host, sandbox_id ORDER BY peak_disk_fill DESC LIMIT 20", + "rawSql": "SELECT blob8 AS host, blob1 AS sandbox_id, MAX(double7) AS peak_disk_used_bytes, MAX(double8) AS disk_total_bytes, MAX(double7) / MAX(double8) AS peak_disk_fill, MAX(timestamp) AS last_checkin FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob8 != '' AND double8 > 0 GROUP BY host, sandbox_id ORDER BY peak_disk_fill DESC LIMIT 20", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_controller_telemetry", + "useWindowFuncForMacros": true + } + ], + "title": "Top Hosts by Disk Fill", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 204, + "panels": [], + "title": "Reliability", + "type": "row" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Top reasons reported in blob9 (last_exit_reason). Spikes in 'oom_killed' / 'signal_killed' / non-zero exit codes are the first thing to look at after a regression.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": 0 }] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 34, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT blob9 AS last_exit_reason, SUM(_sample_interval) AS count, COUNT(DISTINCT blob1) AS distinct_sandboxes FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob9 != '' GROUP BY last_exit_reason ORDER BY count DESC LIMIT 20", + "rawSql": "SELECT blob9 AS last_exit_reason, SUM(_sample_interval) AS count, COUNT(DISTINCT blob1) AS distinct_sandboxes FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob9 != '' GROUP BY last_exit_reason ORDER BY count DESC LIMIT 20", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_controller_telemetry", + "useWindowFuncForMacros": true + } + ], + "title": "Top Exit Reasons", + "type": "table" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Restart delta (MAX(double2) - MIN(double2)) per host over the selected window — how many restarts occurred during the window, inferred from the change in the cumulative counter. Use to identify flapping machines.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": 0 }] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 35, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT blob8 AS host, blob1 AS sandbox_id, MAX(double2) AS max_total_restarts, MAX(double2) - MIN(double2) AS restart_delta, MAX(timestamp) AS last_checkin FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob8 != '' GROUP BY host, sandbox_id HAVING restart_delta > 0 ORDER BY restart_delta DESC LIMIT 20", + "rawSql": "SELECT blob8 AS host, blob1 AS sandbox_id, MAX(double2) AS max_total_restarts, MAX(double2) - MIN(double2) AS restart_delta, MAX(timestamp) AS last_checkin FROM kiloclaw_controller_telemetry WHERE $timeFilter\n $conditionalTest(AND blob1 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob6 IN ($supervisorState),, $supervisorState)\n $conditionalTest(AND blob7 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob2 IN ($controllerVersion),, $controllerVersion) AND blob8 != '' GROUP BY host, sandbox_id HAVING restart_delta > 0 ORDER BY restart_delta DESC LIMIT 20", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_controller_telemetry", + "useWindowFuncForMacros": true + } + ], + "title": "Top Hosts by Restart Delta", + "type": "table" } ], "preload": false, @@ -1025,13 +1393,13 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob6 FROM kiloclaw_controller_telemetry WHERE blob6 != '' ORDER BY blob6", + "definition": "SELECT blob6 FROM kiloclaw_controller_telemetry WHERE blob6 != '' GROUP BY blob6 ORDER BY blob6", "includeAll": true, "label": "Supervisor State", "multi": true, "name": "supervisorState", "options": [], - "query": "SELECT DISTINCT blob6 FROM kiloclaw_controller_telemetry WHERE blob6 != '' ORDER BY blob6", + "query": "SELECT blob6 FROM kiloclaw_controller_telemetry WHERE blob6 != '' GROUP BY blob6 ORDER BY blob6", "refresh": 2, "sort": 1, "type": "query" @@ -1047,13 +1415,13 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob7 FROM kiloclaw_controller_telemetry WHERE blob7 != '' ORDER BY blob7", + "definition": "SELECT blob7 FROM kiloclaw_controller_telemetry WHERE blob7 != '' GROUP BY blob7 ORDER BY blob7", "includeAll": true, "label": "Fly Region", "multi": true, "name": "flyRegion", "options": [], - "query": "SELECT DISTINCT blob7 FROM kiloclaw_controller_telemetry WHERE blob7 != '' ORDER BY blob7", + "query": "SELECT blob7 FROM kiloclaw_controller_telemetry WHERE blob7 != '' GROUP BY blob7 ORDER BY blob7", "refresh": 2, "sort": 1, "type": "query" @@ -1069,13 +1437,13 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob2 FROM kiloclaw_controller_telemetry WHERE blob2 != '' ORDER BY blob2", + "definition": "SELECT blob2 FROM kiloclaw_controller_telemetry WHERE blob2 != '' GROUP BY blob2 ORDER BY blob2", "includeAll": true, "label": "Controller Version", "multi": true, "name": "controllerVersion", "options": [], - "query": "SELECT DISTINCT blob2 FROM kiloclaw_controller_telemetry WHERE blob2 != '' ORDER BY blob2", + "query": "SELECT blob2 FROM kiloclaw_controller_telemetry WHERE blob2 != '' GROUP BY blob2 ORDER BY blob2", "refresh": 2, "sort": 1, "type": "query" diff --git a/dev/grafana/dashboards/kiloclaw-events.json b/dev/grafana/dashboards/kiloclaw-events.json index e9a1144404..c2df7f2140 100644 --- a/dev/grafana/dashboards/kiloclaw-events.json +++ b/dev/grafana/dashboards/kiloclaw-events.json @@ -62,6 +62,7 @@ "x": 0, "y": 1 }, + "description": "Sample-corrected total event count across all event names.", "id": 1, "options": { "colorMode": "value", @@ -93,8 +94,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT SUM(_sample_interval) AS total_events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", - "rawSql": "SELECT SUM(_sample_interval) AS total_events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", + "query": "SELECT SUM(_sample_interval) AS total_events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", + "rawSql": "SELECT SUM(_sample_interval) AS total_events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -136,6 +137,7 @@ "x": 6, "y": 1 }, + "description": "Distinct sandbox IDs (blob8) seen in the window. Excludes empty sandbox IDs (early HTTP events without context).", "id": 2, "options": { "colorMode": "value", @@ -162,13 +164,13 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT COUNT(DISTINCT blob8) AS sandboxes FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob8 != ''", - "rawSql": "SELECT COUNT(DISTINCT blob8) AS sandboxes FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob8 != ''", + "query": "SELECT COUNT(DISTINCT blob8) AS sandboxes FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob8 != ''", + "rawSql": "SELECT COUNT(DISTINCT blob8) AS sandboxes FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob8 != ''", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -219,6 +221,7 @@ "x": 12, "y": 1 }, + "description": "Fraction of events with a non-empty error string (blob5). Sample-corrected via _sample_interval. Yellow at 1%, red at 5%.", "id": 3, "options": { "colorMode": "value", @@ -245,13 +248,13 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT SUM(IF(blob5 != '', _sample_interval, 0)) / NULLIF(SUM(_sample_interval), 0) AS error_rate FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", - "rawSql": "SELECT SUM(IF(blob5 != '', _sample_interval, 0)) / NULLIF(SUM(_sample_interval), 0) AS error_rate FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", + "query": "SELECT IF(SUM(_sample_interval) > 0, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval), 0) AS error_rate FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", + "rawSql": "SELECT IF(SUM(_sample_interval) > 0, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval), 0) AS error_rate FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -294,6 +297,7 @@ "x": 18, "y": 1 }, + "description": "Sample-weighted P95 of double1 (operation duration in ms) across events that report a non-zero duration.", "id": 4, "options": { "colorMode": "value", @@ -320,13 +324,13 @@ "dateTimeColDataType": "timestamp", "dateTimeType": "DATETIME", "editorMode": "sql", - "extrapolate": true, + "extrapolate": false, "format": "table", "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT SUM(IF(double1 > 0, _sample_interval * double1, 0)) / NULLIF(SUM(IF(double1 > 0, _sample_interval, 0)), 0) AS avg_duration_ms FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", - "rawSql": "SELECT SUM(IF(double1 > 0, _sample_interval * double1, 0)) / NULLIF(SUM(IF(double1 > 0, _sample_interval, 0)), 0) AS avg_duration_ms FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)", + "query": "SELECT quantileWeighted(0.95)(double1, _sample_interval) AS p95_duration_ms FROM kiloclaw_events WHERE $timeFilter AND double1 > 0\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", + "rawSql": "SELECT quantileWeighted(0.95)(double1, _sample_interval) AS p95_duration_ms FROM kiloclaw_events WHERE $timeFilter AND double1 > 0\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId)", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -336,7 +340,7 @@ "useWindowFuncForMacros": true } ], - "title": "Avg Duration", + "title": "P95 Duration", "type": "stat" }, { @@ -416,6 +420,7 @@ "x": 0, "y": 6 }, + "description": "Per-second event rate (sum of _sample_interval / bucket size) by event name.", "id": 5, "options": { "legend": { @@ -445,8 +450,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) / $interval_s AS rps FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY t, label ORDER BY t", - "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) / $interval_s AS rps FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY t, label ORDER BY t", + "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) / $interval_s AS rps FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) / $interval_s AS rps FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t, label ORDER BY t", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -522,6 +527,7 @@ "x": 12, "y": 6 }, + "description": "Event volume split by delivery layer (blob3): http, do, reconcile, queue.", "id": 6, "options": { "legend": { @@ -551,8 +557,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT $timeSeries AS t, blob3 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY t, label ORDER BY t", - "rawSql": "SELECT $timeSeries AS t, blob3 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY t, label ORDER BY t", + "query": "SELECT $timeSeries AS t, blob3 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, blob3 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t, label ORDER BY t", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -641,6 +647,7 @@ "x": 0, "y": 15 }, + "description": "Reconcile actions emitted by the periodic reconciler (events matching reconcile.*). Spikes typically indicate a corrective sweep across many sandboxes.", "id": 7, "options": { "legend": { @@ -670,8 +677,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 LIKE 'reconcile.%' GROUP BY t, label ORDER BY t", - "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 LIKE 'reconcile.%' GROUP BY t, label ORDER BY t", + "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 LIKE 'reconcile.%' GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 LIKE 'reconcile.%' GROUP BY t, label ORDER BY t", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -747,6 +754,7 @@ "x": 12, "y": 15 }, + "description": "Snapshot restore lifecycle (instance.restore_*). Watch for restore_failed / restore_retry_scheduled growth.", "id": 8, "options": { "legend": { @@ -776,8 +784,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 LIKE 'instance.restore_%' GROUP BY t, label ORDER BY t", - "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 LIKE 'instance.restore_%' GROUP BY t, label ORDER BY t", + "query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 LIKE 'instance.restore_%' GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS events FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 LIKE 'instance.restore_%' GROUP BY t, label ORDER BY t", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -816,6 +824,7 @@ "x": 0, "y": 23 }, + "description": "Top 20 event names by sample-corrected count.", "id": 9, "options": { "cellHeight": "sm", @@ -841,8 +850,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY event ORDER BY count DESC LIMIT 20", - "rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) GROUP BY event ORDER BY count DESC LIMIT 20", + "query": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY event ORDER BY count DESC LIMIT 20", + "rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY event ORDER BY count DESC LIMIT 20", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -881,6 +890,7 @@ "x": 8, "y": 23 }, + "description": "Top 20 event names that emitted with a non-empty error string.", "id": 10, "options": { "cellHeight": "sm", @@ -906,8 +916,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT blob1 AS event, SUM(_sample_interval) AS errors FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob5 != '' GROUP BY event ORDER BY errors DESC LIMIT 20", - "rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS errors FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob5 != '' GROUP BY event ORDER BY errors DESC LIMIT 20", + "query": "SELECT blob1 AS event, SUM(_sample_interval) AS errors FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob5 != '' GROUP BY event ORDER BY errors DESC LIMIT 20", + "rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS errors FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob5 != '' GROUP BY event ORDER BY errors DESC LIMIT 20", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -946,6 +956,7 @@ "x": 16, "y": 23 }, + "description": "Recent region.capacity_eviction events. Spikes indicate Fly capacity pressure forcing evictions.", "id": 11, "options": { "cellHeight": "sm", @@ -971,8 +982,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT timestamp, blob12 AS fly_region, blob13 AS action FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 = 'region.capacity_eviction' ORDER BY timestamp DESC LIMIT 50", - "rawSql": "SELECT timestamp, blob12 AS fly_region, blob13 AS action FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob1 = 'region.capacity_eviction' ORDER BY timestamp DESC LIMIT 50", + "query": "SELECT timestamp, blob12 AS fly_region, blob13 AS action FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 = 'region.capacity_eviction' ORDER BY timestamp DESC LIMIT 50", + "rawSql": "SELECT timestamp, blob12 AS fly_region, blob13 AS action FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob1 = 'region.capacity_eviction' ORDER BY timestamp DESC LIMIT 50", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -985,13 +996,79 @@ "title": "Capacity Evictions", "type": "table" }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Top error messages from blob5, sample-corrected. Surfaces dominant error classes that the per-event 'Top Erroring Events' table hides by aggregating only on event name.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": 0 }] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "error" }, + "properties": [{ "id": "custom.width", "value": 600 }] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 15, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT blob5 AS error, SUM(_sample_interval) AS count, COUNT(DISTINCT blob1) AS distinct_events, COUNT(DISTINCT blob8) AS distinct_sandboxes FROM kiloclaw_events WHERE $timeFilter AND blob5 != ''\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY error ORDER BY count DESC LIMIT 25", + "rawSql": "SELECT blob5 AS error, SUM(_sample_interval) AS count, COUNT(DISTINCT blob1) AS distinct_events, COUNT(DISTINCT blob8) AS distinct_sandboxes FROM kiloclaw_events WHERE $timeFilter AND blob5 != ''\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY error ORDER BY count DESC LIMIT 25", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_events", + "useWindowFuncForMacros": true + } + ], + "title": "Top Error Messages", + "type": "table" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 31 + "y": 39 }, "id": 103, "panels": [], @@ -1022,8 +1099,9 @@ "h": 10, "w": 12, "x": 0, - "y": 32 + "y": 40 }, + "description": "100 most recent events with a non-empty error string. Best entry point for triage.", "id": 12, "options": { "cellHeight": "sm", @@ -1049,8 +1127,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT timestamp, blob1 AS event, blob5 AS error, blob8 AS sandbox_id, blob7 AS fly_machine_id, blob12 AS fly_region FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob5 != '' ORDER BY timestamp DESC LIMIT 100", - "rawSql": "SELECT timestamp, blob1 AS event, blob5 AS error, blob8 AS sandbox_id, blob7 AS fly_machine_id, blob12 AS fly_region FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND blob5 != '' ORDER BY timestamp DESC LIMIT 100", + "query": "SELECT timestamp, blob1 AS event, blob5 AS error, blob8 AS sandbox_id, blob7 AS fly_machine_id, blob12 AS fly_region FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob5 != '' ORDER BY timestamp DESC LIMIT 100", + "rawSql": "SELECT timestamp, blob1 AS event, blob5 AS error, blob8 AS sandbox_id, blob7 AS fly_machine_id, blob12 AS fly_region FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND blob5 != '' ORDER BY timestamp DESC LIMIT 100", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -1087,8 +1165,9 @@ "h": 10, "w": 12, "x": 12, - "y": 32 + "y": 40 }, + "description": "100 most recent instance.* and reconcile.* events with status and label context.", "id": 13, "options": { "cellHeight": "sm", @@ -1114,8 +1193,8 @@ "interval": "", "intervalFactor": 1, "nullifySparse": false, - "query": "SELECT timestamp, blob1 AS event, blob9 AS status, blob6 AS fly_app_name, blob7 AS fly_machine_id, blob8 AS sandbox_id, blob13 AS label FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND (blob1 LIKE 'instance.%' OR blob1 LIKE 'reconcile.%') ORDER BY timestamp DESC LIMIT 100", - "rawSql": "SELECT timestamp, blob1 AS event, blob9 AS status, blob6 AS fly_app_name, blob7 AS fly_machine_id, blob8 AS sandbox_id, blob13 AS label FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event) AND (blob1 LIKE 'instance.%' OR blob1 LIKE 'reconcile.%') ORDER BY timestamp DESC LIMIT 100", + "query": "SELECT timestamp, blob1 AS event, blob9 AS status, blob6 AS fly_app_name, blob7 AS fly_machine_id, blob8 AS sandbox_id, blob13 AS label FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND (blob1 LIKE 'instance.%' OR blob1 LIKE 'reconcile.%') ORDER BY timestamp DESC LIMIT 100", + "rawSql": "SELECT timestamp, blob1 AS event, blob9 AS status, blob6 AS fly_app_name, blob7 AS fly_machine_id, blob8 AS sandbox_id, blob13 AS label FROM kiloclaw_events WHERE $timeFilter\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) AND (blob1 LIKE 'instance.%' OR blob1 LIKE 'reconcile.%') ORDER BY timestamp DESC LIMIT 100", "refId": "A", "round": "0s", "showFormattedSQL": false, @@ -1127,6 +1206,122 @@ ], "title": "Recent Lifecycle Events", "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 104, + "panels": [], + "title": "Latency", + "type": "row" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "description": "Sample-weighted P50, P95, and P99 of double1 (operation duration) for events that report a non-zero duration.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": 0 }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": false, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, quantileWeighted(0.5)(double1, _sample_interval) AS p50, quantileWeighted(0.95)(double1, _sample_interval) AS p95, quantileWeighted(0.99)(double1, _sample_interval) AS p99 FROM kiloclaw_events WHERE $timeFilter AND double1 > 0\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, quantileWeighted(0.5)(double1, _sample_interval) AS p50, quantileWeighted(0.95)(double1, _sample_interval) AS p95, quantileWeighted(0.99)(double1, _sample_interval) AS p99 FROM kiloclaw_events WHERE $timeFilter AND double1 > 0\n $conditionalTest(AND blob8 = ${sandboxId:sqlstring},, $sandboxId)\n $conditionalTest(AND blob2 = ${userId:sqlstring},, $userId)\n $conditionalTest(AND blob3 IN ($delivery),, $delivery)\n $conditionalTest(AND blob12 IN ($flyRegion),, $flyRegion)\n $conditionalTest(AND blob1 IN ($event),, $event)\n $conditionalTest(AND blob10 IN ($openclawVersion),, $openclawVersion)\n $conditionalTest(AND blob11 IN ($imageTag),, $imageTag)\n $conditionalTest(AND blob14 = ${orgId:sqlstring},, $orgId)\n $conditionalTest(AND blob15 = ${instanceId:sqlstring},, $instanceId) GROUP BY t ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "kiloclaw_events", + "useWindowFuncForMacros": true + } + ], + "title": "Duration Percentiles (P50 / P95 / P99)", + "type": "timeseries" } ], "preload": false, @@ -1197,13 +1392,13 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob3 FROM kiloclaw_events WHERE blob3 != '' ORDER BY blob3", + "definition": "SELECT blob3 FROM kiloclaw_events WHERE blob3 != '' GROUP BY blob3 ORDER BY blob3", "includeAll": true, "label": "Delivery", "multi": true, "name": "delivery", "options": [], - "query": "SELECT DISTINCT blob3 FROM kiloclaw_events WHERE blob3 != '' ORDER BY blob3", + "query": "SELECT blob3 FROM kiloclaw_events WHERE blob3 != '' GROUP BY blob3 ORDER BY blob3", "refresh": 2, "sort": 1, "type": "query" @@ -1219,13 +1414,13 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob12 FROM kiloclaw_events WHERE blob12 != '' ORDER BY blob12", + "definition": "SELECT blob12 FROM kiloclaw_events WHERE blob12 != '' GROUP BY blob12 ORDER BY blob12", "includeAll": true, "label": "Fly Region", "multi": true, "name": "flyRegion", "options": [], - "query": "SELECT DISTINCT blob12 FROM kiloclaw_events WHERE blob12 != '' ORDER BY blob12", + "query": "SELECT blob12 FROM kiloclaw_events WHERE blob12 != '' GROUP BY blob12 ORDER BY blob12", "refresh": 2, "sort": 1, "type": "query" @@ -1241,16 +1436,100 @@ "type": "vertamedia-clickhouse-datasource", "uid": "${DS_KILOCLAW}" }, - "definition": "SELECT DISTINCT blob1 FROM kiloclaw_events ORDER BY blob1", + "definition": "SELECT blob1 FROM kiloclaw_events GROUP BY blob1 ORDER BY blob1", "includeAll": true, "label": "Event", "multi": true, "name": "event", "options": [], - "query": "SELECT DISTINCT blob1 FROM kiloclaw_events ORDER BY blob1", + "query": "SELECT blob1 FROM kiloclaw_events GROUP BY blob1 ORDER BY blob1", "refresh": 2, "sort": 1, "type": "query" + }, + { + "allValue": "", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "definition": "SELECT blob10 FROM kiloclaw_events WHERE blob10 != '' GROUP BY blob10 ORDER BY blob10", + "includeAll": true, + "label": "OpenClaw Version", + "multi": true, + "name": "openclawVersion", + "options": [], + "query": "SELECT blob10 FROM kiloclaw_events WHERE blob10 != '' GROUP BY blob10 ORDER BY blob10", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": "", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "${DS_KILOCLAW}" + }, + "definition": "SELECT blob11 FROM kiloclaw_events WHERE blob11 != '' GROUP BY blob11 ORDER BY blob11", + "includeAll": true, + "label": "Image Tag", + "multi": true, + "name": "imageTag", + "options": [], + "query": "SELECT blob11 FROM kiloclaw_events WHERE blob11 != '' GROUP BY blob11 ORDER BY blob11", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "hide": 0, + "label": "Org ID", + "name": "orgId", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "hide": 0, + "label": "Instance ID", + "name": "instanceId", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" } ] }, diff --git a/dev/grafana/provisioning/datasources/kiloclaw-clickhouse.yml b/dev/grafana/provisioning/datasources/kiloclaw-clickhouse.yml index e233eb8d4e..b75dacfc4f 100644 --- a/dev/grafana/provisioning/datasources/kiloclaw-clickhouse.yml +++ b/dev/grafana/provisioning/datasources/kiloclaw-clickhouse.yml @@ -19,6 +19,9 @@ datasources: secure: ${GRAFANA_CLICKHOUSE_SECURE} username: '' defaultDatabase: default + # tlsSkipVerify must be false in any non-local environment — we point at + # api.cloudflare.com over public TLS. The env var exists only so local + # dev can override against a self-signed proxy if ever needed. tlsSkipVerify: ${GRAFANA_CLICKHOUSE_SKIP_TLS_VERIFY} dialTimeout: 10 httpHeaderName1: Authorization @@ -26,4 +29,3 @@ datasources: validateSql: false secureJsonData: httpHeaderValue1: Bearer ${CF_AE_TOKEN} - xHeaderKey: Bearer ${CF_AE_TOKEN}