Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
707 lines (707 sloc) 21.1 KB
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "An example 'Getting Started' dashboard for monitoring IBM Event Streams",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": 2,
"links": [],
"refresh": "5s",
"rows": [
{
"collapse": false,
"height": 317,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "BytesInPerSec/BytesOutPerSec: Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "kafka_server_brokertopicmetrics_bytesinpersec_count{kubernetes_namespace=~\"$ns\", release=~\"$rn\"}",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"legendFormat": "{{kubernetes_pod_name}}",
"metric": "kafka_server_brokertopicmetrics_bytesinpersec_count",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": "30m",
"timeShift": null,
"title": "Bytes in per second",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": false,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "Bytes",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "BytesInPerSec/BytesOutPerSec: Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.",
"fill": 1,
"id": 7,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "kafka_server_brokertopicmetrics_bytesoutpersec_count{kubernetes_namespace=~\"$ns\", release=~\"$rn\"}",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"legendFormat": "{{kubernetes_pod_name}}",
"metric": "kafka_server_brokertopicmetrics_bytesoutpersec_count",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": "30m",
"timeShift": null,
"title": "Bytes out per second",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "Bytes",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Throughput",
"titleSize": "h6"
},
{
"collapse": false,
"height": 133,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "prometheus",
"description": "ActiveControllerCount: The first node to boot in a Kafka cluster automatically becomes the controller, and there can be only one. The controller in a Kafka cluster is responsible for maintaining the list of partition leaders, and coordinating leadership transitions (in the event a partition leader becomes unavailable). If it becomes necessary to replace the controller, a new controller is randomly chosen by ZooKeeper from the pool of brokers. In general, it is not possible for this value to be greater than one, but you should definitely alert on a value of zero that lasts for more than a short period (< 1s) of time.",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 5,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 6,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_activecontrollercount_value{kubernetes_namespace=~\"$ns\", release=~\"$rn\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"metric": "kafka_controller_kafkacontroller_activecontrollercount_value",
"refId": "A",
"step": 4
}
],
"thresholds": "1,1",
"title": "Active Controller Count",
"type": "singlestat",
"valueFontSize": "200%",
"valueMaps": [
{
"op": "=",
"text": "1",
"value": "1"
},
{
"op": "=",
"text": "Error - No controller",
"value": "0"
},
{
"op": "=",
"text": "Error - Multiple Controllers",
"value": "2"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "prometheus",
"description": "UnderReplicatedPartitions: In a healthy cluster, the number of in sync replicas (ISRs) should be exactly equal to the total number of replicas. If partition replicas fall too far behind their leaders, the follower partition is removed from the ISR pool, and you should see a corresponding increase in IsrShrinksPerSec. Since Kafka’s high-availability guarantees cannot be met without replication, investigation is certainly warranted should this metric value exceed zero for extended time periods.",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 8,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 6,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(kafka_server_replicamanager_underreplicatedpartitions_value{kubernetes_namespace=~\"$ns\", release=~\"$rn\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"metric": "kafka_server_replicamanager_underreplicatedpartitions_value",
"refId": "A",
"step": 4
}
],
"thresholds": "1,1",
"title": "Under Replicated Partitions",
"type": "singlestat",
"valueFontSize": "200%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Cluster Health",
"titleSize": "h6"
},
{
"collapse": false,
"height": 115,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "prometheus",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 3,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_globalpartitioncount_value{kubernetes_namespace=~\"$ns\", release=~\"$rn\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"metric": "kafka_controller_kafkacontroller_globalpartitioncount_value",
"refId": "A",
"step": 4
}
],
"thresholds": "",
"title": "Global Partitions Count",
"type": "singlestat",
"valueFontSize": "200%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "prometheus",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 2,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_globaltopiccount_value{kubernetes_namespace=~\"$ns\", release=~\"$rn\"})",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "kafka_controller_kafkacontroller_globaltopiccount_value",
"refId": "A",
"step": 4
}
],
"thresholds": "",
"title": "Global Topic Count",
"type": "singlestat",
"valueFontSize": "200%",
"valueMaps": [],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "prometheus",
"description": "OfflinePartitionsCount (controller only): This metric reports the number of partitions without an active leader. Because all read and write operations are only performed on partition leaders, a non-zero value for this metric should be alerted on to prevent service interruptions. Any partition without an active leader will be completely inaccessible, and both consumers and producers of that partition will be blocked until a leader becomes available.",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 6,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_offlinepartitionscount_value{kubernetes_namespace=~\"$ns\", release=~\"$rn\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"metric": "kafka_controller_kafkacontroller_offlinepartitionscount_value",
"refId": "A",
"step": 4
}
],
"thresholds": "1,1",
"title": "Offline Partitions Count",
"type": "singlestat",
"valueFontSize": "200%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Topics & Partitions",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": true,
"text": "dev",
"value": "dev"
},
"datasource": "prometheus",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "ns",
"options": [],
"query": "label_values(kubernetes_namespace)",
"refresh": 1,
"regex": "^(?!kube-system).*$",
"sort": 2,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"selected": true,
"tags": [],
"text": "v100-0522",
"value": "v100-0522"
},
"datasource": "prometheus",
"hide": 0,
"includeAll": false,
"label": "Release",
"multi": false,
"name": "rn",
"options": [],
"query": "label_values(release)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "IBM Event Streams",
"version": 8
}