From 41c602f41e89994c929cc1011ef53a6df56fa41e Mon Sep 17 00:00:00 2001 From: Rahul Date: Mon, 1 Jul 2024 23:10:12 +0530 Subject: [PATCH] feat: Harvest should remove Service Center metrics (#3019) * feat: Harvest should remove Service Center metrics --- cmd/collectors/restperf/restperf.go | 6 +- cmd/collectors/zapiperf/zapiperf.go | 17 +- conf/restperf/9.12.0/workload_detail.yaml | 1 + .../9.12.0/workload_detail_volume.yaml | 1 + conf/zapiperf/cdot/9.8.0/workload_detail.yaml | 1 + .../cdot/9.8.0/workload_detail_volume.yaml | 1 + grafana/dashboards/cmode/workload.json | 1572 +---------------- 7 files changed, 28 insertions(+), 1571 deletions(-) diff --git a/cmd/collectors/restperf/restperf.go b/cmd/collectors/restperf/restperf.go index 76f728347..b552921fe 100644 --- a/cmd/collectors/restperf/restperf.go +++ b/cmd/collectors/restperf/restperf.go @@ -50,7 +50,7 @@ var qosDetailQuery = "api/cluster/counter/tables/qos_detail" var qosDetailVolumeQuery = "api/cluster/counter/tables/qos_detail_volume" var qosWorkloadQuery = "api/storage/qos/workloads" -var workloadDetailMetrics = []string{"resource_latency", "service_time_latency"} +var workloadDetailMetrics = []string{"resource_latency"} var qosQueries = map[string]string{ qosQuery: qosQuery, @@ -171,6 +171,10 @@ func (r *RestPerf) InitQOS() error { if withConstituents == "false" { r.perfProp.disableConstituents = true } + withServiceLatency := refine.GetChildContentS("with_service_latency") + if withServiceLatency != "false" { + workloadDetailMetrics = append(workloadDetailMetrics, "service_time_latency") + } } } return nil diff --git a/cmd/collectors/zapiperf/zapiperf.go b/cmd/collectors/zapiperf/zapiperf.go index 6c5d0e35c..1a57fba81 100644 --- a/cmd/collectors/zapiperf/zapiperf.go +++ b/cmd/collectors/zapiperf/zapiperf.go @@ -69,7 +69,7 @@ const ( BILLION = 1_000_000_000 ) -var workloadDetailMetrics = []string{"resource_latency", "service_time_latency"} +var workloadDetailMetrics = []string{"resource_latency"} type ZapiPerf struct { *zapi.Zapi // provides: AbstractCollector, Client, Object, Query, TemplateFn, TemplateType @@ -119,10 +119,25 @@ func (z *ZapiPerf) Init(a *collector.AbstractCollector) error { return err } + z.InitQOS() + z.Logger.Debug().Msg("initialized") return nil } +func (z *ZapiPerf) InitQOS() { + counters := z.Params.GetChildS("counters") + if counters != nil { + refine := counters.GetChildS("refine") + if refine != nil { + withServiceLatency := refine.GetChildContentS("with_service_latency") + if withServiceLatency != "false" { + workloadDetailMetrics = append(workloadDetailMetrics, "service_time_latency") + } + } + } +} + func (z *ZapiPerf) LoadPlugin(kind string, abc *plugin.AbstractPlugin) plugin.Plugin { switch kind { case "Nic": diff --git a/conf/restperf/9.12.0/workload_detail.yaml b/conf/restperf/9.12.0/workload_detail.yaml index e6564b006..9e8deefa2 100644 --- a/conf/restperf/9.12.0/workload_detail.yaml +++ b/conf/restperf/9.12.0/workload_detail.yaml @@ -17,6 +17,7 @@ counters: - wait_time - refine: - with_constituents: false # The possible values are true or false. Setting this to true will include constituents in the results, while false will exclude them. + - with_service_latency: false # The possible values are true or false. Setting this to true will generate metric qos_detail_service_time_latency which is average service time for workload within the subsystems. This latency is the processing time within the subsystem. resource_map: CPU_dblade: backend diff --git a/conf/restperf/9.12.0/workload_detail_volume.yaml b/conf/restperf/9.12.0/workload_detail_volume.yaml index 89facad48..252188795 100644 --- a/conf/restperf/9.12.0/workload_detail_volume.yaml +++ b/conf/restperf/9.12.0/workload_detail_volume.yaml @@ -16,6 +16,7 @@ counters: - wait_time - refine: - with_constituents: false # The possible values are true or false. Setting this to true will include constituents in the results, while false will exclude them. + - with_service_latency: false # The possible values are true or false. Setting this to true will generate metric qos_detail_service_time_latency which is average service time for workload within the subsystems. This latency is the processing time within the subsystem. resource_map: CPU_dblade : backend diff --git a/conf/zapiperf/cdot/9.8.0/workload_detail.yaml b/conf/zapiperf/cdot/9.8.0/workload_detail.yaml index 53b44a595..f97673bd0 100644 --- a/conf/zapiperf/cdot/9.8.0/workload_detail.yaml +++ b/conf/zapiperf/cdot/9.8.0/workload_detail.yaml @@ -20,6 +20,7 @@ counters: - wait_time - refine: - with_constituents: false # The possible values are true or false. Setting this to true will include constituents in the results, while false will exclude them. + - with_service_latency: false # The possible values are true or false. Setting this to true will generate metric qos_detail_service_time_latency which is average service time for workload within the subsystems. This latency is the processing time within the subsystem. resource_map: CPU_dblade: backend diff --git a/conf/zapiperf/cdot/9.8.0/workload_detail_volume.yaml b/conf/zapiperf/cdot/9.8.0/workload_detail_volume.yaml index 41f26baa3..bd52c2e4d 100644 --- a/conf/zapiperf/cdot/9.8.0/workload_detail_volume.yaml +++ b/conf/zapiperf/cdot/9.8.0/workload_detail_volume.yaml @@ -19,6 +19,7 @@ counters: - wait_time - refine: - with_constituents: false # The possible values are true or false. Setting this to true will include constituents in the results, while false will exclude them. + - with_service_latency: false # The possible values are true or false. Setting this to true will generate metric qos_detail_service_time_latency which is average service time for workload within the subsystems. This latency is the processing time within the subsystem. resource_map: CPU_dblade : backend diff --git a/grafana/dashboards/cmode/workload.json b/grafana/dashboards/cmode/workload.json index 094208b0a..1729761b4 100644 --- a/grafana/dashboards/cmode/workload.json +++ b/grafana/dashboards/cmode/workload.json @@ -65,7 +65,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1718893624466, + "iteration": 1719479251226, "links": [ { "asDropdown": true, @@ -4509,1572 +4509,6 @@ "title": "Read IO Type", "type": "row" }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 73 - }, - "id": 186, - "panels": [ - { - "datasource": "${DS_PROMETHEUS}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "µs" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 61 - }, - "id": 187, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "(avg(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\"}) by (resource))", - "interval": "", - "legendFormat": "{{resource}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Service Latency by Resources", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents the delays in the network layer of ONTAP.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 71 - }, - "id": 189, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"frontend\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"frontend\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"frontend\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from frontend", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents the delays in the data/WAFL layer of ONTAP.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 71 - }, - "id": 191, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"backend\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"backend\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"backend\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from backend", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents delays caused by the cluster switches, cables, and adapters which physically connect clustered nodes. \n\nIf the cluster interconnect component is in contention, it means high wait time for I/O requests at the cluster interconnect is impacting the latency of one or more workloads.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 81 - }, - "id": 193, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cluster\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cluster\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"cluster\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from cluster", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents delays due to buffered write flushes, called consistency points (cp).", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 81 - }, - "id": 195, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cp\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cp\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"cp\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from cp", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents slowness due to attached hard drives or solid state drives.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 91 - }, - "id": 197, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"disk\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"disk\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"disk\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from disk", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "`Note:` Typically these latencies only apply to SAN not NAS.\n\nRepresents the wait time of I/O requests by the external networking protocols on the cluster. The wait time is time spent waiting for transfer ready transactions to finish before the cluster can respond to an I/O request. If the network component is in contention, it means high wait time at the protocol layer is impacting the latency of one or more workloads.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 91 - }, - "id": 199, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"network\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"network\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"network\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from network", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents delays due to mirroring writes to the NVRAM/NVLOG memory and to the HA partner NVRAM/NVLOG memory.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 101 - }, - "id": 201, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"nvlog\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"nvlog\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"nvlog\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from nvlog", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents delays due to operations suspending on a delay mechanism. Typically this is diagnosed by NetApp Support.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 101 - }, - "id": 203, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"suspend\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"suspend\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"suspend\"})\n)\n", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from WAFL suspend", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents the throughput maximum (ceiling) setting of the storage Quality of Service (QoS) policy group assigned to the workload. If the policy group component is in contention, it means all workloads in the policy group are being throttled by the set throughput limit, which is impacting the latency of one or more of those workloads.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 111 - }, - "id": 205, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"throttle\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"throttle\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"throttle\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from Qos throttle", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents the latency to a workload that is being caused by QoS throughput floor (expected) setting assigned to other workloads. If the QoS floor set on certain workloads use the majority of the bandwidth to guarantee the promised throughput, other workloads will be throttled and see more latency.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 111 - }, - "id": 207, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"qos_min\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"qos_min\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"qos_min\"})\n)\n", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from qos_min", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Represents the software component in the cluster involved with I/O processing between the cluster and the cloud tier on which user data is stored. If the cloud latency component is in contention, it means that a large amount of reads from volumes that are hosted on the cloud tier are impacting the latency of one or more workloads.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 121 - }, - "id": 209, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cloud\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"cloud\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"cloud\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from cloud", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 121 - }, - "id": 211, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"sync_repl\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"sync_repl\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"sync_repl\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from sync_repl", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 131 - }, - "id": 213, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"flexcache_ral\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"flexcache_ral\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"flexcache_ral\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from flexcache_ral", - "type": "timeseries" - }, - { - "datasource": "${DS_PROMETHEUS}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [ - { - "options": { - "match": "null+nan", - "result": { - "index": 0, - "text": "0%" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 131 - }, - "id": 215, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.8", - "targets": [ - { - "exemplar": false, - "expr": "100 * (\n (qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"flexcache_spinhi\"}\n and\n topk($TopResources, avg_over_time(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",workload=~\"$Workload\",resource=\"flexcache_spinhi\"}[3h] @ end())))\n / on() group_left sum(qos_detail_service_time_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",resource=\"flexcache_spinhi\"})\n)", - "instant": false, - "interval": "", - "legendFormat": "{{cluster}} - {{workload}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Top $TopResources Workloads by Service Time from flexcache_spinhi", - "type": "timeseries" - } - ], - "title": "Service Center", - "type": "row" - }, { "collapsed": true, "datasource": "${DS_PROMETHEUS}", @@ -6082,7 +4516,7 @@ "h": 1, "w": 24, "x": 0, - "y": 74 + "y": 73 }, "id": 127, "panels": [ @@ -7908,5 +6342,5 @@ "timezone": "", "title": "ONTAP: Workload", "uid": "cdot-workload", - "version": 13 + "version": 14 }