From bf550aacbab958179e39255bb51f24acfe726ec5 Mon Sep 17 00:00:00 2001 From: 7840vz <122374011+7840vz@users.noreply.github.com> Date: Fri, 21 Apr 2023 14:54:44 +0300 Subject: [PATCH] feat: map ems severity to prom sev (#1973) * feat: map ems severity to prom sev Uses label 'severity' coming from EMS and maps to recommended prometheus scale of info/warning/critical: https://monitoring.mixins.dev/#guidelines-for-alert-names-labels-and-annotations * fix: revert whitespace --- container/prometheus/ems_alert_rules.yml | 920 +++++++++++++++++++++-- 1 file changed, 865 insertions(+), 55 deletions(-) diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index b8cb14ca5..880da26b3 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -8,7 +8,22 @@ groups: - alert: Volume Anti-ransomware Monitoring State Changed expr: last_over_time(ems_events{message="arw.volume.state"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Anti-ransomware state was changed to [{{ $labels.op }}] for Volume uuid [{{ $labels.volume_uuid }}]." @@ -16,7 +31,22 @@ groups: - alert: Storage VM Anti-ransomware Monitoring State Changed expr: last_over_time(ems_events{message="arw.vserver.state"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Anti-ransomware state was changed to [{{ $labels.op }}] for SVM name [{{ $labels.vserverName }}]." @@ -24,7 +54,22 @@ groups: - alert: Ransomware Activity Detected expr: last_over_time(ems_events{message="callhome.arw.activity.seen"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Ransomware activity detected for Volume uuid [{{ $labels.volume_uuid }}]." @@ -32,7 +77,22 @@ groups: - alert: NVRAM Battery Low expr: last_over_time(ems_events{message="callhome.battery.low"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVRAM battery low for Node uuid [{{ $labels.node_uuid }}]" @@ -40,7 +100,22 @@ groups: - alert: HA Interconnect Down expr: last_over_time(ems_events{message="callhome.hainterconnect.down"} [1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "HA interconnect down for Node uuid [{{ $labels.node_uuid }}]." @@ -48,7 +123,22 @@ groups: - alert: Shadow Copy Failed expr: last_over_time(ems_events{message="cifs.shadowcopy.failure"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Shadow copy failed for Object uuid [{{ $labels.object_uuid }}]." @@ -56,7 +146,22 @@ groups: - alert: AWS Credentials Not Initialized expr: last_over_time(ems_events{message="cloud.aws.iamNotInitialized"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "AWS credentials not initialized on Node uuid [{{ $labels.node_uuid }}]." @@ -64,7 +169,22 @@ groups: - alert: Storage Switch Power Supplies Failed expr: last_over_time(ems_events{message="cluster.switch.pwr.fail"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Storage switch power supplies failed on Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -72,7 +192,22 @@ groups: - alert: Disk Out of Service expr: last_over_time(ems_events{message="disk.outOfService"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Disk out of service for Node uuid [{{ $labels.node_uuid }}]." @@ -80,7 +215,22 @@ groups: - alert: FabricPool Space Usage Limit Reached expr: last_over_time(ems_events{message="fabricpool.full"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FabricPool space usage limit reached for Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -88,7 +238,22 @@ groups: - alert: FabricPool Space Usage Limit Nearly Reached expr: last_over_time(ems_events{message="fabricpool.nearly.full"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FabricPool space usage limit nearly reached for Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -96,7 +261,22 @@ groups: - alert: Giveback of Aggregate Failed expr: last_over_time(ems_events{message="gb.netra.ca.check.failed"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Giveback of aggregate failed of Aggregate uuid [{{ $labels.aggr_uuid }}]." @@ -104,7 +284,22 @@ groups: - alert: LUN Destroyed expr: last_over_time(ems_events{message="LUN.destroy"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "LUN destroyed of Object uuid [{{ $labels.object_uuid }}]." @@ -112,7 +307,22 @@ groups: - alert: LUN Offline expr: last_over_time(ems_events{message="LUN.offline"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Lun offline for Object uuid [{{ $labels.object_uuid }}]" @@ -120,7 +330,22 @@ groups: - alert: Node Root Volume Space Low expr: last_over_time(ems_events{message="mgmtgwd.rootvolrec.low.space"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Node root volume space low on Node uuid [{{ $labels.node_uuid }}]" @@ -128,7 +353,22 @@ groups: - alert: System Cannot Operate Due to Main Unit Fan Failure expr: last_over_time(ems_events{message="monitor.fan.critical"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "System cannot operate due to main unit fan failure for Node uuid [{{ $labels.node_uuid }}]" @@ -136,7 +376,22 @@ groups: - alert: Main Unit Fan Failed expr: last_over_time(ems_events{message="monitor.fan.failed"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Main unit fan failed for Node uuid [{{ $labels.node_uuid }}]" @@ -144,7 +399,22 @@ groups: - alert: Main Unit Fan in Warning State expr: last_over_time(ems_events{message="monitor.fan.warning"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Main unit fan in warning state for Node uuid [{{ $labels.node_uuid }}]" @@ -152,7 +422,22 @@ groups: - alert: Too Many CIFS Authentication expr: last_over_time(ems_events{message="Nblade.cifsManyAuths"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Too many CIFS authentication on Object uuid [{{ $labels.object_uuid }}]" @@ -160,7 +445,22 @@ groups: - alert: Max Times Open Per File Exceeded expr: last_over_time(ems_events{message="Nblade.cifsMaxOpenSameFile"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Max times open per file exceeded on Object uuid [{{ $labels.object_uuid }}]" @@ -168,7 +468,22 @@ groups: - alert: Max Sessions Per User Exceeded expr: last_over_time(ems_events{message="Nblade.cifsMaxSessPerUsrConn"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Max sessions per user exceeded for Object uuid [{{ $labels.object_uuid }}]" @@ -176,7 +491,22 @@ groups: - alert: NetBIOS Name Conflict expr: last_over_time(ems_events{message="Nblade.cifsNbNameConflict"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NetBIOS name conflict for Object uuid [{{ $labels.object_uuid }}]" @@ -184,7 +514,22 @@ groups: - alert: Nonexistent Admin Share expr: last_over_time(ems_events{message="Nblade.cifsNoPrivShare"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Nonexistent admin share for Object uuid [{{ $labels.object_uuid }}]" @@ -192,7 +537,22 @@ groups: - alert: NFSv4 Store Pool Exhausted expr: last_over_time(ems_events{message="Nblade.nfsV4PoolExhaust"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NFSv4 store pool exhausted for Object uuid [{{ $labels.object_uuid }}]" @@ -200,7 +560,22 @@ groups: - alert: Unauthorized User Access to Admin Share expr: last_over_time(ems_events{message="Nblade.vscanBadUserPrivAccess"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Unauthorized user access to admin share for Object uuid [{{ $labels.object_uuid }}]" @@ -208,7 +583,22 @@ groups: - alert: Antivirus Server Busy expr: last_over_time(ems_events{message="Nblade.vscanConnBackPressure"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Antivirus server busy for Object uuid [{{ $labels.object_uuid }}]" @@ -216,7 +606,22 @@ groups: - alert: No Registered Scan Engine expr: last_over_time(ems_events{message="Nblade.vscanNoRegdScanner"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "No registered scan engine for Object uuid [{{ $labels.object_uuid }}]" @@ -224,7 +629,22 @@ groups: - alert: No Vscan Connection expr: last_over_time(ems_events{message="Nblade.vscanNoScannerConn"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "No vscan connection for Object uuid [{{ $labels.object_uuid }}]" @@ -232,7 +652,22 @@ groups: - alert: Virus Detected expr: last_over_time(ems_events{message="Nblade.vscanVirusDetected"}[1w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Virus detected for Object uuid [{{ $labels.object_uuid }}]" @@ -240,7 +675,22 @@ groups: - alert: Non-responsive AntiVirus Server expr: last_over_time(ems_events{message="Nblade.vscanConnInactive"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Non-responsive antiVirus server for Object uuid [{{ $labels.object_uuid }}]" @@ -248,7 +698,22 @@ groups: - alert: NVMe Namespace Destroyed expr: last_over_time(ems_events{message="NVMeNS.destroy"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace destroyed for Object uuid [{{ $labels.object_uuid }}]" @@ -256,7 +721,22 @@ groups: - alert: NVMe Namespace Offline expr: last_over_time(ems_events{message="NVMeNS.offline"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace offline for Object uuid [{{ $labels.object_uuid }}]" @@ -264,7 +744,22 @@ groups: - alert: NVMe Namespace Online expr: last_over_time(ems_events{message="NVMeNS.online"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace online for Object uuid [{{ $labels.object_uuid }}]" @@ -272,7 +767,22 @@ groups: - alert: NVMe-oF Grace Period Active expr: last_over_time(ems_events{message="nvmf.graceperiod.active"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period active for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -280,7 +790,22 @@ groups: - alert: NVMe-oF Grace Period Expired expr: last_over_time(ems_events{message="nvmf.graceperiod.expired"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period expired for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -288,7 +813,22 @@ groups: - alert: NVMe-oF Grace Period Start expr: last_over_time(ems_events{message="nvmf.graceperiod.start"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period start for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -296,7 +836,22 @@ groups: - alert: Cloud Tier Unreachable expr: last_over_time(ems_events{message="object.store.unavailable"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Cloud tier unreachable for Node uuid [{{ $labels.node_uuid }}]" @@ -304,7 +859,22 @@ groups: - alert: Object Store Host Unresolvable expr: last_over_time(ems_events{message="objstore.host.unresolvable"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store host unresolvable for Node uuid [{{ $labels.node_uuid }}]" @@ -312,7 +882,22 @@ groups: - alert: Object Store Intercluster LIF Down expr: last_over_time(ems_events{message="objstore.interclusterlifDown"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store intercluster LIF down for Node uuid [{{ $labels.node_uuid }}]" @@ -320,7 +905,22 @@ groups: - alert: Object Store Signature Mismatch expr: last_over_time(ems_events{message="osc.signatureMismatch"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store signature mismatch for Node uuid [{{ $labels.node_uuid }}]" @@ -328,7 +928,22 @@ groups: - alert: QoS Monitor Memory Maxed Out expr: last_over_time(ems_events{message="qos.monitor.memory.maxed"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "QoS monitor memory maxed out for Object uuid [{{ $labels.object_uuid }}]" @@ -336,7 +951,22 @@ groups: - alert: SAN [active-active] State Changed expr: last_over_time(ems_events{message="scsiblade.san.config.active"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "SAN [active-active] state changed for Object uuid [{{ $labels.object_uuid }}]" @@ -344,7 +974,22 @@ groups: - alert: FC Target Port Commands Exceeded expr: last_over_time(ems_events{message="scsitarget.fct.port.full"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FC target port commands exceeded for Port Name [{{ $labels.portname }}]" @@ -352,7 +997,22 @@ groups: - alert: Shelf Fan Failed expr: last_over_time(ems_events{message="ses.status.fanError"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Shelf fan failed for Node uuid [{{ $labels.node_uuid }}]" @@ -360,7 +1020,22 @@ groups: - alert: Node Panic expr: last_over_time(ems_events{message="sk.panic"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Node panic for Node uuid [{{ $labels.node_uuid }}]" @@ -368,7 +1043,22 @@ groups: - alert: SnapMirror Relationship Out of Sync expr: last_over_time(ems_events{message="sms.status.out.of.sync"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "SnapMirror relationship out of sync for Relationship id [{{ $labels.relationship_id }}]" @@ -376,7 +1066,22 @@ groups: - alert: Service Processor Offline expr: last_over_time(ems_events{message="sp.ipmi.lost.shutdown"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Service processor offline for Node uuid [{{ $labels.node_uuid }}]" @@ -384,7 +1089,22 @@ groups: - alert: Service Processor Not Configured expr: last_over_time(ems_events{message="sp.notConfigured"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Service processor not configured for Node uuid [{{ $labels.node_uuid }}]" @@ -392,7 +1112,22 @@ groups: - alert: Unassigned Disks expr: last_over_time(ems_events{message="unowned.disk.reminder"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Unassigned disks for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -400,15 +1135,45 @@ groups: - alert: Storage VM Stop Succeeded expr: last_over_time(ems_events{message="vserver.stop.succeeded"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: - summary: "Storage VM stop succeeded for instance uuid [{{ $labels.inst_uuid }}]" + summary: "Storage VM stop succeeded for instance uuid [{{ $labels.instuuid }}]" # Alert for READDIR timeout ems - alert: READDIR Timeout expr: last_over_time(ems_events{message="wafl.readdir.expired"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "READDIR timeout for Object uuid [{{ $labels.object_uuid }}]" @@ -416,7 +1181,22 @@ groups: - alert: Volume Automatic Resizing Succeeded expr: last_over_time(ems_events{message="wafl.vol.autoSize.done"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume automatic resizing succeeded for Object uuid [{{ $labels.object_uuid }}]" @@ -424,7 +1204,22 @@ groups: - alert: Volume Offline expr: last_over_time(ems_events{message="wafl.vvol.offline"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume offline for instance uuid [{{ $labels.inst_uuid }}]" @@ -432,6 +1227,21 @@ groups: - alert: Volume Restricted expr: last_over_time(ems_events{message="wafl.vvol.restrict"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume restricted for instance uuid [{{ $labels.inst_uuid }}]"