Skip to content

Commit

Permalink
Add option to exclude alarms and usage elements
Browse files Browse the repository at this point in the history
  • Loading branch information
martialblog committed Aug 21, 2023
1 parent ad21861 commit fae7be6
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 14 deletions.
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Please prefer installation via system packages like `python3-requests`.

Alternatively you can install with pip:

pip3 install requests
pip3 install -r requirements.txt

Make sure to modify the shebang to your environment, one of the following should be fine.

Expand All @@ -43,12 +43,23 @@ optional arguments:
--password PASSWORD, -p PASSWORD
Password for Basic Auth
--mode MODE, -m MODE Check mode
--exclude [EXCLUDE ...]
Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions.
--max-age MAX_AGE, -M MAX_AGE
Max age in minutes for capacity usage updates. Defaults to 5
--version, -V Print version
--insecure Do not verify TLS certificate. Be careful with this option, please
```

The `--exclude` parameter will match against alarms and capacity-usage. It uses the following string representation (whitespaces included) to match against:

* alarms: `severity` `node_display_name` `feature_display_name` `event_type_display_name`
* capacity-usage: `severity` `display_name`

## Examples

Mode: cluster-status

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode cluster-status
[OK] control_cluster_status=STABLE - mgmt_cluster_status=STABLE - control_cluster_status=STABLE - nodes_online=3
Expand All @@ -66,14 +77,25 @@ $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password
| nodes_online=3;;;0
```

Mode: alarms

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms
[WARNING] 1 alarms - 1 medium
[MEDIUM] (2021-04-26 17:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high.
| alarms=1;;;0 alarms.medium=1;;;0
```

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms --exclude "LOW"
# Excluded alerts will still be counted, but are not factored into the exit code
[OK] 1 alarms
| alarms=1;;;0
```

Mode: capacity-usage

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode capacity-usage
[OK] 28 info - no usages - last update: 2021-04-29 19:06:12
Expand Down
76 changes: 63 additions & 13 deletions check_vmware_nsxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import logging
import datetime
import ssl
import re
from urllib.parse import urljoin
import urllib3
import requests
Expand Down Expand Up @@ -127,26 +128,26 @@ def request(self, url, method='GET'):
except Exception as json_exc:
raise CriticalException('Could not decode API JSON: ' + str(json_exc)) # pylint: disable=raise-missing-from

def get_cluster_status(self):
def get_cluster_status(self, excludes=None):
"""
GET and build ClusterStatus
"""
return ClusterStatus(self.request('cluster/status'))
return ClusterStatus(self.request('cluster/status'), excludes)

def get_alarms(self):
def get_alarms(self, excludes=None):
"""
GET and build Alarms
"""
status = "OPEN"
# status = "RESOLVED" # for testing
result = self.request('alarms?page_size=100&status=%s&sort_ascending=false' % status)
return Alarms(result['results'])
return Alarms(data=result['results'], excludes=excludes)

def get_capacity_usage(self):
def get_capacity_usage(self, excludes=None):
"""
GET and build CapacityUsage
"""
return CapacityUsage(self.request('capacity/usage'), self.max_age)
return CapacityUsage(self.request('capacity/usage'), self.max_age, excludes)


class CheckResult:
Expand Down Expand Up @@ -203,9 +204,12 @@ class ClusterStatus(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_ReadClusterStatus.html
"""

def __init__(self, data):
def __init__(self, data, excludes):
super().__init__()
self.data = data
self.excludes = excludes
if excludes is None:
self.excludes = []

def build_output(self):
for area in ['control_cluster_status', 'mgmt_cluster_status', 'control_cluster_status']:
Expand Down Expand Up @@ -234,14 +238,33 @@ class Alarms(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetAlarms.html
"""

def __init__(self, data):
def __init__(self, data, excludes):
super().__init__()
self.data = data
self.excludes = excludes
if excludes is None:
self.excludes = []

def _is_excluded(self, alarm):
# to exclude via --exclude
identifier = "%s %s %s %s" % (
alarm['severity'],
alarm['node_display_name'],
alarm['feature_display_name'],
alarm['event_type_display_name'])
for exclude in self.excludes:
regexp = re.compile(exclude)
if bool(regexp.search(identifier)):
return True
return False

def build_output(self):
states = {}

for alarm in self.data:
if self._is_excluded(alarm):
continue

severity = alarm['severity']
if severity in states:
states[severity] += 1
Expand Down Expand Up @@ -270,7 +293,11 @@ def build_status(self):
states = []

for alarm in self.data:
state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL # CRITICAL, HIGH
if self._is_excluded(alarm):
continue

# HIGH == CRITICAL
state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL
states.append(state)

if len(states) > 0:
Expand All @@ -285,15 +312,33 @@ class CapacityUsage(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetProtonCapacityUsage.html
"""

def __init__(self, data, max_age):
def __init__(self, data, max_age, excludes):
super().__init__()
self.data = data
self.max_age = max_age
self.excludes = excludes
if excludes is None:
self.excludes = []

def _is_excluded(self, usage):
# to exclude via --exclude
identifier = "%s %s" % (
usage['severity'],
usage['display_name'])

for exclude in self.excludes:
regexp = re.compile(exclude)
if bool(regexp.search(identifier)):
return True
return False

def build_output(self):
states = {}

for usage in self.data['capacity_usage']:
if self._is_excluded(usage):
continue

severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR

if severity in states:
Expand Down Expand Up @@ -341,6 +386,9 @@ def build_status(self):
self.summary.append("last update older than %s minutes" % (self.max_age))

for usage in self.data['capacity_usage']:
if self._is_excluded(usage):
continue

severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR

if severity == "INFO":
Expand Down Expand Up @@ -398,6 +446,8 @@ def commandline(args):
help='Password for Basic Auth', required=True)
parser.add_argument('--mode', '-m', choices=['cluster-status', 'alarms', 'capacity-usage'],
help='Check mode to exectue. Hint: alarms will only include open alarms.', required=True)
parser.add_argument('--exclude', nargs='*', action='extend', type=str,
help="Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions.")
parser.add_argument('--max-age', '-M', type=int,
help='Max age in minutes for capacity usage updates. Defaults to 5', default=5, required=False)
parser.add_argument('--insecure',
Expand All @@ -421,11 +471,11 @@ def main(args):
client = Client(args.api, args.username, args.password, verify=(not args.insecure), max_age=args.max_age)

if args.mode == 'cluster-status':
return client.get_cluster_status().print_and_return()
return client.get_cluster_status(args.exclude).print_and_return()
if args.mode == 'alarms':
return client.get_alarms().print_and_return()
return client.get_alarms(args.exclude).print_and_return()
if args.mode == 'capacity-usage':
return client.get_capacity_usage().print_and_return()
return client.get_capacity_usage(args.exclude).print_and_return()

print("[UNKNOWN] unknown mode %s" % args.mode)
return UNKNOWN
Expand Down
37 changes: 37 additions & 0 deletions test_check_vmware_nsxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ def test_alarms_ok(self, mock_req, mock_print):
self.assertEqual(actual, expected)
mock_print.assert_called_with('[WARNING] 1 alarms - 1 medium\n\n[MEDIUM] (2021-04-26 15:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high.\n| alarms=1;;;0 alarms.medium=1;;;0')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_alarms_exclude(self, mock_req, mock_print):

with open('testdata/fixtures/alarms.json') as f:
testdata = json.load(f)

m = mock.MagicMock()
m.status_code = 200
m.json.return_value = testdata
mock_req.return_value = m

c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5)

actual = c.get_alarms(excludes=["M[A-Z]+M"]).print_and_return()
expected = 0

self.assertEqual(actual, expected)
mock_print.assert_called_with('[OK] 1 alarms\n| alarms=1;;;0')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_capacity_usage_ok(self, mock_req, mock_print):
Expand All @@ -172,3 +192,20 @@ def test_capacity_usage_ok(self, mock_req, mock_print):

self.assertEqual(actual, expected)
mock_print.assert_called_with('[WARNING] 28 info - last update: 2021-04-30 09:17:40 - last update older than 5 minutes\n\n[OK] [INFO] System-wide NAT rules: 0 of 25000 (0%)\n[OK] [INFO] Network Introspection Rules: 1 of 10000 (0.01%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Hosts: 0 of 256 (0%)\n[OK] [INFO] Hypervisor Hosts: 18 of 1024 (1.75%)\n[OK] [INFO] System-wide Firewall Rules: 81 of 100000 (0.08%)\n[OK] [INFO] System-wide DHCP Pools: 0 of 10000 (0%)\n[OK] [INFO] System-wide Edge Nodes: 10 of 320 (3.12%)\n[OK] [INFO] Active Directory Domains (Identity Firewall): 0 of 4 (0%)\n[OK] [INFO] vSphere Clusters Prepared for NSX: 4 of 128 (3.12%)\n[OK] [INFO] Prefix-lists: 20 of 500 (4%)\n[OK] [INFO] Logical Switches: 12 of 10000 (0.12%)\n[OK] [INFO] System-wide Logical Switch Ports: 145 of 25000 (0.58%)\n[OK] [INFO] Active Directory Groups (Identity Firewall): 0 of 100000 (0%)\n[OK] [INFO] Distributed Firewall Rules: 75 of 100000 (0.07%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Virtual Machines: 0 of 7500 (0%)\n[OK] [INFO] Distributed Firewall Sections: 23 of 10000 (0.23%)\n[OK] [INFO] Groups Based on IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Edge Clusters: 3 of 160 (1.87%)\n[OK] [INFO] Tier-1 Logical Routers with NAT Enabled: 0 of 4000 (0%)\n[OK] [INFO] System-wide Firewall Sections: 29 of 10000 (0.29%)\n[OK] [INFO] Network Introspection Sections: 1 of 500 (0.2%)\n[OK] [INFO] Groups: 74 of 20000 (0.37%)\n[OK] [INFO] Tier-1 Logical Routers: 4 of 4000 (0.1%)\n[OK] [INFO] IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Network Introspection Service Chains: 0 of 24 (0%)\n[OK] [INFO] Network Introspection Service Paths: 0 of 4000 (0%)\n[OK] [INFO] Tier-0 Logical Routers: 2 of 160 (1.25%)\n[OK] [INFO] DHCP Server Instances: 0 of 10000 (0%)\n| number_of_nat_rules=0%;70;100;0;100 number_of_si_rules=0.01%;70;100;0;100 number_of_gi_protected_hosts=0%;70;100;0;100 number_of_prepared_hosts=1.75%;70;100;0;100 number_of_firewall_rules=0.08%;70;100;0;100 number_of_dhcp_ip_pools=0%;70;100;0;100 number_of_edge_nodes=3.12%;70;100;0;100 number_of_active_directory_domains=0%;70;100;0;100 number_of_vcenter_clusters=3.12%;70;100;0;100 number_of_prefix_list=4%;70;100;0;100 number_of_logical_switches=0.12%;70;100;0;100 number_of_logical_ports=0.58%;70;100;0;100 number_of_active_directory_groups=0%;70;100;0;100 number_of_dfw_rules=0.07%;70;100;0;100 number_of_gi_protected_vms=0%;70;100;0;100 number_of_dfw_sections=0.23%;70;100;0;100 number_of_groups_based_on_ip_sets=0.37%;70;100;0;100 number_of_edge_clusters=1.87%;70;100;0;100 number_of_tier1_with_nat_rule=0%;70;100;0;100 number_of_firewall_sections=0.29%;70;100;0;100 number_of_si_sections=0.2%;70;100;0;100 number_of_nsgroup=0.37%;70;100;0;100 number_of_tier1_routers=0.1%;70;100;0;100 number_of_ipsets=0.37%;70;100;0;100 number_of_si_service_chains=0%;70;100;0;100 number_of_si_service_paths=0%;70;100;0;100 number_of_tier0_routers=1.25%;70;100;0;100 number_of_dhcp_servers=0%;70;100;0;100')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_capacity_usage_exclude(self, mock_req, mock_print):

with open('testdata/fixtures/capacity-usage.json') as f:
testdata = json.load(f)

m = mock.MagicMock()
m.status_code = 200
m.json.return_value = testdata
mock_req.return_value = m

c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5)

actual = c.get_capacity_usage(".*").print_and_return()
expected = 0

0 comments on commit fae7be6

Please sign in to comment.