Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to exclude alarms and usage elements #13

Merged
merged 1 commit into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Please prefer installation via system packages like `python3-requests`.

Alternatively you can install with pip:

pip3 install requests
pip3 install -r requirements.txt

Make sure to modify the shebang to your environment, one of the following should be fine.

Expand All @@ -43,12 +43,23 @@ optional arguments:
--password PASSWORD, -p PASSWORD
Password for Basic Auth
--mode MODE, -m MODE Check mode
--exclude [EXCLUDE ...]
Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions.
--max-age MAX_AGE, -M MAX_AGE
Max age in minutes for capacity usage updates. Defaults to 5
--version, -V Print version
--insecure Do not verify TLS certificate. Be careful with this option, please
```

The `--exclude` parameter will match against alarms and capacity-usage. It uses the following string representation (whitespaces included) to match against:

* alarms: `severity` `node_display_name` `feature_display_name` `event_type_display_name`
* capacity-usage: `severity` `display_name`

## Examples

Mode: cluster-status

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode cluster-status
[OK] control_cluster_status=STABLE - mgmt_cluster_status=STABLE - control_cluster_status=STABLE - nodes_online=3
Expand All @@ -66,14 +77,25 @@ $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password
| nodes_online=3;;;0
```

Mode: alarms

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms
[WARNING] 1 alarms - 1 medium

[MEDIUM] (2021-04-26 17:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high.
| alarms=1;;;0 alarms.medium=1;;;0
```

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms --exclude "LOW"
# Excluded alerts will still be counted, but are not factored into the exit code
[OK] 1 alarms
| alarms=1;;;0
```

Mode: capacity-usage

```
$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode capacity-usage
[OK] 28 info - no usages - last update: 2021-04-29 19:06:12
Expand Down
76 changes: 63 additions & 13 deletions check_vmware_nsxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import logging
import datetime
import ssl
import re
from urllib.parse import urljoin
import urllib3
import requests
Expand Down Expand Up @@ -127,26 +128,26 @@ def request(self, url, method='GET'):
except Exception as json_exc:
raise CriticalException('Could not decode API JSON: ' + str(json_exc)) # pylint: disable=raise-missing-from

def get_cluster_status(self):
def get_cluster_status(self, excludes=None):
"""
GET and build ClusterStatus
"""
return ClusterStatus(self.request('cluster/status'))
return ClusterStatus(self.request('cluster/status'), excludes)

def get_alarms(self):
def get_alarms(self, excludes=None):
"""
GET and build Alarms
"""
status = "OPEN"
# status = "RESOLVED" # for testing
result = self.request('alarms?page_size=100&status=%s&sort_ascending=false' % status)
return Alarms(result['results'])
return Alarms(data=result['results'], excludes=excludes)

def get_capacity_usage(self):
def get_capacity_usage(self, excludes=None):
"""
GET and build CapacityUsage
"""
return CapacityUsage(self.request('capacity/usage'), self.max_age)
return CapacityUsage(self.request('capacity/usage'), self.max_age, excludes)


class CheckResult:
Expand Down Expand Up @@ -203,9 +204,12 @@ class ClusterStatus(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_ReadClusterStatus.html
"""

def __init__(self, data):
def __init__(self, data, excludes):
super().__init__()
self.data = data
self.excludes = excludes
if excludes is None:
self.excludes = []

def build_output(self):
for area in ['control_cluster_status', 'mgmt_cluster_status', 'control_cluster_status']:
Expand Down Expand Up @@ -234,14 +238,33 @@ class Alarms(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetAlarms.html
"""

def __init__(self, data):
def __init__(self, data, excludes):
super().__init__()
self.data = data
self.excludes = excludes
if excludes is None:
self.excludes = []

def _is_excluded(self, alarm):
# to exclude via --exclude
identifier = "%s %s %s %s" % (
alarm['severity'],
alarm['node_display_name'],
alarm['feature_display_name'],
alarm['event_type_display_name'])
for exclude in self.excludes:
regexp = re.compile(exclude)
if bool(regexp.search(identifier)):
return True
return False

def build_output(self):
states = {}

for alarm in self.data:
if self._is_excluded(alarm):
continue

severity = alarm['severity']
if severity in states:
states[severity] += 1
Expand Down Expand Up @@ -270,7 +293,11 @@ def build_status(self):
states = []

for alarm in self.data:
state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL # CRITICAL, HIGH
if self._is_excluded(alarm):
continue

# HIGH == CRITICAL
state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL
states.append(state)

if len(states) > 0:
Expand All @@ -285,15 +312,33 @@ class CapacityUsage(CheckResult):
https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetProtonCapacityUsage.html
"""

def __init__(self, data, max_age):
def __init__(self, data, max_age, excludes):
super().__init__()
self.data = data
self.max_age = max_age
self.excludes = excludes
if excludes is None:
self.excludes = []

def _is_excluded(self, usage):
# to exclude via --exclude
identifier = "%s %s" % (
usage['severity'],
usage['display_name'])

for exclude in self.excludes:
regexp = re.compile(exclude)
if bool(regexp.search(identifier)):
return True
return False

def build_output(self):
states = {}

for usage in self.data['capacity_usage']:
if self._is_excluded(usage):
continue

severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR

if severity in states:
Expand Down Expand Up @@ -341,6 +386,9 @@ def build_status(self):
self.summary.append("last update older than %s minutes" % (self.max_age))

for usage in self.data['capacity_usage']:
if self._is_excluded(usage):
continue

severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR

if severity == "INFO":
Expand Down Expand Up @@ -398,6 +446,8 @@ def commandline(args):
help='Password for Basic Auth', required=True)
parser.add_argument('--mode', '-m', choices=['cluster-status', 'alarms', 'capacity-usage'],
help='Check mode to exectue. Hint: alarms will only include open alarms.', required=True)
parser.add_argument('--exclude', nargs='*', action='extend', type=str,
help="Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions.")
parser.add_argument('--max-age', '-M', type=int,
help='Max age in minutes for capacity usage updates. Defaults to 5', default=5, required=False)
parser.add_argument('--insecure',
Expand All @@ -421,11 +471,11 @@ def main(args):
client = Client(args.api, args.username, args.password, verify=(not args.insecure), max_age=args.max_age)

if args.mode == 'cluster-status':
return client.get_cluster_status().print_and_return()
return client.get_cluster_status(args.exclude).print_and_return()
if args.mode == 'alarms':
return client.get_alarms().print_and_return()
return client.get_alarms(args.exclude).print_and_return()
if args.mode == 'capacity-usage':
return client.get_capacity_usage().print_and_return()
return client.get_capacity_usage(args.exclude).print_and_return()

print("[UNKNOWN] unknown mode %s" % args.mode)
return UNKNOWN
Expand Down
37 changes: 37 additions & 0 deletions test_check_vmware_nsxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ def test_alarms_ok(self, mock_req, mock_print):
self.assertEqual(actual, expected)
mock_print.assert_called_with('[WARNING] 1 alarms - 1 medium\n\n[MEDIUM] (2021-04-26 15:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high.\n| alarms=1;;;0 alarms.medium=1;;;0')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_alarms_exclude(self, mock_req, mock_print):

with open('testdata/fixtures/alarms.json') as f:
testdata = json.load(f)

m = mock.MagicMock()
m.status_code = 200
m.json.return_value = testdata
mock_req.return_value = m

c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5)

actual = c.get_alarms(excludes=["M[A-Z]+M"]).print_and_return()
expected = 0

self.assertEqual(actual, expected)
mock_print.assert_called_with('[OK] 1 alarms\n| alarms=1;;;0')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_capacity_usage_ok(self, mock_req, mock_print):
Expand All @@ -172,3 +192,20 @@ def test_capacity_usage_ok(self, mock_req, mock_print):

self.assertEqual(actual, expected)
mock_print.assert_called_with('[WARNING] 28 info - last update: 2021-04-30 09:17:40 - last update older than 5 minutes\n\n[OK] [INFO] System-wide NAT rules: 0 of 25000 (0%)\n[OK] [INFO] Network Introspection Rules: 1 of 10000 (0.01%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Hosts: 0 of 256 (0%)\n[OK] [INFO] Hypervisor Hosts: 18 of 1024 (1.75%)\n[OK] [INFO] System-wide Firewall Rules: 81 of 100000 (0.08%)\n[OK] [INFO] System-wide DHCP Pools: 0 of 10000 (0%)\n[OK] [INFO] System-wide Edge Nodes: 10 of 320 (3.12%)\n[OK] [INFO] Active Directory Domains (Identity Firewall): 0 of 4 (0%)\n[OK] [INFO] vSphere Clusters Prepared for NSX: 4 of 128 (3.12%)\n[OK] [INFO] Prefix-lists: 20 of 500 (4%)\n[OK] [INFO] Logical Switches: 12 of 10000 (0.12%)\n[OK] [INFO] System-wide Logical Switch Ports: 145 of 25000 (0.58%)\n[OK] [INFO] Active Directory Groups (Identity Firewall): 0 of 100000 (0%)\n[OK] [INFO] Distributed Firewall Rules: 75 of 100000 (0.07%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Virtual Machines: 0 of 7500 (0%)\n[OK] [INFO] Distributed Firewall Sections: 23 of 10000 (0.23%)\n[OK] [INFO] Groups Based on IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Edge Clusters: 3 of 160 (1.87%)\n[OK] [INFO] Tier-1 Logical Routers with NAT Enabled: 0 of 4000 (0%)\n[OK] [INFO] System-wide Firewall Sections: 29 of 10000 (0.29%)\n[OK] [INFO] Network Introspection Sections: 1 of 500 (0.2%)\n[OK] [INFO] Groups: 74 of 20000 (0.37%)\n[OK] [INFO] Tier-1 Logical Routers: 4 of 4000 (0.1%)\n[OK] [INFO] IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Network Introspection Service Chains: 0 of 24 (0%)\n[OK] [INFO] Network Introspection Service Paths: 0 of 4000 (0%)\n[OK] [INFO] Tier-0 Logical Routers: 2 of 160 (1.25%)\n[OK] [INFO] DHCP Server Instances: 0 of 10000 (0%)\n| number_of_nat_rules=0%;70;100;0;100 number_of_si_rules=0.01%;70;100;0;100 number_of_gi_protected_hosts=0%;70;100;0;100 number_of_prepared_hosts=1.75%;70;100;0;100 number_of_firewall_rules=0.08%;70;100;0;100 number_of_dhcp_ip_pools=0%;70;100;0;100 number_of_edge_nodes=3.12%;70;100;0;100 number_of_active_directory_domains=0%;70;100;0;100 number_of_vcenter_clusters=3.12%;70;100;0;100 number_of_prefix_list=4%;70;100;0;100 number_of_logical_switches=0.12%;70;100;0;100 number_of_logical_ports=0.58%;70;100;0;100 number_of_active_directory_groups=0%;70;100;0;100 number_of_dfw_rules=0.07%;70;100;0;100 number_of_gi_protected_vms=0%;70;100;0;100 number_of_dfw_sections=0.23%;70;100;0;100 number_of_groups_based_on_ip_sets=0.37%;70;100;0;100 number_of_edge_clusters=1.87%;70;100;0;100 number_of_tier1_with_nat_rule=0%;70;100;0;100 number_of_firewall_sections=0.29%;70;100;0;100 number_of_si_sections=0.2%;70;100;0;100 number_of_nsgroup=0.37%;70;100;0;100 number_of_tier1_routers=0.1%;70;100;0;100 number_of_ipsets=0.37%;70;100;0;100 number_of_si_service_chains=0%;70;100;0;100 number_of_si_service_paths=0%;70;100;0;100 number_of_tier0_routers=1.25%;70;100;0;100 number_of_dhcp_servers=0%;70;100;0;100')

@mock.patch('builtins.print')
@mock.patch('requests.request')
def test_capacity_usage_exclude(self, mock_req, mock_print):

with open('testdata/fixtures/capacity-usage.json') as f:
testdata = json.load(f)

m = mock.MagicMock()
m.status_code = 200
m.json.return_value = testdata
mock_req.return_value = m

c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5)

actual = c.get_capacity_usage(".*").print_and_return()
expected = 0