Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: honza <honza@inuits.eu>
- Loading branch information
honza
committed
May 14, 2017
1 parent
f34170a
commit 21e2b67
Showing
3 changed files
with
279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,272 @@ | ||
#!/bin/bash | ||
|
||
## | ||
## check_graphite_metric | ||
## | ||
## by kali (kali@kalimotxo.net) created for Sociomantic GmbH, 2014 | ||
## | ||
## | ||
## Checks for values provided by graphite interface. | ||
## Accepts low and high ranges for warning and critical status. | ||
## | ||
## Depends on curl to retrieve the values from graphite interface. | ||
## | ||
## Specify the metric name with -m parameter, can use wildcards, but only ONE target per call. | ||
## Retrieves current values collection with a "from" graphite parameter specifying time in the past, | ||
## which can be specified in parameters in the check. Useful for metrics with high intervals (e.g. 1 h) | ||
## Ignores values received as "None" which means graphite has no points collected for the specified metric/timeframe | ||
## | ||
## Returns UNKNOWN status if can't retreive values from graphite. | ||
## | ||
|
||
# Declare some defaults | ||
EXIT_NAGIOS_STATUS=-1 | ||
CHECK_OUTPUT="" | ||
DEFAULT_NULL_STATUS=3 | ||
# search string in metric to replace with the HOST parameter | ||
METRIC_HOST_MATCHSTRING="%HOST%" | ||
# default timeframe to check graphite for values | ||
GRAPHITE_FROM_VALUE="8minutes" | ||
# default needed curl options - allow special chars + silent + timeout 30 seconds | ||
CURL_OPTIONS="-g -s --max-time 30" | ||
# array to store metric names returned from graphite. used to keep graphite's response order | ||
declare -a RES_METRICS | ||
# assoc. array to store values for each metric. declared -A to iterate results and keep only the last not null value | ||
declare -A RES_VALUES | ||
# counter of passed threshold parameters | ||
HAS_LO_PARAMS=0 | ||
HAS_HI_PARAMS=0 | ||
|
||
# define exit functions for nagios | ||
exit_ok() { | ||
echo "OK - ${CHECK_OUTPUT}" | ||
exit 0 | ||
} | ||
exit_war() { | ||
echo "WARNING - ${CHECK_OUTPUT}" | ||
exit 1 | ||
} | ||
exit_cri() { | ||
echo "CRITICAL - ${CHECK_OUTPUT}" | ||
exit 2 | ||
} | ||
exit_unk() { | ||
echo "UNKNOWN - ${CHECK_OUTPUT}" | ||
exit 3 | ||
} | ||
exit_bad() { | ||
echo "OUT OF RANGE - ${CHECK_OUTPUT}" | ||
exit 255 | ||
} | ||
# May this also be the inline documentation :) | ||
show_usage() { | ||
cat << EOF | ||
Usage: $0 -w <warn_lvl_high> -c <crit_lvl_high> [-H host] -m <graphite_metric_substring> | ||
-g|--graphite_url [url] : Url (base) where graphite lives | ||
-G|--graphite_auth [u:p] : User / password to authenticate into graphite (http auth) | ||
-W|--warn_high [value] : Value threshold to alert as warning when graphite goes above it (hi level) | ||
-C|--crit_high [value] : Value threshold to alert as critical when graphite goes above it (hi/hi level) | ||
-w|--warn_low [value] : (Optional) Lower warning value threshold (lo level) Ignored if absent. | ||
-c|--crit_low [value] : (Optional) Lower critical value threshold (lo/lo level) Ignored if absent. | ||
-H|--host [host] : (Optional) Host string in the format dc-NNN. If present, it will be replaced in the graphite metric. | ||
-m|--metric [metric] : Graphite metric to be checked for values. If -H is present it will replace any occurence of the string "${METRIC_HOST_MATCHSTRING}" here. | ||
-n|--null_returns [code] : (Optional) Return specified status [OK|WARN|CRIT|UNK|0|1|2|3] when graphite gives NULL values only. Defaults to "UNK|3" | ||
-y|--hyst_warn [value] : (Optional) Hysteresis for the warning level - how much value should change to turn off Warning | ||
-Y|--hyst_crit [value] : (Optional) Hysteresis for the critical level - how much value should change to turn off Error | ||
-t|--timeframe [minutes] : (Optional) Set MINUTES threshold for metric inspection (e.g.: -t 5 --> checks metric values for 5 minutes in the past). Defaults to "8" | ||
-q|--quiet : Quiet output. Returns only the status, and values only if the status is critical or warning. | ||
-h|--help : Show usage. | ||
-d|--debug : enable debug. | ||
Example: | ||
$0 -W 20 -C 40 -m "eu-002.cacti_propulsor.cacti_propulsor_cpu-0*.cpu_usage" | ||
WARNING - eu-002.cacti_propulsor.cacti_propulsor_cpu-02.cpu_usage=30.3966666667; eu-002.cacti_propulsor.cacti_propulsor_cpu-01.cpu_usage=22.8726666667; eu-002.cacti_propulsor.cacti_propulsor_cpu-03.cpu_usage=29.4873333333; eu-002.cacti_propulsor.cacti_propulsor_cpu-04.cpu_usage=27.258; | ||
EOF | ||
} | ||
|
||
check_status() { | ||
local METRIC="${1}" | ||
local ADDITION_CRIT="${2:-0}" | ||
local ADDITION_WARN="${3:-0}" | ||
local STATUS="0" | ||
if [ $HAS_LO_PARAMS -eq 2 ] ; then | ||
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} < ($WARN_LVL_LOW + $ADDITION_WARN))}" && STATUS=1 | ||
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} < ($CRIT_LVL_LOW + $ADDITION_CRIT))}" && STATUS=2 | ||
fi | ||
if [ $HAS_HI_PARAMS -eq 2 ] ; then | ||
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} > ($WARN_LVL_HIGH - ${ADDITION_WARN}))}" && STATUS=1 | ||
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} > ($CRIT_LVL_HIGH - ${ADDITION_CRIT}))}" && STATUS=2 | ||
fi | ||
echo ${STATUS} | ||
} | ||
|
||
while (( "$#" )) ; do | ||
case "${1}" in | ||
-g|--graphite_url) | ||
GRAPHITE_RENDER_URL="${2}/render/" | ||
shift | ||
;; | ||
-G|--graphite_auth) | ||
CURL_OPTIONS="${CURL_OPTIONS} -k -u ${2}" | ||
shift | ||
;; | ||
-W|--warn_high) | ||
WARN_LVL_HIGH=${2} | ||
(( ++HAS_HI_PARAMS )) | ||
shift | ||
;; | ||
-C|--crit_high) | ||
CRIT_LVL_HIGH=${2} | ||
(( ++HAS_HI_PARAMS )) | ||
shift | ||
;; | ||
-w|--warn_low) | ||
WARN_LVL_LOW=${2} | ||
(( ++HAS_LO_PARAMS )) | ||
shift | ||
;; | ||
-c|--crit_low) | ||
CRIT_LVL_LOW=${2} | ||
(( ++HAS_LO_PARAMS )) | ||
shift | ||
;; | ||
-m|--host) | ||
METRIC_STRING=${2} | ||
shift | ||
;; | ||
-H|--metric) | ||
HOST=${2} | ||
shift | ||
;; | ||
-n|--null_returns) | ||
case ${2^^} in | ||
OK|0) | ||
DEFAULT_NULL_STATUS=0 | ||
;; | ||
WARN|WARNING|1) | ||
DEFAULT_NULL_STATUS=1 | ||
;; | ||
CRIT|CRITICAL|2) | ||
DEFAULT_NULL_STATUS=2 | ||
;; | ||
*) | ||
DEFAULT_NULL_STATUS=3 | ||
;; | ||
esac | ||
shift | ||
;; | ||
-y|--hyst_warn) | ||
HYSTERESIS_WARN=${2} | ||
shift | ||
;; | ||
-Y|--hyst_crit) | ||
HYSTERESIS_CRIT=${2} | ||
shift | ||
;; | ||
-t|--timeframe) | ||
GRAPHITE_FROM_VALUE="${2}minutes" | ||
shift | ||
;; | ||
-q|--quiet) | ||
QUIET_OUTPUT=1 | ||
;; | ||
-d|--debug) | ||
set -x | ||
;; | ||
-h|--help) | ||
show_usage | ||
exit 0 | ||
;; | ||
*) | ||
# silently ignore unknown parameters rather than rant about it and exit | ||
;; | ||
esac | ||
shift | ||
done | ||
|
||
# build base curl options for querying graphite. build the target metric passed as parameter over it | ||
GRAPHITE_QUERY_BASE="${GRAPHITE_RENDER_URL}?format=raw&from=-${GRAPHITE_FROM_VALUE}&target=" | ||
|
||
# make sure we have all the necessary parameters | ||
case $HAS_LO_PARAMS in | ||
1) | ||
if [ -z ${WARN_LVL_LOW} ] ; then | ||
# Default warning level to critical if not given | ||
WARN_LVL_LOW=${CRIT_LVL_LOW} | ||
(( ++HAS_LO_PARAMS )) | ||
else | ||
CHECK_OUTPUT="Missing CRITICAL (low) level parameter. Passed -w=${WARN_LVL_LOW} -c=${CRIT_LVL_LOW}" | ||
exit_bad | ||
fi | ||
;; | ||
2) | ||
if `awk "BEGIN {exit !(${WARN_LVL_LOW} < ${CRIT_LVL_LOW})}"` ; then | ||
CHECK_OUTPUT="Low Warning threshold cannot be below critical threshold. Passed -w=${WARN_LVL_LOW} -c=${CRIT_LVL_LOW}" | ||
exit_bad | ||
fi | ||
;; | ||
esac | ||
case $HAS_HI_PARAMS in | ||
1) | ||
if [ -z ${WARN_LVL_HIGH} ] ; then | ||
WARN_LVL_HIGH=${CRIT_LVL_HIGH} | ||
(( ++HAS_HI_PARAMS )) | ||
else | ||
CHECK_OUTPUT="Missing CRITICAL (high) level parameter. Passed -W=${WARN_LVL_HIGH} -C=${CRIT_LVL_HIGH}" | ||
exit_bad | ||
fi | ||
;; | ||
2) | ||
if `awk "BEGIN {exit !(${WARN_LVL_HIGH} > ${CRIT_LVL_HIGH})}"` ; then | ||
CHECK_OUTPUT="High Warning threshold cannot be over critical threshold. Passed -W=${WARN_LVL_HIGH} -C=${CRIT_LVL_HIGH}" | ||
exit_bad | ||
fi | ||
;; | ||
esac | ||
|
||
## If HOST parameter is present, replace it in the metric variable | ||
[ "$HOST" != "" ] && METRIC_STRING=$(sed "s/$METRIC_HOST_MATCHSTRING/$HOST/g" <<< $METRIC_STRING) | ||
|
||
# Run query against graphite and store last non-none values on the array | ||
CURL_URI="${GRAPHITE_QUERY_BASE}${METRIC_STRING}" | ||
for GRAPHITE_OUT in $(curl ${CURL_OPTIONS} ${CURL_URI} | sed 's/ //g') ; do | ||
if [[ "${GRAPHITE_OUT:0:1}" =~ ^\<.*|^\{.* ]] ; then | ||
GLOBAL_STATE=3 | ||
CHECK_OUTPUT="Graphite returned an error, check your metric: ${METRIC_STRING}" | ||
exit_unk | ||
fi | ||
GRAPHITE_METRIC=$(cut -f1 -d\| <<< $GRAPHITE_OUT | sed 's/,[0-9]*,[0-9]*,[0-9]*$//') | ||
RES_METRICS+=("$GRAPHITE_METRIC") | ||
for GRAPHITE_VALUE in $(cut -f2 -d\| <<< $GRAPHITE_OUT | sed 's/,/ /g') ; do | ||
[ "${GRAPHITE_VALUE,,}" != "none" ] && RES_VALUES[${GRAPHITE_METRIC}]=${GRAPHITE_VALUE} | ||
done | ||
done | ||
|
||
PREV_STATUS=${NAGIOS_LASTSERVICESTATEID:-0} | ||
if [ ${#RES_VALUES[@]} -eq 0 ] ; then | ||
EXIT_NAGIOS_STATUS=$DEFAULT_NULL_STATUS | ||
if [ ${DEFAULT_NULL_STATUS} -eq 3 ] ; then CHECK_OUTPUT="Graphite query ${METRIC_STRING} returned nothing." ; fi | ||
else | ||
for METRIC in ${RES_METRICS[*]} ; do | ||
METRIC_STATUS=0 | ||
if [[ "" == "${RES_VALUES["$METRIC"]}" ]] ; then | ||
METRIC_STATUS=$DEFAULT_NULL_STATUS | ||
else | ||
if [[ ! -z ${HYSTERESIS_CRIT} && ${PREV_STATUS} == 2 ]] || [[ ! -z ${HYSTERESIS_WARN} && ${PREV_STATUS} == 1 ]]; then | ||
METRIC_STATUS=$(check_status "${METRIC}" "${HYSTERESIS_CRIT}" "${HYSTERESIS_WARN}") | ||
else | ||
METRIC_STATUS=$(check_status "${METRIC}" 0) | ||
fi | ||
fi | ||
[ $METRIC_STATUS -gt $EXIT_NAGIOS_STATUS ] && EXIT_NAGIOS_STATUS=$METRIC_STATUS | ||
[ "${QUIET_OUTPUT}" != 1 ] || [ $METRIC_STATUS -gt 0 ] && CHECK_OUTPUT="${CHECK_OUTPUT}${METRIC}=${RES_VALUES[$METRIC]}; " | ||
done | ||
fi | ||
|
||
case $EXIT_NAGIOS_STATUS in | ||
0) exit_ok ;; | ||
1) exit_war ;; | ||
2) exit_cri ;; | ||
3) exit_unk ;; | ||
*) exit $EXIT_NAGIOS_STATUS ;; | ||
esac |