Skip to content

Commit

Permalink
add check_graphite-metric
Browse files Browse the repository at this point in the history
Signed-off-by: honza <honza@inuits.eu>
  • Loading branch information
honza committed May 14, 2017
1 parent f34170a commit 21e2b67
Show file tree
Hide file tree
Showing 3 changed files with 279 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Expand Up @@ -304,6 +304,12 @@ An example Vagrant project has been included to get you started right away.
<td><a href="https://github.com/honzatlusty">Jan Tlusty</a></td>
<td><a href="https://github.com/honzatlusty/nagios-storm-topology-latency">upstream</a></td>
</tr>
<tr>
<td>check_graphite-metric</td>
<td><a href="https://github.com/kali-hernandez">kali-hernandez</a></td>
<td><a href="https://github.com/kali-hernandez/nagios_plugins/blob/master/check_graphite_metric">upstream</a></td>
</tr>




Expand Down
1 change: 1 addition & 0 deletions build.txt
Expand Up @@ -65,5 +65,6 @@ check_rabbitmq-sync.rb 1.0
check_zmstatus.pl 1.0
check_ssl-cert 1.0
check_topology-latency.rb 1.0
check_graphite-metric 1.0

# vim: set ts=2 sw=2 et : #
272 changes: 272 additions & 0 deletions check_graphite-metric
@@ -0,0 +1,272 @@
#!/bin/bash

##
## check_graphite_metric
##
## by kali (kali@kalimotxo.net) created for Sociomantic GmbH, 2014
##
##
## Checks for values provided by graphite interface.
## Accepts low and high ranges for warning and critical status.
##
## Depends on curl to retrieve the values from graphite interface.
##
## Specify the metric name with -m parameter, can use wildcards, but only ONE target per call.
## Retrieves current values collection with a "from" graphite parameter specifying time in the past,
## which can be specified in parameters in the check. Useful for metrics with high intervals (e.g. 1 h)
## Ignores values received as "None" which means graphite has no points collected for the specified metric/timeframe
##
## Returns UNKNOWN status if can't retreive values from graphite.
##

# Declare some defaults
EXIT_NAGIOS_STATUS=-1
CHECK_OUTPUT=""
DEFAULT_NULL_STATUS=3
# search string in metric to replace with the HOST parameter
METRIC_HOST_MATCHSTRING="%HOST%"
# default timeframe to check graphite for values
GRAPHITE_FROM_VALUE="8minutes"
# default needed curl options - allow special chars + silent + timeout 30 seconds
CURL_OPTIONS="-g -s --max-time 30"
# array to store metric names returned from graphite. used to keep graphite's response order
declare -a RES_METRICS
# assoc. array to store values for each metric. declared -A to iterate results and keep only the last not null value
declare -A RES_VALUES
# counter of passed threshold parameters
HAS_LO_PARAMS=0
HAS_HI_PARAMS=0

# define exit functions for nagios
exit_ok() {
echo "OK - ${CHECK_OUTPUT}"
exit 0
}
exit_war() {
echo "WARNING - ${CHECK_OUTPUT}"
exit 1
}
exit_cri() {
echo "CRITICAL - ${CHECK_OUTPUT}"
exit 2
}
exit_unk() {
echo "UNKNOWN - ${CHECK_OUTPUT}"
exit 3
}
exit_bad() {
echo "OUT OF RANGE - ${CHECK_OUTPUT}"
exit 255
}
# May this also be the inline documentation :)
show_usage() {
cat << EOF
Usage: $0 -w <warn_lvl_high> -c <crit_lvl_high> [-H host] -m <graphite_metric_substring>
-g|--graphite_url [url] : Url (base) where graphite lives
-G|--graphite_auth [u:p] : User / password to authenticate into graphite (http auth)
-W|--warn_high [value] : Value threshold to alert as warning when graphite goes above it (hi level)
-C|--crit_high [value] : Value threshold to alert as critical when graphite goes above it (hi/hi level)
-w|--warn_low [value] : (Optional) Lower warning value threshold (lo level) Ignored if absent.
-c|--crit_low [value] : (Optional) Lower critical value threshold (lo/lo level) Ignored if absent.
-H|--host [host] : (Optional) Host string in the format dc-NNN. If present, it will be replaced in the graphite metric.
-m|--metric [metric] : Graphite metric to be checked for values. If -H is present it will replace any occurence of the string "${METRIC_HOST_MATCHSTRING}" here.
-n|--null_returns [code] : (Optional) Return specified status [OK|WARN|CRIT|UNK|0|1|2|3] when graphite gives NULL values only. Defaults to "UNK|3"
-y|--hyst_warn [value] : (Optional) Hysteresis for the warning level - how much value should change to turn off Warning
-Y|--hyst_crit [value] : (Optional) Hysteresis for the critical level - how much value should change to turn off Error
-t|--timeframe [minutes] : (Optional) Set MINUTES threshold for metric inspection (e.g.: -t 5 --> checks metric values for 5 minutes in the past). Defaults to "8"
-q|--quiet : Quiet output. Returns only the status, and values only if the status is critical or warning.
-h|--help : Show usage.
-d|--debug : enable debug.
Example:
$0 -W 20 -C 40 -m "eu-002.cacti_propulsor.cacti_propulsor_cpu-0*.cpu_usage"
WARNING - eu-002.cacti_propulsor.cacti_propulsor_cpu-02.cpu_usage=30.3966666667; eu-002.cacti_propulsor.cacti_propulsor_cpu-01.cpu_usage=22.8726666667; eu-002.cacti_propulsor.cacti_propulsor_cpu-03.cpu_usage=29.4873333333; eu-002.cacti_propulsor.cacti_propulsor_cpu-04.cpu_usage=27.258;
EOF
}

check_status() {
local METRIC="${1}"
local ADDITION_CRIT="${2:-0}"
local ADDITION_WARN="${3:-0}"
local STATUS="0"
if [ $HAS_LO_PARAMS -eq 2 ] ; then
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} < ($WARN_LVL_LOW + $ADDITION_WARN))}" && STATUS=1
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} < ($CRIT_LVL_LOW + $ADDITION_CRIT))}" && STATUS=2
fi
if [ $HAS_HI_PARAMS -eq 2 ] ; then
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} > ($WARN_LVL_HIGH - ${ADDITION_WARN}))}" && STATUS=1
awk "BEGIN {exit !(${RES_VALUES["$METRIC"]} > ($CRIT_LVL_HIGH - ${ADDITION_CRIT}))}" && STATUS=2
fi
echo ${STATUS}
}

while (( "$#" )) ; do
case "${1}" in
-g|--graphite_url)
GRAPHITE_RENDER_URL="${2}/render/"
shift
;;
-G|--graphite_auth)
CURL_OPTIONS="${CURL_OPTIONS} -k -u ${2}"
shift
;;
-W|--warn_high)
WARN_LVL_HIGH=${2}
(( ++HAS_HI_PARAMS ))
shift
;;
-C|--crit_high)
CRIT_LVL_HIGH=${2}
(( ++HAS_HI_PARAMS ))
shift
;;
-w|--warn_low)
WARN_LVL_LOW=${2}
(( ++HAS_LO_PARAMS ))
shift
;;
-c|--crit_low)
CRIT_LVL_LOW=${2}
(( ++HAS_LO_PARAMS ))
shift
;;
-m|--host)
METRIC_STRING=${2}
shift
;;
-H|--metric)
HOST=${2}
shift
;;
-n|--null_returns)
case ${2^^} in
OK|0)
DEFAULT_NULL_STATUS=0
;;
WARN|WARNING|1)
DEFAULT_NULL_STATUS=1
;;
CRIT|CRITICAL|2)
DEFAULT_NULL_STATUS=2
;;
*)
DEFAULT_NULL_STATUS=3
;;
esac
shift
;;
-y|--hyst_warn)
HYSTERESIS_WARN=${2}
shift
;;
-Y|--hyst_crit)
HYSTERESIS_CRIT=${2}
shift
;;
-t|--timeframe)
GRAPHITE_FROM_VALUE="${2}minutes"
shift
;;
-q|--quiet)
QUIET_OUTPUT=1
;;
-d|--debug)
set -x
;;
-h|--help)
show_usage
exit 0
;;
*)
# silently ignore unknown parameters rather than rant about it and exit
;;
esac
shift
done

# build base curl options for querying graphite. build the target metric passed as parameter over it
GRAPHITE_QUERY_BASE="${GRAPHITE_RENDER_URL}?format=raw&from=-${GRAPHITE_FROM_VALUE}&target="

# make sure we have all the necessary parameters
case $HAS_LO_PARAMS in
1)
if [ -z ${WARN_LVL_LOW} ] ; then
# Default warning level to critical if not given
WARN_LVL_LOW=${CRIT_LVL_LOW}
(( ++HAS_LO_PARAMS ))
else
CHECK_OUTPUT="Missing CRITICAL (low) level parameter. Passed -w=${WARN_LVL_LOW} -c=${CRIT_LVL_LOW}"
exit_bad
fi
;;
2)
if `awk "BEGIN {exit !(${WARN_LVL_LOW} < ${CRIT_LVL_LOW})}"` ; then
CHECK_OUTPUT="Low Warning threshold cannot be below critical threshold. Passed -w=${WARN_LVL_LOW} -c=${CRIT_LVL_LOW}"
exit_bad
fi
;;
esac
case $HAS_HI_PARAMS in
1)
if [ -z ${WARN_LVL_HIGH} ] ; then
WARN_LVL_HIGH=${CRIT_LVL_HIGH}
(( ++HAS_HI_PARAMS ))
else
CHECK_OUTPUT="Missing CRITICAL (high) level parameter. Passed -W=${WARN_LVL_HIGH} -C=${CRIT_LVL_HIGH}"
exit_bad
fi
;;
2)
if `awk "BEGIN {exit !(${WARN_LVL_HIGH} > ${CRIT_LVL_HIGH})}"` ; then
CHECK_OUTPUT="High Warning threshold cannot be over critical threshold. Passed -W=${WARN_LVL_HIGH} -C=${CRIT_LVL_HIGH}"
exit_bad
fi
;;
esac

## If HOST parameter is present, replace it in the metric variable
[ "$HOST" != "" ] && METRIC_STRING=$(sed "s/$METRIC_HOST_MATCHSTRING/$HOST/g" <<< $METRIC_STRING)

# Run query against graphite and store last non-none values on the array
CURL_URI="${GRAPHITE_QUERY_BASE}${METRIC_STRING}"
for GRAPHITE_OUT in $(curl ${CURL_OPTIONS} ${CURL_URI} | sed 's/ //g') ; do
if [[ "${GRAPHITE_OUT:0:1}" =~ ^\<.*|^\{.* ]] ; then
GLOBAL_STATE=3
CHECK_OUTPUT="Graphite returned an error, check your metric: ${METRIC_STRING}"
exit_unk
fi
GRAPHITE_METRIC=$(cut -f1 -d\| <<< $GRAPHITE_OUT | sed 's/,[0-9]*,[0-9]*,[0-9]*$//')
RES_METRICS+=("$GRAPHITE_METRIC")
for GRAPHITE_VALUE in $(cut -f2 -d\| <<< $GRAPHITE_OUT | sed 's/,/ /g') ; do
[ "${GRAPHITE_VALUE,,}" != "none" ] && RES_VALUES[${GRAPHITE_METRIC}]=${GRAPHITE_VALUE}
done
done

PREV_STATUS=${NAGIOS_LASTSERVICESTATEID:-0}
if [ ${#RES_VALUES[@]} -eq 0 ] ; then
EXIT_NAGIOS_STATUS=$DEFAULT_NULL_STATUS
if [ ${DEFAULT_NULL_STATUS} -eq 3 ] ; then CHECK_OUTPUT="Graphite query ${METRIC_STRING} returned nothing." ; fi
else
for METRIC in ${RES_METRICS[*]} ; do
METRIC_STATUS=0
if [[ "" == "${RES_VALUES["$METRIC"]}" ]] ; then
METRIC_STATUS=$DEFAULT_NULL_STATUS
else
if [[ ! -z ${HYSTERESIS_CRIT} && ${PREV_STATUS} == 2 ]] || [[ ! -z ${HYSTERESIS_WARN} && ${PREV_STATUS} == 1 ]]; then
METRIC_STATUS=$(check_status "${METRIC}" "${HYSTERESIS_CRIT}" "${HYSTERESIS_WARN}")
else
METRIC_STATUS=$(check_status "${METRIC}" 0)
fi
fi
[ $METRIC_STATUS -gt $EXIT_NAGIOS_STATUS ] && EXIT_NAGIOS_STATUS=$METRIC_STATUS
[ "${QUIET_OUTPUT}" != 1 ] || [ $METRIC_STATUS -gt 0 ] && CHECK_OUTPUT="${CHECK_OUTPUT}${METRIC}=${RES_VALUES[$METRIC]}; "
done
fi

case $EXIT_NAGIOS_STATUS in
0) exit_ok ;;
1) exit_war ;;
2) exit_cri ;;
3) exit_unk ;;
*) exit $EXIT_NAGIOS_STATUS ;;
esac

0 comments on commit 21e2b67

Please sign in to comment.