Skip to content

Commit

Permalink
feat: Create resolution metrics for health alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulguptajss committed Jun 10, 2024
1 parent faa7a24 commit 9aae486
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 102 deletions.
97 changes: 77 additions & 20 deletions cmd/collectors/rest/plugins/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ type Health struct {
client *rest.Client
data map[string]*matrix.Matrix
lastFilterTime int64
previousData map[string]*matrix.Matrix
resolutionData map[string]*matrix.Matrix
}

func New(p *plugin.AbstractPlugin) plugin.Plugin {
Expand All @@ -58,7 +60,7 @@ func (h *Health) Init() error {
return err
}

if err := h.initAllMatrix(); err != nil {
if err := h.InitAllMatrix(); err != nil {
return err
}

Expand All @@ -70,26 +72,30 @@ func (h *Health) Init() error {
return h.client.Init(5)
}

func (h *Health) initAllMatrix() error {
func (h *Health) InitAllMatrix() error {
h.data = make(map[string]*matrix.Matrix)
h.resolutionData = make(map[string]*matrix.Matrix)
mats := []string{diskHealthMatrix, shelfHealthMatrix, supportHealthMatrix, nodeHealthMatrix,
networkEthernetPortHealthMatrix, networkFCPortHealthMatrix, lifHealthMatrix,
volumeRansomwareHealthMatrix, volumeMoveHealthMatrix, licenseHealthMatrix, haHealthMatrix}
for _, m := range mats {
if err := h.initMatrix(m); err != nil {
if err := h.initMatrix(m, "", h.data); err != nil {
return err
}
if err := h.initMatrix(m, "Resolution", h.resolutionData); err != nil {
return err
}
}
return nil
}

func (h *Health) initMatrix(name string) error {
h.data[name] = matrix.New(h.Parent+name, name, name)
func (h *Health) initMatrix(name string, prefix string, inputMat map[string]*matrix.Matrix) error {
inputMat[name] = matrix.New(h.Parent+name+prefix, name, name)
for _, v1 := range h.data {
v1.SetExportOptions(matrix.DefaultExportOptions())
}
for _, k := range metrics {
err := matrix.CreateMetric(k, h.data[name])
err := matrix.CreateMetric(k, inputMat[name])
if err != nil {
h.Logger.Warn().Err(err).Str("key", k).Msg("error while creating metric")
return err
Expand Down Expand Up @@ -124,14 +130,15 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util

// Purge and reset data
// remove all metrics as analytics label may change over time
err = h.initAllMatrix()
err = h.InitAllMatrix()
if err != nil {
h.Logger.Warn().Err(err).Msg("error while init matrix")
return nil, nil, err
}
for k := range h.data {
// Set all global labels if already not exist
h.data[k].SetGlobalLabels(data.GetGlobalLabels())
h.resolutionData[k].SetGlobalLabels(data.GetGlobalLabels())
}

h.collectDiskAlerts()
Expand All @@ -146,11 +153,24 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util
h.collectVolumeMoveAlerts()
h.collectLicenseAlerts()

h.generateResolutionMetrics()

result := make([]*matrix.Matrix, 0, len(h.data))

for _, value := range h.data {
result = append(result, value)
}

resolutionInstancesCount := 0
for _, value := range h.resolutionData {
result = append(result, value)
resolutionInstancesCount += len(value.GetInstances())
}

if resolutionInstancesCount > 0 {
h.Logger.Info().Int("instances", resolutionInstancesCount).
Msg("Collected Resolution metrics")
}
return result, h.client.Metadata, nil
}

Expand Down Expand Up @@ -183,7 +203,7 @@ func (h *Health) collectLicenseAlerts() {
instance.SetLabel("state", state)
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -217,7 +237,7 @@ func (h *Health) collectVolumeMoveAlerts() {
instance.SetLabel("volume", volume)
instance.SetLabel(severityLabel, string(warning))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -269,7 +289,7 @@ func (h *Health) collectVolumeRansomwareAlerts() {
instance.SetLabel("volume", volume)
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -302,7 +322,7 @@ func (h *Health) collectNetworkInterfacesAlerts() {
instance.SetLabel("lif", lif)
instance.SetLabel(severityLabel, string(warning))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -335,7 +355,7 @@ func (h *Health) collectNetworkFCPortAlerts() {
instance.SetLabel("port", port)
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -370,7 +390,7 @@ func (h *Health) collectNetworkEthernetPortAlerts() {
instance.SetLabel("type", portType)
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -400,7 +420,7 @@ func (h *Health) collectNodeAlerts() {
instance.SetLabel("healthy", "false")
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -440,7 +460,7 @@ func (h *Health) collectHAAlerts() {
instance.SetLabel("partner_state", partnerState)
instance.SetLabel(severityLabel, string(errr))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -480,7 +500,7 @@ func (h *Health) collectShelfAlerts() {
instance.SetLabel(severityLabel, string(warning))
}

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}
}
Expand Down Expand Up @@ -528,7 +548,7 @@ func (h *Health) collectSupportAlerts() {
instance.SetLabel("correctiveAction", correctiveAction)
instance.SetLabel(severityLabel, string(warning))

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
// update lastFilterTime to current cluster time
h.lastFilterTime = toTime
Expand Down Expand Up @@ -564,7 +584,7 @@ func (h *Health) collectDiskAlerts() {
instance.SetLabel(severityLabel, string(warning))
}

h.setAlertMetric(mat, instance)
h.setAlertMetric(mat, instance, 1)
}
}

Expand Down Expand Up @@ -711,7 +731,7 @@ func (h *Health) getTimeStampFilter(clusterTime time.Time) string {
return fmt.Sprintf("time=>=%d", fromTime)
}

func (h *Health) setAlertMetric(mat *matrix.Matrix, instance *matrix.Instance) {
func (h *Health) setAlertMetric(mat *matrix.Matrix, instance *matrix.Instance, value float64) {
var err error
m := mat.GetMetric("alerts")
if m == nil {
Expand All @@ -720,7 +740,44 @@ func (h *Health) setAlertMetric(mat *matrix.Matrix, instance *matrix.Instance) {
return
}
}
if err = m.SetValueFloat64(instance, 1); err != nil {
if err = m.SetValueFloat64(instance, value); err != nil {
h.Logger.Error().Err(err).Str("metric", "alerts").Msg("Unable to set value on metric")
}
}

func (h *Health) generateResolutionMetrics() {
for prevKey, prevMat := range h.previousData {
curMat, exists := h.data[prevKey]
if !exists {
continue
}

prevInstances := prevMat.GetInstanceKeys()
curInstances := make(map[string]struct{})
for _, instanceKey := range curMat.GetInstanceKeys() {
curInstances[instanceKey] = struct{}{}
}

for _, pInstanceKey := range prevInstances {
if _, found := curInstances[pInstanceKey]; found {
continue
}

rMat := h.resolutionData[prevKey]
if rMat == nil {
h.Logger.Warn().Str("key", prevKey).Msg("empty resolution Matrix")
continue
}

rInstance, err := rMat.NewInstance(pInstanceKey)
if err != nil {
h.Logger.Warn().Str("key", pInstanceKey).Msg("error while creating instance")
continue
}

rInstance.SetLabels(prevMat.GetInstance(pInstanceKey).GetLabels())
h.setAlertMetric(rMat, rInstance, 0)
}
}
h.previousData = h.data
}
67 changes: 67 additions & 0 deletions cmd/collectors/rest/plugins/health/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package health

import (
"github.com/netapp/harvest/v2/cmd/poller/plugin"
"github.com/netapp/harvest/v2/pkg/logging"
"github.com/netapp/harvest/v2/pkg/matrix"
"testing"
)

func TestEndPoll(t *testing.T) {
// Create a new Health struct
h := &Health{AbstractPlugin: plugin.New("health", nil, nil, nil, "health", nil)}
h.Logger = logging.Get()
h.data = make(map[string]*matrix.Matrix)
h.previousData = make(map[string]*matrix.Matrix)
_ = h.InitAllMatrix()

matName := "health_lif"

// Initialize some test data
prevMat := matrix.New("UUID", "object", "identifier")
prevInstance1, _ := prevMat.NewInstance("0")
prevInstance1.SetLabel("label0", "value0")
prevInstance2, _ := prevMat.NewInstance("1")
prevInstance2.SetLabel("label1", "value1")
h.previousData[matName] = prevMat

curMat := matrix.New("UUID", "object", "identifier")
curInstance, _ := curMat.NewInstance("2")
curInstance.SetLabel("label2", "value2")
h.data[matName] = curMat

curMat2 := matrix.New("UUID2", "object2", "identifier2")
curInstance2, _ := curMat2.NewInstance("2")
curInstance2.SetLabel("label2", "value2")
h.data["testMatrix2"] = curMat

h.generateResolutionMetrics()

// Check that resolutionData has the expected values
resMat, ok := h.resolutionData[matName]
if !ok {
t.Fatalf("expected resolutionData to have key " + matName)
}

// Check the count of instances in the resolution matrix
if len(resMat.GetInstances()) != 2 {
t.Fatalf("expected resolutionData to have 2 instances, got %d", len(resMat.GetInstances()))
}

// Check that previousData is correctly updated
if len(h.previousData[matName].GetInstances()) != 1 {
t.Fatalf("expected previousData to have 1 instance, got %d", len(h.previousData["testMatrix"].GetInstances()))
}

// Check the instances in the resolution matrix
for _, instanceKey := range []string{"0", "1"} {
resInstance := resMat.GetInstance(instanceKey)
if resInstance == nil {
t.Fatalf("expected resolutionData to have instance with index %s", instanceKey)
}

if label := resInstance.GetLabel("label" + instanceKey); label != "value"+instanceKey {
t.Fatalf("expected instance label 'label%s' to be 'value%s', got '%s'", instanceKey, instanceKey, label)
}
}
}
Loading

0 comments on commit 9aae486

Please sign in to comment.