Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cluster health dashboard #1881

Merged
merged 30 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
538f7ee
feat: cluster health dashboard
rahulguptajss Mar 31, 2023
959d6fe
feat: added some more alerts
rahulguptajss Apr 3, 2023
e6a5735
feat: added some more alerts
rahulguptajss Apr 3, 2023
f38351e
feat: added some more alerts
rahulguptajss Apr 3, 2023
7cf66f7
feat: fix tests
rahulguptajss Apr 3, 2023
61d1c37
feat: add more alerts
rahulguptajss Apr 4, 2023
01a16fa
feat: add more alerts
rahulguptajss Apr 4, 2023
3166908
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 5, 2023
2d7d17a
feat: merge main
rahulguptajss Apr 5, 2023
e8cc2b9
feat: health dashboard
rahulguptajss Apr 6, 2023
658a468
feat: health dashboard
rahulguptajss Apr 6, 2023
306797b
feat: health dashboard
rahulguptajss Apr 7, 2023
604b4ae
feat: health dashboard
rahulguptajss Apr 7, 2023
4418b72
feat: health dashboard
rahulguptajss Apr 7, 2023
b4c2eff
feat: health dashboard
rahulguptajss Apr 7, 2023
0e3986f
feat: health dashboard
rahulguptajss Apr 7, 2023
22499ca
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 7, 2023
e92fbde
feat: ignore health counters
rahulguptajss Apr 7, 2023
f88dafc
feat: ignore ems counters
rahulguptajss Apr 7, 2023
030de9a
feat: address review comments
rahulguptajss Apr 10, 2023
83193be
feat: address review comments
rahulguptajss Apr 10, 2023
25af8b5
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 10, 2023
b11dca1
feat: address review comments
rahulguptajss Apr 10, 2023
c73cef5
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 10, 2023
7015d52
feat: solve empty column problem
rahulguptajss Apr 11, 2023
aedb76b
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
80700b0
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
2cab067
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
13cefa1
feat: address review comments
rahulguptajss Apr 12, 2023
0a1ad62
feat: address review comments
rahulguptajss Apr 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions cmd/collectors/rest/plugins/health/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
package health

import (
goversion "github.com/hashicorp/go-version"
"github.com/netapp/harvest/v2/cmd/collectors"
"github.com/netapp/harvest/v2/cmd/poller/plugin"
"github.com/netapp/harvest/v2/cmd/tools/rest"
"github.com/netapp/harvest/v2/pkg/conf"
"github.com/netapp/harvest/v2/pkg/errs"
"github.com/netapp/harvest/v2/pkg/matrix"
"github.com/tidwall/gjson"
"strings"
"time"
)

type AlertSeverity string

const (
errr AlertSeverity = "error"
warning AlertSeverity = "warning"
)

const diskHealthMatrix = "health_disk"
rahulguptajss marked this conversation as resolved.
Show resolved Hide resolved
const severityLabel = "severity"

type Health struct {
*plugin.AbstractPlugin
client *rest.Client
data map[string]*matrix.Matrix
}

func New(p *plugin.AbstractPlugin) plugin.Plugin {
return &Health{AbstractPlugin: p}
}

var metrics = []string{
"alerts",
}

func (v *Health) Init() error {

var err error

if err = v.InitAbc(); err != nil {
return err
}

if err = v.initAllMatrix(); err != nil {
return err
}

timeout, _ := time.ParseDuration(rest.DefaultTimeout)
if v.client, err = rest.New(conf.ZapiPoller(v.ParentParams), timeout, v.Auth); err != nil {
v.Logger.Error().Stack().Err(err).Msg("connecting")
return err
}

if err = v.client.Init(5); err != nil {
return err
}

return nil
}

func (v *Health) initAllMatrix() error {
v.data = make(map[string]*matrix.Matrix)
rahulguptajss marked this conversation as resolved.
Show resolved Hide resolved
if err := v.initMatrix(diskHealthMatrix); err != nil {
return err
}
return nil
}

func (v *Health) initMatrix(name string) error {
v.data = make(map[string]*matrix.Matrix)
v.data[name] = matrix.New(v.Parent+name, name, name)
rahulguptajss marked this conversation as resolved.
Show resolved Hide resolved
for _, v1 := range v.data {
v1.SetExportOptions(matrix.DefaultExportOptions())
}
for _, k := range metrics {
err := matrix.CreateMetric(k, v.data[name])
if err != nil {
v.Logger.Warn().Err(err).Str("key", k).Msg("error while creating metric")
return err
}
}
return nil
}

func (v *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, error) {
data := dataMap[v.Object]
clusterVersion := v.client.Cluster().GetVersion()
ontapVersion, err := goversion.NewVersion(clusterVersion)
if err != nil {
v.Logger.Error().Err(err).
Str("version", clusterVersion).
Msg("Failed to parse version")
return nil, nil
}
version96 := "9.6"
version96After, err := goversion.NewVersion(version96)
if err != nil {
v.Logger.Error().Err(err).
Str("version", version96).
Msg("Failed to parse version")
return nil, nil
}

if ontapVersion.LessThan(version96After) {
return nil, nil
}

// Purge and reset data
// remove all metrics as analytics label may change over time
err = v.initAllMatrix()
if err != nil {
v.Logger.Warn().Err(err).Msg("error while init matrix")
return nil, err
}
for k := range v.data {
// Set all global labels if already not exist
v.data[k].SetGlobalLabels(data.GetGlobalLabels())
}

v.collectDiskAlerts()

result := make([]*matrix.Matrix, 0, len(v.data))

for _, value := range v.data {
result = append(result, value)
}
return result, nil
}

func (v *Health) collectDiskAlerts() {
var (
instance *matrix.Instance
)
if records, err := v.getDisks(); err != nil {
if errs.IsRestErr(err, errs.APINotFound) {
v.Logger.Debug().Err(err).Msg("API not found")
} else {
v.Logger.Error().Err(err).Msg("Failed to collect analytic data")
}
rahulguptajss marked this conversation as resolved.
Show resolved Hide resolved
} else {
dMatrix := v.data[diskHealthMatrix]
for _, record := range records {
name := record.Get("name").String()
containerType := record.Get("container_type").String()
instance, err = dMatrix.NewInstance(name)
if err != nil {
v.Logger.Warn().Str("key", name).Msg("error while creating instance")
continue
}
instance.SetLabel("disk", name)
instance.SetLabel("container_type", containerType)
if containerType == "broken" {
instance.SetLabel(severityLabel, string(errr))
} else {
instance.SetLabel(severityLabel, string(warning))
}
m := dMatrix.GetMetric("alerts")
if m == nil {
if m, err = dMatrix.NewMetricFloat64("alerts"); err != nil {
v.Logger.Warn().Err(err).Str("key", "alerts").Msg("error while creating metric")
continue
}
}
if err := m.SetValueFloat64(instance, 1); err != nil {
v.Logger.Error().Err(err).Str("metric", "alerts").Msg("Unable to set value on metric")
}
}
}
}

func (v *Health) getDisks() ([]gjson.Result, error) {
var (
result []gjson.Result
err error
)

fields := []string{"name", "container_type"}
query := "api/storage/disks"
href := rest.BuildHref(query, strings.Join(fields, ","), []string{"container_type=broken|unassigned"}, "", "", "", "", query)

if result, err = collectors.InvokeRestCall(v.client, href, v.Logger); err != nil {
return nil, err
}
return result, nil
}
3 changes: 3 additions & 0 deletions cmd/collectors/rest/rest.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/certificate"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/disk"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/health"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/netroute"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/qospolicyadaptive"
"github.com/netapp/harvest/v2/cmd/collectors/rest/plugins/qospolicyfixed"
Expand Down Expand Up @@ -369,6 +370,8 @@ func (r *Rest) LoadPlugin(kind string, abc *plugin.AbstractPlugin) plugin.Plugin
switch kind {
case "Disk":
return disk.New(abc)
case "Health":
return health.New(abc)
case "NetRoute":
return netroute.New(abc)
case "Qtree":
Expand Down
1 change: 1 addition & 0 deletions cmd/tools/grafana/dashboard_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,7 @@ func TestOnlyHighlightsExpanded(t *testing.T) {
"cmode/security.json": 3,
"cmode/fsa.json": 2,
"cmode/workload.json": 2,
"cmode/health.json": 2,
}
// count number of expanded sections in dashboard and ensure num expanded = 1
visitDashboards(
Expand Down
12 changes: 12 additions & 0 deletions conf/rest/9.6.0/health.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Health
query: api/cluster
object: health

counters:
- ^^uuid
- ^name

plugins:
- Health

export_data: false
1 change: 1 addition & 0 deletions conf/rest/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ objects:
ClusterPeer: clusterpeer.yaml
Disk: disk.yaml
# ExportRule: exports.yaml
Health: health.yaml
Lun: lun.yaml
Namespace: namespace.yaml
# NetConnections: netConnections.yaml
Expand Down
Loading