Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cluster health dashboard #1881

Merged
merged 30 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
538f7ee
feat: cluster health dashboard
rahulguptajss Mar 31, 2023
959d6fe
feat: added some more alerts
rahulguptajss Apr 3, 2023
e6a5735
feat: added some more alerts
rahulguptajss Apr 3, 2023
f38351e
feat: added some more alerts
rahulguptajss Apr 3, 2023
7cf66f7
feat: fix tests
rahulguptajss Apr 3, 2023
61d1c37
feat: add more alerts
rahulguptajss Apr 4, 2023
01a16fa
feat: add more alerts
rahulguptajss Apr 4, 2023
3166908
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 5, 2023
2d7d17a
feat: merge main
rahulguptajss Apr 5, 2023
e8cc2b9
feat: health dashboard
rahulguptajss Apr 6, 2023
658a468
feat: health dashboard
rahulguptajss Apr 6, 2023
306797b
feat: health dashboard
rahulguptajss Apr 7, 2023
604b4ae
feat: health dashboard
rahulguptajss Apr 7, 2023
4418b72
feat: health dashboard
rahulguptajss Apr 7, 2023
b4c2eff
feat: health dashboard
rahulguptajss Apr 7, 2023
0e3986f
feat: health dashboard
rahulguptajss Apr 7, 2023
22499ca
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 7, 2023
e92fbde
feat: ignore health counters
rahulguptajss Apr 7, 2023
f88dafc
feat: ignore ems counters
rahulguptajss Apr 7, 2023
030de9a
feat: address review comments
rahulguptajss Apr 10, 2023
83193be
feat: address review comments
rahulguptajss Apr 10, 2023
25af8b5
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 10, 2023
b11dca1
feat: address review comments
rahulguptajss Apr 10, 2023
c73cef5
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 10, 2023
7015d52
feat: solve empty column problem
rahulguptajss Apr 11, 2023
aedb76b
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
80700b0
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
2cab067
Merge remote-tracking branch 'origin/main' into rg2-health-alert
rahulguptajss Apr 11, 2023
13cefa1
feat: address review comments
rahulguptajss Apr 12, 2023
0a1ad62
feat: address review comments
rahulguptajss Apr 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 70 additions & 1 deletion cmd/collectors/commonutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@ package collectors

import (
"github.com/netapp/harvest/v2/cmd/tools/rest"
"github.com/netapp/harvest/v2/pkg/errs"
"github.com/netapp/harvest/v2/pkg/logging"
"github.com/netapp/harvest/v2/pkg/matrix"
"github.com/netapp/harvest/v2/pkg/tree/node"
"github.com/tidwall/gjson"
"strings"
"time"
)

const DefaultBatchSize = "500"
const (
DefaultBatchSize = "500"
MaxAllowedTimeDrift = 10 * time.Second
)

func InvokeRestCall(client *rest.Client, href string, logger *logging.Logger) ([]gjson.Result, error) {
result, err := rest.Fetch(client, href)
Expand All @@ -25,6 +30,70 @@ func InvokeRestCall(client *rest.Client, href string, logger *logging.Logger) ([
return result, nil
}

func GetClusterTime(client *rest.Client, returnTimeOut string, logger *logging.Logger) (time.Time, error) {
var (
err error
records []gjson.Result
clusterTime time.Time
timeOfNodes []int64
)

query := "private/cli/cluster/date"
fields := []string{"date"}

href := rest.BuildHref(query, strings.Join(fields, ","), nil, "", "", "1", returnTimeOut, "")

if records, err = rest.Fetch(client, href); err != nil {
return clusterTime, err
}
if len(records) == 0 {
return clusterTime, errs.New(errs.ErrConfig, " date not found on cluster")
}

for _, instanceData := range records {
currentClusterDate := instanceData.Get("date")
if currentClusterDate.Exists() {
t, err := time.Parse(time.RFC3339, currentClusterDate.String())
if err != nil {
logger.Error().Str("date", currentClusterDate.String()).Err(err).Msg("Failed to load cluster date")
continue
}
clusterTime = t
timeOfNodes = append(timeOfNodes, t.UnixNano())
}
}

for _, timeOfEachNode := range timeOfNodes {
timeDrift := time.Duration(timeOfEachNode - timeOfNodes[0]).Abs()
if timeDrift >= MaxAllowedTimeDrift {
logger.Warn().Float64("timedrift(in sec)", timeDrift.Seconds()).Msg("Time drift exist among the nodes")
break
}
}

logger.Debug().Str("cluster time", clusterTime.String()).Msg("")
return clusterTime, nil
}

// GetDataInterval fetch pollData interval
func GetDataInterval(param *node.Node, defaultInterval time.Duration) (time.Duration, error) {
rahulguptajss marked this conversation as resolved.
Show resolved Hide resolved
var dataIntervalStr string
var durationVal time.Duration
var err error
schedule := param.GetChildS("schedule")
if schedule != nil {
dataInterval := schedule.GetChildS("data")
if dataInterval != nil {
dataIntervalStr = dataInterval.GetContentS()
if durationVal, err = time.ParseDuration(dataIntervalStr); err == nil {
return durationVal, nil
}
return defaultInterval, err
}
}
return defaultInterval, nil
}

func UpdateProtectedFields(instance *matrix.Instance) {

// check for group_type
Expand Down
41 changes: 41 additions & 0 deletions cmd/collectors/commonutils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package collectors

import (
"github.com/netapp/harvest/v2/pkg/matrix"
"github.com/netapp/harvest/v2/pkg/tree/node"
"testing"
"time"
)
Expand Down Expand Up @@ -256,3 +257,43 @@ func testNewerTimestampThanDuration(t *testing.T) {
t.Errorf("timestamp= %f is newer than duration %s", timestamp, duration.String())
}
}

func TestGetDataInterval(t *testing.T) {
defaultDataPollDuration := 3 * time.Minute
type args struct {
param *node.Node
defaultInterval time.Duration
}

type test struct {
name string
args args
want float64
wantErr bool
}
tests := []test{
{"success_return_poller_schedule", args{param: generateScheduleParam("4m"), defaultInterval: defaultDataPollDuration}, 240, false},
{"error_return_default_schedule", args{param: generateScheduleParam("4ma"), defaultInterval: defaultDataPollDuration}, 180, true},
{"return_default_schedule", args{param: generateScheduleParam(""), defaultInterval: defaultDataPollDuration}, 180, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetDataInterval(tt.args.param, tt.args.defaultInterval)
if (err != nil) != tt.wantErr {
t.Errorf("GetDataInterval() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got.Seconds() != tt.want {
t.Errorf("GetDataInterval() got = %v, want %v", got, tt.want)
}
})
}
}

func generateScheduleParam(duration string) *node.Node {
root := node.NewS("root")
param := root.NewChildS("schedule", "")
param.NewChildS("data", duration)
return root
}
69 changes: 2 additions & 67 deletions cmd/collectors/ems/ems.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ const defaultSeverityFilter = "alert|emergency|error|informational|notice"
const MaxBookendInstances = 1000
const DefaultBookendResolutionDuration = 28 * 24 * time.Hour // 28 days == 672 hours
const Hyphen = "-"
const MaxAllowedTimeDrift = 10 * time.Second

type Ems struct {
*rest2.Rest // provides: AbstractCollector, Client, Object, Query, TemplateFn, TemplateType
Expand Down Expand Up @@ -248,58 +247,13 @@ func (e *Ems) InitCache() error {
return nil
}

func (e *Ems) getClusterTime() (time.Time, error) {
var (
err error
records []gjson.Result
clusterTime time.Time
timeOfNodes []int64
)

query := "private/cli/cluster/date"
fields := []string{"date"}

href := rest.BuildHref(query, strings.Join(fields, ","), nil, "", "", "1", e.ReturnTimeOut, "")

if records, err = e.GetRestData(href); err != nil {
return clusterTime, err
}
if len(records) == 0 {
return clusterTime, errs.New(errs.ErrConfig, e.Object+" date not found on cluster")
}

for _, instanceData := range records {
currentClusterDate := instanceData.Get("date")
if currentClusterDate.Exists() {
t, err := time.Parse(time.RFC3339, currentClusterDate.String())
if err != nil {
e.Logger.Error().Str("date", currentClusterDate.String()).Err(err).Msg("Failed to load cluster date")
continue
}
clusterTime = t
timeOfNodes = append(timeOfNodes, t.UnixNano())
}
}

for _, timeOfEachNode := range timeOfNodes {
timeDrift := time.Duration(timeOfEachNode - timeOfNodes[0]).Abs()
if timeDrift >= MaxAllowedTimeDrift {
e.Logger.Warn().Float64("timedrift(in sec)", timeDrift.Seconds()).Msg("Time drift exist among the nodes")
break
}
}

e.Logger.Debug().Str("cluster time", clusterTime.String()).Msg("")
return clusterTime, nil
}

// returns time filter (clustertime - polldata duration)
func (e *Ems) getTimeStampFilter(clusterTime time.Time) string {
fromTime := e.lastFilterTime
// check if this is the first request
if e.lastFilterTime == 0 {
// if first request fetch cluster time
dataDuration, err := GetDataInterval(e.GetParams(), defaultDataPollDuration)
dataDuration, err := collectors.GetDataInterval(e.GetParams(), defaultDataPollDuration)
if err != nil {
e.Logger.Warn().Err(err).
Str("defaultDataPollDuration", defaultDataPollDuration.String()).
Expand Down Expand Up @@ -421,7 +375,7 @@ func (e *Ems) PollData() (map[string]*matrix.Matrix, error) {
startTime = time.Now()

// add time filter
clusterTime, err := e.getClusterTime()
clusterTime, err := collectors.GetClusterTime(e.Client, e.ReturnTimeOut, e.Logger)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -507,25 +461,6 @@ func (e *Ems) getHref(names []string, filter []string) string {
return href
}

// GetDataInterval fetch pollData interval
func GetDataInterval(param *node.Node, defaultInterval time.Duration) (time.Duration, error) {
var dataIntervalStr string
var durationVal time.Duration
var err error
schedule := param.GetChildS("schedule")
if schedule != nil {
dataInterval := schedule.GetChildS("data")
if dataInterval != nil {
dataIntervalStr = dataInterval.GetContentS()
if durationVal, err = time.ParseDuration(dataIntervalStr); err == nil {
return durationVal, nil
}
return defaultInterval, err
}
}
return defaultInterval, nil
}

func parseProperties(instanceData gjson.Result, property string) gjson.Result {

if !strings.HasPrefix(property, "parameters.") {
Expand Down
Loading