-
Notifications
You must be signed in to change notification settings - Fork 10
health: Create package to determine health #15
Changes from all commits
487aa3d
7c6301a
4977f03
365504e
5513730
f626979
0a6c5b4
892abf3
16debaf
88c7c80
ebbfa94
d477371
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
package health | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics" | ||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/util" | ||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/config" | ||
"github.com/pkg/errors" | ||
"github.com/sirupsen/logrus" | ||
) | ||
|
||
// DiagnosisResult is a possible result after a diagnosis. | ||
type DiagnosisResult int | ||
|
||
// Possible diagnosis results. | ||
const ( | ||
Unknown DiagnosisResult = iota | ||
Inconclusive | ||
Healthy | ||
Unhealthy | ||
) | ||
|
||
// Diagnosis is the information about the health of the revision. | ||
type Diagnosis struct { | ||
OverallResult DiagnosisResult | ||
CheckResults []CheckResult | ||
} | ||
|
||
// CheckResult is information about a metrics criteria check. | ||
type CheckResult struct { | ||
Threshold float64 | ||
ActualValue float64 | ||
IsCriteriaMet bool | ||
} | ||
|
||
// Diagnose attempts to determine the health of a revision. | ||
// | ||
// If the minimum number of requests is not met, then health cannot be | ||
// determined and Diagnosis.EnoughRequests is set to false. | ||
// | ||
// Otherwise, all metrics criteria are checked to determine if the revision is | ||
// healthy. | ||
func Diagnose(ctx context.Context, provider metrics.Provider, query metrics.Query, | ||
offset time.Duration, minRequests int64, healthCriteria []config.Metric) (*Diagnosis, error) { | ||
|
||
logger := util.LoggerFromContext(ctx) | ||
metricsValues, err := CollectMetrics(ctx, provider, query, offset, healthCriteria) | ||
if err != nil { | ||
return nil, errors.Wrap(err, "could not collect metrics") | ||
} | ||
|
||
overallResult := Healthy | ||
var results []CheckResult | ||
for i, criteria := range healthCriteria { | ||
logger := logger.WithFields(logrus.Fields{ | ||
"metricsType": criteria.Type, | ||
"percentile": criteria.Percentile, | ||
"threshold": criteria.Threshold, | ||
"actualValue": metricsValues[i], | ||
}) | ||
|
||
result := determineResult(criteria.Type, criteria.Threshold, metricsValues[i]) | ||
results = append(results, result) | ||
if result.IsCriteriaMet { | ||
logger.Debug("met criteria") | ||
continue | ||
} | ||
|
||
overallResult = Unhealthy | ||
logger.Debug("unmet criteria") | ||
} | ||
|
||
return &Diagnosis{ | ||
OverallResult: overallResult, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I appreciate the efficiency of the loop above, it's doing two things: collecting metrics and evaluating the aggregation of those metrics. I prefer to separate out the data collection from the data analysis, and minimize the amount of property passing and storing of values derived from mutable data. In that case, instead of a control function such as
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this approach, what about doing this: results := health.Assess(ctx, provider, query, offset, healthCriteria) // returns []health.CheckResult
diagnosis := health.Diagnose(healthCriteria, results) The above approach works too. It just means an extra struct containing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, SGTM. Please capture this in an issue and let's do it another PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping track of this at #30 |
||
CheckResults: results, | ||
}, nil | ||
} | ||
|
||
// CollectMetrics returns an array of values collected for each of the specified | ||
// metrics criteria. | ||
func CollectMetrics(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration, healthCriteria []config.Metric) ([]float64, error) { | ||
logger := util.LoggerFromContext(ctx) | ||
logger.Debug("start collecting metrics") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: OpenTelemetry instrumentation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know how OT would play into here. Are you talking about "querying OT" or "using OT to trace the operator tool itself"? We would look metrics storages (Stackdriver API, Prometheus) and those don't know anything about OT when it comes to querying. |
||
var values []float64 | ||
for _, criteria := range healthCriteria { | ||
var value float64 | ||
var err error | ||
|
||
switch criteria.Type { | ||
case config.LatencyMetricsCheck: | ||
value, err = latency(ctx, provider, query, offset, criteria.Percentile) | ||
break | ||
case config.ErrorRateMetricsCheck: | ||
value, err = errorRatePercent(ctx, provider, query, offset) | ||
break | ||
default: | ||
return nil, errors.Errorf("unimplemented metrics %q", criteria.Type) | ||
} | ||
gvso marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if err != nil { | ||
return nil, errors.Wrapf(err, "failed to obtain metrics %q", criteria.Type) | ||
} | ||
values = append(values, value) | ||
} | ||
|
||
return values, nil | ||
} | ||
|
||
// determineResult concludes if metrics criteria was met. | ||
// | ||
// The returned value also includes a string with details of why the criteria | ||
// was met or not. | ||
func determineResult(metricsType config.MetricsCheck, threshold float64, actualValue float64) CheckResult { | ||
result := CheckResult{ActualValue: actualValue, Threshold: threshold} | ||
|
||
// As of now, the supported health criteria (latency and error rate) need to | ||
// be less than the threshold. So, this is sufficient for now but might need | ||
// to change to a switch statement when criteria with a minimum threshold is | ||
// added. | ||
if actualValue <= threshold { | ||
result.IsCriteriaMet = true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the CheckResult values are not immutable and the calculation is a simple comparison, I think this would be better as a method rather than a property. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @grayside so it's actually hard to look at e.g. for latency metrics, threshold=100 actual=200 → requirement is failed the method that generates this CheckResult decides if those numbers mean something good or bad. in case of "min request" metrics is not met → we simply do noop (wait more requests to happen) however in case of "latency" metrics not met → means "rollback". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, that makes sense, and reading again I see the comment about the forthcoming switch statement. That seems like it will be a bit complex. I can't help but go to the idea that there should be an interface such as the following (reusing the same verbs being discussed above, but more locally this time): type Metric interface {
Assess(context.Context, metrics.Metrics, time.Duration, []config.Metrics[]) (*CheckResult, error)
Diagnose() bool
} Then the individual Metrics can be implemented with a bit more isolation and testability. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @grayside! I think I understand your concern, but I am a little bit confused about the expected interface for the caller of this interface. The idea is that you get a
for criteria := range healthCriteria {
results = append(results, health.Assess(ctx, metricsProvider, ...)) // ignore error
}
Also, I don't see how this would prevent the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit +1 on this being an overkill right now. Though I agree, Adam’s approach long-term is definitely appealing. I'd say no need to refactor this right now. |
||
} | ||
return result | ||
} | ||
|
||
// latency returns the latency for the given offset and percentile. | ||
func latency(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration, percentile float64) (float64, error) { | ||
alignerReducer, err := metrics.PercentileToAlignReduce(percentile) | ||
if err != nil { | ||
return 0, errors.Wrap(err, "failed to parse percentile") | ||
} | ||
|
||
logger := util.LoggerFromContext(ctx).WithField("percentile", percentile) | ||
logger.Debug("querying for latency metrics") | ||
latency, err := provider.Latency(ctx, query, offset, alignerReducer) | ||
if err != nil { | ||
return 0, errors.Wrap(err, "failed to get latency metrics") | ||
} | ||
logger.WithField("value", latency).Debug("latency successfully retrieved") | ||
|
||
return latency, nil | ||
} | ||
|
||
// errorRatePercent returns the percentage of errors during the given offset. | ||
func errorRatePercent(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration) (float64, error) { | ||
logger := util.LoggerFromContext(ctx) | ||
logger.Debug("querying for error rate metrics") | ||
rate, err := provider.ErrorRate(ctx, query, offset) | ||
if err != nil { | ||
return 0, errors.Wrap(err, "failed to get error rate metrics") | ||
} | ||
|
||
// Multiply rate by 100 to have a percentage. | ||
rate *= 100 | ||
logger.WithField("value", rate).Debug("error rate successfully retrieved") | ||
return rate, nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
package health_test | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics" | ||
metricsMocker "github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics/mock" | ||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/config" | ||
"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/health" | ||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestDiagnose(t *testing.T) { | ||
metricsMock := &metricsMocker.Metrics{} | ||
metricsMock.LatencyFn = func(ctx context.Context, query metrics.Query, offset time.Duration, alignReduceType metrics.AlignReduce) (float64, error) { | ||
return 500, nil | ||
} | ||
metricsMock.ErrorRateFn = func(ctx context.Context, query metrics.Query, offset time.Duration) (float64, error) { | ||
return 0.01, nil | ||
} | ||
|
||
tests := []struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be good to test metrics and thresholds set to 0. Even if that is working fine now, that seems likely to raise unexpected behavior in any future refactoring. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping track at #31 |
||
name string | ||
query metrics.Query | ||
offset time.Duration | ||
minRequests int64 | ||
healthCriteria []config.Metric | ||
expected *health.Diagnosis | ||
}{ | ||
{ | ||
name: "healthy revision", | ||
query: metricsMocker.Query{}, | ||
offset: 5 * time.Minute, | ||
minRequests: 1000, | ||
healthCriteria: []config.Metric{ | ||
{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 750}, | ||
{Type: config.ErrorRateMetricsCheck, Threshold: 5}, | ||
}, | ||
expected: &health.Diagnosis{ | ||
OverallResult: health.Healthy, | ||
CheckResults: []health.CheckResult{ | ||
{ | ||
Threshold: 750.0, | ||
ActualValue: 500.0, | ||
IsCriteriaMet: true, | ||
}, | ||
{ | ||
Threshold: 5.0, | ||
ActualValue: 1.0, | ||
IsCriteriaMet: true, | ||
}, | ||
}, | ||
}, | ||
}, | ||
{ | ||
name: "barely healthy revision", | ||
query: metricsMocker.Query{}, | ||
offset: 5 * time.Minute, | ||
healthCriteria: []config.Metric{ | ||
{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 500}, | ||
{Type: config.ErrorRateMetricsCheck, Threshold: 1}, | ||
}, | ||
expected: &health.Diagnosis{ | ||
OverallResult: health.Healthy, | ||
CheckResults: []health.CheckResult{ | ||
{ | ||
Threshold: 500.0, | ||
ActualValue: 500.0, | ||
IsCriteriaMet: true, | ||
}, | ||
{ | ||
Threshold: 1.0, | ||
ActualValue: 1.0, | ||
IsCriteriaMet: true, | ||
}, | ||
}, | ||
}, | ||
}, | ||
{ | ||
name: "unhealthy revision, miss latency", | ||
query: metricsMocker.Query{}, | ||
offset: 5 * time.Minute, | ||
minRequests: 1000, | ||
healthCriteria: []config.Metric{ | ||
{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 499}, | ||
}, | ||
expected: &health.Diagnosis{ | ||
OverallResult: health.Unhealthy, | ||
CheckResults: []health.CheckResult{ | ||
{ | ||
Threshold: 499.0, | ||
ActualValue: 500.0, | ||
IsCriteriaMet: false, | ||
}, | ||
}, | ||
}, | ||
}, | ||
{ | ||
name: "unhealthy revision, miss error rate", | ||
query: metricsMocker.Query{}, | ||
offset: 5 * time.Minute, | ||
minRequests: 1000, | ||
healthCriteria: []config.Metric{ | ||
{Type: config.ErrorRateMetricsCheck, Threshold: 0.95}, | ||
}, | ||
expected: &health.Diagnosis{ | ||
OverallResult: health.Unhealthy, | ||
CheckResults: []health.CheckResult{ | ||
{ | ||
Threshold: 0.95, | ||
ActualValue: 1.0, | ||
IsCriteriaMet: false, | ||
}, | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
t.Run(test.name, func(t *testing.T) { | ||
ctx := context.Background() | ||
diagnosis, _ := health.Diagnose(ctx, metricsMock, test.query, test.offset, test.minRequests, test.healthCriteria) | ||
assert.Equal(t, test.expected, diagnosis) | ||
}) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is potentially a problem if no healthCriteria is specified (somehow) and you return
Healthy
. that's why we defineInconclusive
andUnknown
for.if you say
var overallResult DiagnosisResult
it will default toUnknown
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to be clear my concern is: It's not defensive enough for a critical task such as rollout. user is much better off if we don't roll out or rollback, rather than blindly rolling forward due to a bug in our tool.
I need you to approach to the problem with this mindset.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! My initial thought was that
Inconclusive
is the value returned when there are no enough requests, but that's still not implemented.I was not sure if the best approach to empty criteria was roll forward (maybe the user didn't want health checks) or do nothing (
Unknown
/Inconclusive
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, it's not a good approach.
Inconclusive or Unknown is better. But best might be returning error if no criteria is specified (terminal error).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's address this at #30