GoogleCloudPlatform · ahmetb · Jul 27, 2020 · Jul 23, 2020 · Jul 23, 2020 · Jul 21, 2020
diff --git a/pkg/health/health.go b/pkg/health/health.go
@@ -0,0 +1,159 @@
+package health
+
+import (
+	"context"
+	"time"
+
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics"
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/util"
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/config"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+)
+
+// DiagnosisResult is a possible result after a diagnosis.
+type DiagnosisResult int
+
+// Possible diagnosis results.
+const (
+	Unknown DiagnosisResult = iota
+	Inconclusive
+	Healthy
+	Unhealthy
+)
+
+// Diagnosis is the information about the health of the revision.
+type Diagnosis struct {
+	OverallResult DiagnosisResult
+	CheckResults  []CheckResult
+}
+
+// CheckResult is information about a metrics criteria check.
+type CheckResult struct {
+	Threshold     float64
+	ActualValue   float64
+	IsCriteriaMet bool
+}
+
+// Diagnose attempts to determine the health of a revision.
+//
+// If the minimum number of requests is not met, then health cannot be
+// determined and Diagnosis.EnoughRequests is set to false.
+//
+// Otherwise, all metrics criteria are checked to determine if the revision is
+// healthy.
+func Diagnose(ctx context.Context, provider metrics.Provider, query metrics.Query,
+	offset time.Duration, minRequests int64, healthCriteria []config.Metric) (*Diagnosis, error) {
+
+	logger := util.LoggerFromContext(ctx)
+	metricsValues, err := CollectMetrics(ctx, provider, query, offset, healthCriteria)
+	if err != nil {
+		return nil, errors.Wrap(err, "could not collect metrics")
+	}
+
+	overallResult := Healthy
+	var results []CheckResult
+	for i, criteria := range healthCriteria {
+		logger := logger.WithFields(logrus.Fields{
+			"metricsType": criteria.Type,
+			"percentile":  criteria.Percentile,
+			"threshold":   criteria.Threshold,
+			"actualValue": metricsValues[i],
+		})
+
+		result := determineResult(criteria.Type, criteria.Threshold, metricsValues[i])
+		results = append(results, result)
+		if result.IsCriteriaMet {
+			logger.Debug("met criteria")
+			continue
+		}
+
+		overallResult = Unhealthy
+		logger.Debug("unmet criteria")
+	}
+
+	return &Diagnosis{
+		OverallResult: overallResult,
+		CheckResults:  results,
+	}, nil
+}
+
+// CollectMetrics returns an array of values collected for each of the specified
+// metrics criteria.
+func CollectMetrics(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration, healthCriteria []config.Metric) ([]float64, error) {
+	logger := util.LoggerFromContext(ctx)
+	logger.Debug("start collecting metrics")
+	var values []float64
+	for _, criteria := range healthCriteria {
+		var value float64
+		var err error
+
+		switch criteria.Type {
+		case config.LatencyMetricsCheck:
+			value, err = latency(ctx, provider, query, offset, criteria.Percentile)
+			break
+		case config.ErrorRateMetricsCheck:
+			value, err = errorRatePercent(ctx, provider, query, offset)
+			break
+		default:
+			return nil, errors.Errorf("unimplemented metrics %q", criteria.Type)
+		}
+
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed to obtain metrics %q", criteria.Type)
+		}
+		values = append(values, value)
+	}
+
+	return values, nil
+}
+
+// determineResult concludes if metrics criteria was met.
+//
+// The returned value also includes a string with details of why the criteria
+// was met or not.
+func determineResult(metricsType config.MetricsCheck, threshold float64, actualValue float64) CheckResult {
+	result := CheckResult{ActualValue: actualValue, Threshold: threshold}
+
+	// As of now, the supported health criteria (latency and error rate) need to
+	// be less than the threshold. So, this is sufficient for now but might need
+	// to change to a switch statement when criteria with a minimum threshold is
+	// added.
+	if actualValue <= threshold {
+		result.IsCriteriaMet = true
+	}
+	return result
+}
+
+// latency returns the latency for the given offset and percentile.
+func latency(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration, percentile float64) (float64, error) {
+	alignerReducer, err := metrics.PercentileToAlignReduce(percentile)
+	if err != nil {
+		return 0, errors.Wrap(err, "failed to parse percentile")
+	}
+
+	logger := util.LoggerFromContext(ctx).WithField("percentile", percentile)
+	logger.Debug("querying for latency metrics")
+	latency, err := provider.Latency(ctx, query, offset, alignerReducer)
+	if err != nil {
+		return 0, errors.Wrap(err, "failed to get latency metrics")
+	}
+	logger.WithField("value", latency).Debug("latency successfully retrieved")
+
+	return latency, nil
+}
+
+// errorRatePercent returns the percentage of errors during the given offset.
+func errorRatePercent(ctx context.Context, provider metrics.Provider, query metrics.Query, offset time.Duration) (float64, error) {
+	logger := util.LoggerFromContext(ctx)
+	logger.Debug("querying for error rate metrics")
+	rate, err := provider.ErrorRate(ctx, query, offset)
+	if err != nil {
+		return 0, errors.Wrap(err, "failed to get error rate metrics")
+	}
+
+	// Multiply rate by 100 to have a percentage.
+	rate *= 100
+	logger.WithField("value", rate).Debug("error rate successfully retrieved")
+	return rate, nil
+}
diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go
@@ -0,0 +1,128 @@
+package health_test
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics"
+	metricsMocker "github.com/GoogleCloudPlatform/cloud-run-release-operator/internal/metrics/mock"
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/config"
+	"github.com/GoogleCloudPlatform/cloud-run-release-operator/pkg/health"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDiagnose(t *testing.T) {
+	metricsMock := &metricsMocker.Metrics{}
+	metricsMock.LatencyFn = func(ctx context.Context, query metrics.Query, offset time.Duration, alignReduceType metrics.AlignReduce) (float64, error) {
+		return 500, nil
+	}
+	metricsMock.ErrorRateFn = func(ctx context.Context, query metrics.Query, offset time.Duration) (float64, error) {
+		return 0.01, nil
+	}
+
+	tests := []struct {
+		name           string
+		query          metrics.Query
+		offset         time.Duration
+		minRequests    int64
+		healthCriteria []config.Metric
+		expected       *health.Diagnosis
+	}{
+		{
+			name:        "healthy revision",
+			query:       metricsMocker.Query{},
+			offset:      5 * time.Minute,
+			minRequests: 1000,
+			healthCriteria: []config.Metric{
+				{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 750},
+				{Type: config.ErrorRateMetricsCheck, Threshold: 5},
+			},
+			expected: &health.Diagnosis{
+				OverallResult: health.Healthy,
+				CheckResults: []health.CheckResult{
+					{
+						Threshold:     750.0,
+						ActualValue:   500.0,
+						IsCriteriaMet: true,
+					},
+					{
+						Threshold:     5.0,
+						ActualValue:   1.0,
+						IsCriteriaMet: true,
+					},
+				},
+			},
+		},
+		{
+			name:   "barely healthy revision",
+			query:  metricsMocker.Query{},
+			offset: 5 * time.Minute,
+			healthCriteria: []config.Metric{
+				{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 500},
+				{Type: config.ErrorRateMetricsCheck, Threshold: 1},
+			},
+			expected: &health.Diagnosis{
+				OverallResult: health.Healthy,
+				CheckResults: []health.CheckResult{
+					{
+						Threshold:     500.0,
+						ActualValue:   500.0,
+						IsCriteriaMet: true,
+					},
+					{
+						Threshold:     1.0,
+						ActualValue:   1.0,
+						IsCriteriaMet: true,
+					},
+				},
+			},
+		},
+		{
+			name:        "unhealthy revision, miss latency",
+			query:       metricsMocker.Query{},
+			offset:      5 * time.Minute,
+			minRequests: 1000,
+			healthCriteria: []config.Metric{
+				{Type: config.LatencyMetricsCheck, Percentile: 99, Threshold: 499},
+			},
+			expected: &health.Diagnosis{
+				OverallResult: health.Unhealthy,
+				CheckResults: []health.CheckResult{
+					{
+						Threshold:     499.0,
+						ActualValue:   500.0,
+						IsCriteriaMet: false,
+					},
+				},
+			},
+		},
+		{
+			name:        "unhealthy revision, miss error rate",
+			query:       metricsMocker.Query{},
+			offset:      5 * time.Minute,
+			minRequests: 1000,
+			healthCriteria: []config.Metric{
+				{Type: config.ErrorRateMetricsCheck, Threshold: 0.95},
+			},
+			expected: &health.Diagnosis{
+				OverallResult: health.Unhealthy,
+				CheckResults: []health.CheckResult{
+					{
+						Threshold:     0.95,
+						ActualValue:   1.0,
+						IsCriteriaMet: false,
+					},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ctx := context.Background()
+			diagnosis, _ := health.Diagnose(ctx, metricsMock, test.query, test.offset, test.minRequests, test.healthCriteria)
+			assert.Equal(t, test.expected, diagnosis)
+		})
+	}
+}