Skip to content

Commit

Permalink
fix(export): make FromFlags & metadata call more reliable and observable
Browse files Browse the repository at this point in the history
Fixes b/344740239 (edge case with GKE Metadata Server and GKE sandbox).

* Added debug logging
* Moved risky logic from FromFlags, see code comment why
* Updated metadata deps & use timeout-ed context

Signed-off-by: bwplotka <bwplotka@google.com>
  • Loading branch information
bwplotka committed Jun 11, 2024
1 parent 10cccee commit e9db076
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 33 deletions.
84 changes: 51 additions & 33 deletions pkg/export/setup/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/lease"
kingpin "github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/google/shlex"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -90,34 +91,13 @@ func Global() *export.Exporter {
// FromFlags returns a constructor for a new exporter that is configured through flags that are
// registered with the given application. The constructor must be called after the flags
// have been parsed.
//
// NOTE(bwplotka): This method should only setup flags, no extra logic should be done here
// as we don't have a logger ready, and nothing was logged for the binary yet.
// Potential risky logic can be moved to the returned function we return here.
// See b/344740239 on how hard is to debug regressions here.
func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Context, log.Logger, prometheus.Registerer) (*export.Exporter, error) {
var opts export.ExporterOpts
env := UAEnvUnspecified

// Default target fields if we can detect them in GCP.
if metadata.OnGCE() {
env = UAEnvGCE
opts.ProjectID, _ = metadata.ProjectID()
opts.Cluster, _ = metadata.InstanceAttributeValue("cluster-name")
if opts.Cluster != "" {
env = UAEnvGKE
}
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.
if loc, _ := metadata.InstanceAttributeValue("cluster-location"); loc != "" {
opts.Location = loc
} else {
opts.Location, _ = metadata.Zone()
}
}
opts.UserAgentEnv = env
opts.UserAgentProduct = userAgentProduct

a.Flag("export.disable", "Disable exporting to GCM.").
Expand All @@ -132,20 +112,17 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
a.Flag("export.credentials-file", "Credentials file for authentication with the GCM API.").
Default("").StringVar(&opts.CredentialsFile)

a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).
Default(opts.ProjectID).StringVar(&opts.ProjectID)
a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).StringVar(&opts.ProjectID)

a.Flag("export.user-agent-mode", fmt.Sprintf("Mode for user agent used for requests against the GCM API. Valid values are %q, %q, %q, %q or %q.", UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM, UAModeUnspecified)).
Default("unspecified").EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)
Default(UAModeUnspecified).EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)

// The location and cluster flag should probably not be used. On the other hand, they make it easy
// to populate these important values in the monitored resource without interfering with existing
// Prometheus configuration.
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).
Default(opts.Location).StringVar(&opts.Location)
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).StringVar(&opts.Location)

a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).
Default(opts.Cluster).StringVar(&opts.Cluster)
a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).StringVar(&opts.Cluster)

a.Flag("export.match", `A Prometheus time series matcher. Can be repeated. Every time series must match at least one of the matchers to be exported. This flag can be used equivalently to the match[] parameter of the Prometheus federation endpoint to selectively export data. (Example: --export.match='{job="prometheus"}' --export.match='{__name__=~"job:.*"})`).
Default("").SetValue(&opts.Matchers)
Expand Down Expand Up @@ -185,6 +162,47 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
Default("").OverrideDefaultFromEnvar("KUBE_NAME").String()

return func(ctx context.Context, logger log.Logger, metrics prometheus.Registerer) (*export.Exporter, error) {
// Default target fields if we can detect them in GCP.
level.Debug(logger).Log("msg", "best-effort call to GCP metadata server to populate default project ID, cluster, location and user-agent")

Check failure on line 166 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)
if metadata.OnGCE() {
env := UAEnvGCE
if opts.ProjectID == "" {
opts.ProjectID, _ = metadata.ProjectID()
}
if opts.Cluster == "" {
level.Debug(logger).Log("msg", "best-effort call to GCP metadata server for potential GKE cluster name")

Check failure on line 173 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)
opts.Cluster, _ = metadata.InstanceAttributeValue("cluster-name")
if opts.Cluster != "" {
env = UAEnvGKE
}
}

if opts.Location == "" {
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
//
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.
level.Debug(logger).Log("msg", "best-effort call to GCP metadata server for potential GKE cluster location")

Check failure on line 191 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)
if loc, _ := metadata.InstanceAttributeValue("cluster-location"); loc != "" {
opts.Location = loc
} else {
opts.Location, _ = metadata.Zone()
}
}

if opts.UserAgentEnv == UAEnvUnspecified {
// NOTE: If user set unspecified on purpose, this will override.
opts.UserAgentEnv = env
}
}
level.Debug(logger).Log("msg", "best-effort GCP metadata server lookup finished")

switch *haBackend {
case HABackendNone:
case HABackendKubernetes:
Expand Down
50 changes: 50 additions & 0 deletions pkg/export/setup/setup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package setup

import (
"context"
"os"
"testing"
"time"

"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
)

func TestFromFlags_NoMetadataServer(t *testing.T) {
fake := kingpin.New("test", "test")
newExport := FromFlags(fake, "product")
// Our readines is default (3 * 10s), so ensure FromFlags is never longer than 30s.
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Fake app invocation.
if _, err := fake.Parse(nil); err != nil {
t.Fatal(err)
}

// Make sure constructor works too.
if _, err := newExport(ctx, log.NewLogfmtLogger(os.Stderr), nil); err != nil {
t.Fatal(err)
}
}

// Regression test for b/344740239

Check failure on line 31 in pkg/export/setup/setup_test.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Comment should end in a period (godot)
func TestFromFlags_NoMetadataServer_SandboxIssue(t *testing.T) {
// TODO(bwplotka): TBD

fake := kingpin.New("test", "test")
newExport := FromFlags(fake, "product")
// Our readines is default (3 * 10s), so ensure FromFlags is never longer than 30s.
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Fake app invocation.
if _, err := fake.Parse(nil); err != nil {
t.Fatal(err)
}

// Make sure constructor works too.
if _, err := newExport(ctx, log.NewLogfmtLogger(os.Stderr), nil); err != nil {
t.Fatal(err)
}
}

0 comments on commit e9db076

Please sign in to comment.