Skip to content

Commit

Permalink
fix(export): add support in onGCE + no conn to metadata case
Browse files Browse the repository at this point in the history
Fixes b/344740239 (edge case with GKE Metadata Server and GKE sandbox).

* Added debug logging.
* Updated metadata deps to get googleapis/google-cloud-go#9733 & use timeout-ed context
* Moved risky logic from FromFlags, see code comment why.
* Added regession test.

### Alternatives

Everything we do in FromFlags or constructor is within readines period. We
could consider moving potentially "slow" things on slow network or metadata srv
to exporter.Run. This could be questionable as for GMP to work
we at end need export functionality to work, so delaying that information or
making it surface in separation to readiness might not be helpful.

Signed-off-by: bwplotka <bwplotka@google.com>
  • Loading branch information
bwplotka committed Jun 11, 2024
1 parent 10cccee commit 668618c
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 320 deletions.
3 changes: 1 addition & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/GoogleCloudPlatform/prometheus-engine
go 1.22.3

require (
cloud.google.com/go/compute/metadata v0.2.3
cloud.google.com/go/compute/metadata v0.3.0
cloud.google.com/go/monitoring v1.18.0
github.com/ahmetb/gen-crd-api-reference-docs v0.3.0
github.com/alecthomas/kingpin/v2 v2.4.0
Expand Down Expand Up @@ -47,7 +47,6 @@ require (
)

require (
cloud.google.com/go/compute v1.23.3 // indirect
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.1 // indirect
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.5.1 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.1 // indirect
Expand Down
5 changes: 2 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,12 @@ cloud.google.com/go/compute v1.18.0/go.mod h1:1X7yHxec2Ga+Ss6jPyjxRxpu2uu7PLgsOV
cloud.google.com/go/compute v1.19.0/go.mod h1:rikpw2y+UMidAe9tISo04EHNOIf42RLYF/q8Bs93scU=
cloud.google.com/go/compute v1.19.3/go.mod h1:qxvISKp/gYnXkSAD1ppcSOveRAmzxicEv/JlizULFrI=
cloud.google.com/go/compute v1.20.1/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM=
cloud.google.com/go/compute v1.23.3 h1:6sVlXXBmbd7jNX0Ipq0trII3e4n1/MsADLK6a+aiVlk=
cloud.google.com/go/compute v1.23.3/go.mod h1:VCgBUoMnIVIR0CscqQiPJLAG25E3ZRZMzcFZeQ+h8CI=
cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZEXYonfTBHHFPO/4UU=
cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM=
cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY=
cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA=
cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc=
cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
cloud.google.com/go/contactcenterinsights v1.3.0/go.mod h1:Eu2oemoePuEFc/xKFPjbTuPSj0fYJcPls9TFlPNnHHY=
cloud.google.com/go/contactcenterinsights v1.4.0/go.mod h1:L2YzkGbPsv+vMQMCADxJoT9YiTTnSEd6fEvCeHTYVck=
cloud.google.com/go/contactcenterinsights v1.6.0/go.mod h1:IIDlT6CLcDoyv79kDv8iWxMSTZhLxSCofVV5W6YFM/w=
Expand Down
136 changes: 103 additions & 33 deletions pkg/export/setup/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ import (
"fmt"
"os"
"strconv"
"strings"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/lease"
kingpin "github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/google/shlex"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -90,34 +93,13 @@ func Global() *export.Exporter {
// FromFlags returns a constructor for a new exporter that is configured through flags that are
// registered with the given application. The constructor must be called after the flags
// have been parsed.
//
// NOTE(bwplotka): This method should only setup flags, no extra logic should be done here
// as we don't have a logger ready, and nothing was logged for the binary yet.
// Potential risky logic can be moved to the returned function we return here.
// See b/344740239 on how hard is to debug regressions here.
func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Context, log.Logger, prometheus.Registerer) (*export.Exporter, error) {
var opts export.ExporterOpts
env := UAEnvUnspecified

// Default target fields if we can detect them in GCP.
if metadata.OnGCE() {
env = UAEnvGCE
opts.ProjectID, _ = metadata.ProjectID()
opts.Cluster, _ = metadata.InstanceAttributeValue("cluster-name")
if opts.Cluster != "" {
env = UAEnvGKE
}
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.
if loc, _ := metadata.InstanceAttributeValue("cluster-location"); loc != "" {
opts.Location = loc
} else {
opts.Location, _ = metadata.Zone()
}
}
opts.UserAgentEnv = env
opts.UserAgentProduct = userAgentProduct

a.Flag("export.disable", "Disable exporting to GCM.").
Expand All @@ -132,20 +114,17 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
a.Flag("export.credentials-file", "Credentials file for authentication with the GCM API.").
Default("").StringVar(&opts.CredentialsFile)

a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).
Default(opts.ProjectID).StringVar(&opts.ProjectID)
a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).StringVar(&opts.ProjectID)

a.Flag("export.user-agent-mode", fmt.Sprintf("Mode for user agent used for requests against the GCM API. Valid values are %q, %q, %q, %q or %q.", UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM, UAModeUnspecified)).
Default("unspecified").EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)
Default(UAModeUnspecified).EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)

// The location and cluster flag should probably not be used. On the other hand, they make it easy
// to populate these important values in the monitored resource without interfering with existing
// Prometheus configuration.
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).
Default(opts.Location).StringVar(&opts.Location)
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).StringVar(&opts.Location)

a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).
Default(opts.Cluster).StringVar(&opts.Cluster)
a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).StringVar(&opts.Cluster)

a.Flag("export.match", `A Prometheus time series matcher. Can be repeated. Every time series must match at least one of the matchers to be exported. This flag can be used equivalently to the match[] parameter of the Prometheus federation endpoint to selectively export data. (Example: --export.match='{job="prometheus"}' --export.match='{__name__=~"job:.*"})`).
Default("").SetValue(&opts.Matchers)
Expand Down Expand Up @@ -184,7 +163,28 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
kubeName := a.Flag("export.ha.kube.name", "Name for the HA locking resource. Must be identical across replicas. May be set through the KUBE_NAME environment variable.").
Default("").OverrideDefaultFromEnvar("KUBE_NAME").String()

// NOTE(bwplotka): This function will be likely performed within "getting ready" period, so before readiness is
// set to ready. Typical readiness can be as fast as 30s, so make sure this code timeouts faster than that.
return func(ctx context.Context, logger log.Logger, metrics prometheus.Registerer) (*export.Exporter, error) {
level.Debug(logger).Log("msg", "started constructing the GCM export logic")

Check failure on line 169 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)

// NOTE: OnGCE() actually means "onGCP", or really any place with
// certain IP (of metadata server) or DNS entry available. For example on GKE.
if metadata.OnGCE() {
// When, potentially, on GCE we attempt to populate some, unspecified option entries
// like project ID, cluster, location, zone and user agent from GCP metadata server.
//
// This will be used to get *some* data, if not specified override by flags, to
// use if labels or external label settings does not have those set. Those will
// be used as crucial labels for export to work against GCM's Prometheus target.
//
// NOTE: Set a 10s hard limit due to readiness and liveliness probes during this stage.
mctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
tryPopulateUnspecifiedFromMetadata(mctx, logger, &opts)
cancel()
level.Debug(logger).Log("msg", "best-effort on-GCP metadata gathering finished")

Check failure on line 185 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)
}

switch *haBackend {
case HABackendNone:
case HABackendKubernetes:
Expand Down Expand Up @@ -230,3 +230,73 @@ func loadKubeConfig(kubeconfigPath string) (*rest.Config, error) {
func ExtraArgs() ([]string, error) {
return shlex.Split(os.Getenv(ExtraArgsEnvvar))
}

// tryPopulateUnspecifiedFromMetadata assumes we are in GCP, with unknown state
// of the metadata server. This is best-effort, so when the metadata server
// is not accessible on GCP, because it was disabled (404 errors), or not accessible
// (connection refused, slow network, e.g. sandbox + metadata disabled)
// it's a noop. Make sure to pass context with a timeout.
func tryPopulateUnspecifiedFromMetadata(ctx context.Context, logger log.Logger, opts *export.ExporterOpts) {
const (
projectIDPath = "project/project-id"
clusterNamePath = "instance/attributes/cluster-name"
clusterLocationPath = "instance/attributes/cluster-location"
zonePath = "instance/zone"
)

env := UAEnvGCE // This also means GKE with metadata server disabled.

c := metadata.NewClient(nil)
// Mimick metadata.InstanceAttributeValue("cluster-name") but with context.
gkeClusterName, err := c.GetWithContext(ctx, clusterNamePath)
if err != nil {
level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", clusterNamePath, "err", err)

Check failure on line 253 in pkg/export/setup/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

Error return value of `(github.com/go-kit/log.Logger).Log` is not checked (errcheck)
} else if gkeClusterName != "" {
env = UAEnvGKE
if opts.Cluster == "" {
opts.Cluster = gkeClusterName
}
}

if opts.ProjectID == "" {
// Mimick metadata.ProjectID() but with context.
projectID, err := c.GetWithContext(ctx, projectIDPath)
if err != nil {
level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", projectIDPath, "err", err)
} else {
opts.ProjectID = strings.TrimSpace(projectID)
}
}
if opts.Location == "" {
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
//
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.

// Mimick metadata.InstanceAttributeValue("cluster-location") but with context.
loc, err := c.GetWithContext(ctx, clusterLocationPath)
if err != nil {
level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; falling back to zone", "key", clusterLocationPath, "err", err)
zone, err := c.GetWithContext(ctx, zonePath)
if err != nil {
level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", zonePath, "err", err)
} else {
zone = strings.TrimSpace(zone)
// zone is of the form "projects/<projNum>/zones/<zoneName>".
opts.Location = zone[strings.LastIndex(zone, "/")+1:]
}
} else {
opts.Location = loc
}
}
if opts.UserAgentEnv == UAEnvUnspecified {
// We acknowledge that, if user set unspecified on purpose, this will override.
opts.UserAgentEnv = env
}
}
86 changes: 86 additions & 0 deletions pkg/export/setup/setup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package setup

import (
"context"
"net/http"
"net/http/httptest"
"os"
"sync"
"testing"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"
"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/google/go-cmp/cmp"
)

func TestFromFlags_NotOnGCE(t *testing.T) {
// Asserting there is actually no GCE underneath.
if metadata.OnGCE() {
t.Fatal("This test assumes we don't run on GCP")
}

fake := kingpin.New("test", "test")
newExport := FromFlags(fake, "product")
// Our readines is default (3 * 10s), so ensure FromFlags is never longer than 30s.
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Fake app invocation.
if _, err := fake.Parse(nil); err != nil {
t.Fatal(err)
}

// Make sure constructor works too.
if _, err := newExport(ctx, log.NewLogfmtLogger(os.Stderr), nil); err != nil {
t.Fatal(err)
}
}

// Regression test for b/344740239. We ensure that even stuck metadata servers
// calls will timeout correctly (we propagate context properly).
func TestTryPopulateUnspecifiedFromMetadata(t *testing.T) {
// Asserting there is actually no GCE underneath.
if metadata.OnGCE() {
t.Fatal("This test assumes we don't run on GCP")
}

var wg sync.WaitGroup
wg.Add(1)

// Setup fake HTTP server taking forever.
s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

Check warning on line 54 in pkg/export/setup/setup_test.go

View workflow job for this annotation

GitHub Actions / golangci-lint

unused-parameter: parameter 'w' seems to be unused, consider removing or renaming it as _ (revive)
wg.Wait()
}))
t.Cleanup(func() {
wg.Done()
s.Close()
_ = os.Setenv("GCE_METADATA_HOST", "")
})

// Main "readiness" like timeout that we have to be faster than.
testCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Inject metadata URL for our server (does not matter if 404).
if err := os.Setenv("GCE_METADATA_HOST", s.Listener.Addr().String()); err != nil {

Check failure on line 68 in pkg/export/setup/setup_test.go

View workflow job for this annotation

GitHub Actions / golangci-lint

os.Setenv() can be replaced by `t.Setenv()` in TestTryPopulateUnspecifiedFromMetadata (tenv)
t.Fatal(err)
}

// We expect this to finish sooner.
ctx, cancel2 := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel2()
opts := export.ExporterOpts{}
tryPopulateUnspecifiedFromMetadata(ctx, log.NewLogfmtLogger(os.Stderr), &opts)
if diff := cmp.Diff(export.ExporterOpts{}, opts); diff != "" {
t.Fatal("expected no options populated", diff)
}
// We expect to finish the test, before testCtx is cancelled.
// TODO(bwplotka): Assert we are not exiting faster because metadata cannot access our HTTP test server.
// I checked manually for inverse to fail (e.g. setting in ctx longer than textCtx).
if testCtx.Err() != nil {
t.Fatal("tryPopulateUnspecifiedFromMetadata took 30s to complete, it should timeout after 1s")
}
}

0 comments on commit 668618c

Please sign in to comment.