Skip to content

Commit

Permalink
fix(export): add support in onGCE + no conn to metadata case (#1023)
Browse files Browse the repository at this point in the history
Cherry-pick of 3 commits from
#1021
  • Loading branch information
bwplotka committed Jun 12, 2024
2 parents 8b1c46d + 69f283a commit 79b11af
Show file tree
Hide file tree
Showing 11 changed files with 288 additions and 321 deletions.
3 changes: 1 addition & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/GoogleCloudPlatform/prometheus-engine
go 1.21

require (
cloud.google.com/go/compute/metadata v0.2.3
cloud.google.com/go/compute/metadata v0.3.0
cloud.google.com/go/monitoring v1.18.0
github.com/ahmetb/gen-crd-api-reference-docs v0.3.0
github.com/alecthomas/kingpin/v2 v2.4.0
Expand Down Expand Up @@ -47,7 +47,6 @@ require (
)

require (
cloud.google.com/go/compute v1.23.3 // indirect
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.1 // indirect
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.5.1 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.1 // indirect
Expand Down
5 changes: 2 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,12 @@ cloud.google.com/go/compute v1.18.0/go.mod h1:1X7yHxec2Ga+Ss6jPyjxRxpu2uu7PLgsOV
cloud.google.com/go/compute v1.19.0/go.mod h1:rikpw2y+UMidAe9tISo04EHNOIf42RLYF/q8Bs93scU=
cloud.google.com/go/compute v1.19.3/go.mod h1:qxvISKp/gYnXkSAD1ppcSOveRAmzxicEv/JlizULFrI=
cloud.google.com/go/compute v1.20.1/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM=
cloud.google.com/go/compute v1.23.3 h1:6sVlXXBmbd7jNX0Ipq0trII3e4n1/MsADLK6a+aiVlk=
cloud.google.com/go/compute v1.23.3/go.mod h1:VCgBUoMnIVIR0CscqQiPJLAG25E3ZRZMzcFZeQ+h8CI=
cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZEXYonfTBHHFPO/4UU=
cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM=
cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY=
cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA=
cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc=
cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
cloud.google.com/go/contactcenterinsights v1.3.0/go.mod h1:Eu2oemoePuEFc/xKFPjbTuPSj0fYJcPls9TFlPNnHHY=
cloud.google.com/go/contactcenterinsights v1.4.0/go.mod h1:L2YzkGbPsv+vMQMCADxJoT9YiTTnSEd6fEvCeHTYVck=
cloud.google.com/go/contactcenterinsights v1.6.0/go.mod h1:IIDlT6CLcDoyv79kDv8iWxMSTZhLxSCofVV5W6YFM/w=
Expand Down
146 changes: 112 additions & 34 deletions pkg/export/setup/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ import (
"fmt"
"os"
"strconv"
"strings"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/lease"
kingpin "github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/google/shlex"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -90,34 +93,16 @@ func Global() *export.Exporter {
// FromFlags returns a constructor for a new exporter that is configured through flags that are
// registered with the given application. The constructor must be called after the flags
// have been parsed.
//
// NOTE(bwplotka): This method should only setup flags, no extra logic should be done here
// as we don't have a logger ready, and nothing was logged for the binary yet.
// Potential risky logic can be moved to the returned function we return here.
// See b/344740239 on how hard is to debug regressions here.
func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Context, log.Logger, prometheus.Registerer) (*export.Exporter, error) {
var opts export.ExporterOpts
env := UAEnvUnspecified

// Default target fields if we can detect them in GCP.
if metadata.OnGCE() {
env = UAEnvGCE
opts.ProjectID, _ = metadata.ProjectID()
opts.Cluster, _ = metadata.InstanceAttributeValue("cluster-name")
if opts.Cluster != "" {
env = UAEnvGKE
}
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.
if loc, _ := metadata.InstanceAttributeValue("cluster-location"); loc != "" {
opts.Location = loc
} else {
opts.Location, _ = metadata.Zone()
}
}
opts.UserAgentEnv = env
var (
metadataFetchTimeout time.Duration
opts export.ExporterOpts
)
opts.UserAgentProduct = userAgentProduct

a.Flag("export.disable", "Disable exporting to GCM.").
Expand All @@ -132,20 +117,17 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
a.Flag("export.credentials-file", "Credentials file for authentication with the GCM API.").
Default("").StringVar(&opts.CredentialsFile)

a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).
Default(opts.ProjectID).StringVar(&opts.ProjectID)
a.Flag("export.label.project-id", fmt.Sprintf("Default project ID set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyProjectID)).StringVar(&opts.ProjectID)

a.Flag("export.user-agent-mode", fmt.Sprintf("Mode for user agent used for requests against the GCM API. Valid values are %q, %q, %q, %q or %q.", UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM, UAModeUnspecified)).
Default("unspecified").EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)
Default(UAModeUnspecified).EnumVar(&opts.UserAgentMode, UAModeUnspecified, UAModeGKE, UAModeKubectl, UAModeAVMW, UAModeABM)

// The location and cluster flag should probably not be used. On the other hand, they make it easy
// to populate these important values in the monitored resource without interfering with existing
// Prometheus configuration.
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).
Default(opts.Location).StringVar(&opts.Location)
a.Flag("export.label.location", fmt.Sprintf("The default location set for all exported data. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyLocation)).StringVar(&opts.Location)

a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).
Default(opts.Cluster).StringVar(&opts.Cluster)
a.Flag("export.label.cluster", fmt.Sprintf("The default cluster set for all scraped targets. Prefer setting the external label %q in the Prometheus configuration if not using the auto-discovered default.", export.KeyCluster)).StringVar(&opts.Cluster)

a.Flag("export.match", `A Prometheus time series matcher. Can be repeated. Every time series must match at least one of the matchers to be exported. This flag can be used equivalently to the match[] parameter of the Prometheus federation endpoint to selectively export data. (Example: --export.match='{job="prometheus"}' --export.match='{__name__=~"job:.*"})`).
Default("").SetValue(&opts.Matchers)
Expand All @@ -165,6 +147,9 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
a.Flag("export.debug.shard-buffer-size", "The buffer size for each individual shard. Each element in buffer (queue) consists of sample and hash.").
Default(strconv.Itoa(export.DefaultShardBufferSize)).UintVar(&opts.Efficiency.ShardBufferSize)

a.Flag("export.debug.fetch-metadata-timeout", "The total timeout for the initial gathering of the best-effort GCP data from the metadata server. This data is used for special labels required by Prometheus metrics (e.g. project id, location, cluster name), as well as information for the user agent. This is done on startup, so make sure this work to be faster than your readiness and liveliness probes.").
Default("10s").DurationVar(&metadataFetchTimeout)

a.Flag("export.token-url", "The request URL to generate token that's needed to ingest metrics to the project").
StringVar(&opts.TokenURL)

Expand All @@ -184,7 +169,30 @@ func FromFlags(a *kingpin.Application, userAgentProduct string) func(context.Con
kubeName := a.Flag("export.ha.kube.name", "Name for the HA locking resource. Must be identical across replicas. May be set through the KUBE_NAME environment variable.").
Default("").OverrideDefaultFromEnvar("KUBE_NAME").String()

// NOTE(bwplotka): This function will be likely performed within "getting ready" period, so before readiness is
// set to ready. Typical readiness can be as fast as 30s, so make sure this code timeouts faster than that.
return func(ctx context.Context, logger log.Logger, metrics prometheus.Registerer) (*export.Exporter, error) {
_ = level.Debug(logger).Log("msg", "started constructing the GCM export logic")

if metadata.OnGCE() {
// NOTE: OnGCE does not guarantee we will have all metadata entries or metadata
// server is accessible.

_ = level.Debug(logger).Log("msg", "detected we might run on GCE node; attempting metadata server access", "timeout", metadataFetchTimeout.String())
// When, potentially, on GCE we attempt to populate some, unspecified option entries
// like project ID, cluster, location, zone and user agent from GCP metadata server.
//
// This will be used to get *some* data, if not specified override by flags, to
// use if labels or external label settings does not have those set. Those will
// be used as crucial labels for export to work against GCM's Prometheus target.
//
// Set a hard time limit due to readiness and liveliness probes during this stage.
mctx, cancel := context.WithTimeout(context.Background(), metadataFetchTimeout)
tryPopulateUnspecifiedFromMetadata(mctx, logger, &opts)
cancel()
_ = level.Debug(logger).Log("msg", "best-effort on-GCE metadata gathering finished")
}

switch *haBackend {
case HABackendNone:
case HABackendKubernetes:
Expand Down Expand Up @@ -230,3 +238,73 @@ func loadKubeConfig(kubeconfigPath string) (*rest.Config, error) {
func ExtraArgs() ([]string, error) {
return shlex.Split(os.Getenv(ExtraArgsEnvvar))
}

// tryPopulateUnspecifiedFromMetadata assumes we are in GCP, with unknown state
// of the metadata server. This is best-effort, so when the metadata server
// is not accessible on GCP, because it was disabled (404 errors), or not accessible
// (connection refused, slow network, e.g. sandbox + metadata disabled)
// it's a noop. Make sure to pass context with a timeout.
func tryPopulateUnspecifiedFromMetadata(ctx context.Context, logger log.Logger, opts *export.ExporterOpts) {
const (
projectIDPath = "project/project-id"
clusterNamePath = "instance/attributes/cluster-name"
clusterLocationPath = "instance/attributes/cluster-location"
zonePath = "instance/zone"
)

env := UAEnvGCE // This also means GKE with metadata server disabled.

c := metadata.NewClient(nil)
// Mimick metadata.InstanceAttributeValue("cluster-name") but with context.
gkeClusterName, err := c.GetWithContext(ctx, clusterNamePath)
if err != nil {
_ = level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", clusterNamePath, "err", err)
} else if gkeClusterName != "" {
env = UAEnvGKE
if opts.Cluster == "" {
opts.Cluster = gkeClusterName
}
}

if opts.ProjectID == "" {
// Mimick metadata.ProjectID() but with context.
projectID, err := c.GetWithContext(ctx, projectIDPath)
if err != nil {
_ = level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", projectIDPath, "err", err)
} else {
opts.ProjectID = strings.TrimSpace(projectID)
}
}
if opts.Location == "" {
// These attributes are set for GKE nodes. For the location, we first check
// the cluster location, which may be a zone or a region. We must always use that value
// to avoid collisions with other clusters, as the same cluster name may be reused
// in different locations.
// In particular, we cannot set the location to the node's zone for a regional cluster,
// even though this would provide more accuracy, as there may also be a zonal cluster
// with the same name.
//
// We only fallback to the node zone as the location if no cluster location exists to
// default for deployments on GCE.

// Mimick metadata.InstanceAttributeValue("cluster-location") but with context.
loc, err := c.GetWithContext(ctx, clusterLocationPath)
if err != nil {
_ = level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; falling back to zone", "key", clusterLocationPath, "err", err)
zone, err := c.GetWithContext(ctx, zonePath)
if err != nil {
_ = level.Debug(logger).Log("msg", "fetching entry from GCP metadata server failed; skipping", "key", zonePath, "err", err)
} else {
zone = strings.TrimSpace(zone)
// zone is of the form "projects/<projNum>/zones/<zoneName>".
opts.Location = zone[strings.LastIndex(zone, "/")+1:]
}
} else {
opts.Location = loc
}
}
if opts.UserAgentEnv == UAEnvUnspecified {
// We acknowledge that, if user set unspecified on purpose, this will override.
opts.UserAgentEnv = env
}
}
94 changes: 94 additions & 0 deletions pkg/export/setup/setup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package setup

import (
"context"
"net/http"
"net/http/httptest"
"os"
"sync"
"testing"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"
"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/google/go-cmp/cmp"
)

func TestFromFlags_NotOnGCE(t *testing.T) {
// Asserting there is actually no GCE underneath.
if metadata.OnGCE() {
t.Fatal("This test assumes we don't run on GCP")
}

fake := kingpin.New("test", "test")
newExport := FromFlags(fake, "product")
// Our readines is default (3 * 10s), so ensure FromFlags is never longer than 30s.
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Fake app invocation.
if _, err := fake.Parse(nil); err != nil {
t.Fatal(err)
}

// Make sure constructor does is not stuck forever.
_, _ = newExport(ctx, log.NewLogfmtLogger(os.Stderr), nil)
}

// Regression test for b/344740239. We ensure that even stuck metadata servers
// calls will timeout correctly (we propagate context properly).
func TestTryPopulateUnspecifiedFromMetadata(t *testing.T) {
// Asserting there is actually no GCE underneath.
if metadata.OnGCE() {
t.Fatal("This test assumes we don't run on GCP")
}

var wg sync.WaitGroup
wg.Add(1)

// Setup fake HTTP server taking forever.
s := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
wg.Wait()
}))
t.Cleanup(func() {
wg.Done()
s.Close()
})

// Main "readiness" like timeout that we have to be faster than.
testCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

// Inject metadata URL for our server (does not matter if 404).
t.Setenv("GCE_METADATA_HOST", s.Listener.Addr().String())

// We expect this to finish sooner.
ctx, cancel2 := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel2()
opts := export.ExporterOpts{}
tryPopulateUnspecifiedFromMetadata(ctx, log.NewLogfmtLogger(os.Stderr), &opts)
if diff := cmp.Diff(export.ExporterOpts{}, opts); diff != "" {
t.Fatal("expected no options populated", diff)
}
// We expect to finish the test, before testCtx is cancelled.
// TODO(bwplotka): Assert we are not exiting faster because metadata cannot access our HTTP test server.
// I checked manually for inverse to fail (e.g. setting in ctx longer than textCtx).
if testCtx.Err() != nil {
t.Fatal("tryPopulateUnspecifiedFromMetadata took 30s to complete, it should timeout after 1s")
}
}
Loading

0 comments on commit 79b11af

Please sign in to comment.