Skip to content

Commit

Permalink
Fix: Network Manager race condition (#1376)
Browse files Browse the repository at this point in the history
* Create restart VM test

* Make the ops agent run after `network-online.target`

* Added retries for network healthchecks

---------

Co-authored-by: Francisco Valente <1435136+franciscovalentecastro@users.noreply.github.com>
  • Loading branch information
avilevy18 and franciscovalentecastro committed Aug 30, 2023
1 parent 253f082 commit c06b141
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
40 changes: 38 additions & 2 deletions integration_test/ops_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3953,8 +3953,9 @@ func checkExpectedHealthCheckResult(t *testing.T, output string, name string, ex
func getRecentServiceOutputForPlatform(platform string) string {
if gce.IsWindows(platform) {
cmd := strings.Join([]string{
"$Past = (Get-Date) - (New-TimeSpan -Minute 1)",
"Get-WinEvent -MaxEvents 10 -FilterHashtable @{ Logname='Application'; ProviderName='google-cloud-ops-agent'; StartTime=$Past } | select -ExpandProperty Message",
"$ServiceStart = (Get-EventLog -LogName 'System' -Source 'Service Control Manager' -EntryType 'Information' -Message '*Google Cloud Ops Agent service entered the running state*' -Newest 1).TimeGenerated",
"$QueryStart = $ServiceStart - (New-TimeSpan -Seconds 30)",
"Get-WinEvent -MaxEvents 10 -FilterHashtable @{ Logname='Application'; ProviderName='google-cloud-ops-agent'; StartTime=$QueryStart } | select -ExpandProperty Message",
}, ";")
return cmd
}
Expand Down Expand Up @@ -4233,6 +4234,41 @@ func TestNoNvmlOtelReceiverWithoutGpu(t *testing.T) {
})
}

func TestRestartVM(t *testing.T) {
t.Parallel()
gce.RunForEachPlatform(t, func(t *testing.T, platform string) {
t.Parallel()

ctx, logger, vm := agents.CommonSetup(t, platform)
if err := agents.SetupOpsAgent(ctx, logger.ToMainLog(), vm, ""); err != nil {
t.Fatal(err)
}

cmdOut, err := gce.RunRemotely(ctx, logger.ToMainLog(), vm, "", getRecentServiceOutputForPlatform(vm.Platform))
if err != nil {
t.Fatal(err)
}

// Ensure sure all healthchecks pass before the restart
checkExpectedHealthCheckResult(t, cmdOut.Stdout, "Network", "PASS", "")
checkExpectedHealthCheckResult(t, cmdOut.Stdout, "Ports", "PASS", "")
checkExpectedHealthCheckResult(t, cmdOut.Stdout, "API", "PASS", "")

if err := gce.RestartInstance(ctx, logger, vm); err != nil {
t.Fatal(err)
}

cmdOut, err = gce.RunRemotely(ctx, logger.ToMainLog(), vm, "", getRecentServiceOutputForPlatform(vm.Platform))
if err != nil {
t.Fatal(err)
}

checkExpectedHealthCheckResult(t, cmdOut.Stdout, "Network", "PASS", "")
checkExpectedHealthCheckResult(t, cmdOut.Stdout, "Ports", "PASS", "")
checkExpectedHealthCheckResult(t, cmdOut.Stdout, "API", "PASS", "")
})
}

func TestMain(m *testing.M) {
code := m.Run()
gce.CleanupKeysOrDie()
Expand Down
16 changes: 15 additions & 1 deletion internal/healthchecks/network_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ package healthchecks
import (
"errors"
"net/http"
"time"

"github.com/GoogleCloudPlatform/ops-agent/internal/logs"
"github.com/cenkalti/backoff/v4"
)

type networkRequest struct {
Expand All @@ -29,7 +31,19 @@ type networkRequest struct {
}

func (r networkRequest) SendRequest(logger logs.StructuredLogger) error {
response, err := http.Get(r.url)
var response *http.Response
var err error
bf := backoff.NewExponentialBackOff()
bf.MaxElapsedTime = 60 * time.Second
expTicker := backoff.NewTicker(bf)

for range expTicker.C {
response, err = http.Get(r.url)
if err == nil && response.StatusCode == http.StatusOK {
expTicker.Stop()
break
}
}
if err != nil {
if isTimeoutError(err) || isConnectionRefusedError(err) {
return r.healthCheckError
Expand Down
3 changes: 2 additions & 1 deletion systemd/google-cloud-ops-agent.service
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

[Unit]
Description=Google Cloud Ops Agent
Wants=google-cloud-ops-agent-fluent-bit.service google-cloud-ops-agent-opentelemetry-collector.service google-cloud-ops-agent-diagnostics.service
Wants=google-cloud-ops-agent-fluent-bit.service google-cloud-ops-agent-opentelemetry-collector.service google-cloud-ops-agent-diagnostics.service network-online.target
After=network-online.target

[Service]
Type=oneshot
Expand Down

0 comments on commit c06b141

Please sign in to comment.