diff --git a/docs/adr/011-refresh-outputs-merge-semantics.md b/docs/adr/011-refresh-outputs-merge-semantics.md new file mode 100644 index 00000000..4c117df4 --- /dev/null +++ b/docs/adr/011-refresh-outputs-merge-semantics.md @@ -0,0 +1,59 @@ +# ADR 011: refresh-outputs merge semantics + +**Status:** Accepted +**Date:** 2026-05-06 + +## Context + +`iac/refreshoutputs/refresh.go::refreshOne` previously **replaced** `dst.Outputs` +entirely with the cloud Read response (`live.Outputs`). This was correct for fields +that cloud Read returns, but caused silent data loss for fields that a provider's +Read endpoint does not return. + +The canonical example is DigitalOcean Droplets: `user_data` (cloud-init script) is +accepted at creation time but is **never returned by the Read endpoint**. After a +refresh-outputs pass, state would contain `user_data: ""`. The planner's next +plan/apply cycle would compare `state.user_data=""` against `config.user_data=""` +and emit a REPLACE action. Apply would attempt delete+create; with a Volume already +attached the DO API returned a 422, blocking the TC2 cutover (run 25508442022). + +The previous TC2 run (25507244491) succeeded only because that Droplet was a GHOST +at refresh time — the ghost-tolerance fix (PR #572) skipped it, preserving `user_data` +in state. Once the Droplet was created the next refresh clobbered the field. + +## Decision + +`refreshOne` now **merges** `live.Outputs` into `src.Outputs` rather than replacing +it: + +1. Start from a clone of `src.Outputs` (preserves all create-time fields). +2. For each field `k` in `live.Outputs`: if it is absent or differs from `merged[k]`, + set `merged[k] = v` and mark `needsUpdate = true`. +3. If any field changed, assign `dst.Outputs = merged`. + +Semantics: +- **Cloud truth wins** for any field present in the live Read response. +- **Create-time fields are preserved** for fields absent from the live Read response. +- The existing "skip write when nothing changed" optimisation is retained. + +## Trade-offs + +If a cloud provider truly removes a field from a live resource's outputs (rare for +IaC-managed resources), refresh-outputs will keep the stale value in state. The +planner may not detect the removal unless it also re-reads outputs. + +**This is acceptable** because: +- Provider-managed removal of a previously-set output field is uncommon for + IaC-controlled resources. +- The remediation path is well-defined: `wfctl infra apply --refresh` performs a + full reconcile and will surface the discrepancy as a plan diff. +- The alternative (replace semantics) causes false REPLACE storms for write-only + fields, which is a far more disruptive failure mode. + +## References + +- TC2 run 25508442022 — failure: 422 `storage already associated with another droplet` + caused by `user_data` clobber. +- TC2 run 25507244491 — success: ghost-skip preserved `user_data`. +- PR #572 — ghost-tolerance fix (`ErrResourceNotFound` skip in `refreshOne`). +- DO Droplet API docs: `user_data` is a create-time-only attribute. diff --git a/iac/refreshoutputs/refresh.go b/iac/refreshoutputs/refresh.go index e3c2b914..162d2304 100644 --- a/iac/refreshoutputs/refresh.go +++ b/iac/refreshoutputs/refresh.go @@ -105,19 +105,29 @@ func refreshOne(ctx context.Context, p interfaces.IaCProvider, dst *interfaces.R } return fmt.Errorf("could not refresh %q: %w", src.Name, err) } - if !reflect.DeepEqual(live.Outputs, src.Outputs) { - dst.Outputs = cloneMap(live.Outputs) + // Merge: preserve fields in src.Outputs that don't appear in live.Outputs. + // Some cloud Read endpoints don't return all fields that were captured at + // create-time (e.g., DO Droplet's user_data is write-only on Read). A naive + // replace clobbers those fields, causing plan to falsely detect drift on the + // next plan/apply cycle. Merge ensures refresh-outputs is idempotent for + // fields beyond cloud's Read scope. + // + // Copy-on-write: the clone is only allocated on the first detected change, + // so resources that haven't changed incur no per-resource allocation. + var merged map[string]any + for k, v := range live.Outputs { + if existing, ok := src.Outputs[k]; ok && reflect.DeepEqual(existing, v) { + continue + } + // First change detected: clone src.Outputs (nil-safe) and start merging. + if merged == nil { + merged = make(map[string]any, len(src.Outputs)+len(live.Outputs)) + maps.Copy(merged, src.Outputs) + } + merged[k] = v } - return nil -} - -// cloneMap returns an independent shallow copy of m. Callers receive a map -// they can mutate without aliasing the live driver output. -func cloneMap(m map[string]any) map[string]any { - if m == nil { - return nil + if merged != nil { + dst.Outputs = merged } - c := make(map[string]any, len(m)) - maps.Copy(c, m) - return c + return nil } diff --git a/iac/refreshoutputs/refresh_test.go b/iac/refreshoutputs/refresh_test.go index ca1a2067..345f1d28 100644 --- a/iac/refreshoutputs/refresh_test.go +++ b/iac/refreshoutputs/refresh_test.go @@ -213,3 +213,121 @@ func TestRefresh_PropagateNonGhostError(t *testing.T) { t.Errorf("error should propagate underlying cause; got: %v", err) } } + +// TestRefresh_MergePreservesFieldsNotInRead verifies that fields present in +// src.Outputs but absent from the live Read response are preserved in +// dst.Outputs. This covers cloud providers whose Read endpoints are write-only +// for some fields (e.g., DO Droplet user_data). +func TestRefresh_MergePreservesFieldsNotInRead(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-1", + // user_data was captured at create-time; Read won't return it. + Outputs: map[string]any{"id": "x", "user_data": ""}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider Read only returns id — user_data is omitted (write-only on Read) + "droplet-1": {"id": "x"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "x" { + t.Errorf("id should be present: %v", got) + } + if got := refreshed[0].Outputs["user_data"]; got != "" { + t.Errorf("user_data should be preserved from src (not in Read response): %v", got) + } +} + +// TestRefresh_LiveOverridesExisting verifies that when the cloud Read response +// returns a field that also exists in src.Outputs with a different value, the +// live (cloud) value wins. +func TestRefresh_LiveOverridesExisting(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-2", + Outputs: map[string]any{"id": "x"}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider returns updated id — cloud truth wins + "droplet-2": {"id": "y"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "y" { + t.Errorf("id should be updated to cloud value 'y', got: %v", got) + } +} + +// TestRefresh_NewFieldsFromLiveAdded verifies that new fields returned by the +// cloud Read response (not present in src.Outputs) are added to dst.Outputs. +func TestRefresh_NewFieldsFromLiveAdded(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-3", + Outputs: map[string]any{"id": "x"}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider now also returns private_ip (newly available after provisioning) + "droplet-3": {"id": "x", "private_ip": "10.0.0.5"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "x" { + t.Errorf("id should remain: %v", got) + } + if got := refreshed[0].Outputs["private_ip"]; got != "10.0.0.5" { + t.Errorf("private_ip from live Read should be added: %v", got) + } +} + +// TestRefresh_NilSrcOutputs_LiveFieldsPopulated verifies that when a resource +// has no previously-persisted Outputs (nil map), and the provider Read returns +// non-empty Outputs, the merge correctly populates dst.Outputs without panicking. +// This covers resources that enter state before any outputs were captured. +func TestRefresh_NilSrcOutputs_LiveFieldsPopulated(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "new-droplet", + Type: "infra.droplet", + ProviderID: "droplet-nil", + Outputs: nil, // no persisted outputs yet + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + "droplet-nil": {"id": "abc123", "ip": "1.2.3.4"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "abc123" { + t.Errorf("id should be populated from live Read: %v", got) + } + if got := refreshed[0].Outputs["ip"]; got != "1.2.3.4" { + t.Errorf("ip should be populated from live Read: %v", got) + } +}