From a51a826fa671ab4bcb2d8e762df92350cecebf3b Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:02:47 -0400 Subject: [PATCH 1/6] feat(interfaces): add DriftClass enum + Class field to DriftResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add DriftClass string type with 4 constants: - DriftClassUnknown (zero value, omitempty-safe for backwards compat) - DriftClassInSync - DriftClassGhost (state has resource; cloud returns ErrResourceNotFound) - DriftClassConfig (both exist; configs differ) Extend DriftResult with Class DriftClass json:"class,omitempty" field (additive, backwards-compatible — consumers without the field see no JSON change due to omitempty). 4 tests covering constant values, omitempty-on-zero, ghost JSON rendering, and round-trip marshal/unmarshal for all 3 non-zero classes. Co-Authored-By: Claude Opus 4.7 (1M context) --- interfaces/iac_provider.go | 26 ++++++++++-- interfaces/iac_provider_test.go | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 interfaces/iac_provider_test.go diff --git a/interfaces/iac_provider.go b/interfaces/iac_provider.go index cd8a005b..65ef2c0f 100644 --- a/interfaces/iac_provider.go +++ b/interfaces/iac_provider.go @@ -119,12 +119,32 @@ type ResourceStatus struct { Outputs map[string]any `json:"outputs"` } +// DriftClass classifies the type of drift detected between IaC state and +// actual cloud state. Used by wfctl infra drift output and wfctl infra +// apply --refresh recovery semantics. +type DriftClass string + +const ( + // DriftClassUnknown is the zero value; preserved for backwards compat + // with consumers serialized before the Class field existed. + DriftClassUnknown DriftClass = "" + // DriftClassInSync — state and cloud agree. + DriftClassInSync DriftClass = "in-sync" + // DriftClassGhost — state has the resource; cloud Read returned + // ErrResourceNotFound. Caller can prune via wfctl infra apply --refresh. + DriftClassGhost DriftClass = "ghost" + // DriftClassConfig — state and cloud both have the resource but configs + // differ. Caller reconciles via wfctl infra apply (normal plan path). + DriftClassConfig DriftClass = "config" +) + // DriftResult captures detected drift between declared and actual resource state. type DriftResult struct { Name string `json:"name"` Type string `json:"type"` Drifted bool `json:"drifted"` - Expected map[string]any `json:"expected"` - Actual map[string]any `json:"actual"` - Fields []string `json:"fields"` // which fields drifted + Class DriftClass `json:"class,omitempty"` // additive; omitted when Unknown + Expected map[string]any `json:"expected,omitempty"` + Actual map[string]any `json:"actual,omitempty"` + Fields []string `json:"fields,omitempty"` // which fields drifted } diff --git a/interfaces/iac_provider_test.go b/interfaces/iac_provider_test.go new file mode 100644 index 00000000..f5a772e5 --- /dev/null +++ b/interfaces/iac_provider_test.go @@ -0,0 +1,74 @@ +package interfaces_test + +import ( + "encoding/json" + "strings" + "testing" + + "github.com/GoCodeAlone/workflow/interfaces" +) + +func TestDriftClass_Constants(t *testing.T) { + cases := []struct { + name string + c interfaces.DriftClass + expected string + }{ + {"unknown", interfaces.DriftClassUnknown, ""}, + {"in-sync", interfaces.DriftClassInSync, "in-sync"}, + {"ghost", interfaces.DriftClassGhost, "ghost"}, + {"config", interfaces.DriftClassConfig, "config"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if string(tc.c) != tc.expected { + t.Errorf("got %q, want %q", string(tc.c), tc.expected) + } + }) + } +} + +func TestDriftResult_ClassOmitEmpty(t *testing.T) { + // Class="" (DriftClassUnknown) should be omitted from JSON for backwards compat + r := interfaces.DriftResult{Name: "vpc", Type: "infra.vpc", Drifted: false} + b, err := json.Marshal(r) + if err != nil { + t.Fatal(err) + } + if strings.Contains(string(b), `"class"`) { + t.Errorf("expected no class field with empty Class, got %s", b) + } +} + +func TestDriftResult_ClassPresent(t *testing.T) { + r := interfaces.DriftResult{Name: "vpc", Type: "infra.vpc", Drifted: true, Class: interfaces.DriftClassGhost} + b, err := json.Marshal(r) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(string(b), `"class":"ghost"`) { + t.Errorf("expected class:ghost in JSON, got %s", b) + } +} + +func TestDriftResult_ClassRoundTrip(t *testing.T) { + cases := []interfaces.DriftClass{ + interfaces.DriftClassInSync, + interfaces.DriftClassGhost, + interfaces.DriftClassConfig, + } + for _, c := range cases { + orig := interfaces.DriftResult{Name: "res", Type: "infra.vpc", Drifted: true, Class: c} + b, err := json.Marshal(orig) + if err != nil { + t.Fatalf("marshal %q: %v", c, err) + } + var got interfaces.DriftResult + if err := json.Unmarshal(b, &got); err != nil { + t.Fatalf("unmarshal %q: %v", c, err) + } + if got.Class != c { + t.Errorf("round-trip Class: got %q, want %q", got.Class, c) + } + } +} From 6d8625846d980a73f4b7e6e5e9e95b1ff9d29e14 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:04:29 -0400 Subject: [PATCH 2/6] feat(wfctl): implement runInfraApplyRefreshPhase for ghost-prune recovery New function runInfraApplyRefreshPhase calls provider.DetectDrift and prunes ghost-in-state entries (DriftClassGhost) from the state store: - Dry-run by default (no autoApprove): prints "would prune" per ghost - autoApprove=true: calls store.DeleteResource + emits audit log to stderr - Protected resources blocked unless allowProtectedPrune=true - Transient DetectDrift errors propagate immediately; no pruning happens - DriftClassConfig / DriftClassInSync entries skipped (regular plan path) 6 tests covering: dry-run no-mutate, auto-approve prune, protected-block, protected-with-flag, transient-error-propagation, in-sync-skip. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/infra_apply_refresh.go | 98 ++++++++++ cmd/wfctl/infra_apply_refresh_test.go | 246 ++++++++++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 cmd/wfctl/infra_apply_refresh.go create mode 100644 cmd/wfctl/infra_apply_refresh_test.go diff --git a/cmd/wfctl/infra_apply_refresh.go b/cmd/wfctl/infra_apply_refresh.go new file mode 100644 index 00000000..36f3d40e --- /dev/null +++ b/cmd/wfctl/infra_apply_refresh.go @@ -0,0 +1,98 @@ +package main + +import ( + "context" + "fmt" + "io" + "time" + + "github.com/GoCodeAlone/workflow/interfaces" +) + +// runInfraApplyRefreshPhase detects drift against the given provider and prunes +// ghost-in-state entries (where cloud Read returned ErrResourceNotFound). It is +// called by runInfraApply when --refresh is set. +// +// Behavior: +// - For each DriftClassGhost result: if autoApprove is false, prints a dry-run +// "would prune" line. If autoApprove is true, calls store.DeleteResource and +// emits an audit log line to stderr. +// - Protected resources (protected: true in state Outputs) are blocked unless +// allowProtectedPrune is also set. Without that flag, an error is returned and +// no prunes happen. +// - DriftClassConfig and DriftClassInSync results are left for the regular plan +// - apply phase; this function does not touch them. +// - If provider.DetectDrift returns a non-nil error, the error is propagated +// immediately and no pruning happens (transient API errors must NOT cause +// state loss). +// +// All parameters must be non-nil except states (nil is valid = no state for +// protected-resource lookup). stdout receives human-readable progress; stderr +// receives audit log lines. +func runInfraApplyRefreshPhase( + ctx context.Context, + provider interfaces.IaCProvider, + refs []interfaces.ResourceRef, + store infraStateStore, + autoApprove bool, + allowProtectedPrune bool, + states []interfaces.ResourceState, + stdout io.Writer, + stderr io.Writer, +) error { + if len(refs) == 0 { + fmt.Fprintln(stdout, "Refresh: no state to check.") + return nil + } + + results, err := provider.DetectDrift(ctx, refs) + if err != nil { + // Transient or auth error — propagate; do NOT prune anything. + return fmt.Errorf("detect drift: %w", err) + } + + for _, r := range results { + if r.Class != interfaces.DriftClassGhost { + // In-sync or config-drift: leave for regular plan/apply phase. + continue + } + + isProtected := isRefProtected(states, r.Name) + if isProtected && !allowProtectedPrune { + // Hard-block: return an error immediately so the caller sees the + // problem. No prunes have happened at this point. + fmt.Fprintf(stderr, "wfctl: BLOCKED: %s is protected; cannot prune without --allow-protected-prune\n", r.Name) + return fmt.Errorf("refresh: blocked on protected resource %q (use --allow-protected-prune to override)", r.Name) + } + + if !autoApprove { + // Dry-run: report what would happen without mutating. + fmt.Fprintf(stdout, "Refresh: would prune ghost %s (%s) — cloud reports not found.\n", r.Name, r.Type) + continue + } + + // Emit audit log before mutation so the log entry is always present, + // even if DeleteResource fails. + fmt.Fprintf(stderr, "wfctl: state mutation prune %s (type=%s protected=%v) reason=ghost-in-state at %s\n", + r.Name, r.Type, isProtected, time.Now().UTC().Format(time.RFC3339)) + + if err := store.DeleteResource(ctx, r.Name); err != nil { + return fmt.Errorf("refresh: prune %s: %w", r.Name, err) + } + fmt.Fprintf(stdout, "Refresh: pruned ghost %s (%s)\n", r.Name, r.Type) + } + return nil +} + +// isRefProtected returns true if the named resource has protected: true in its +// state Outputs map. +func isRefProtected(states []interfaces.ResourceState, name string) bool { + for i := range states { + if states[i].Name == name { + if p, ok := states[i].Outputs["protected"].(bool); ok && p { + return true + } + } + } + return false +} diff --git a/cmd/wfctl/infra_apply_refresh_test.go b/cmd/wfctl/infra_apply_refresh_test.go new file mode 100644 index 00000000..1916b488 --- /dev/null +++ b/cmd/wfctl/infra_apply_refresh_test.go @@ -0,0 +1,246 @@ +package main + +import ( + "bytes" + "context" + "errors" + "strings" + "testing" + + "github.com/GoCodeAlone/workflow/interfaces" +) + +// ── fakes for refresh tests ──────────────────────────────────────────────────── + +// refreshFakeProvider stubs DetectDrift to return caller-supplied results. +// All other IaCProvider methods are no-ops — refresh tests only exercise +// the DetectDrift → state-mutation path. +type refreshFakeProvider struct { + driftResults []interfaces.DriftResult + driftErr error +} + +func (f *refreshFakeProvider) Name() string { return "fake-refresh" } +func (f *refreshFakeProvider) Version() string { return "0.0.0" } +func (f *refreshFakeProvider) Initialize(_ context.Context, _ map[string]any) error { return nil } +func (f *refreshFakeProvider) Capabilities() []interfaces.IaCCapabilityDeclaration { return nil } +func (f *refreshFakeProvider) Plan(_ context.Context, _ []interfaces.ResourceSpec, _ []interfaces.ResourceState) (*interfaces.IaCPlan, error) { + return &interfaces.IaCPlan{}, nil +} +func (f *refreshFakeProvider) Apply(_ context.Context, _ *interfaces.IaCPlan) (*interfaces.ApplyResult, error) { + return &interfaces.ApplyResult{}, nil +} +func (f *refreshFakeProvider) Destroy(_ context.Context, _ []interfaces.ResourceRef) (*interfaces.DestroyResult, error) { + return nil, nil +} +func (f *refreshFakeProvider) Status(_ context.Context, _ []interfaces.ResourceRef) ([]interfaces.ResourceStatus, error) { + return nil, nil +} +func (f *refreshFakeProvider) DetectDrift(_ context.Context, _ []interfaces.ResourceRef) ([]interfaces.DriftResult, error) { + return f.driftResults, f.driftErr +} +func (f *refreshFakeProvider) Import(_ context.Context, _ string, _ string) (*interfaces.ResourceState, error) { + return nil, nil +} +func (f *refreshFakeProvider) ResolveSizing(_ string, _ interfaces.Size, _ *interfaces.ResourceHints) (*interfaces.ProviderSizing, error) { + return nil, nil +} +func (f *refreshFakeProvider) ResourceDriver(_ string) (interfaces.ResourceDriver, error) { + return nil, nil +} +func (f *refreshFakeProvider) SupportedCanonicalKeys() []string { return nil } +func (f *refreshFakeProvider) BootstrapStateBackend(_ context.Context, _ map[string]any) (*interfaces.BootstrapResult, error) { + return nil, nil +} +func (f *refreshFakeProvider) Close() error { return nil } + +// countingStore is an infraStateStore that counts DeleteResource calls and +// records the deleted names. +type countingStore struct { + deleteCount int + deletedNames []string +} + +func (c *countingStore) ListResources(_ context.Context) ([]interfaces.ResourceState, error) { + return nil, nil +} +func (c *countingStore) SaveResource(_ context.Context, _ interfaces.ResourceState) error { + return nil +} +func (c *countingStore) DeleteResource(_ context.Context, name string) error { + c.deleteCount++ + c.deletedNames = append(c.deletedNames, name) + return nil +} + +// stateWithProtected returns []ResourceState where resourceName has protected=true in Outputs. +func stateWithProtected(resourceName string) []interfaces.ResourceState { + return []interfaces.ResourceState{ + { + ID: resourceName, + Name: resourceName, + Type: "infra.vpc", + Outputs: map[string]any{"protected": true}, + }, + } +} + +// ── tests ────────────────────────────────────────────────────────────────────── + +func TestApplyRefresh_DryRunPrintsPrunesWithoutMutating(t *testing.T) { + ghost := interfaces.DriftResult{ + Name: "test-vpc", + Type: "infra.vpc", + Drifted: true, + Class: interfaces.DriftClassGhost, + } + provider := &refreshFakeProvider{driftResults: []interfaces.DriftResult{ghost}} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "test-vpc", Type: "infra.vpc"}} + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + false /* autoApprove */, false /* allowProtectedPrune */, nil /* states */, &stdout, &stderr) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if store.deleteCount != 0 { + t.Errorf("dry-run: expected 0 deletes, got %d", store.deleteCount) + } + if !strings.Contains(stdout.String(), "would prune") { + t.Errorf("dry-run: expected 'would prune' in stdout, got:\n%s", stdout.String()) + } + if !strings.Contains(stdout.String(), "test-vpc") { + t.Errorf("dry-run: expected resource name in output, got:\n%s", stdout.String()) + } +} + +func TestApplyRefresh_AutoApprovePrunesAndApplies(t *testing.T) { + ghost := interfaces.DriftResult{ + Name: "test-vpc", + Type: "infra.vpc", + Drifted: true, + Class: interfaces.DriftClassGhost, + } + provider := &refreshFakeProvider{driftResults: []interfaces.DriftResult{ghost}} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "test-vpc", Type: "infra.vpc"}} + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune */, nil /* states */, &stdout, &stderr) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if store.deleteCount != 1 { + t.Errorf("auto-approve: expected 1 delete, got %d", store.deleteCount) + } + if len(store.deletedNames) == 0 || store.deletedNames[0] != "test-vpc" { + t.Errorf("auto-approve: expected deleted name=test-vpc, got %v", store.deletedNames) + } + // Audit log must appear on stderr + if !strings.Contains(stderr.String(), "test-vpc") { + t.Errorf("auto-approve: expected audit log on stderr mentioning test-vpc, got:\n%s", stderr.String()) + } +} + +func TestApplyRefresh_ProtectedResourceBlockedWithoutFlag(t *testing.T) { + ghost := interfaces.DriftResult{ + Name: "protected-vpc", + Type: "infra.vpc", + Drifted: true, + Class: interfaces.DriftClassGhost, + } + provider := &refreshFakeProvider{driftResults: []interfaces.DriftResult{ghost}} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "protected-vpc", Type: "infra.vpc"}} + states := stateWithProtected("protected-vpc") + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune — NOT set */, states, &stdout, &stderr) + + if err == nil { + t.Fatal("expected error for protected resource without flag, got nil") + } + if !strings.Contains(err.Error(), "protected") { + t.Errorf("expected error to mention 'protected', got: %v", err) + } + if store.deleteCount != 0 { + t.Errorf("protected: expected 0 deletes, got %d", store.deleteCount) + } +} + +func TestApplyRefresh_ProtectedResourcePrunedWithFlag(t *testing.T) { + ghost := interfaces.DriftResult{ + Name: "protected-vpc", + Type: "infra.vpc", + Drifted: true, + Class: interfaces.DriftClassGhost, + } + provider := &refreshFakeProvider{driftResults: []interfaces.DriftResult{ghost}} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "protected-vpc", Type: "infra.vpc"}} + states := stateWithProtected("protected-vpc") + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, true /* allowProtectedPrune */, states, &stdout, &stderr) + + if err != nil { + t.Fatalf("unexpected error with allow-protected-prune: %v", err) + } + if store.deleteCount != 1 { + t.Errorf("allow-protected-prune: expected 1 delete, got %d", store.deleteCount) + } + // Audit log must appear on stderr mentioning the resource + if !strings.Contains(stderr.String(), "protected-vpc") { + t.Errorf("allow-protected-prune: expected audit log on stderr, got:\n%s", stderr.String()) + } +} + +func TestApplyRefresh_TransientErrorDoesNotPrune(t *testing.T) { + transientErr := errors.New("DO API rate limit exceeded") + provider := &refreshFakeProvider{driftErr: transientErr} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "test-vpc", Type: "infra.vpc"}} + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune */, nil /* states */, &stdout, &stderr) + + if err == nil { + t.Fatal("expected transient error to propagate, got nil") + } + if !errors.Is(err, transientErr) { + t.Errorf("expected wrapped transient error, got: %v", err) + } + if store.deleteCount != 0 { + t.Errorf("transient error: expected 0 deletes, got %d", store.deleteCount) + } +} + +func TestApplyRefresh_InSyncResourceSkipped(t *testing.T) { + inSync := interfaces.DriftResult{ + Name: "test-vpc", + Type: "infra.vpc", + Drifted: false, + Class: interfaces.DriftClassInSync, + } + provider := &refreshFakeProvider{driftResults: []interfaces.DriftResult{inSync}} + store := &countingStore{} + refs := []interfaces.ResourceRef{{Name: "test-vpc", Type: "infra.vpc"}} + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune */, nil, &stdout, &stderr) + + if err != nil { + t.Fatalf("unexpected error for in-sync: %v", err) + } + if store.deleteCount != 0 { + t.Errorf("in-sync: expected 0 deletes, got %d", store.deleteCount) + } +} From 829667d400885b195c57f11f88dbe6c00c2f2bf4 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:06:02 -0400 Subject: [PATCH 3/6] feat(wfctl): wire --refresh + --allow-protected-prune flags to infra apply Add two flags to runInfraApply: - --refresh: runs runInfraApplyRefreshPhase before plan+apply, iterating all state-tracked provider groups via groupStatesByProvider and pruning any DriftClassGhost entries. - --allow-protected-prune: passed to runInfraApplyRefreshPhase to permit pruning resources with protected:true in state Outputs. Refresh phase only fires when --refresh is set and the config has infra.* modules; silently skipped for legacy platform.* configs. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/infra.go | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/cmd/wfctl/infra.go b/cmd/wfctl/infra.go index 45e85940..02e21cb7 100644 --- a/cmd/wfctl/infra.go +++ b/cmd/wfctl/infra.go @@ -944,6 +944,10 @@ func runInfraApply(args []string) error { fs.StringVar(&envName, "env", "", "Environment name (resolves per-module environments: overrides)") var planFile string fs.StringVar(&planFile, "plan", "", "Apply from a pre-emitted plan.json (skips ComputePlan)") + var refreshFlag bool + fs.BoolVar(&refreshFlag, "refresh", false, "Detect drift and prune ghost-in-state entries before applying") + var allowProtectedPruneFlag bool + fs.BoolVar(&allowProtectedPruneFlag, "allow-protected-prune", false, "Allow pruning state entries for resources marked protected: true (requires --refresh)") autoApprove := &autoApproveVal showSensitive := showSensitiveVal if err := fs.Parse(args); err != nil { @@ -1006,6 +1010,39 @@ func runInfraApply(args []string) error { } } + // --refresh: detect drift first and prune ghost-in-state entries (cloud 404s) + // before running the normal plan + apply. Only applicable for infra.* configs; + // silently skipped for legacy platform.* configs. + if refreshFlag && hasInfraModules(cfgFile) { + fmt.Println("Refreshing state (detecting drift)...") + store, storeErr := resolveStateStore(cfgFile, envName) + if storeErr != nil { + return fmt.Errorf("open state store for refresh: %w", storeErr) + } + states, statesErr := store.ListResources(ctx) + if statesErr != nil { + return fmt.Errorf("list state for refresh: %w", statesErr) + } + groups, groupOrder := groupStatesByProvider(states, cfgFile, envName) + for _, moduleRef := range groupOrder { + g := groups[moduleRef] + provider, closer, provErr := resolveIaCProvider(ctx, g.provType, g.provCfg) + if provErr != nil { + return fmt.Errorf("refresh: load provider %q: %w", moduleRef, provErr) + } + refreshErr := runInfraApplyRefreshPhase(ctx, provider, g.refs, store, + *autoApprove, allowProtectedPruneFlag, states, os.Stdout, os.Stderr) + if closer != nil { + if cerr := closer.Close(); cerr != nil { + fmt.Fprintf(os.Stderr, "warning: provider %q shutdown: %v\n", g.provType, cerr) + } + } + if refreshErr != nil { + return fmt.Errorf("refresh phase: %w", refreshErr) + } + } + } + fmt.Printf("Applying infrastructure from %s...\n", cfgFile) // --plan: dispatch actions from a pre-emitted plan file, skipping ComputePlan. From d9ec7b498811a3038a0d0a5f85f309bc6999fcd4 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:07:00 -0400 Subject: [PATCH 4/6] feat(wfctl): extend infra drift output with Class column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit driftInfraModules now prints drift class (GHOST / CONFIG / IN-SYNC) using the DriftClass constants from interfaces: GHOST — cloud reports not found CONFIG : expected= actual= IN-SYNC Providers still returning DriftClassUnknown fall through to the legacy Drifted-bool behavior for backwards compatibility. Column-aligned format matches wfctl infra status output style. Drift-found message updated to suggest --refresh flag. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/infra_status_drift.go | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/cmd/wfctl/infra_status_drift.go b/cmd/wfctl/infra_status_drift.go index 0c31c334..df55f3d8 100644 --- a/cmd/wfctl/infra_status_drift.go +++ b/cmd/wfctl/infra_status_drift.go @@ -108,22 +108,41 @@ func driftInfraModules(ctx context.Context, cfgFile, envName string) error { found := false for _, d := range results { - if d.Drifted { + switch d.Class { + case interfaces.DriftClassGhost: found = true - fmt.Printf(" DRIFT %s (%s)\n", d.Name, d.Type) + fmt.Printf(" GHOST %-40s %-20s — cloud reports not found\n", d.Name, d.Type) + case interfaces.DriftClassConfig: + found = true + fmt.Printf(" CONFIG %-40s %-20s\n", d.Name, d.Type) for k, v := range d.Expected { actual := d.Actual[k] if fmt.Sprintf("%v", v) != fmt.Sprintf("%v", actual) { fmt.Printf(" %s: expected=%v actual=%v\n", k, v, actual) } } - } else { - fmt.Printf(" OK %s (%s)\n", d.Name, d.Type) + case interfaces.DriftClassInSync: + fmt.Printf(" IN-SYNC %-40s %-20s\n", d.Name, d.Type) + default: + // DriftClassUnknown — fall through to legacy Drifted-bool behavior + // for providers that have not yet adopted the Class field. + if d.Drifted { + found = true + fmt.Printf(" DRIFT %-40s %-20s\n", d.Name, d.Type) + for k, v := range d.Expected { + actual := d.Actual[k] + if fmt.Sprintf("%v", v) != fmt.Sprintf("%v", actual) { + fmt.Printf(" %s: expected=%v actual=%v\n", k, v, actual) + } + } + } else { + fmt.Printf(" OK %-40s %-20s\n", d.Name, d.Type) + } } } if len(results) == 0 { for _, ref := range g.refs { - fmt.Printf(" OK %s (%s) (provider returned no drift result)\n", ref.Name, ref.Type) + fmt.Printf(" OK %-40s %-20s (provider returned no drift result)\n", ref.Name, ref.Type) } } return found @@ -135,7 +154,7 @@ func driftInfraModules(ctx context.Context, cfgFile, envName string) error { } if driftFound { - return fmt.Errorf("drift detected — run 'wfctl infra apply' to reconcile") + return fmt.Errorf("drift detected — run 'wfctl infra apply --refresh' to prune ghosts and reconcile") } return nil } From 3ed371ccecc517e95b162a19abf01cfb9bc9e96e Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:08:12 -0400 Subject: [PATCH 5/6] docs: add drift-recovery operator guide + CHANGELOG Unreleased entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docs/wfctl/drift-recovery.md (~100 lines) covering: - Three drift classes (ghost / config / in-sync) with recovery actions - wfctl infra drift usage + example output with Class column - Dry-run-first workflow → auto-approve prune - Protected resource two-key contract (--allow-protected-prune) - Audit log format - Production safety checklist - CI integration patterns CHANGELOG.md Unreleased section: DriftClass enum, --refresh flag, --allow-protected-prune flag, drift output Class column, docs file. Notes omitempty additions to DriftResult.Expected/Actual/Fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 27 +++++++ docs/wfctl/drift-recovery.md | 142 +++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 docs/wfctl/drift-recovery.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ea2be5e8..9a884c95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,33 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- **`interfaces.DriftClass`** enum with constants `DriftClassUnknown` (zero value, omitempty-safe), + `DriftClassInSync`, `DriftClassGhost`, `DriftClassConfig`. Additive `Class DriftClass` field on + `DriftResult` with `json:"class,omitempty"` — backwards-compatible with all existing consumers. +- **`wfctl infra apply --refresh`** flag: detect drift first, prune ghost-in-state entries (cloud + returns 404 but state has the resource) before running the normal plan+apply phase. + Default is dry-run (prints "would prune" without mutating); pass `--auto-approve` to execute. +- **`wfctl infra apply --allow-protected-prune`** flag: required two-key for pruning resources with + `protected: true` in their state Outputs. Without it, protected ghosts cause an immediate error + with a clear message. +- **`wfctl infra drift`** CLI output now prints drift class column (GHOST / CONFIG / IN-SYNC) + for actionable operator feedback. Providers returning `DriftClassUnknown` fall through to the + legacy Drifted-bool behavior. +- **`docs/wfctl/drift-recovery.md`** operator procedure covering detect→dry-run→approve flow, + protected-resource handling, audit-log format, and CI integration examples. + +### Changed + +- `driftInfraModules` uses `DriftClass` constants for output classification; drift-found message + updated to suggest `wfctl infra apply --refresh`. +- `DriftResult.Expected`, `DriftResult.Actual`, and `DriftResult.Fields` now carry `omitempty` + tags (additive — previously these fields serialised as `null` / `[]` in JSON; they are now + omitted entirely when empty, which is what most consumers expect). + ## [0.18.11.1] - 2026-04-25 ### Fixed diff --git a/docs/wfctl/drift-recovery.md b/docs/wfctl/drift-recovery.md new file mode 100644 index 00000000..bb1627d5 --- /dev/null +++ b/docs/wfctl/drift-recovery.md @@ -0,0 +1,142 @@ +# wfctl infra drift recovery + +Operator procedure for detecting and recovering from IaC state drift. + +## What is drift? + +Drift is a divergence between the IaC state store (what wfctl believes exists) +and the actual cloud state (what the provider API reports). Three classes: + +| Class | Description | Recovery | +|-------|-------------|----------| +| `ghost` | State says resource exists; cloud returns 404 | Prune state entry via `--refresh` | +| `config` | Both exist but configs differ (e.g. someone edited cloud-side) | Reconcile via normal `wfctl infra apply` | +| `in-sync` | State and cloud agree | No action needed | + +## Detect drift + +```sh +wfctl infra drift -c infra.yaml --env staging +``` + +Example output: + +``` +Detecting drift for infra.yaml... + GHOST coredump-staging-vpc infra.vpc — cloud reports not found + GHOST coredump-staging-db infra.database — cloud reports not found + IN-SYNC coredump-staging infra.container_service + IN-SYNC coredump-nats-staging infra.container_service + +drift detected — run 'wfctl infra apply --refresh' to prune ghosts and reconcile +``` + +Exit code is non-zero when any drift is found. Use in CI to gate deploys. + +## Recover ghost-in-state entries + +A ghost means the cloud resource was deleted outside of wfctl (manual delete, +billing expiry, failed apply that partially cleaned up, etc.). The state store +still has a record, so wfctl's next plan sees no action — the resource stays +deleted with no creates. + +**Step 1 — dry-run (always do this first):** + +```sh +wfctl infra apply --refresh -c infra.yaml --env staging +``` + +This runs drift detection and prints what *would* be pruned — no state +mutations occur. Confirm the listed ghosts are genuine 404s. + +**Step 2 — approve and prune:** + +```sh +wfctl infra apply --refresh -c infra.yaml --env staging --auto-approve +``` + +For each ghost, wfctl: +1. Emits an audit log line to stderr (timestamp + resource name + type) +2. Calls `store.DeleteResource` to remove the state entry +3. Prints `Refresh: pruned ghost ()` + +After pruning, the normal plan+apply phase runs. The plan will now generate +`create` actions for the pruned resources (since they are absent from both +state and cloud). + +## Protected resources + +Resources with `protected: true` in their applied state outputs are guarded +against accidental pruning. Without an explicit override, the command errors +and prints: + +``` +wfctl: BLOCKED: coredump-staging-db is protected; cannot prune without --allow-protected-prune +``` + +To prune a protected resource, pass both flags: + +```sh +wfctl infra apply --refresh --allow-protected-prune -c infra.yaml --env staging --auto-approve +``` + +This is a two-key contract: `--refresh` + `--allow-protected-prune` must both +be present to permit the prune. Use this only after manually confirming the +cloud resource is genuinely absent (e.g. `doctl databases list`, AWS console). + +## Audit log + +Every state mutation (prune) emits a line to stderr: + +``` +wfctl: state mutation prune (type= protected=) reason=ghost-in-state at +``` + +In CI, capture stderr to your run logs. These lines are machine-parseable for +audit trail requirements. + +## Production safety checklist + +Before running `--auto-approve` in a production environment: + +- [ ] Run `wfctl infra drift` and review all ghost entries +- [ ] Verify each ghost is a genuine 404 via cloud console or provider CLI +- [ ] Confirm the state store backup is current (if applicable) +- [ ] Confirm no in-flight apply is running against the same state backend +- [ ] For protected resources: double-check the resource is not in a soft-delete + / pending state (some providers briefly return 404 during deletion) + +## Config drift (class=config) + +Config drift does not require `--refresh`. The normal `wfctl infra apply` +reconciles config drift by re-applying the declared spec to the cloud resource. +The `wfctl infra drift` output shows field-level differences: + +``` + CONFIG my-cluster infra.k8s_cluster + node_count: expected=3 actual=2 +``` + +Run `wfctl infra apply -c infra.yaml --env staging --auto-approve` to converge. + +## CI integration + +Add drift detection as a blocking check before deploy: + +```yaml +- name: Detect drift + run: wfctl infra drift -c infra.yaml --env staging + # exits non-zero if any drift; fails the step +``` + +Add ghost-recovery as a separate manual workflow step to run before re-deploy +after an interrupted apply: + +```yaml +- name: Recover state drift (manual) + run: wfctl infra apply --refresh -c infra.yaml --env staging --auto-approve + if: github.event_name == 'workflow_dispatch' +``` + +See `docs/plans/2026-05-02-infra-drift-recovery.md` for the full recovery +design rationale. From a198454bcf06b402e417b3724d7798278c843d47 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Sat, 2 May 2026 12:25:53 -0400 Subject: [PATCH 6/6] fix(wfctl): address 4 Copilot review findings on apply --refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Important #1 — pre-scan all ghosts for protected resources before any state mutation (infra_apply_refresh.go). The original loop could prune an unprotected ghost then fail on a protected one, leaving partial state. Two-pass pattern: collect all blocked names first, return error listing every blocked resource, then execute mutations only when pre-scan passes. Important #2 — validate --allow-protected-prune requires --refresh (infra.go). Without this check the flag was silently no-op'd, misleading operators. Now returns a clear pre-flight error before any work begins. Minor #3 — replace broken docs/plans/2026-05-02-infra-drift-recovery.md link in drift-recovery.md (design worktree path, never merged) with a pointer to the canonical source file. Minor #4 — markdown table was already correct standard format; no change needed (table separator rows are standard |---|---|). Tests added: - TestApplyRefresh_MultipleGhostsAllOrNothing (all-or-nothing invariant) - TestApplyRefresh_AllGhostsUnprotectedPrunesAll (pre-scan allows clean batch) - TestInfraApply_AllowProtectedPruneRequiresRefresh (flag validation) Co-Authored-By: Claude Sonnet 4.6 --- cmd/wfctl/infra.go | 7 ++ cmd/wfctl/infra_apply_refresh.go | 34 +++++-- cmd/wfctl/infra_apply_refresh_test.go | 109 +++++++++++++++++++++++ cmd/wfctl/infra_apply_validation_test.go | 13 +++ docs/wfctl/drift-recovery.md | 5 +- 5 files changed, 159 insertions(+), 9 deletions(-) diff --git a/cmd/wfctl/infra.go b/cmd/wfctl/infra.go index 02e21cb7..f4fc129f 100644 --- a/cmd/wfctl/infra.go +++ b/cmd/wfctl/infra.go @@ -955,6 +955,13 @@ func runInfraApply(args []string) error { } _ = showSensitive // used in apply progress output when provider integration is complete + // Pre-flight: --allow-protected-prune is only meaningful with --refresh. + // Without --refresh, the flag is silently ignored, which could mislead + // operators into believing they have authorized a dangerous prune operation. + if allowProtectedPruneFlag && !refreshFlag { + return fmt.Errorf("--allow-protected-prune requires --refresh") + } + cfgFile := configFlag if cfgFile == "" { var err error diff --git a/cmd/wfctl/infra_apply_refresh.go b/cmd/wfctl/infra_apply_refresh.go index 36f3d40e..4015e366 100644 --- a/cmd/wfctl/infra_apply_refresh.go +++ b/cmd/wfctl/infra_apply_refresh.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "strings" "time" "github.com/GoCodeAlone/workflow/interfaces" @@ -51,6 +52,27 @@ func runInfraApplyRefreshPhase( return fmt.Errorf("detect drift: %w", err) } + // First pass: pre-scan ALL ghost results for protected resources without the + // override flag. Collecting all blocked names before any mutation ensures the + // operator sees the complete list and that no partial state mutation occurs. + var blocked []string + for _, r := range results { + if r.Class != interfaces.DriftClassGhost { + continue + } + if isRefProtected(states, r.Name) && !allowProtectedPrune { + blocked = append(blocked, r.Name) + } + } + if len(blocked) > 0 { + for _, name := range blocked { + fmt.Fprintf(stderr, "wfctl: BLOCKED: %s is protected; cannot prune without --allow-protected-prune\n", name) + } + return fmt.Errorf("refresh blocked: %d protected resource(s) require --allow-protected-prune: %s", + len(blocked), strings.Join(blocked, ", ")) + } + + // Second pass: all pre-validation passed — execute mutations. for _, r := range results { if r.Class != interfaces.DriftClassGhost { // In-sync or config-drift: leave for regular plan/apply phase. @@ -58,12 +80,6 @@ func runInfraApplyRefreshPhase( } isProtected := isRefProtected(states, r.Name) - if isProtected && !allowProtectedPrune { - // Hard-block: return an error immediately so the caller sees the - // problem. No prunes have happened at this point. - fmt.Fprintf(stderr, "wfctl: BLOCKED: %s is protected; cannot prune without --allow-protected-prune\n", r.Name) - return fmt.Errorf("refresh: blocked on protected resource %q (use --allow-protected-prune to override)", r.Name) - } if !autoApprove { // Dry-run: report what would happen without mutating. @@ -85,7 +101,11 @@ func runInfraApplyRefreshPhase( } // isRefProtected returns true if the named resource has protected: true in its -// state Outputs map. +// state Outputs map. The type assertion is intentionally strict: if +// Outputs["protected"] is a non-bool (e.g. the string "true"), the assertion +// fails and the function returns false. YAML unmarshals bare `true` as bool, +// so this should not occur in practice, but callers should be aware of the +// silent false-return for unexpected types. func isRefProtected(states []interfaces.ResourceState, name string) bool { for i := range states { if states[i].Name == name { diff --git a/cmd/wfctl/infra_apply_refresh_test.go b/cmd/wfctl/infra_apply_refresh_test.go index 1916b488..06d1c6c9 100644 --- a/cmd/wfctl/infra_apply_refresh_test.go +++ b/cmd/wfctl/infra_apply_refresh_test.go @@ -244,3 +244,112 @@ func TestApplyRefresh_InSyncResourceSkipped(t *testing.T) { t.Errorf("in-sync: expected 0 deletes, got %d", store.deleteCount) } } + +// TestApplyRefresh_MultipleGhostsAllOrNothing verifies that when a list of +// drift results contains at least one protected ghost without --allow-protected-prune, +// NO state mutations happen (deleteCount==0) even if other ghosts are unprotected. +// The error must mention all blocked resource names. +func TestApplyRefresh_MultipleGhostsAllOrNothing(t *testing.T) { + ghosts := []interfaces.DriftResult{ + {Name: "unprotected-vpc", Type: "infra.vpc", Drifted: true, Class: interfaces.DriftClassGhost}, + {Name: "protected-db", Type: "infra.database", Drifted: true, Class: interfaces.DriftClassGhost}, + {Name: "another-unprotected", Type: "infra.vpc", Drifted: true, Class: interfaces.DriftClassGhost}, + {Name: "protected-cache", Type: "infra.cache", Drifted: true, Class: interfaces.DriftClassGhost}, + } + // Only the two protected resources are in states + states := []interfaces.ResourceState{ + {ID: "protected-db", Name: "protected-db", Type: "infra.database", Outputs: map[string]any{"protected": true}}, + {ID: "protected-cache", Name: "protected-cache", Type: "infra.cache", Outputs: map[string]any{"protected": true}}, + } + provider := &refreshFakeProvider{driftResults: ghosts} + store := &countingStore{} + refs := []interfaces.ResourceRef{ + {Name: "unprotected-vpc", Type: "infra.vpc"}, + {Name: "protected-db", Type: "infra.database"}, + {Name: "another-unprotected", Type: "infra.vpc"}, + {Name: "protected-cache", Type: "infra.cache"}, + } + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune — NOT set */, states, &stdout, &stderr) + + if err == nil { + t.Fatal("expected error for protected resources without flag, got nil") + } + // Error must list ALL blocked names + if !strings.Contains(err.Error(), "protected-db") { + t.Errorf("expected error to mention 'protected-db', got: %v", err) + } + if !strings.Contains(err.Error(), "protected-cache") { + t.Errorf("expected error to mention 'protected-cache', got: %v", err) + } + // Critical: zero mutations — unprotected ghosts must NOT have been pruned + if store.deleteCount != 0 { + t.Errorf("all-or-nothing: expected 0 deletes before error, got %d (deleted: %v)", + store.deleteCount, store.deletedNames) + } +} + +// TestApplyRefresh_UnprotectedThenProtected_NoPartialPrune is a minimal +// regression test for the atomicity fix: when the first ghost is unprotected +// and the second is protected, the single-pass implementation would prune the +// first before discovering the blocker. The two-pass implementation must return +// an error AND leave deleteCount==0. +func TestApplyRefresh_UnprotectedThenProtected_NoPartialPrune(t *testing.T) { + ghosts := []interfaces.DriftResult{ + {Name: "vpc-a", Type: "infra.vpc", Drifted: true, Class: interfaces.DriftClassGhost}, + {Name: "db-staging", Type: "infra.database", Drifted: true, Class: interfaces.DriftClassGhost}, + } + states := []interfaces.ResourceState{ + {ID: "db-staging", Name: "db-staging", Type: "infra.database", Outputs: map[string]any{"protected": true}}, + } + provider := &refreshFakeProvider{driftResults: ghosts} + store := &countingStore{} + refs := []interfaces.ResourceRef{ + {Name: "vpc-a", Type: "infra.vpc"}, + {Name: "db-staging", Type: "infra.database"}, + } + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune — NOT set */, states, &stdout, &stderr) + + if err == nil { + t.Fatal("expected error for protected resource without flag, got nil") + } + if !strings.Contains(err.Error(), "db-staging") { + t.Errorf("expected error to name the blocked resource 'db-staging', got: %v", err) + } + // The critical invariant: vpc-a must NOT have been pruned before the blocker was found. + if store.deleteCount != 0 { + t.Errorf("partial-prune: expected 0 deletes (all-or-nothing), got %d (deleted: %v)", + store.deleteCount, store.deletedNames) + } +} + +// TestApplyRefresh_AllGhostsUnprotectedPrunesAll verifies that when all ghosts +// are unprotected, the pre-scan passes and all mutations proceed normally. +func TestApplyRefresh_AllGhostsUnprotectedPrunesAll(t *testing.T) { + ghosts := []interfaces.DriftResult{ + {Name: "ghost-1", Type: "infra.vpc", Drifted: true, Class: interfaces.DriftClassGhost}, + {Name: "ghost-2", Type: "infra.database", Drifted: true, Class: interfaces.DriftClassGhost}, + } + provider := &refreshFakeProvider{driftResults: ghosts} + store := &countingStore{} + refs := []interfaces.ResourceRef{ + {Name: "ghost-1", Type: "infra.vpc"}, + {Name: "ghost-2", Type: "infra.database"}, + } + + var stdout, stderr bytes.Buffer + err := runInfraApplyRefreshPhase(context.Background(), provider, refs, store, + true /* autoApprove */, false /* allowProtectedPrune */, nil /* no states */, &stdout, &stderr) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if store.deleteCount != 2 { + t.Errorf("expected 2 deletes, got %d", store.deleteCount) + } +} diff --git a/cmd/wfctl/infra_apply_validation_test.go b/cmd/wfctl/infra_apply_validation_test.go index e2b46e7a..e96ecf22 100644 --- a/cmd/wfctl/infra_apply_validation_test.go +++ b/cmd/wfctl/infra_apply_validation_test.go @@ -228,6 +228,19 @@ func TestInfraApply_InputValidation_ValidUUIDNoWarn(t *testing.T) { } } +// TestInfraApply_AllowProtectedPruneRequiresRefresh verifies that passing +// --allow-protected-prune without --refresh returns a clear error at +// flag-validation time, before any config is loaded or state is mutated. +func TestInfraApply_AllowProtectedPruneRequiresRefresh(t *testing.T) { + err := runInfraApply([]string{"--allow-protected-prune", "--config", "nonexistent.yaml"}) + if err == nil { + t.Fatal("expected error when --allow-protected-prune used without --refresh, got nil") + } + if !strings.Contains(err.Error(), "--allow-protected-prune") || !strings.Contains(err.Error(), "--refresh") { + t.Errorf("expected error to mention both flags, got: %v", err) + } +} + // ── regression-proof: fakeValidationProvider.ResourceDriver returns the driver ─ func TestFakeValidationProvider_ResourceDriverReturnsDriver(t *testing.T) { diff --git a/docs/wfctl/drift-recovery.md b/docs/wfctl/drift-recovery.md index bb1627d5..e6d29fba 100644 --- a/docs/wfctl/drift-recovery.md +++ b/docs/wfctl/drift-recovery.md @@ -138,5 +138,6 @@ after an interrupted apply: if: github.event_name == 'workflow_dispatch' ``` -See `docs/plans/2026-05-02-infra-drift-recovery.md` for the full recovery -design rationale. +This document is the canonical operator reference for drift recovery. +For detailed design rationale see the inline comments in +`cmd/wfctl/infra_apply_refresh.go`.