From 0a0d9b91edbc1f2b615c165ed2ffde9a06efabc4 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Wed, 8 Apr 2026 12:09:44 +0200 Subject: [PATCH 01/23] gather-info: fix DCGM daemon detection, InfiniBand hints, and harden collectors Bug fixes: - Detect DCGM daemon not running and prompt to enable it (exit 235 fix) - Add missing ibstatus/ibv_devinfo parser hints that caused phantom errors - Fix nvidia facts defaulting to empty string instead of "unavailable" - Fix omitempty silently dropping zero-value command state fields in manifest - Fix docker gate logic to distinguish permission/daemon/timeout failures - Fix critical_event_count reporting capped count instead of true total Reliability improvements: - Single-pass artifact checking in all triage analyzers with firstPayloadLine - Panic recovery around each triage analyzer - Rewrite archive.go with WalkDir, named returns, and errors.Join - Reuse docker info output for gate check instead of running twice - Extract saveDirConcat helper to deduplicate network/packages collectors - Add SanitizePathComponent for safe artifact paths from container names - Pointer fields in report.go for explicit zero emission New tests: - Contract tests ensuring Go vocabularies stay in sync with JSON schemas - Artifact state tests for firstPayloadLine and checkArtifact - Triage determinism tests for stable output ordering - Xid analyzer unit tests Other: - Add triage-result.schema.json for triage _data/*.json files - Narrow lspci GPU detection to 3D/VGA/Display PCI classes - Remove unused UI screenshots (non-selected.png, selected.png) - Update documentation to reflect current architecture --- .gitignore | 5 +- customers/vm-troubleshooting/AGENTS.md | 7 +- customers/vm-troubleshooting/CODEMAP.md | 6 +- .../internal/collector/additional.go | 2 +- .../internal/collector/collector.go | 69 +++- .../internal/collector/collector_test.go | 4 +- .../internal/collector/common.go | 150 +++++--- .../internal/collector/dcgm.go | 11 +- .../internal/collector/docker.go | 132 +++++-- .../internal/collector/docker_test.go | 34 +- .../internal/collector/infiniband.go | 2 +- .../internal/collector/journal.go | 14 +- .../internal/collector/network.go | 36 +- .../internal/collector/nvidia.go | 84 +++-- .../internal/collector/packages.go | 33 +- .../internal/collector/services.go | 11 +- .../internal/collector/system.go | 23 +- .../internal/install/dcgm.go | 34 ++ .../internal/output/archive.go | 34 +- .../output/archive_consistency_test.go | 4 +- .../internal/output/contract_test.go | 334 ++++++++++++++++++ .../internal/output/manifest.go | 136 ++++--- .../internal/output/manifest_test.go | 2 +- .../internal/output/report.go | 35 +- .../internal/output/report_test.go | 11 +- .../internal/output/summary.go | 16 +- .../internal/platform/dcgm.go | 42 ++- .../internal/platform/nvidia.go | 15 +- .../internal/runner/runner.go | 75 ++-- .../internal/transfer/commands.go | 57 +-- .../internal/triage/artifact_state_test.go | 68 ++++ .../internal/triage/critical.go | 77 +++- .../internal/triage/critical_test.go | 106 ++++-- .../internal/triage/determinism_test.go | 109 ++++++ .../internal/triage/firewall.go | 49 ++- .../internal/triage/integration_test.go | 46 ++- .../internal/triage/triage.go | 100 +++++- .../vm-troubleshooting/internal/triage/xid.go | 57 ++- .../internal/triage/xid_analyze_test.go | 69 ++++ customers/vm-troubleshooting/non-selected.png | Bin 18482 -> 0 bytes .../schemas/manifest.schema.json | 38 +- .../schemas/report-record.schema.json | 12 +- .../schemas/triage-result.schema.json | 39 ++ customers/vm-troubleshooting/selected.png | Bin 19212 -> 0 bytes docs/architecture.md | 215 +++++++++++ 45 files changed, 1939 insertions(+), 464 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/output/contract_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/artifact_state_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/determinism_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/xid_analyze_test.go delete mode 100644 customers/vm-troubleshooting/non-selected.png create mode 100644 customers/vm-troubleshooting/schemas/triage-result.schema.json delete mode 100644 customers/vm-troubleshooting/selected.png create mode 100644 docs/architecture.md diff --git a/.gitignore b/.gitignore index 3e62733..3c5a9d3 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,7 @@ Thumbs.db *.pem *.key -docs/plans/ \ No newline at end of file +docs/plans/ + +.mcp_data/ +.serena/ diff --git a/customers/vm-troubleshooting/AGENTS.md b/customers/vm-troubleshooting/AGENTS.md index 6de8762..a9473e9 100644 --- a/customers/vm-troubleshooting/AGENTS.md +++ b/customers/vm-troubleshooting/AGENTS.md @@ -53,16 +53,17 @@ The archive contains three complementary output files for different consumers: - `metadata.json` — stable backward-compatible summary. **Do not change field types or remove fields.** - `manifest.json` — rich machine-readable index with per-artifact records, SHA-256 checksums, typed facts, and tags. This is the primary file for automated parsing tools. Schema: `schemas/manifest.schema.json`. - `report.ndjson` — one JSON line per artifact/issue/fact/collector_summary, streamable and grep-friendly. Schema: `schemas/report-record.schema.json`. +- `triage/_data/*.json` — per-analyzer findings with severity, evidence, and typed facts. Schema: `schemas/triage-result.schema.json`. - `SUMMARY.txt` — human-readable report. Format should remain stable for support engineers. Rules for the structured layer: - Schema files live in `schemas/` in-repo and are included in every archive. -- `schema_version` follows semver: minor adds fields, major changes types. Current: `1.0.0`. -- Facts with integer keys (`cpu_cores`, `gpu_count`, `memory_total`, `oom_event_count`, `xid_error_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report. `"unavailable"` maps to `null`. +- `schema_version` follows semver: minor adds fields, major changes types. Current: `2.0.0`. +- Facts with integer keys (`cpu_cores`, `gpu_count`, `memory_total`, `oom_event_count`, `xid_classified_count`, `critical_event_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report/triage. `"unavailable"` maps to `null`. - All other facts remain strings. ## Artifact registration -- Every artifact must go through `saveCommand`, `saveFile`, or `captureToFile` in `common.go`, or use `Writer.ReservePath` + one of the `Add*Artifact` helpers for custom flows. +- Every artifact must go through `saveCommand`, `saveFile`, `saveCapturedProbe`, `saveProbeOutput`, or `saveDirConcat` in `common.go`, or use `Writer.ReservePath` + one of the `Add*Artifact` helpers for custom flows. - Every artifact requires a `parserHint` (from `ValidParserHints`) and 1-3 `tags` (from `ValidTags`). Both are validated before write — invalid values record an error and skip the write. - Artifact paths are globally unique across all collectors, enforced by `Writer.ReservePath`. Duplicate paths are rejected before write. - Framework-owned paths (`metadata.json`, `manifest.json`, `report.ndjson`, `SUMMARY.txt`, `schemas/*`) are reserved before collectors run. diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index 4b3945d..8e68ac3 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -28,7 +28,8 @@ Keep this file updated in the same change as architecture or collector changes. - creates the work directory, - registers collectors in order, - runs collectors, - - writes `metadata.json` and `SUMMARY.txt`, + - runs triage analyzers on collected artifacts, + - writes `manifest.json`, `report.ndjson`, `metadata.json`, and `SUMMARY.txt`, - creates the final `.tar.gz` archive. 4. Each collector writes artifacts into a deterministic subtree under the work directory. @@ -61,7 +62,7 @@ Keep this file updated in the same change as architecture or collector changes. - Verbose logging should stay here or be called through this interface. ### `internal/output/` -- Owns artifact writing, command/file metadata headers, `SUMMARY.txt`, `metadata.json`, and archive creation. +- Owns artifact writing, command/file metadata headers, `manifest.json`, `report.ndjson`, `SUMMARY.txt`, `metadata.json`, and archive creation. - If support needs a new machine-readable field or archive invariant, change this package. ### `internal/privilege/` @@ -137,6 +138,7 @@ Keep this file updated in the same change as architecture or collector changes. - Summary and metadata are always generated when an archive is created. - `manifest.json` — rich machine-readable index with per-artifact records, SHA-256 checksums, typed facts, and tags. Schema: `schemas/manifest.schema.json`. - `report.ndjson` — one JSON line per artifact/issue/fact/collector_summary, streamable. Schema: `schemas/report-record.schema.json`. +- `triage/_data/*.json` — per-analyzer findings with typed facts. Schema: `schemas/triage-result.schema.json`. - `metadata.json` — stable backward-compatible summary (do not change field types). - Schema files are included in the archive for parser self-discovery. - All artifact paths are globally reserved via `Writer.ReservePath` — no duplicates possible. diff --git a/customers/vm-troubleshooting/internal/collector/additional.go b/customers/vm-troubleshooting/internal/collector/additional.go index 467f072..0fc0734 100644 --- a/customers/vm-troubleshooting/internal/collector/additional.go +++ b/customers/vm-troubleshooting/internal/collector/additional.go @@ -42,7 +42,7 @@ func (c *AdditionalCollector) Collect(ctx context.Context) (*CollectorResult, er {"hardware/sensors.txt", "sensors", nil, false, config.TimeoutQuick, "text", []string{"hardware"}}, } { if !c.Exec.CommandExists(spec.name) { - r.RecordSkipped(spec.name + ": unavailable") + r.RecordSkip(SkipCommandUnavailable, spec.name+": unavailable") continue } c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, Timeout: spec.timeout}, spec.hint, spec.tags...) diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index 763ebe7..2a3855f 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -58,7 +58,8 @@ var ValidParserHints = map[string]bool{ "ss": true, "mount": true, "lsmod": true, "pip": true, "docker": true, "nmcli": true, "networkctl": true, "resolvectl": true, "bridge": true, "netplan": true, "iptables": true, "nft": true, - "ufw": true, "firewall-cmd": true, "ibstat": true, "rdma": true, + "ufw": true, "firewall-cmd": true, "ibstat": true, "ibstatus": true, + "ibv_devinfo": true, "rdma": true, "apt-mark": true, "sh": true, "hostname": true, "date": true, "uptime": true, "uname": true, "csv": true, } @@ -99,14 +100,53 @@ type Collector interface { Collect(ctx context.Context) (*CollectorResult, error) } +// ErrorCode is the controlled vocabulary for structured error codes. +type ErrorCode string + +const ( + ErrCommandFailed ErrorCode = "command_failed" + ErrCommandTimedOut ErrorCode = "command_timed_out" + ErrProbeFailed ErrorCode = "probe_failed" + ErrArtifactValidation ErrorCode = "artifact_validation_failed" + ErrArtifactReserve ErrorCode = "artifact_reserve_failed" + ErrArtifactWrite ErrorCode = "artifact_write_failed" + ErrEnumerationFailed ErrorCode = "enumeration_failed" +) + +// StructuredError describes a single collection error with machine-readable code. +type StructuredError struct { + Code ErrorCode `json:"code"` + Message string `json:"message"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + +// SkipCode is the controlled vocabulary for structured skip reason codes. +type SkipCode string + +const ( + SkipDisabledByFlag SkipCode = "disabled_by_flag" + SkipCommandUnavailable SkipCode = "command_unavailable" + SkipSourceUnavailable SkipCode = "source_unavailable" + SkipNotApplicable SkipCode = "not_applicable" + SkipPermissionOrAccess SkipCode = "permission_or_access" + SkipDaemonUnavailable SkipCode = "daemon_unavailable" +) + +// SkipReason describes why an artifact or collector was skipped. +type SkipReason struct { + Reason SkipCode `json:"reason"` + Detail string `json:"detail"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + type CollectorResult struct { ID string Name string Issues []Issue Facts map[string]string Artifacts []ArtifactRecord - Skipped []string - Errors []string + Skipped []SkipReason + Errors []StructuredError Duration time.Duration } @@ -158,8 +198,21 @@ func (r *CollectorResult) AddProbeArtifact(path, hint string, tags ...string) { }) } -func (r *CollectorResult) RecordSkipped(msg string) { r.Skipped = append(r.Skipped, msg) } -func (r *CollectorResult) RecordError(msg string) { r.Errors = append(r.Errors, msg) } +func (r *CollectorResult) RecordSkip(reason SkipCode, detail string) { + r.Skipped = append(r.Skipped, SkipReason{Reason: reason, Detail: detail}) +} + +func (r *CollectorResult) RecordSkipForArtifact(reason SkipCode, detail, artifactPath string) { + r.Skipped = append(r.Skipped, SkipReason{Reason: reason, Detail: detail, ArtifactPath: artifactPath}) +} + +func (r *CollectorResult) RecordError(code ErrorCode, msg string) { + r.Errors = append(r.Errors, StructuredError{Code: code, Message: msg}) +} + +func (r *CollectorResult) RecordErrorForArtifact(code ErrorCode, msg, artifactPath string) { + r.Errors = append(r.Errors, StructuredError{Code: code, Message: msg, ArtifactPath: artifactPath}) +} func (r *CollectorResult) SetFact(key, value string) { if value != "" { r.Facts[key] = value @@ -204,7 +257,7 @@ func (r *Registry) RunAll(ctx context.Context, skip map[string]bool, u ui.UI) ([ res := NewResult() res.ID = c.ID() res.Name = c.Name() - res.RecordSkipped("collector disabled by flag") + res.RecordSkip(SkipDisabledByFlag, "collector disabled by flag") results = append(results, res) u.Skip(fmt.Sprintf("[%d/%d] %s (skipped by flag)", i+1, total, c.Name())) continue @@ -227,7 +280,7 @@ func (r *Registry) RunAll(ctx context.Context, skip map[string]bool, u ui.UI) ([ res.Name = c.Name() res.Duration = time.Since(start) if err != nil { - res.RecordError(err.Error()) + res.RecordError(ErrProbeFailed, err.Error()) } results = append(results, res) @@ -235,7 +288,7 @@ func (r *Registry) RunAll(ctx context.Context, skip map[string]bool, u ui.UI) ([ if len(res.Errors) > 0 { sp.Fail(fmt.Sprintf("[%d/%d] %s (%s, %d error(s))", i+1, total, c.Name(), dur, len(res.Errors))) } else if len(res.Artifacts) == 0 && len(res.Skipped) > 0 { - sp.Success(fmt.Sprintf("[%d/%d] %s (skipped: %s)", i+1, total, c.Name(), res.Skipped[0])) + sp.Success(fmt.Sprintf("[%d/%d] %s (skipped: %s)", i+1, total, c.Name(), res.Skipped[0].Detail)) } else { sp.Success(fmt.Sprintf("[%d/%d] %s (%s, %d artifact(s))", i+1, total, c.Name(), dur, len(res.Artifacts))) } diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index 734e111..cc1257f 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -309,7 +309,7 @@ func TestDCGMCollectorSkipsDiagWhenNotEnabled(t *testing.T) { // Do NOT register "dcgmi diag -r 2" — if it runs, FakeExecutor will return an error root := t.TempDir() - c := NewDCGMCollector(fake, output.NewWriter(root), ui.NoopUI{}, false) + c := NewDCGMCollector(fake, output.NewWriter(root), ui.NoopUI{}, false, true) res, err := c.Collect(context.Background()) if err != nil { t.Fatal(err) @@ -333,7 +333,7 @@ func TestDCGMCollectorRunsDiagWhenEnabled(t *testing.T) { fake.Commands["dcgmi diag -r 2"] = executor.FakeResponse{Stdout: []byte("diag output\n")} root := t.TempDir() - c := NewDCGMCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + c := NewDCGMCollector(fake, output.NewWriter(root), ui.NoopUI{}, true, true) res, err := c.Collect(context.Background()) if err != nil { t.Fatal(err) diff --git a/customers/vm-troubleshooting/internal/collector/common.go b/customers/vm-troubleshooting/internal/collector/common.go index 1a3a419..fa3985c 100644 --- a/customers/vm-troubleshooting/internal/collector/common.go +++ b/customers/vm-troubleshooting/internal/collector/common.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "path/filepath" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/executor" @@ -20,38 +21,48 @@ type Base struct { func (b Base) saveProbeOutput(r *CollectorResult, path, content, hint string, tags ...string) string { b.UI.Verbose(fmt.Sprintf(" probe: %s -> %s", hint, path)) if err := ValidateTagsAndHint(hint, tags); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) return "" } if err := b.Writer.ReservePath(path); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, err), path) return "" } if err := b.Writer.SaveOutput(path, content); err != nil { b.Writer.ReleasePath(path) - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, err), path) return "" } r.AddProbeArtifact(path, hint, tags...) return path } -func (b Base) saveCapturedProbe(r *CollectorResult, path string, spec executor.CommandSpec, stdout, stderr, hint string, tags []string, notes ...string) string { +func (b Base) saveCapturedProbe(r *CollectorResult, path string, spec executor.CommandSpec, cmdResult executor.CommandResult, stdout, stderr, hint string, tags []string, notes ...string) string { b.UI.Verbose(fmt.Sprintf(" probe-capture: %s -> %s", spec.String(), path)) if err := ValidateTagsAndHint(hint, tags); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) return "" } if err := b.Writer.ReservePath(path); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, err), path) return "" } if err := b.Writer.SaveCapturedCommand(path, spec, stdout, stderr, notes...); err != nil { b.Writer.ReleasePath(path) - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, err), path) return "" } - r.AddProbeArtifact(path, hint, tags...) + // Derive artifact status from the actual command result. + status := "ok" + if cmdResult.Skipped { + status = "skipped" + } else if cmdResult.TimedOut { + status = "error" + } else if cmdResult.Err != nil && !spec.IgnoreExit { + status = "error" + } + r.AddCommandArtifact(path, spec.String(), cmdResult.ExitCode, status, + spec.IgnoreExit, cmdResult.TimedOut, cmdResult.Truncated, cmdResult.Duration, hint, tags...) return path } @@ -63,12 +74,12 @@ func (b Base) saveCommand(ctx context.Context, r *CollectorResult, path string, // Pre-write: validate tags/hint if err := ValidateTagsAndHint(hint, tags); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) return "" } // Pre-write: reserve path globally if err := b.Writer.ReservePath(path); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, err), path) return "" } @@ -76,7 +87,7 @@ func (b Base) saveCommand(ctx context.Context, r *CollectorResult, path string, result, err := b.Writer.SaveCommand(ctx, b.Exec, path, spec) if err != nil { b.Writer.ReleasePath(path) - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, err), path) return "" } @@ -85,18 +96,18 @@ func (b Base) saveCommand(ctx context.Context, r *CollectorResult, path string, if result.Skipped { status = "skipped" b.UI.Verbose(fmt.Sprintf(" skipped: %s (requires root)", spec.String())) - r.RecordSkipped(spec.String()) + r.RecordSkipForArtifact(SkipPermissionOrAccess, spec.String(), path) } else if result.TimedOut { status = "error" b.UI.Verbose(fmt.Sprintf(" timed out: %s", spec.String())) - r.RecordError(fmt.Sprintf("%s: timed out", spec.String())) + r.RecordErrorForArtifact(ErrCommandTimedOut, fmt.Sprintf("%s: timed out", spec.String()), path) } else if result.Err != nil { if spec.IgnoreExit { b.UI.Verbose(fmt.Sprintf(" note: %s exited %d (non-zero exit ignored)", spec.String(), result.ExitCode)) } else { status = "error" b.UI.Verbose(fmt.Sprintf(" error: %s: %v (exit %d)", spec.String(), result.Err, result.ExitCode)) - r.RecordError(fmt.Sprintf("%s: %v", spec.String(), result.Err)) + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", spec.String(), result.Err), path) } } @@ -105,73 +116,96 @@ func (b Base) saveCommand(ctx context.Context, r *CollectorResult, path string, return path } -func (b Base) captureToFile(ctx context.Context, r *CollectorResult, path string, spec executor.CommandSpec, hint string, tags ...string) string { - if ctx.Err() != nil { - return "" - } - b.UI.Verbose(fmt.Sprintf(" capture: %s -> %s", spec.String(), path)) - - if err := ValidateTagsAndHint(hint, tags); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) - return "" - } - if err := b.Writer.ReservePath(path); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) - return "" +// saveDirConcat reads all regular files in a directory, concatenates them with +// "=== filename ===" headers, and saves the result as a single artifact. +// Subdirectories are skipped. An optional sanitize function can be applied to +// each file's content. +func (b Base) saveDirConcat(r *CollectorResult, dest, dir string, sanitize func(string) string, tags ...string) { + entries, err := os.ReadDir(dir) + if err != nil { + r.RecordSkipForArtifact(SkipSourceUnavailable, dir+": unavailable", dest) + return } - - result, stdout, stderr := b.Exec.Capture(ctx, spec, 10*1024*1024) - content := string(stdout) - if len(stderr) > 0 { - if content != "" && !strings.HasSuffix(content, "\n") { - content += "\n" + var buf strings.Builder + filesSeen := 0 + filesRead := 0 + hadReadErr := false + for _, entry := range entries { + if entry.IsDir() { + continue + } + filesSeen++ + data, err := os.ReadFile(filepath.Join(dir, entry.Name())) + if err != nil { + hadReadErr = true + continue + } + filesRead++ + content := string(data) + if sanitize != nil { + content = sanitize(content) + } + buf.WriteString("=== " + entry.Name() + " ===\n") + buf.WriteString(content) + if !strings.HasSuffix(content, "\n") { + buf.WriteByte('\n') } - content += string(stderr) } - if content == "" && result.Skipped { - content = "[SKIPPED]\n" + if buf.Len() > 0 { + b.saveProbeOutput(r, dest, buf.String(), "text", tags...) + return } - if err := b.Writer.SaveCapturedCommand(path, spec, content, ""); err != nil { - b.Writer.ReleasePath(path) - r.RecordError(fmt.Sprintf("%s: %v", path, err)) - return "" + if filesSeen > 0 && filesRead == 0 && hadReadErr { + r.RecordSkipForArtifact(SkipSourceUnavailable, dir+": unavailable", dest) } +} - status := "ok" - if result.Skipped { - status = "skipped" - b.UI.Verbose(fmt.Sprintf(" skipped: %s (requires root)", spec.String())) - r.RecordSkipped(spec.String()) +// SanitizePathComponent normalizes an external identifier for use as a single +// path component in artifact paths. It replaces path separators and other +// problematic characters, truncates to a reasonable length, and ensures the +// result is never empty. +func SanitizePathComponent(name string) string { + // Replace path separators and control chars with underscores + var b strings.Builder + for _, r := range name { + switch { + case r == '/' || r == '\\' || r == '\x00' || r == ':' || r == '*' || r == '?' || r == '"' || r == '<' || r == '>' || r == '|': + b.WriteByte('_') + case r < 0x20: // control characters + b.WriteByte('_') + default: + b.WriteRune(r) + } } - if result.TimedOut { - status = "error" - r.RecordError(fmt.Sprintf("%s: timed out", spec.String())) + s := b.String() + // Strip leading dots to prevent hidden files or traversal + s = strings.TrimLeft(s, ".") + // Truncate to 128 chars + if len(s) > 128 { + s = s[:128] } - if result.Err != nil && !result.Skipped && !result.TimedOut && !spec.IgnoreExit { - status = "error" - r.RecordError(fmt.Sprintf("%s: %v", spec.String(), result.Err)) + if s == "" { + s = "_unnamed" } - - r.AddCommandArtifact(path, spec.String(), result.ExitCode, status, spec.IgnoreExit, result.TimedOut, result.Truncated, result.Duration, hint, tags...) - return content + return s } func (b Base) saveFile(r *CollectorResult, dest, src string, sanitize func(string) string, tags ...string) { b.UI.Verbose(fmt.Sprintf(" file: %s -> %s", src, dest)) if err := ValidateTagsAndHint("text", tags); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", dest, err)) + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", dest, err), dest) return } if err := b.Writer.ReservePath(dest); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", dest, err)) + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", dest, err), dest) return } data, err := os.ReadFile(src) if err != nil { b.Writer.ReleasePath(dest) - r.RecordSkipped(src + ": unavailable") + r.RecordSkipForArtifact(SkipSourceUnavailable, src+": unavailable", dest) return } content := string(data) @@ -181,7 +215,7 @@ func (b Base) saveFile(r *CollectorResult, dest, src string, sanitize func(strin } if err := b.Writer.SaveReadFile(dest, src, content, isSanitized); err != nil { b.Writer.ReleasePath(dest) - r.RecordError(fmt.Sprintf("%s: %v", dest, err)) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", dest, err), dest) return } r.AddFileArtifact(dest, src, isSanitized, tags...) diff --git a/customers/vm-troubleshooting/internal/collector/dcgm.go b/customers/vm-troubleshooting/internal/collector/dcgm.go index cc5527b..13b04b3 100644 --- a/customers/vm-troubleshooting/internal/collector/dcgm.go +++ b/customers/vm-troubleshooting/internal/collector/dcgm.go @@ -12,10 +12,11 @@ import ( type DcgmCollector struct { Base activeGPUDiag bool + daemonRunning bool } -func NewDCGMCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, activeGPUDiag bool) *DcgmCollector { - return &DcgmCollector{Base: Base{Exec: exec, Writer: writer, UI: ui}, activeGPUDiag: activeGPUDiag} +func NewDCGMCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, activeGPUDiag, daemonRunning bool) *DcgmCollector { + return &DcgmCollector{Base: Base{Exec: exec, Writer: writer, UI: ui}, activeGPUDiag: activeGPUDiag, daemonRunning: daemonRunning} } func (c *DcgmCollector) Name() string { return "DCGM" } @@ -24,7 +25,11 @@ func (c *DcgmCollector) ID() string { return "dcgm" } func (c *DcgmCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Exec.CommandExists("dcgmi") { - r.RecordSkipped("dcgmi unavailable") + r.RecordSkip(SkipCommandUnavailable, "dcgmi unavailable") + return r, nil + } + if !c.daemonRunning { + r.RecordSkip(SkipDaemonUnavailable, "DCGM daemon not running; user declined to start") return r, nil } c.saveCommand(ctx, r, "dcgm/dcgmi_discovery.txt", executor.CommandSpec{Name: "dcgmi", Args: []string{"discovery", "-l"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "dcgmi", "gpu-health") diff --git a/customers/vm-troubleshooting/internal/collector/docker.go b/customers/vm-troubleshooting/internal/collector/docker.go index 57efd72..eb00e99 100644 --- a/customers/vm-troubleshooting/internal/collector/docker.go +++ b/customers/vm-troubleshooting/internal/collector/docker.go @@ -27,24 +27,38 @@ func (c *DockerCollector) ID() string { return "docker" } func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Exec.CommandExists("docker") { - r.RecordSkipped("docker unavailable") + r.RecordSkip(SkipCommandUnavailable, "docker unavailable") return r, nil } infoSpec := executor.CommandSpec{Name: "docker", Args: []string{"info"}, NeedsRoot: true, Timeout: config.TimeoutMedium} - infoResult, _, infoErr := c.Exec.Capture(ctx, infoSpec, 256*1024) - if infoResult.Skipped || infoResult.Err != nil { - r.RecordSkipped("docker daemon not accessible") + infoResult, infoOut, infoErr := c.Exec.Capture(ctx, infoSpec, 256*1024) + if infoResult.Skipped { + r.RecordSkip(SkipPermissionOrAccess, "docker info requires elevated access") + return r, nil + } + if infoResult.TimedOut { + r.RecordError(ErrCommandTimedOut, "docker info timed out") + return r, nil + } + if infoResult.Err != nil { + stderr := strings.ToLower(string(infoErr)) + if strings.Contains(stderr, "permission denied") || strings.Contains(stderr, "connect: permission denied") { + r.RecordSkip(SkipPermissionOrAccess, "docker: permission denied") + } else { + r.RecordSkip(SkipDaemonUnavailable, "docker daemon not accessible") + } if len(infoErr) > 0 { errPath := "docker/docker_info_error.txt" - c.saveCapturedProbe(r, errPath, infoSpec, "", string(infoErr), "docker", []string{"docker"}) + c.saveCapturedProbe(r, errPath, infoSpec, infoResult, "", string(infoErr), "docker", []string{"docker"}) } return r, nil } + // Reuse gate result for docker info artifact (6C: avoid re-running the gate command) + c.saveCapturedProbe(r, "docker/docker_info.txt", infoSpec, infoResult, string(infoOut), string(infoErr), "docker", []string{"docker"}) for _, spec := range []struct { path string args []string }{ - {"docker/docker_info.txt", []string{"info"}}, {"docker/docker_version.txt", []string{"version"}}, {"docker/docker_ps_all.txt", []string{"ps", "-a", "--format", "table {{.ID}}\t{{.Image}}\t{{.Status}}\t{{.Names}}\t{{.Ports}}"}}, {"docker/docker_df.txt", []string{"system", "df", "-v"}}, @@ -52,50 +66,92 @@ func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) } { c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: "docker", Args: spec.args, NeedsRoot: true, Timeout: config.TimeoutMedium}, "docker", "docker") } - _, versionOut, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"version", "--format", "{{.Server.Version}}"}, NeedsRoot: true, Timeout: config.TimeoutQuick}, 64*1024) - r.SetFact("docker_version", strings.TrimSpace(string(versionOut))) - _, list, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"ps", "-a", "--format", "{{.ID}}|{{.Names}}|{{.Image}}"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, 2*1024*1024) - lines := strings.Split(strings.TrimSpace(string(list)), "\n") - if len(lines) == 1 && lines[0] == "" { - lines = nil + versionResult, versionOut, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"version", "--format", "{{.Server.Version}}"}, NeedsRoot: true, Timeout: config.TimeoutQuick}, 64*1024) + version := strings.TrimSpace(string(versionOut)) + if versionResult.Err == nil && !versionResult.Skipped && version != "" { + r.SetFact("docker_version", version) + } else { + r.SetFact("docker_version", "unavailable") } - r.SetFact("container_count", fmt.Sprintf("%d", len(lines))) - vllmCount := 0 - for _, line := range lines { - parts := strings.Split(line, "|") - if len(parts) < 3 { - continue + + type containerRow struct { + id string + name string + image string + } + isVLLM := func(row containerRow) bool { + return strings.Contains(strings.ToLower(row.name+" "+row.image), "vllm") + } + + listResult, list, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"ps", "-a", "--format", "{{.ID}}|{{.Names}}|{{.Image}}"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, 2*1024*1024) + var rows []containerRow + listUsable := false + if listResult.Err == nil && !listResult.Skipped { + raw := strings.TrimSpace(string(list)) + if raw == "" { + listUsable = true + r.SetFact("container_count", "0") + } else { + parseFailed := false + for _, line := range strings.Split(raw, "\n") { + parts := strings.SplitN(line, "|", 3) + if len(parts) != 3 { + parseFailed = true + break + } + id := strings.TrimSpace(parts[0]) + name := strings.TrimSpace(parts[1]) + image := strings.TrimSpace(parts[2]) + if id == "" { + parseFailed = true + break + } + rows = append(rows, containerRow{id: id, name: name, image: image}) + } + if parseFailed { + r.SetFact("container_count", "unavailable") + } else { + listUsable = true + r.SetFact("container_count", fmt.Sprintf("%d", len(rows))) + } } - id, name, image := parts[0], parts[1], parts[2] - if strings.Contains(strings.ToLower(name+" "+image), "vllm") { - vllmCount++ - _, out, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"inspect", id}, NeedsRoot: true, Timeout: config.TimeoutMedium}, 2*1024*1024) - path := fmt.Sprintf("docker/vllm_logs/%s_inspect.json", name) - inspectSpec := executor.CommandSpec{Name: "docker", Args: []string{"inspect", id}, NeedsRoot: true, Timeout: config.TimeoutMedium} - c.saveCapturedProbe(r, path, inspectSpec, sanitize.DockerInspect(string(out))+"\n", "", "json", []string{"docker", "docker-security"}, "Environment variable values and common secret fields have been redacted") - c.saveCommand(ctx, r, fmt.Sprintf("docker/vllm_logs/%s_stats.txt", name), executor.CommandSpec{Name: "docker", Args: []string{"stats", "--no-stream", id}, NeedsRoot: true, Timeout: config.TimeoutQuick}, "docker", "docker") - if c.IncludeLogs { - c.saveCommand(ctx, r, fmt.Sprintf("docker/vllm_logs/%s_logs.txt", name), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "10000", "--timestamps", id}, NeedsRoot: true, Timeout: config.TimeoutSlow}, "docker", "docker") + } else { + r.SetFact("container_count", "unavailable") + } + + vllmCount := 0 + if !listUsable { + r.SetFact("vllm_container_count", "unavailable") + } else { + for _, row := range rows { + if isVLLM(row) { + vllmCount++ + safeName := SanitizePathComponent(row.name) + inspectSpec := executor.CommandSpec{Name: "docker", Args: []string{"inspect", row.id}, NeedsRoot: true, Timeout: config.TimeoutMedium} + inspectResult, out, _ := c.Exec.Capture(ctx, inspectSpec, 2*1024*1024) + path := fmt.Sprintf("docker/vllm_logs/%s_inspect.txt", safeName) + c.saveCapturedProbe(r, path, inspectSpec, inspectResult, sanitize.DockerInspect(string(out))+"\n", "", "docker", []string{"docker", "docker-security"}, "Environment variable values and common secret fields have been redacted") + c.saveCommand(ctx, r, fmt.Sprintf("docker/vllm_logs/%s_stats.txt", safeName), executor.CommandSpec{Name: "docker", Args: []string{"stats", "--no-stream", row.id}, NeedsRoot: true, Timeout: config.TimeoutQuick}, "docker", "docker") + if c.IncludeLogs { + c.saveCommand(ctx, r, fmt.Sprintf("docker/vllm_logs/%s_logs.txt", safeName), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "10000", "--timestamps", row.id}, NeedsRoot: true, Timeout: config.TimeoutSlow}, "docker", "docker") + } } } + r.SetFact("vllm_container_count", fmt.Sprintf("%d", vllmCount)) } - r.SetFact("vllm_container_count", fmt.Sprintf("%d", vllmCount)) - if c.IncludeLogs { + + if c.IncludeLogs && listUsable { count := 0 - for _, line := range lines { + for _, row := range rows { if count >= 20 { break } - parts := strings.Split(line, "|") - if len(parts) < 3 { - continue - } - id, name, image := parts[0], parts[1], parts[2] - if strings.Contains(strings.ToLower(name+" "+image), "vllm") { + if isVLLM(row) { continue } count++ - c.saveCommand(ctx, r, fmt.Sprintf("docker/other_logs/%s_logs_tail.txt", name), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "500", "--timestamps", id}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "docker", "docker") + safeName := SanitizePathComponent(row.name) + c.saveCommand(ctx, r, fmt.Sprintf("docker/other_logs/%s_logs_tail.txt", safeName), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "500", "--timestamps", row.id}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "docker", "docker") } } return r, nil diff --git a/customers/vm-troubleshooting/internal/collector/docker_test.go b/customers/vm-troubleshooting/internal/collector/docker_test.go index cee9e24..67e7b67 100644 --- a/customers/vm-troubleshooting/internal/collector/docker_test.go +++ b/customers/vm-troubleshooting/internal/collector/docker_test.go @@ -47,7 +47,7 @@ func TestDockerCollectorVLLMInspectSanitized(t *testing.T) { t.Fatalf("unexpected vllm count: %q", got) } - inspectData, err := os.ReadFile(filepath.Join(root, "docker/vllm_logs/vllm-main_inspect.json")) + inspectData, err := os.ReadFile(filepath.Join(root, "docker/vllm_logs/vllm-main_inspect.txt")) if err != nil { t.Fatalf("reading inspect artifact: %v", err) } @@ -59,3 +59,35 @@ func TestDockerCollectorVLLMInspectSanitized(t *testing.T) { t.Fatalf("inspect output missing redaction: %s", inspectText) } } + +func TestDockerCollectorMarksCountsUnavailableOnMalformedContainerList(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["docker"] = true + fake.Commands["docker info"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["docker version"] = executor.FakeResponse{Stdout: []byte("version\n")} + fake.Commands["docker version --format {{.Server.Version}}"] = executor.FakeResponse{Stdout: []byte("\n")} + fake.Commands[`docker ps -a --format table {{.ID}}`+"\t"+`{{.Image}}`+"\t"+`{{.Status}}`+"\t"+`{{.Names}}`+"\t"+`{{.Ports}}`] = executor.FakeResponse{Stdout: []byte("table\n")} + fake.Commands["docker system df -v"] = executor.FakeResponse{Stdout: []byte("df\n")} + fake.Commands["docker network ls"] = executor.FakeResponse{Stdout: []byte("bridge\n")} + // Malformed output: does not match expected ID|NAME|IMAGE shape. + fake.Commands["docker ps -a --format {{.ID}}|{{.Names}}|{{.Image}}"] = executor.FakeResponse{Stdout: []byte("malformed-line\n")} + + root := t.TempDir() + collector := NewDockerCollector(fake, output.NewWriter(root), ui.NoopUI{}, false) + res, err := collector.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["docker_version"]; got != "unavailable" { + t.Fatalf("expected docker_version=unavailable, got %q", got) + } + if got := res.Facts["container_count"]; got != "unavailable" { + t.Fatalf("expected container_count=unavailable, got %q", got) + } + if got := res.Facts["vllm_container_count"]; got != "unavailable" { + t.Fatalf("expected vllm_container_count=unavailable, got %q", got) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/infiniband.go b/customers/vm-troubleshooting/internal/collector/infiniband.go index e5ae981..6f14a39 100644 --- a/customers/vm-troubleshooting/internal/collector/infiniband.go +++ b/customers/vm-troubleshooting/internal/collector/infiniband.go @@ -37,7 +37,7 @@ func (c *InfiniBandCollector) Collect(ctx context.Context) (*CollectorResult, er c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, Timeout: config.TimeoutQuick}, spec.name, "infiniband") } if !collected { - r.RecordSkipped("InfiniBand tools unavailable") + r.RecordSkip(SkipCommandUnavailable, "InfiniBand tools unavailable") } return r, nil } diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 50a86eb..57d5ac9 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -27,7 +27,7 @@ func (c *JournalCollector) ID() string { return "journal" } func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Exec.CommandExists("journalctl") { - r.RecordSkipped("journalctl unavailable") + r.RecordSkip(SkipCommandUnavailable, "journalctl unavailable") return r, nil } @@ -60,9 +60,9 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error oomPath := "logs/oom_events.txt" if oomResult.Skipped { - r.RecordSkipped("journalctl unavailable for OOM scan") + r.RecordSkipForArtifact(SkipPermissionOrAccess, "journalctl OOM scan requires root", oomPath) r.SetFact("oom_event_count", "unavailable") - c.saveCapturedProbe(r, oomPath, oomSpec, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan unavailable") + c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan unavailable") for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { c.saveCommand(ctx, r, fmt.Sprintf("logs/journal_%s.txt", svc), executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-u", svc), NeedsRoot: true, Timeout: config.TimeoutMedium}, "journalctl", "journal", "services") } @@ -93,15 +93,15 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error if oomRealErr { r.SetFact("oom_event_count", "unavailable") - r.RecordError(fmt.Sprintf("%s: %v", oomSpec.String(), oomResult.Err)) - c.saveCapturedProbe(r, oomPath, oomSpec, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", oomSpec.String(), oomResult.Err), oomPath) + c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") } else if oomCount > 0 { r.SetFact("oom_event_count", fmt.Sprintf("%d", oomCount)) r.AddIssue(SeverityCritical, "MEM", fmt.Sprintf("%d OOM killer event(s)", oomCount)) - c.saveCapturedProbe(r, oomPath, oomSpec, strings.Join(oom, "\n")+"\n", "", "journalctl", []string{"oom"}) + c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, strings.Join(oom, "\n")+"\n", "", "journalctl", []string{"oom"}) } else { r.SetFact("oom_event_count", "0") - c.saveCapturedProbe(r, oomPath, oomSpec, "No OOM events found\n", "", "journalctl", []string{"oom"}) + c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, "No OOM events found\n", "", "journalctl", []string{"oom"}) } for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { diff --git a/customers/vm-troubleshooting/internal/collector/network.go b/customers/vm-troubleshooting/internal/collector/network.go index c7850c5..8ee4c8e 100644 --- a/customers/vm-troubleshooting/internal/collector/network.go +++ b/customers/vm-troubleshooting/internal/collector/network.go @@ -3,8 +3,6 @@ package collector import ( "context" "os" - "path/filepath" - "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" @@ -55,24 +53,12 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error for _, dir := range []struct { path string dest string - }{{"/etc/netplan", "network/netplan_configs.txt"}, {"/etc/systemd/network", "network/systemd_network_configs.txt"}, {"/etc/network/interfaces.d", "network/interfaces_d.txt"}} { - entries, err := os.ReadDir(dir.path) - if err != nil { - continue - } - var b strings.Builder - for _, entry := range entries { - data, err := os.ReadFile(filepath.Join(dir.path, entry.Name())) - if err != nil { - continue - } - b.WriteString("=== " + entry.Name() + " ===\n") - b.WriteString(sanitize.SensitiveConfig(string(data))) - if !strings.HasSuffix(b.String(), "\n") { - b.WriteByte('\n') - } - } - c.saveProbeOutput(r, dir.dest, b.String(), "text", "network", "config") + }{ + {"/etc/netplan", "network/netplan_configs.txt"}, + {"/etc/systemd/network", "network/systemd_network_configs.txt"}, + {"/etc/network/interfaces.d", "network/interfaces_d.txt"}, + } { + c.saveDirConcat(r, dir.dest, dir.path, sanitize.SensitiveConfig, "network", "config") } for _, spec := range []struct { path string @@ -103,13 +89,7 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error netplanSpec := executor.CommandSpec{Name: "netplan", Args: []string{"get", "--all"}, NeedsRoot: true, Timeout: config.TimeoutMedium} result, stdout, stderr := c.Exec.Capture(ctx, netplanSpec, 2*1024*1024) content := sanitize.SensitiveConfig(string(stdout)) - _ = c.saveCapturedProbe(r, "network/netplan_get.txt", netplanSpec, content, string(stderr), "netplan", []string{"network", "config"}, "Sensitive values have been redacted") - if result.Skipped { - r.RecordSkipped("netplan get --all") - } - if result.Err != nil && !result.Skipped { - r.RecordError("netplan get --all: " + result.Err.Error()) - } + _ = c.saveCapturedProbe(r, "network/netplan_get.txt", netplanSpec, result, content, string(stderr), "netplan", []string{"network", "config"}, "Sensitive values have been redacted") } if data, err := os.ReadFile("/proc/net/vlan/config"); err == nil { c.saveProbeOutput(r, "network/vlan_config.txt", string(data), "procfs", "network") @@ -130,7 +110,7 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error {"network/ufw_status.txt", "ufw", []string{"status", "verbose"}, true, "ufw", []string{"firewall"}}, } { if !c.Exec.CommandExists(spec.name) { - r.RecordSkipped(spec.name + ": unavailable") + r.RecordSkip(SkipCommandUnavailable, spec.name+": unavailable") continue } c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, Timeout: config.TimeoutMedium}, spec.hint, spec.tags...) diff --git a/customers/vm-troubleshooting/internal/collector/nvidia.go b/customers/vm-troubleshooting/internal/collector/nvidia.go index 6f3915b..1d0d6d8 100644 --- a/customers/vm-troubleshooting/internal/collector/nvidia.go +++ b/customers/vm-troubleshooting/internal/collector/nvidia.go @@ -27,28 +27,33 @@ func (c *NvidiaCollector) ID() string { return "nvidia" } func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Enabled { - r.RecordSkipped("no NVIDIA hardware detected") + r.RecordSkip(SkipNotApplicable, "no NVIDIA hardware detected") return r, nil } if c.Exec.CommandExists("lspci") { lspciSpec := executor.CommandSpec{Name: "lspci", Timeout: config.TimeoutQuick} - _, out, _ := c.Exec.Capture(ctx, lspciSpec, 512*1024) - var lines []string - hasNVSwitch := false - for _, line := range strings.Split(string(out), "\n") { - lower := strings.ToLower(line) - if strings.Contains(lower, "nvidia") { - lines = append(lines, line) - } - if strings.Contains(lower, "nvswitch") { - hasNVSwitch = true + lspciResult, out, _ := c.Exec.Capture(ctx, lspciSpec, 512*1024) + if lspciResult.Err == nil && !lspciResult.Skipped { + var lines []string + hasNVSwitch := false + for _, line := range strings.Split(string(out), "\n") { + lower := strings.ToLower(line) + if strings.Contains(lower, "nvidia") { + lines = append(lines, line) + } + if strings.Contains(lower, "nvswitch") { + hasNVSwitch = true + } } + r.SetFact("nvswitch_present", fmt.Sprintf("%t", hasNVSwitch)) + c.saveCapturedProbe(r, "nvidia/pci_devices.txt", lspciSpec, lspciResult, strings.Join(lines, "\n")+"\n", "", "lspci", []string{"gpu", "hardware"}) + } else { + r.SetFact("nvswitch_present", "unavailable") + c.saveCapturedProbe(r, "nvidia/pci_devices.txt", lspciSpec, lspciResult, "", "", "lspci", []string{"gpu", "hardware"}) } - r.SetFact("nvswitch_present", fmt.Sprintf("%t", hasNVSwitch)) - c.saveCapturedProbe(r, "nvidia/pci_devices.txt", lspciSpec, strings.Join(lines, "\n")+"\n", "", "lspci", []string{"gpu", "hardware"}) } if !c.Exec.CommandExists("nvidia-smi") { - r.RecordSkipped("nvidia-smi unavailable") + r.RecordSkip(SkipCommandUnavailable, "nvidia-smi unavailable") return r, nil } for _, spec := range []struct { @@ -64,17 +69,33 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: "nvidia-smi", Args: spec.args, Timeout: config.TimeoutMedium}, "nvidia-smi", "gpu") } gpuSpec := executor.CommandSpec{Name: "nvidia-smi", Args: []string{"--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"}, Timeout: config.TimeoutMedium} + r.SetFact("gpu_count", "unavailable") + r.SetFact("gpu_model", "unavailable") + r.SetFact("driver_version", "unavailable") gpuResult, gpuCsv, gpuErr := c.Exec.Capture(ctx, gpuSpec, 512*1024) - c.saveCapturedProbe(r, "nvidia/gpu_summary.csv", gpuSpec, string(gpuCsv), string(gpuErr), "nvidia-smi-csv", []string{"gpu"}) + c.saveCapturedProbe(r, "nvidia/gpu_summary.txt", gpuSpec, gpuResult, string(gpuCsv), string(gpuErr), "nvidia-smi-csv", []string{"gpu"}) if gpuResult.Err != nil && !gpuResult.Skipped { - r.RecordError(fmt.Sprintf("%s: %v", gpuSpec.String(), gpuResult.Err)) + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", gpuSpec.String(), gpuResult.Err), "nvidia/gpu_summary.txt") } - rows, _ := csv.NewReader(strings.NewReader(string(gpuCsv))).ReadAll() - if len(rows) > 0 { - r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) - r.SetFact("gpu_model", strings.TrimSpace(rows[0][0])) - if len(rows[0]) > 2 { - r.SetFact("driver_version", strings.TrimSpace(rows[0][2])) + if gpuResult.Err == nil && !gpuResult.Skipped { + rows, parseErr := csv.NewReader(strings.NewReader(string(gpuCsv))).ReadAll() + if parseErr == nil && len(rows) > 0 { + validShape := true + for _, row := range rows { + if len(row) < 3 { + validShape = false + break + } + } + if validShape { + r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) + if model := strings.TrimSpace(rows[0][0]); model != "" { + r.SetFact("gpu_model", model) + } + if driver := strings.TrimSpace(rows[0][2]); driver != "" { + r.SetFact("driver_version", driver) + } + } } } // Capture raw dmesg for the archive. Xid classification is handled by the @@ -83,8 +104,8 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) dmesgSpec := executor.CommandSpec{Name: "dmesg", NeedsRoot: true, Timeout: config.TimeoutMedium} dmesgResult, dmesg, dmesgErr := c.Exec.Capture(ctx, dmesgSpec, 2*1024*1024) if dmesgResult.Skipped { - r.RecordSkipped("dmesg unavailable for Xid scan") - c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, string(dmesg), string(dmesgErr), "dmesg", []string{"gpu", "gpu-errors"}, "Xid scan unavailable") + r.RecordSkipForArtifact(SkipPermissionOrAccess, "dmesg requires root for Xid scan", "nvidia/xid_errors.txt") + c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, dmesgResult, string(dmesg), string(dmesgErr), "dmesg", []string{"gpu", "gpu-errors"}, "Xid scan unavailable") } else { var xid []string for _, line := range strings.Split(string(dmesg), "\n") { @@ -98,12 +119,12 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) } } if dmesgResult.Err != nil { - r.RecordError(fmt.Sprintf("%s: %v", dmesgSpec.String(), dmesgResult.Err)) - c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, string(dmesg), string(dmesgErr), "dmesg", []string{"gpu", "gpu-errors"}, "Xid scan incomplete") + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", dmesgSpec.String(), dmesgResult.Err), "nvidia/xid_errors.txt") + c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, dmesgResult, string(dmesg), string(dmesgErr), "dmesg", []string{"gpu", "gpu-errors"}, "Xid scan incomplete") } else if len(xid) > 0 { - c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, strings.Join(xid, "\n")+"\n", "", "dmesg", []string{"gpu", "gpu-errors"}) + c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, dmesgResult, strings.Join(xid, "\n")+"\n", "", "dmesg", []string{"gpu", "gpu-errors"}) } else { - c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, "No Xid/GPU errors found\n", "", "dmesg", []string{"gpu", "gpu-errors"}) + c.saveCapturedProbe(r, "nvidia/xid_errors.txt", dmesgSpec, dmesgResult, "No Xid/GPU errors found\n", "", "dmesg", []string{"gpu", "gpu-errors"}) } } for _, args := range [][]string{ @@ -112,7 +133,12 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) {"--query-gpu=index,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max", "--format=csv"}, {"--query-gpu=index,power.draw,power.limit,temperature.gpu", "--format=csv"}, } { - name := strings.TrimPrefix(strings.Split(args[0], "=")[1], "index,") + // Derive filename from the query parameter (e.g. "--query-gpu=index,power.draw" → "power_draw") + query := args[0] + if _, after, ok := strings.Cut(query, "="); ok { + query = after + } + name := strings.TrimPrefix(query, "index,") file := strings.ReplaceAll(strings.ReplaceAll(name, ",", "_"), ".", "_") c.saveCommand(ctx, r, "nvidia/"+file+".txt", executor.CommandSpec{Name: "nvidia-smi", Args: args, Timeout: config.TimeoutMedium}, "nvidia-smi-csv", "gpu") } diff --git a/customers/vm-troubleshooting/internal/collector/packages.go b/customers/vm-troubleshooting/internal/collector/packages.go index 194da55..0fccf4a 100644 --- a/customers/vm-troubleshooting/internal/collector/packages.go +++ b/customers/vm-troubleshooting/internal/collector/packages.go @@ -3,8 +3,6 @@ package collector import ( "context" "os" - "path/filepath" - "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" @@ -54,36 +52,7 @@ func (c *PackagesCollector) Collect(ctx context.Context) (*CollectorResult, erro switch c.Distro.Family { case "debian": c.saveFile(r, "packages/apt_sources.txt", "/etc/apt/sources.list", nil, "packages", "config") - if entries, err := os.ReadDir("/etc/apt/sources.list.d"); err == nil { - var b strings.Builder - for _, entry := range entries { - if entry.IsDir() { - continue - } - data, err := os.ReadFile(filepath.Join("/etc/apt/sources.list.d", entry.Name())) - if err != nil { - continue - } - b.WriteString("=== " + entry.Name() + " ===\n") - b.Write(data) - if len(data) == 0 || data[len(data)-1] != '\n' { - b.WriteByte('\n') - } - } - if b.Len() > 0 { - path := "packages/apt_sources_d.txt" - if err := ValidateTagsAndHint("text", []string{"packages", "config"}); err != nil { - r.RecordError(path + ": " + err.Error()) - } else if reserveErr := c.Writer.ReservePath(path); reserveErr != nil { - r.RecordError(path + ": " + reserveErr.Error()) - } else if writeErr := c.Writer.SaveReadFile(path, "/etc/apt/sources.list.d", b.String(), false); writeErr != nil { - c.Writer.ReleasePath(path) - r.RecordError(path + ": " + writeErr.Error()) - } else { - r.AddFileArtifact(path, "/etc/apt/sources.list.d", false, "packages", "config") - } - } - } + c.saveDirConcat(r, "packages/apt_sources_d.txt", "/etc/apt/sources.list.d", nil, "packages", "config") if _, err := os.Stat("/var/log/dpkg.log"); err == nil { c.saveCommand(ctx, r, "packages/dpkg_recent.txt", executor.CommandSpec{Name: "sh", Args: []string{"-c", `grep -E "(install|upgrade)" /var/log/dpkg.log | tail -100`}, Timeout: config.TimeoutMedium, IgnoreExit: true}, "sh", "packages") } diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index f93cf01..15c605d 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -60,11 +60,14 @@ func (c *ServicesCollector) Collect(ctx context.Context) (*CollectorResult, erro Timeout: config.TimeoutQuick, } result, stdout, _ := c.Exec.Capture(ctx, spec, 512*1024) - c.saveCapturedProbe(r, "services/failed_services.txt", spec, + c.saveCapturedProbe(r, "services/failed_services.txt", spec, result, string(stdout), "", "systemctl", []string{"services"}) - if result.Err != nil && result.ExitCode != 0 { - r.RecordError("failed to enumerate failed services: " + result.Err.Error()) + if result.Skipped { + r.RecordSkipForArtifact(SkipPermissionOrAccess, "systemctl failed-services scan unavailable", "services/failed_services.txt") + r.SetFact("failed_service_count", "unavailable") + } else if result.Err != nil && result.ExitCode != 0 { + r.RecordErrorForArtifact(ErrEnumerationFailed, "failed to enumerate failed services: "+result.Err.Error(), "services/failed_services.txt") r.SetFact("failed_service_count", "unavailable") } else { var shellFailedNames []string @@ -147,7 +150,7 @@ func (c *ServicesCollector) resolveExisting(ctx context.Context, names []string, Name: "systemctl", Args: []string{"list-unit-files", "--type=service", "--no-pager"}, Timeout: config.TimeoutQuick, }, 1*1024*1024) - if result.Err != nil { + if result.Skipped || result.Err != nil { return nil } unitFiles := probe.ParseUnitFileList(string(stdout)) diff --git a/customers/vm-troubleshooting/internal/collector/system.go b/customers/vm-troubleshooting/internal/collector/system.go index 64b8ece..2e3b5f0 100644 --- a/customers/vm-troubleshooting/internal/collector/system.go +++ b/customers/vm-troubleshooting/internal/collector/system.go @@ -52,7 +52,7 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) {"hardware/block_devices.txt", executor.CommandSpec{Name: "lsblk", Args: []string{"-o", "NAME,SIZE,TYPE,FSTYPE,MOUNTPOINT,MODEL"}, Timeout: config.TimeoutQuick}, "lsblk", []string{"disk", "hardware"}}, } { if !c.Exec.CommandExists(spec.cmd.Name) { - r.RecordSkipped(spec.cmd.Name + ": unavailable") + r.RecordSkip(SkipCommandUnavailable, spec.cmd.Name+": unavailable") continue } c.saveCommand(ctx, r, spec.path, spec.cmd, spec.hint, spec.tags...) @@ -85,11 +85,11 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) func (c *SystemCollector) saveProcess(ctx context.Context, r *CollectorResult, path string, spec executor.CommandSpec) { if err := ValidateTagsAndHint("ps", []string{"processes"}); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) return } if err := c.Writer.ReservePath(path); err != nil { - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, err), path) return } result, stdout, stderr := c.Exec.Capture(ctx, spec, 10*1024*1024) @@ -99,14 +99,19 @@ func (c *SystemCollector) saveProcess(ctx context.Context, r *CollectorResult, p } if err := c.Writer.SaveCapturedCommand(path, spec, strings.TrimSpace(content)+"\n", "", "Potential secrets in command arguments have been redacted"); err != nil { c.Writer.ReleasePath(path) - r.RecordError(fmt.Sprintf("%s: %v", path, err)) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, err), path) return } - r.AddCommandArtifact(path, spec.String(), result.ExitCode, "ok", false, result.TimedOut, result.Truncated, result.Duration, "ps", "processes") + status := "ok" if result.Skipped { - r.RecordSkipped(spec.String()) - } - if result.Err != nil && !result.Skipped { - r.RecordError(fmt.Sprintf("%s: %v", spec.String(), result.Err)) + status = "skipped" + r.RecordSkipForArtifact(SkipPermissionOrAccess, spec.String(), path) + } else if result.TimedOut { + status = "error" + r.RecordErrorForArtifact(ErrCommandTimedOut, fmt.Sprintf("%s: timed out", spec.String()), path) + } else if result.Err != nil { + status = "error" + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", spec.String(), result.Err), path) } + r.AddCommandArtifact(path, spec.String(), result.ExitCode, status, false, result.TimedOut, result.Truncated, result.Duration, "ps", "processes") } diff --git a/customers/vm-troubleshooting/internal/install/dcgm.go b/customers/vm-troubleshooting/internal/install/dcgm.go index 40b1b63..f56efa4 100644 --- a/customers/vm-troubleshooting/internal/install/dcgm.go +++ b/customers/vm-troubleshooting/internal/install/dcgm.go @@ -53,6 +53,40 @@ func prodDeps() installDeps { } } +// EnableDCGMDaemon prompts the user to enable and start the nvidia-dcgm service +// when DCGM is installed but the daemon is not running. Returns true if the +// daemon is running after the call (either started successfully, or the caller +// should not gate on it — e.g. no systemctl). +func EnableDCGMDaemon(ctx context.Context, exec executor.Executor, u ui.UI) bool { + if !exec.CommandExists("systemctl") { + return false + } + if !u.IsInteractive() || !exec.HasRoot() { + return false + } + if !u.PromptYesNo("DCGM daemon is not running. Start it to collect GPU health data?") { + return false + } + sp := u.StartSpinner("Starting nvidia-dcgm service...") + res, _, stderr := exec.Capture(ctx, executor.CommandSpec{ + Name: "systemctl", + Args: []string{"--now", "enable", "nvidia-dcgm"}, + NeedsRoot: true, + Timeout: config.TimeoutQuick, + }, 128*1024) + if res.Err != nil { + sp.Fail(fmt.Sprintf("failed to start nvidia-dcgm: %s", strings.TrimSpace(string(stderr)))) + return false + } + // Verify via D-Bus that daemon is actually active now. + if !platform.IsDCGMDaemonActive(ctx) { + sp.Fail("nvidia-dcgm service started but daemon not responding") + return false + } + sp.Success("nvidia-dcgm service started") + return true +} + // PromptAndInstallDCGM is the entry point called from the runner. // It gates on interactivity, root, supported OS, and WSL before prompting. func PromptAndInstallDCGM(ctx context.Context, exec executor.Executor, u ui.UI, distro platform.DistroInfo) error { diff --git a/customers/vm-troubleshooting/internal/output/archive.go b/customers/vm-troubleshooting/internal/output/archive.go index 2e5b5b6..9fc6882 100644 --- a/customers/vm-troubleshooting/internal/output/archive.go +++ b/customers/vm-troubleshooting/internal/output/archive.go @@ -3,24 +3,38 @@ package output import ( "archive/tar" "compress/gzip" + "errors" "io" + "io/fs" "os" "path/filepath" ) -func CreateArchive(srcDir, destArchive string) error { +func CreateArchive(srcDir, destArchive string) (retErr error) { out, err := os.Create(destArchive) if err != nil { return err } - defer out.Close() + defer func() { + retErr = errors.Join(retErr, out.Close()) + }() + gzw := gzip.NewWriter(out) - defer gzw.Close() + defer func() { + retErr = errors.Join(retErr, gzw.Close()) + }() + tw := tar.NewWriter(gzw) - defer tw.Close() + defer func() { + retErr = errors.Join(retErr, tw.Close()) + }() + parent := filepath.Dir(srcDir) - base := filepath.Base(srcDir) - return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { + return filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + info, err := d.Info() if err != nil { return err } @@ -36,15 +50,15 @@ func CreateArchive(srcDir, destArchive string) error { if err := tw.WriteHeader(header); err != nil { return err } - if info.IsDir() || rel == base { + if d.IsDir() { return nil } f, err := os.Open(path) if err != nil { return err } - defer f.Close() - _, err = io.Copy(tw, f) - return err + _, copyErr := io.Copy(tw, f) + closeErr := f.Close() + return errors.Join(copyErr, closeErr) }) } diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index e02e747..2c41862 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -21,7 +21,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "1.1.0", + SchemaVersion: "2.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", @@ -69,7 +69,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { t.Fatalf("extractArchive failed: %v", err) } archiveRoot := filepath.Join(extracted, filepath.Base(root)) - for _, path := range []string{"schemas/manifest.schema.json", "schemas/report-record.schema.json", "manifest.json", "report.ndjson"} { + for _, path := range []string{"schemas/manifest.schema.json", "schemas/report-record.schema.json", "schemas/triage-result.schema.json", "manifest.json", "report.ndjson"} { if _, err := os.Stat(filepath.Join(archiveRoot, path)); err != nil { t.Fatalf("expected %s in archive: %v", path, err) } diff --git a/customers/vm-troubleshooting/internal/output/contract_test.go b/customers/vm-troubleshooting/internal/output/contract_test.go new file mode 100644 index 0000000..420146b --- /dev/null +++ b/customers/vm-troubleshooting/internal/output/contract_test.go @@ -0,0 +1,334 @@ +package output_test + +import ( + "encoding/json" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + schemaassets "github.com/NexGenCloud/vm-diagnostics/schemas" +) + +// TestSchemaTagEnumsMatchGoVocabulary ensures schema tag enums stay in sync with +// the Go ValidTags map. This prevents the drift that Phase 1A fixed from recurring. +func TestSchemaTagEnumsMatchGoVocabulary(t *testing.T) { + t.Parallel() + // Manifest schema: tags at $defs.artifact.properties.tags.items.enum + manifestTags := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "artifact", "properties", "tags", "items", "enum") + checkEnumMatchesMap(t, "manifest.schema.json tags", manifestTags, collector.ValidTags) + + // Report schema: tags inside oneOf[0] (artifact branch) at properties.tags.items.enum + reportTags := extractReportTagEnum(t) + checkEnumMatchesMap(t, "report-record.schema.json tags", reportTags, collector.ValidTags) +} + +func checkEnumMatchesMap(t *testing.T, label string, schemaEnum []string, goMap map[string]bool) { + t.Helper() + for _, val := range schemaEnum { + if !goMap[val] { + t.Errorf("%s has %q not in Go map", label, val) + } + } + for goVal := range goMap { + found := false + for _, val := range schemaEnum { + if val == goVal { + found = true + break + } + } + if !found { + t.Errorf("Go map has %q not in %s", goVal, label) + } + } +} + +// extractReportTagEnum extracts tags from the report schema's oneOf[0] (artifact branch). +func extractReportTagEnum(t *testing.T) []string { + t.Helper() + data, err := schemaassets.FS.ReadFile("report-record.schema.json") + if err != nil { + t.Fatalf("reading report schema: %v", err) + } + var doc map[string]any + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("parsing report schema: %v", err) + } + oneOf, ok := doc["oneOf"].([]any) + if !ok || len(oneOf) == 0 { + t.Fatal("report schema: missing oneOf") + } + // First branch is the artifact branch + branch, ok := oneOf[0].(map[string]any) + if !ok { + t.Fatal("report schema: oneOf[0] not an object") + } + props, ok := branch["properties"].(map[string]any) + if !ok { + t.Fatal("report schema: oneOf[0].properties not an object") + } + tags, ok := props["tags"].(map[string]any) + if !ok { + t.Fatal("report schema: tags not an object") + } + items, ok := tags["items"].(map[string]any) + if !ok { + t.Fatal("report schema: tags.items not an object") + } + enumRaw, ok := items["enum"].([]any) + if !ok { + t.Fatal("report schema: tags.items.enum not an array") + } + result := make([]string, len(enumRaw)) + for i, v := range enumRaw { + result[i], ok = v.(string) + if !ok { + t.Fatalf("report schema: tags.items.enum[%d] not a string", i) + } + } + return result +} + +// TestSchemaParserHintEnumsMatchGoVocabulary ensures parser_hint enums stay in sync. +func TestSchemaParserHintEnumsMatchGoVocabulary(t *testing.T) { + t.Parallel() + hints := extractSchemaParserHintEnum(t, "manifest.schema.json") + for _, hint := range hints { + if !collector.ValidParserHints[hint] { + t.Errorf("schema has parser_hint %q not in Go ValidParserHints", hint) + } + } + for goHint := range collector.ValidParserHints { + found := false + for _, hint := range hints { + if hint == goHint { + found = true + break + } + } + if !found { + t.Errorf("Go ValidParserHints has hint %q not in manifest schema", goHint) + } + } +} + +// TestSchemaSkipReasonEnumsMatchGoConstants ensures skip reason codes stay in sync. +func TestSchemaSkipReasonEnumsMatchGoConstants(t *testing.T) { + t.Parallel() + goSkipCodes := map[string]bool{ + string(collector.SkipDisabledByFlag): true, + string(collector.SkipCommandUnavailable): true, + string(collector.SkipSourceUnavailable): true, + string(collector.SkipNotApplicable): true, + string(collector.SkipPermissionOrAccess): true, + string(collector.SkipDaemonUnavailable): true, + } + schemaCodes := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "skip_reason", "properties", "reason", "enum") + for _, code := range schemaCodes { + if !goSkipCodes[code] { + t.Errorf("schema has skip reason %q not in Go SkipCode constants", code) + } + } + for code := range goSkipCodes { + found := false + for _, sc := range schemaCodes { + if sc == code { + found = true + break + } + } + if !found { + t.Errorf("Go SkipCode %q not in manifest schema skip_reason enum", code) + } + } +} + +// TestSchemaErrorCodeEnumsMatchGoConstants ensures error codes stay in sync. +func TestSchemaErrorCodeEnumsMatchGoConstants(t *testing.T) { + t.Parallel() + goErrorCodes := map[string]bool{ + string(collector.ErrCommandFailed): true, + string(collector.ErrCommandTimedOut): true, + string(collector.ErrProbeFailed): true, + string(collector.ErrArtifactValidation): true, + string(collector.ErrArtifactReserve): true, + string(collector.ErrArtifactWrite): true, + string(collector.ErrEnumerationFailed): true, + } + schemaCodes := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "structured_error", "properties", "code", "enum") + for _, code := range schemaCodes { + if !goErrorCodes[code] { + t.Errorf("schema has error code %q not in Go ErrorCode constants", code) + } + } + for code := range goErrorCodes { + found := false + for _, sc := range schemaCodes { + if sc == code { + found = true + break + } + } + if !found { + t.Errorf("Go ErrorCode %q not in manifest schema structured_error enum", code) + } + } +} + +// TestFactTypingConsistency ensures ConvertFacts produces consistent types. +func TestFactTypingConsistency(t *testing.T) { + t.Parallel() + facts := map[string]string{ + "cpu_cores": "8", + "gpu_count": "2", + "xid_classified_count": "3", + "firewall_posture": "restrictive", + "oom_event_count": "unavailable", + } + result := output.ConvertFacts(facts) + // Integer keys should be numbers + if v, ok := result["cpu_cores"].(int64); !ok || v != 8 { + t.Errorf("cpu_cores should be int64(8), got %T(%v)", result["cpu_cores"], result["cpu_cores"]) + } + if v, ok := result["gpu_count"].(int64); !ok || v != 2 { + t.Errorf("gpu_count should be int64(2), got %T(%v)", result["gpu_count"], result["gpu_count"]) + } + // String keys should stay strings + if v, ok := result["firewall_posture"].(string); !ok || v != "restrictive" { + t.Errorf("firewall_posture should be string, got %T(%v)", result["firewall_posture"], result["firewall_posture"]) + } + // "unavailable" → null + if result["oom_event_count"] != nil { + t.Errorf("oom_event_count with 'unavailable' should be nil, got %T(%v)", result["oom_event_count"], result["oom_event_count"]) + } +} + +// TestStructuredErrorSerialization verifies structured errors roundtrip correctly. +func TestStructuredErrorSerialization(t *testing.T) { + t.Parallel() + err := output.ManifestStructuredError{ + Code: "command_failed", + Message: "nvidia-smi: exit status 1", + ArtifactPath: "nvidia/nvidia_smi.txt", + } + data, jsonErr := json.Marshal(err) + if jsonErr != nil { + t.Fatal(jsonErr) + } + var decoded output.ManifestStructuredError + if jsonErr := json.Unmarshal(data, &decoded); jsonErr != nil { + t.Fatal(jsonErr) + } + if decoded.Code != err.Code || decoded.Message != err.Message || decoded.ArtifactPath != err.ArtifactPath { + t.Errorf("roundtrip mismatch: got %+v", decoded) + } +} + +// TestStructuredSkipSerialization verifies skip reasons roundtrip correctly. +func TestStructuredSkipSerialization(t *testing.T) { + t.Parallel() + skip := output.ManifestSkipReason{ + Reason: "command_unavailable", + Detail: "nvidia-smi unavailable", + ArtifactPath: "nvidia/nvidia_smi.txt", + } + data, err := json.Marshal(skip) + if err != nil { + t.Fatal(err) + } + var decoded output.ManifestSkipReason + if err := json.Unmarshal(data, &decoded); err != nil { + t.Fatal(err) + } + if decoded.Reason != skip.Reason || decoded.Detail != skip.Detail || decoded.ArtifactPath != skip.ArtifactPath { + t.Errorf("roundtrip mismatch: got %+v", decoded) + } +} + +// TestSchemaCollectorStatusEnumMatchesCode ensures the status enum in the manifest +// schema stays in sync with the collector status values produced in runner.go. +func TestSchemaCollectorStatusEnumMatchesCode(t *testing.T) { + t.Parallel() + goStatuses := map[string]bool{"ok": true, "partial": true, "failed": true, "skipped": true} + schemaStatuses := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "collector_summary", "properties", "status", "enum") + checkEnumMatchesMap(t, "manifest collector status", schemaStatuses, goStatuses) +} + +// TestTriageResultSchemaIsLoadable verifies the triage result schema exists and is valid JSON. +func TestTriageResultSchemaIsLoadable(t *testing.T) { + t.Parallel() + data, err := schemaassets.FS.ReadFile("triage-result.schema.json") + if err != nil { + t.Fatalf("missing triage-result.schema.json: %v", err) + } + var doc map[string]any + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("invalid JSON in triage-result.schema.json: %v", err) + } + // Verify key fields + if doc["$id"] == nil { + t.Error("triage-result.schema.json missing $id") + } + defs, ok := doc["$defs"].(map[string]any) + if !ok { + t.Fatal("triage-result.schema.json missing $defs") + } + if _, ok := defs["finding"]; !ok { + t.Error("triage-result.schema.json missing $defs.finding") + } +} + +// --- helpers --- + +func extractSchemaParserHintEnum(t *testing.T, schemaFile string) []string { + t.Helper() + return extractSchemaEnumAtPath(t, schemaFile, "$defs", "artifact", "properties", "parser_hint", "enum") +} + +// extractSchemaEnumAtPath navigates a JSON schema by a sequence of map keys, +// where the last key must point to a JSON array of strings. +func extractSchemaEnumAtPath(t *testing.T, schemaFile string, keys ...string) []string { + t.Helper() + data, err := schemaassets.FS.ReadFile(schemaFile) + if err != nil { + t.Fatalf("reading schema %s: %v", schemaFile, err) + } + var doc map[string]any + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("parsing schema %s: %v", schemaFile, err) + } + + var current any = doc + for i, key := range keys { + if key == "enum" { + // last key — extract the array + arr, ok := current.(map[string]any)[key] + if !ok { + t.Fatalf("schema %s: key %q not found at depth %d", schemaFile, key, i) + } + items, ok := arr.([]any) + if !ok { + t.Fatalf("schema %s: expected array at %q, got %T", schemaFile, key, arr) + } + result := make([]string, len(items)) + for j, item := range items { + s, ok := item.(string) + if !ok { + t.Fatalf("schema %s: expected string in array at %q[%d], got %T", schemaFile, key, j, item) + } + result[j] = s + } + return result + } + m, ok := current.(map[string]any) + if !ok { + t.Fatalf("schema %s: expected object at depth %d for key %q, got %T", schemaFile, i, key, current) + } + current, ok = m[key] + if !ok { + t.Fatalf("schema %s: key %q not found at depth %d", schemaFile, key, i) + } + } + t.Fatalf("schema %s: path did not end with 'enum'", schemaFile) + return nil +} diff --git a/customers/vm-troubleshooting/internal/output/manifest.go b/customers/vm-troubleshooting/internal/output/manifest.go index 5fb4cfc..30fe729 100644 --- a/customers/vm-troubleshooting/internal/output/manifest.go +++ b/customers/vm-troubleshooting/internal/output/manifest.go @@ -34,13 +34,13 @@ type ManifestArtifact struct { Type string `json:"type"` Command string `json:"command,omitempty"` Source string `json:"source,omitempty"` - ExitCode int `json:"exit_code,omitempty"` + ExitCode int `json:"exit_code"` Status string `json:"status"` - IgnoredExit bool `json:"ignored_exit,omitempty"` - TimedOut bool `json:"timed_out,omitempty"` + IgnoredExit bool `json:"ignored_exit"` + TimedOut bool `json:"timed_out"` Sanitized bool `json:"sanitized,omitempty"` - Truncated bool `json:"truncated,omitempty"` - DurationMS int64 `json:"duration_ms,omitempty"` + Truncated bool `json:"truncated"` + DurationMS int64 `json:"duration_ms"` SizeBytes int64 `json:"size_bytes"` SHA256 string `json:"sha256"` ContentType string `json:"content_type"` @@ -56,17 +56,31 @@ type ManifestIssueMeta struct { Hidden bool `json:"hidden,omitempty"` } +// ManifestSkipReason is the JSON representation of a skip reason. +type ManifestSkipReason struct { + Reason string `json:"reason"` + Detail string `json:"detail"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + +// ManifestStructuredError is the JSON representation of a structured error. +type ManifestStructuredError struct { + Code string `json:"code"` + Message string `json:"message"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + // ManifestCollector is the per-collector summary in the manifest. type ManifestCollector struct { - Status string `json:"status"` - DurationMS int64 `json:"duration_ms"` - ArtifactCount int `json:"artifact_count"` - SkippedCount int `json:"skipped_count"` - ErrorCount int `json:"error_count"` - Facts map[string]any `json:"facts"` - Issues []ManifestIssueMeta `json:"issues"` - Skipped []string `json:"skipped"` - Errors []string `json:"errors"` + Status string `json:"status"` + DurationMS int64 `json:"duration_ms"` + ArtifactCount int `json:"artifact_count"` + SkippedCount int `json:"skipped_count"` + ErrorCount int `json:"error_count"` + Facts map[string]any `json:"facts"` + Issues []ManifestIssueMeta `json:"issues"` + SkipReasons []ManifestSkipReason `json:"skip_reasons"` + Errors []ManifestStructuredError `json:"structured_errors"` } // ManifestJSON is the top-level manifest structure. @@ -91,8 +105,10 @@ var integerFactKeys = map[string]bool{ "xid_classified_count": true, "critical_event_count": true, "oom_event_count": true, } -// convertFacts converts string facts to typed JSON values per the explicit allowlist. -func convertFacts(facts map[string]string) map[string]any { +// ConvertFacts converts string facts to typed JSON values per the explicit allowlist. +// Integer-keyed facts become JSON numbers; "unavailable" or "" become null. +// Use this for manifest, report, and triage JSON to ensure consistent typing. +func ConvertFacts(facts map[string]string) map[string]any { out := make(map[string]any, len(facts)) keys := make([]string, 0, len(facts)) for k := range facts { @@ -116,23 +132,6 @@ func convertFacts(facts map[string]string) map[string]any { return out } -// ArtifactLike is the interface that collector results implement for artifact records. -type ArtifactLike interface { - GetPath() string - GetType() string - GetCommand() string - GetSource() string - GetExitCode() int - GetStatus() string - GetIgnoredExit() bool - GetTimedOut() bool - GetSanitized() bool - GetTruncated() bool - GetTags() []string - GetDuration() time.Duration - GetParserHint() string -} - // fileInfo computes sha256 and size for a file in the archive work directory. func fileInfo(root, path string) (size int64, hash string) { fullPath := filepath.Join(root, cleanRelativePath(path)) @@ -149,20 +148,24 @@ func fileInfo(root, path string) (size int64, hash string) { return n, hex.EncodeToString(h.Sum(nil)) } -func contentTypeForArtifact(path, hint string) string { +func contentTypeForArtifact(path, hint, artifactType string) string { if hint == "binary" { return "application/octet-stream" } - switch strings.ToLower(filepath.Ext(path)) { - case ".json": - return "application/json" - case ".ndjson": - return "application/x-ndjson" - case ".csv": - return "text/csv" - default: - return "text/plain" + // Only claim machine-parseable content types for probe artifacts. + // Command artifacts get headers prepended by SaveCapturedCommand, + // making them invalid JSON/CSV on disk. + if artifactType == "probe" { + switch strings.ToLower(filepath.Ext(path)) { + case ".json": + return "application/json" + case ".ndjson": + return "application/x-ndjson" + case ".csv": + return "text/csv" + } } + return "text/plain" } func stageSchemas(w *Writer) error { @@ -215,9 +218,7 @@ func WriteManifestFromResults(w *Writer, meta ManifestMeta, collectorIDs []strin ArtifactCount: len(ri.Artifacts), SkippedCount: len(ri.Skipped), ErrorCount: len(ri.Errors), - Facts: convertFacts(ri.Facts), - Skipped: ri.Skipped, - Errors: ri.Errors, + Facts: ConvertFacts(ri.Facts), } for _, issue := range ri.Issues { mc.Issues = append(mc.Issues, ManifestIssueMeta{ @@ -227,14 +228,28 @@ func WriteManifestFromResults(w *Writer, meta ManifestMeta, collectorIDs []strin Hidden: issue.Hidden, }) } + for _, s := range ri.Skipped { + mc.SkipReasons = append(mc.SkipReasons, ManifestSkipReason{ + Reason: s.Reason, + Detail: s.Detail, + ArtifactPath: s.ArtifactPath, + }) + } + for _, e := range ri.Errors { + mc.Errors = append(mc.Errors, ManifestStructuredError{ + Code: e.Code, + Message: e.Message, + ArtifactPath: e.ArtifactPath, + }) + } if mc.Issues == nil { mc.Issues = []ManifestIssueMeta{} } - if mc.Skipped == nil { - mc.Skipped = []string{} + if mc.SkipReasons == nil { + mc.SkipReasons = []ManifestSkipReason{} } if mc.Errors == nil { - mc.Errors = []string{} + mc.Errors = []ManifestStructuredError{} } for _, a := range ri.Artifacts { sz, h := fileInfo(w.Root(), a.Path) @@ -253,7 +268,7 @@ func WriteManifestFromResults(w *Writer, meta ManifestMeta, collectorIDs []strin DurationMS: a.DurationMS, SizeBytes: sz, SHA256: h, - ContentType: contentTypeForArtifact(a.Path, a.ParserHint), + ContentType: contentTypeForArtifact(a.Path, a.ParserHint, a.Type), ParserHint: a.ParserHint, Tags: a.Tags, } @@ -272,6 +287,20 @@ func WriteManifestFromResults(w *Writer, meta ManifestMeta, collectorIDs []strin return w.SaveOutput("manifest.json", string(data)+"\n") } +// ManifestSkipReasonInput mirrors collector.SkipReason for the manifest writer bridge. +type ManifestSkipReasonInput struct { + Reason string + Detail string + ArtifactPath string +} + +// ManifestStructuredErrorInput mirrors collector.StructuredError for the manifest writer bridge. +type ManifestStructuredErrorInput struct { + Code string + Message string + ArtifactPath string +} + // ManifestResultInput is a simple struct to pass collector results to the manifest writer // without importing the collector package (avoids import cycle). type ManifestResultInput struct { @@ -280,8 +309,8 @@ type ManifestResultInput struct { Artifacts []ManifestArtifactInput Facts map[string]string Issues []ManifestIssueInput - Skipped []string - Errors []string + Skipped []ManifestSkipReasonInput + Errors []ManifestStructuredErrorInput } type ManifestArtifactInput struct { @@ -314,7 +343,8 @@ func BuildManifestInput( artifacts []ManifestArtifactInput, facts map[string]string, issues []ManifestIssueInput, - skipped, errors []string, + skipped []ManifestSkipReasonInput, + errors []ManifestStructuredErrorInput, ) ManifestResultInput { return ManifestResultInput{ Status: status, diff --git a/customers/vm-troubleshooting/internal/output/manifest_test.go b/customers/vm-troubleshooting/internal/output/manifest_test.go index 9789f20..34cf74a 100644 --- a/customers/vm-troubleshooting/internal/output/manifest_test.go +++ b/customers/vm-troubleshooting/internal/output/manifest_test.go @@ -37,7 +37,7 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "1.1.0", + SchemaVersion: "2.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 88e0004..6c34377 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -4,7 +4,6 @@ import ( "encoding/json" "os" "sort" - "strconv" "time" ) @@ -18,10 +17,13 @@ type ReportRecord struct { // type=artifact fields Path string `json:"path,omitempty"` Command string `json:"command,omitempty"` - ExitCode int `json:"exit_code,omitempty"` + ExitCode *int `json:"exit_code,omitempty"` Status string `json:"status,omitempty"` Tags []string `json:"tags,omitempty"` - DurationMS int64 `json:"duration_ms,omitempty"` + DurationMS *int64 `json:"duration_ms,omitempty"` + + // Note: exit_code/duration_ms use pointers so artifact and summary records + // can explicitly emit zero values, while non-artifact record types omit them. // type=issue fields Severity string `json:"severity,omitempty"` @@ -37,7 +39,7 @@ type ReportRecord struct { ArtifactCount int `json:"artifact_count,omitempty"` } -const reportSchemaVersion = "1.1.0" +const reportSchemaVersion = "2.0.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. @@ -49,6 +51,8 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes ri := resultsMap[cid] for _, a := range ri.Artifacts { + exitCode := a.ExitCode + durationMS := a.DurationMS if err := enc.Encode(ReportRecord{ SchemaVersion: reportSchemaVersion, Type: "artifact", @@ -56,10 +60,10 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes Collector: cid, Path: a.Path, Command: a.Command, - ExitCode: a.ExitCode, + ExitCode: &exitCode, Status: a.Status, Tags: a.Tags, - DurationMS: a.DurationMS, + DurationMS: &durationMS, }); err != nil { return err } @@ -80,33 +84,26 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes } } - factKeys := make([]string, 0, len(ri.Facts)) - for k := range ri.Facts { + typedFacts := ConvertFacts(ri.Facts) + factKeys := make([]string, 0, len(typedFacts)) + for k := range typedFacts { factKeys = append(factKeys, k) } sort.Strings(factKeys) for _, k := range factKeys { - v := ri.Facts[k] - var typedVal any = v - if integerFactKeys[k] { - if v == "unavailable" || v == "" { - typedVal = nil - } else if n, err := strconv.ParseInt(v, 10, 64); err == nil { - typedVal = n - } - } if err := enc.Encode(ReportRecord{ SchemaVersion: reportSchemaVersion, Type: "fact", Timestamp: ts, Collector: cid, Key: k, - Value: typedVal, + Value: typedFacts[k], }); err != nil { return err } } + durationMS := ri.DurationMS if err := enc.Encode(ReportRecord{ SchemaVersion: reportSchemaVersion, Type: "collector_summary", @@ -114,7 +111,7 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes Collector: cid, Status: ri.Status, ArtifactCount: len(ri.Artifacts), - DurationMS: ri.DurationMS, + DurationMS: &durationMS, }); err != nil { return err } diff --git a/customers/vm-troubleshooting/internal/output/report_test.go b/customers/vm-troubleshooting/internal/output/report_test.go index 71f29b8..736c2e9 100644 --- a/customers/vm-troubleshooting/internal/output/report_test.go +++ b/customers/vm-troubleshooting/internal/output/report_test.go @@ -28,7 +28,7 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { ), "network": BuildManifestInput( "ok", - 20, + 0, []ManifestArtifactInput{{Path: "network/ip_addr.txt", Type: "probe", Status: "ok", ParserHint: "netlink", Tags: []string{"network"}}}, map[string]string{"hostname": "node-1"}, nil, @@ -81,8 +81,17 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { if records[0]["collector"] != "system" || records[0]["type"] != "artifact" { t.Fatalf("unexpected first record ordering: %#v", records[0]) } + if _, ok := records[0]["exit_code"]; !ok { + t.Fatalf("artifact record should include explicit exit_code, got %#v", records[0]) + } + if _, ok := records[0]["duration_ms"]; !ok { + t.Fatalf("artifact record should include explicit duration_ms, got %#v", records[0]) + } last := records[len(records)-1] if last["collector"] != "network" || last["type"] != "collector_summary" { t.Fatalf("unexpected last record ordering: %#v", last) } + if _, ok := last["duration_ms"]; !ok { + t.Fatalf("collector_summary should include explicit duration_ms, got %#v", last) + } } diff --git a/customers/vm-troubleshooting/internal/output/summary.go b/customers/vm-troubleshooting/internal/output/summary.go index 61897ab..66d4617 100644 --- a/customers/vm-troubleshooting/internal/output/summary.go +++ b/customers/vm-troubleshooting/internal/output/summary.go @@ -21,13 +21,13 @@ type SummaryIssue struct { } type SummaryResult struct { - Name string - Issues []SummaryIssue - Facts map[string]string - Artifacts []string - Skipped []string - Errors []string - Duration time.Duration + Name string + Issues []SummaryIssue + Facts map[string]string + Artifacts []string + SkipDetails []string // human-readable skip details for SUMMARY.txt display + ErrorMessages []string // human-readable error messages for SUMMARY.txt display + Duration time.Duration } type CollectorMeta struct { @@ -153,7 +153,7 @@ func WriteSummary(w *Writer, hostname, version string, results []SummaryResult) } b.WriteString("Collectors:\n") for _, r := range results { - b.WriteString(fmt.Sprintf("- %s: artifacts=%d skipped=%d errors=%d duration=%s\n", r.Name, len(r.Artifacts), len(r.Skipped), len(r.Errors), r.Duration.Round(time.Millisecond))) + b.WriteString(fmt.Sprintf("- %s: artifacts=%d skipped=%d errors=%d duration=%s\n", r.Name, len(r.Artifacts), len(r.SkipDetails), len(r.ErrorMessages), r.Duration.Round(time.Millisecond))) } b.WriteString("\nKey Facts:\n") for _, r := range results { diff --git a/customers/vm-troubleshooting/internal/platform/dcgm.go b/customers/vm-troubleshooting/internal/platform/dcgm.go index db448a4..633e00a 100644 --- a/customers/vm-troubleshooting/internal/platform/dcgm.go +++ b/customers/vm-troubleshooting/internal/platform/dcgm.go @@ -5,11 +5,23 @@ import ( "strings" "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/probe" ) func DetectDCGM(ctx context.Context, exec executor.Executor, distro DistroInfo) bool { if exec.CommandExists("dcgmi") { - return true + // Verify dcgmi can actually talk to the driver with a lightweight probe. + // A broken install (library mismatch, corrupted package) will fail quickly. + result, _, _ := exec.Capture(ctx, executor.CommandSpec{ + Name: "dcgmi", + Args: []string{"discovery", "-l"}, + NeedsRoot: true, + Timeout: detectTimeout, + }, 64*1024) + if result.Err == nil && !result.Skipped { + return true + } + // dcgmi exists but can't connect — fall through to package detection } switch distro.Family { case "debian": @@ -28,18 +40,24 @@ func DetectDCGM(ctx context.Context, exec executor.Executor, distro DistroInfo) return false } -func IsDCGMSupportedOS(d DistroInfo) bool { - if d.ID == "ubuntu" && (d.Version == "22.04" || d.Version == "24.04") { - return true - } - if d.ID == "debian" && d.Version == "12" { - return true - } - if d.Family == "rhel" { - return true +// IsDCGMDaemonActive checks whether the nvidia-dcgm (or dcgm) systemd service +// is currently active via D-Bus. Returns true optimistically when D-Bus is +// unavailable so that dcgmi gets a chance to connect on its own. +func IsDCGMDaemonActive(ctx context.Context) bool { + statuses := probe.BatchServiceStatus(ctx, []string{"nvidia-dcgm", "dcgm"}) + if statuses == nil { + return true // D-Bus unavailable — let dcgmi try and fail gracefully } - if d.Family == "suse" { - return true + for _, name := range []string{"nvidia-dcgm", "dcgm"} { + if info, ok := statuses[name]; ok && info.ActiveState == "active" { + return true + } } return false } + +// IsDCGMSupportedOS returns true for OS versions the DCGM installer can actually handle. +// This must stay aligned with install.distroSlug — only advertise what the installer supports. +func IsDCGMSupportedOS(d DistroInfo) bool { + return d.ID == "ubuntu" && (d.Version == "22.04" || d.Version == "24.04") +} diff --git a/customers/vm-troubleshooting/internal/platform/nvidia.go b/customers/vm-troubleshooting/internal/platform/nvidia.go index ed44dbf..173872f 100644 --- a/customers/vm-troubleshooting/internal/platform/nvidia.go +++ b/customers/vm-troubleshooting/internal/platform/nvidia.go @@ -21,7 +21,20 @@ func DetectNvidiaGPU(ctx context.Context, exec executor.Executor) bool { } if exec.CommandExists("lspci") { _, stdout, _ := exec.Capture(ctx, executor.CommandSpec{Name: "lspci", Timeout: detectTimeout}, 1024*1024) - return strings.Contains(strings.ToLower(string(stdout)), "nvidia") + // Only match NVIDIA devices in GPU-related PCI classes (3D controller, + // VGA compatible, Display controller) to avoid false positives from + // NVIDIA network adapters and other non-GPU devices. + for _, line := range strings.Split(string(stdout), "\n") { + lower := strings.ToLower(line) + if !strings.Contains(lower, "nvidia") { + continue + } + if strings.Contains(lower, "3d controller") || + strings.Contains(lower, "vga compatible") || + strings.Contains(lower, "display controller") { + return true + } + } } return false } diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index b0e42e3..6f12c8c 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -90,6 +90,16 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { if err := install.PromptAndInstallDCGM(ctx, r.Exec, r.UI, distro); err != nil { r.UI.Warn(err.Error()) } + hasDCGM = platform.DetectDCGM(ctx, r.Exec, distro) + } + + // DCGM daemon check — installed but service not running + dcgmDaemonRunning := !hasDCGM // if not installed, flag is irrelevant (collector skips on missing binary) + if hasDCGM { + dcgmDaemonRunning = platform.IsDCGMDaemonActive(ctx) + if !dcgmDaemonRunning { + dcgmDaemonRunning = install.EnableDCGMDaemon(ctx, r.Exec, r.UI) + } } // Validate output directory @@ -108,7 +118,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { writer := output.NewWriter(workDir) // Reserve framework-owned output paths before collectors run - for _, reserved := range []string{"metadata.json", "manifest.json", "report.ndjson", "SUMMARY.txt", "transfer_commands.txt", "schemas/manifest.schema.json", "schemas/report-record.schema.json"} { + for _, reserved := range []string{"metadata.json", "manifest.json", "report.ndjson", "SUMMARY.txt", "transfer_commands.txt", "schemas/manifest.schema.json", "schemas/report-record.schema.json", "schemas/triage-result.schema.json"} { _ = writer.ReservePath(reserved) } @@ -116,7 +126,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { registry.Register(collector.NewSystemCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewNetworkCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewNvidiaCollector(r.Exec, writer, r.UI, hasGPU)) - registry.Register(collector.NewDCGMCollector(r.Exec, writer, r.UI, r.Config.EnableActiveGPUDiag)) + registry.Register(collector.NewDCGMCollector(r.Exec, writer, r.UI, r.Config.EnableActiveGPUDiag, dcgmDaemonRunning)) registry.Register(collector.NewDockerCollector(r.Exec, writer, r.UI, r.Config.IncludeContainerLogs)) registry.Register(collector.NewServicesCollector(r.Exec, writer, r.UI, distro)) registry.Register(collector.NewJournalCollector(r.Exec, writer, r.UI, r.Config.JournalSince, r.Config.IncludeFullJournal)) @@ -138,7 +148,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { // Run triage analyzers on collected artifacts (before summary so findings appear in SUMMARY.txt) r.UI.Section("Analyzing findings") - triageResults, triageErr := triage.RunAllAnalyzers(ctx, workDir, writer, r.UI) + triageResults, triageErr := triage.RunAllAnalyzers(ctx, workDir, archiveName, writer, r.UI) if triageErr != nil { r.UI.Warn("Triage analysis failed: " + triageErr.Error()) } @@ -203,14 +213,23 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } partial := false for _, res := range results { + status := collectorStatus(res) + if status == "partial" || status == "failed" { + partial = true + } + sr := output.SummaryResult{ Name: res.Name, Facts: res.Facts, Artifacts: collector.ArtifactPaths(res.Artifacts), - Skipped: res.Skipped, - Errors: res.Errors, Duration: res.Duration, } + for _, s := range res.Skipped { + sr.SkipDetails = append(sr.SkipDetails, s.Detail) + } + for _, e := range res.Errors { + sr.ErrorMessages = append(sr.ErrorMessages, e.Message) + } for _, issue := range res.Issues { if !issue.Severity.Valid() { r.UI.Warn(fmt.Sprintf("collector %s: issue with invalid severity skipped: %s", res.ID, issue.Message)) @@ -225,13 +244,6 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } summaryResults = append(summaryResults, sr) - status := "ok" - if len(res.Errors) > 0 { - status = "partial" - partial = true - } else if len(res.Artifacts) == 0 && len(res.Skipped) > 0 { - status = "skipped" - } meta.Collectors[res.ID] = output.CollectorMeta{ Status: status, DurationMS: res.Duration.Milliseconds(), @@ -259,12 +271,8 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { platformOS, platformKernel := "", "" for _, res := range results { collectorIDs = append(collectorIDs, res.ID) - status := "ok" - if len(res.Errors) > 0 { - status = "partial" - } else if len(res.Artifacts) == 0 && len(res.Skipped) > 0 { - status = "skipped" - } + status := collectorStatus(res) + artifacts := make([]output.ManifestArtifactInput, 0, len(res.Artifacts)) for _, a := range res.Artifacts { artifacts = append(artifacts, output.ManifestArtifactInput{ @@ -286,10 +294,22 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { Hidden: issue.Hidden, }) } + skipped := make([]output.ManifestSkipReasonInput, 0, len(res.Skipped)) + for _, s := range res.Skipped { + skipped = append(skipped, output.ManifestSkipReasonInput{ + Reason: string(s.Reason), Detail: s.Detail, ArtifactPath: s.ArtifactPath, + }) + } + errors := make([]output.ManifestStructuredErrorInput, 0, len(res.Errors)) + for _, e := range res.Errors { + errors = append(errors, output.ManifestStructuredErrorInput{ + Code: string(e.Code), Message: e.Message, ArtifactPath: e.ArtifactPath, + }) + } resultsMap[res.ID] = output.ManifestResultInput{ Status: status, DurationMS: res.Duration.Milliseconds(), Artifacts: artifacts, Facts: res.Facts, Issues: issues, - Skipped: res.Skipped, Errors: res.Errors, + Skipped: skipped, Errors: errors, } if res.ID == "system" { platformOS = res.Facts["os"] @@ -298,7 +318,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "1.1.0", + SchemaVersion: "2.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, @@ -369,6 +389,21 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { return &RunResult{ArchivePath: archivePath, WorkDir: workDir, ExitCode: exitCode}, nil } +// collectorStatus derives the terminal status from a collector result. +// ok: no errors. partial: some artifacts produced, some errors. failed: errors, zero artifacts. skipped: disabled or not applicable. +func collectorStatus(res *collector.CollectorResult) string { + if len(res.Errors) > 0 { + if len(res.Artifacts) > 0 { + return "partial" + } + return "failed" + } + if len(res.Artifacts) == 0 && len(res.Skipped) > 0 { + return "skipped" + } + return "ok" +} + func uiAllowedInstall(u ui.UI, cfg *config.Config) bool { return u.IsInteractive() && !cfg.NonInteractive } diff --git a/customers/vm-troubleshooting/internal/transfer/commands.go b/customers/vm-troubleshooting/internal/transfer/commands.go index 4773d70..e8355ae 100644 --- a/customers/vm-troubleshooting/internal/transfer/commands.go +++ b/customers/vm-troubleshooting/internal/transfer/commands.go @@ -1,6 +1,7 @@ package transfer import ( + "cmp" "context" "fmt" "io" @@ -8,7 +9,7 @@ import ( "net/http" "os" "os/user" - "sort" + "slices" "strings" "time" @@ -72,19 +73,43 @@ func DiscoverIPs() []IPInfo { }) } } - // Sort: default-route first, then public before private, alphabetical within group - sort.Slice(ips, func(i, j int) bool { - if ips[i].HasDefaultRoute != ips[j].HasDefaultRoute { - return ips[i].HasDefaultRoute - } - if ips[i].IsPrivate != ips[j].IsPrivate { - return !ips[i].IsPrivate // public first - } - return ips[i].Address < ips[j].Address - }) + // Sort: default-route first, then public before private, alphabetical within group. + // Total order: Address+Interface is the tiebreaker so no ties are possible. + slices.SortStableFunc(ips, compareIPs) return ips } +// compareIPs defines a total order for IP entries: default-route first, +// then public before private, then alphabetical by address, then by interface name. +func compareIPs(a, b IPInfo) int { + return cmp.Or( + boolDesc(a.HasDefaultRoute, b.HasDefaultRoute), // default-route first + boolAsc(a.IsPrivate, b.IsPrivate), // public (false) before private (true) + cmp.Compare(a.Address, b.Address), // address tiebreaker + cmp.Compare(a.Interface, b.Interface), // interface tiebreaker (same IP on multiple ifaces) + ) +} + +func boolDesc(a, b bool) int { + if a == b { + return 0 + } + if a { + return -1 + } + return 1 +} + +func boolAsc(a, b bool) int { + if a == b { + return 0 + } + if a { + return 1 + } + return -1 +} + // DetectUsername returns the likely SSH username. // Prefers SUDO_USER, falls back to os/user.Current(), then "user". func DetectUsername() string { @@ -182,15 +207,7 @@ func GenerateCandidates(hostname, username string, ips []IPInfo) string { // Sort display: default-route first, then public, then private sorted := make([]IPInfo, len(ips)) copy(sorted, ips) - sort.Slice(sorted, func(i, j int) bool { - if sorted[i].HasDefaultRoute != sorted[j].HasDefaultRoute { - return sorted[i].HasDefaultRoute - } - if sorted[i].IsPrivate != sorted[j].IsPrivate { - return !sorted[i].IsPrivate - } - return sorted[i].Address < sorted[j].Address - }) + slices.SortStableFunc(sorted, compareIPs) b.WriteString(fmt.Sprintf("Hostname: %s\n", hostname)) if len(sorted) > 0 { diff --git a/customers/vm-troubleshooting/internal/triage/artifact_state_test.go b/customers/vm-troubleshooting/internal/triage/artifact_state_test.go new file mode 100644 index 0000000..34db6d5 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/artifact_state_test.go @@ -0,0 +1,68 @@ +package triage + +import ( + "os" + "path/filepath" + "testing" +) + +func TestCheckArtifact_HeaderedSkippedSentinel(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + content := "# Command: dmesg\n# Timestamp: 2026-01-01T00:00:00Z\n---\n[SKIPPED - requires root privileges]\nextra context line\n" + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + state, body := checkArtifact(workDir, "logs/dmesg.txt") + if state != ArtifactSkipped { + t.Fatalf("expected ArtifactSkipped, got %v", state) + } + if body != "" { + t.Fatalf("expected empty body for skipped artifact, got %q", body) + } +} + +func TestCheckArtifact_EmbeddedSkippedTextIsUsable(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + content := "# Command: dmesg\n---\nnormal first payload line\nsome log says [SKIPPED - sample text]\n" + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + state, body := checkArtifact(workDir, "logs/dmesg.txt") + if state != ArtifactUsable { + t.Fatalf("expected ArtifactUsable, got %v", state) + } + if body == "" { + t.Fatal("expected non-empty body for usable artifact") + } +} + +func TestCheckArtifact_PlainHashLineIsUsable(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + // No header separator: this is plain payload content. + content := "# plain log line\n" + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + state, body := checkArtifact(workDir, "logs/dmesg.txt") + if state != ArtifactUsable { + t.Fatalf("expected ArtifactUsable, got %v", state) + } + if body == "" { + t.Fatal("expected non-empty body for usable artifact") + } +} diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 2eeafdf..992cc45 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -1,10 +1,12 @@ package triage import ( + "cmp" "context" "fmt" "regexp" - "sort" + "slices" + "strconv" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/collector" @@ -20,9 +22,9 @@ type CriticalPattern struct { } // criticalPatterns are high-confidence patterns applied to all log sources. +// NOTE: Xid/SXid is owned by triage/xid.go; OOM is owned by collector/journal.go. +// Do not add patterns here that duplicate those owners. var criticalPatterns = []CriticalPattern{ - {"OOM Kill", regexp.MustCompile(`(?i)(out of memory|oom.kill|invoked oom-killer|killed process)`), collector.SeverityCritical, "MEM", false}, - {"Xid/SXid", regexp.MustCompile(`(?i)(NVRM:\s*(S?Xid))`), collector.SeverityCritical, "GPU", false}, {"Kernel Panic", regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), collector.SeverityCritical, "KERN", false}, {"Hardware Error", regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), collector.SeverityCritical, "HW", false}, {"Fallen Off Bus", regexp.MustCompile(`(?i)fallen off the bus`), collector.SeverityCritical, "GPU", false}, @@ -60,6 +62,36 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro {"logs/journal_errors.txt", true}, } + // Single-pass artifact check: classify each source once and cache the + // content for usable sources. This avoids re-reading files multiple times. + type checkedSource struct { + sourceSpec + state ArtifactState + content string + } + checked := make([]checkedSource, len(sources)) + anySkipped := false + anyUsable := false + for i, src := range sources { + state, content := checkArtifact(workDir, src.path) + checked[i] = checkedSource{sourceSpec: src, state: state, content: content} + if state == ArtifactSkipped { + anySkipped = true + } + if state == ArtifactUsable { + anyUsable = true + } + } + if !anySkipped && !anyUsable { + return nil, nil // all missing + } + if !anyUsable { + return &TriageResult{ + Name: "critical_events", + Facts: map[string]string{"critical_scan_status": "unavailable"}, + }, nil + } + // Dedup by (pattern_name, line_hash) to avoid counting the same line from multiple sources type dedupKey struct { pattern string @@ -68,11 +100,11 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro seen := make(map[dedupKey]*criticalEvent) var eventOrder []dedupKey // maintain insertion order - for _, src := range sources { - content := readArtifact(workDir, src.path) - if content == "" { + for _, src := range checked { + if src.state != ArtifactUsable { continue } + content := src.content patterns := criticalPatterns if src.lowConfidence { @@ -120,28 +152,31 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro for _, k := range eventOrder { events = append(events, seen[k]) } - sort.Slice(events, func(i, j int) bool { - if events[i].severity != events[j].severity { - return events[i].severity > events[j].severity - } - return events[i].count > events[j].count + // Sort: total order so output is deterministic regardless of input order. + // Tiebreakers extend to the dedup key (pattern + line) so no ties are possible. + slices.SortStableFunc(events, func(a, b *criticalEvent) int { + return cmp.Or( + cmp.Compare(b.severity, a.severity), // severity DESC + cmp.Compare(b.count, a.count), // count DESC + cmp.Compare(a.pattern, b.pattern), // pattern ASC (tiebreaker) + cmp.Compare(a.source, b.source), // source ASC (tiebreaker) + cmp.Compare(a.line, b.line), // line ASC (dedup key) + ) }) - // Cap at maxEvents - if len(events) > maxEvents { + // Cap at maxEvents (fact reports true total, findings are capped) + totalCount := len(events) + capped := totalCount > maxEvents + if capped { events = events[:maxEvents] } // Build findings and text var findings []Finding var textLines []string - textLines = append(textLines, fmt.Sprintf("Critical Log Analysis: %d event(s)\n", len(events))) + textLines = append(textLines, fmt.Sprintf("Critical Log Analysis: %d event(s)\n", totalCount)) - // Group by category for summary - catCounts := make(map[string]int) for _, ev := range events { - catCounts[ev.category]++ - // Truncate long lines for readability line := ev.line if len(line) > 200 { @@ -160,11 +195,15 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro textLines = append(textLines, fmt.Sprintf(" [%s] %s (%dx): %s", ev.severity.String(), ev.pattern, ev.count, line)) } + if capped { + textLines = append(textLines, fmt.Sprintf("\n ... %d additional event(s) not shown (capped at %d)", totalCount-maxEvents, maxEvents)) + } + return &TriageResult{ Name: "critical_events", Findings: findings, Facts: map[string]string{ - "critical_event_count": itoa(len(events)), + "critical_event_count": strconv.Itoa(totalCount), }, Text: strings.Join(textLines, "\n") + "\n", }, nil diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index 28b23b6..fc58538 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -4,13 +4,15 @@ import ( "context" "os" "path/filepath" + "strconv" "strings" "testing" "github.com/NexGenCloud/vm-diagnostics/internal/collector" ) -func TestAnalyzeCriticalLogs_OOMKill(t *testing.T) { +func TestAnalyzeCriticalLogs_OOMNotMatched(t *testing.T) { + // OOM is owned by collector/journal.go, not critical.go. t.Parallel() workDir := t.TempDir() os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) @@ -22,18 +24,11 @@ func TestAnalyzeCriticalLogs_OOMKill(t *testing.T) { if err != nil { t.Fatal(err) } - if len(tr.Findings) == 0 { - t.Fatal("expected OOM finding") - } - found := false for _, f := range tr.Findings { - if f.Title == "OOM Kill" && f.Severity == collector.SeverityCritical && !f.Hidden { - found = true + if f.Title == "OOM Kill" { + t.Error("OOM should not be matched by critical.go (owned by journal.go)") } } - if !found { - t.Error("expected visible CRITICAL OOM Kill finding") - } } func TestAnalyzeCriticalLogs_LowConfidenceHidden(t *testing.T) { @@ -92,8 +87,8 @@ func TestAnalyzeCriticalLogs_Dedup(t *testing.T) { t.Parallel() workDir := t.TempDir() os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) - // Same OOM line in both dmesg and journal_kernel — should dedup - line := "[12345.0] Out of memory: Killed process 1234 (python3)\n" + // Same kernel panic line in both dmesg and journal_kernel — should dedup + line := "[12345.0] kernel panic - not syncing: Fatal exception\n" os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\n"+line), 0o644) os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.txt"), []byte("---\n"+line), 0o644) @@ -101,30 +96,99 @@ func TestAnalyzeCriticalLogs_Dedup(t *testing.T) { if err != nil { t.Fatal(err) } - oomCount := 0 + panicCount := 0 for _, f := range tr.Findings { - if f.Title == "OOM Kill" { - oomCount++ + if f.Title == "Kernel Panic" { + panicCount++ } } - if oomCount != 1 { - t.Errorf("expected 1 deduplicated OOM finding, got %d", oomCount) + if panicCount != 1 { + t.Errorf("expected 1 deduplicated Kernel Panic finding, got %d", panicCount) } } func TestAnalyzeCriticalLogs_MissingFiles(t *testing.T) { t.Parallel() workDir := t.TempDir() - // No logs/ directory at all + // No logs/ directory at all — should return nil, nil + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr != nil { + t.Fatal("expected nil result when all source artifacts are missing") + } +} + +func TestAnalyzeCriticalLogs_SkippedArtifacts(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + // All sources are skipped + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("[SKIPPED]\n"), 0o644) + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) if err != nil { t.Fatal(err) } if tr == nil { - t.Fatal("should return a result even with no files") + t.Fatal("expected non-nil result for skipped artifacts") + } + if tr.Facts["critical_scan_status"] != "unavailable" { + t.Errorf("expected critical_scan_status=unavailable, got %q", tr.Facts["critical_scan_status"]) } if len(tr.Findings) != 0 { - t.Errorf("expected 0 findings with no files, got %d", len(tr.Findings)) + t.Error("skipped artifacts should produce no findings") + } +} + +func TestAnalyzeCriticalLogs_IgnoresSkippedSourceContentWhenOtherSourceUsable(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + + // Usable source with no critical patterns. + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "# Command: dmesg\n---\n[1.0] benign startup line\n", + ), 0o644) + + // Skipped source includes "failed" text that should not be parsed as evidence. + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte( + "# Command: journalctl\n---\n[SKIPPED permission_or_access] failed to read journal\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil result") + } + if len(tr.Findings) != 0 { + t.Fatalf("expected no findings, got %d", len(tr.Findings)) + } + if tr.Facts["critical_event_count"] != "0" { + t.Fatalf("expected critical_event_count=0, got %q", tr.Facts["critical_event_count"]) + } +} + +func TestAnalyzeCriticalLogs_XidNotMatched(t *testing.T) { + // Xid is owned by triage/xid.go, not critical.go. + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "# Command: dmesg\n---\n[1000.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1234\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + for _, f := range tr.Findings { + if f.Title == "Xid/SXid" { + t.Error("Xid should not be matched by critical.go (owned by xid.go)") + } } } @@ -136,7 +200,7 @@ func TestAnalyzeCriticalLogs_Cap(t *testing.T) { var lines strings.Builder lines.WriteString("---\n") for i := 0; i < 200; i++ { - lines.WriteString("[1.0] operation timed out on device " + itoa(i) + "\n") + lines.WriteString("[1.0] operation timed out on device " + strconv.Itoa(i) + "\n") } os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte(lines.String()), 0o644) diff --git a/customers/vm-troubleshooting/internal/triage/determinism_test.go b/customers/vm-troubleshooting/internal/triage/determinism_test.go new file mode 100644 index 0000000..b0ae59b --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/determinism_test.go @@ -0,0 +1,109 @@ +package triage + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +// TestTriageDeterminism_IdenticalInputProducesIdenticalOutput runs the full +// triage pipeline twice on identical input and verifies the output files +// are byte-for-byte identical. +func TestTriageDeterminism_IdenticalInputProducesIdenticalOutput(t *testing.T) { + t.Parallel() + + makeWorkDir := func(t *testing.T) string { + t.Helper() + dir := t.TempDir() + os.MkdirAll(filepath.Join(dir, "logs"), 0o755) + os.MkdirAll(filepath.Join(dir, "network"), 0o755) + + // dmesg with multiple Xids at same severity to test tiebreakers + os.WriteFile(filepath.Join(dir, "logs/dmesg.txt"), []byte( + "# Command: dmesg\n---\n"+ + "[1.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1234\n"+ + "[2.0] NVRM: Xid (PCI:0000:86:00): 79, pid=5678\n"+ + "[3.0] NVRM: Xid (PCI:0000:3b:00): 48, pid=1234\n"+ + "[4.0] NVRM: Xid (PCI:0000:86:00): 48, pid=5678\n", + ), 0o644) + + // iptables + os.WriteFile(filepath.Join(dir, "network/iptables.txt"), []byte( + "Chain INPUT (policy DROP 0 packets, 0 bytes)\n"+ + "Chain FORWARD (policy DROP 0 packets, 0 bytes)\n"+ + "Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes)\n", + ), 0o644) + + // journal with multiple error patterns at same severity + os.WriteFile(filepath.Join(dir, "logs/journal_errors.txt"), []byte( + "# Command: journalctl\n---\n"+ + "systemd[1]: Failed to start aaa.service\n"+ + "systemd[1]: Failed to start zzz.service\n", + ), 0o644) + + return dir + } + + // Run 1 + dir1 := makeWorkDir(t) + w1 := output.NewWriter(dir1) + results1, err := RunAllAnalyzers(context.Background(), dir1, "test-archive", w1, ui.NoopUI{}) + if err != nil { + t.Fatalf("run 1 failed: %v", err) + } + + // Run 2 + dir2 := makeWorkDir(t) + w2 := output.NewWriter(dir2) + results2, err := RunAllAnalyzers(context.Background(), dir2, "test-archive", w2, ui.NoopUI{}) + if err != nil { + t.Fatalf("run 2 failed: %v", err) + } + + // Compare triage output files + if len(results1) != len(results2) { + t.Fatalf("different result counts: %d vs %d", len(results1), len(results2)) + } + + for i, r1 := range results1 { + r2 := results2[i] + if r1.Name != r2.Name { + t.Errorf("result %d: name mismatch: %q vs %q", i, r1.Name, r2.Name) + } + if len(r1.Findings) != len(r2.Findings) { + t.Errorf("result %d (%s): finding count mismatch: %d vs %d", i, r1.Name, len(r1.Findings), len(r2.Findings)) + continue + } + for j, f1 := range r1.Findings { + f2 := r2.Findings[j] + if f1.Title != f2.Title || f1.Severity != f2.Severity || f1.Category != f2.Category { + t.Errorf("result %d finding %d: mismatch:\n run1: %s/%s/%s\n run2: %s/%s/%s", + i, j, f1.Severity, f1.Category, f1.Title, f2.Severity, f2.Category, f2.Title) + } + } + } + + // Compare the actual JSON files byte-for-byte + for _, name := range []string{ + "triage/_data/gpu_health.json", + "triage/_data/firewall_posture.json", + "triage/_data/critical_events.json", + } { + data1, err1 := os.ReadFile(filepath.Join(dir1, name)) + data2, err2 := os.ReadFile(filepath.Join(dir2, name)) + if err1 != nil && err2 != nil { + continue // both missing is fine + } + if (err1 != nil) != (err2 != nil) { + t.Errorf("file %s: exists in one run but not the other", name) + continue + } + if string(data1) != string(data2) { + t.Errorf("file %s: not byte-identical across runs\nrun1:\n%s\nrun2:\n%s", name, data1, data2) + } + } +} diff --git a/customers/vm-troubleshooting/internal/triage/firewall.go b/customers/vm-troubleshooting/internal/triage/firewall.go index e52a0b3..f10e587 100644 --- a/customers/vm-troubleshooting/internal/triage/firewall.go +++ b/customers/vm-troubleshooting/internal/triage/firewall.go @@ -159,14 +159,53 @@ func parseFirewalldPosture(content string) FirewallPosture { // AnalyzeFirewall reads collected firewall artifacts and classifies the overall posture. func AnalyzeFirewall(_ context.Context, workDir string) (*TriageResult, error) { - iptables := readArtifact(workDir, "network/iptables.txt") - ufw := readArtifact(workDir, "network/ufw_status.txt") - nft := readArtifact(workDir, "network/nftables.txt") - fwd := readArtifact(workDir, "network/firewalld_zones.txt") + type sourceCheck struct { + path string + state ArtifactState + content string + } + + fwSources := []sourceCheck{ + {path: "network/iptables.txt"}, + {path: "network/ufw_status.txt"}, + {path: "network/nftables.txt"}, + {path: "network/firewalld_zones.txt"}, + } - if iptables == "" && ufw == "" && nft == "" && fwd == "" { + // Single-pass artifact checks with cached usable content. + allMissing := true + anyUsable := false + for i := range fwSources { + state, content := checkArtifact(workDir, fwSources[i].path) + fwSources[i].state = state + fwSources[i].content = content + if state != ArtifactMissing { + allMissing = false + } + if state == ArtifactUsable { + anyUsable = true + } + } + if allMissing { return nil, nil // no firewall artifacts } + if !anyUsable { + // All present sources are skipped/unavailable — don't emit a noisy "unknown" finding + return &TriageResult{ + Name: "firewall_posture", + Facts: map[string]string{"firewall_scan_status": "unavailable"}, + }, nil + } + artifactContent := make(map[string]string, len(fwSources)) + for _, src := range fwSources { + if src.state == ArtifactUsable { + artifactContent[src.path] = src.content + } + } + iptables := artifactContent["network/iptables.txt"] + ufw := artifactContent["network/ufw_status.txt"] + nft := artifactContent["network/nftables.txt"] + fwd := artifactContent["network/firewalld_zones.txt"] // Determine posture from each source, pick the most authoritative. // Precedence: UFW → firewalld → nftables → iptables diff --git a/customers/vm-troubleshooting/internal/triage/integration_test.go b/customers/vm-troubleshooting/internal/triage/integration_test.go index e4f2c2e..40b3dc9 100644 --- a/customers/vm-troubleshooting/internal/triage/integration_test.go +++ b/customers/vm-troubleshooting/internal/triage/integration_test.go @@ -2,6 +2,7 @@ package triage import ( "context" + "encoding/json" "os" "path/filepath" "strings" @@ -37,7 +38,7 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte("# Command: journalctl\n---\nsystemd[1]: Failed to start some.service\n"), 0o644) - results, err := RunAllAnalyzers(context.Background(), workDir, writer, ui.NoopUI{}) + results, err := RunAllAnalyzers(context.Background(), workDir, "vm-diagnostics-test", writer, ui.NoopUI{}) if err != nil { t.Fatalf("RunAllAnalyzers failed: %v", err) } @@ -158,4 +159,47 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if len(triageResult.Artifacts) != 6 { t.Errorf("expected 6 registered artifacts, got %d", len(triageResult.Artifacts)) } + + // --- Verify triage JSON files have valid envelope structure --- + for _, name := range []string{ + "triage/_data/gpu_health.json", + "triage/_data/firewall_posture.json", + "triage/_data/critical_events.json", + } { + data, err := os.ReadFile(filepath.Join(workDir, name)) + if err != nil { + t.Errorf("reading %s: %v", name, err) + continue + } + var envelope struct { + Kind string `json:"kind"` + SchemaVersion string `json:"schema_version"` + ArchiveID string `json:"archive_id"` + Analyzer string `json:"analyzer"` + Findings json.RawMessage `json:"findings"` + } + if err := json.Unmarshal(data, &envelope); err != nil { + t.Errorf("%s: invalid JSON: %v", name, err) + continue + } + if envelope.Kind != "triage_result" { + t.Errorf("%s: kind=%q, want triage_result", name, envelope.Kind) + } + if envelope.SchemaVersion != triageSchemaVersion { + t.Errorf("%s: schema_version=%q, want %s", name, envelope.SchemaVersion, triageSchemaVersion) + } + if envelope.ArchiveID != "vm-diagnostics-test" { + t.Errorf("%s: archive_id=%q, want vm-diagnostics-test", name, envelope.ArchiveID) + } + if envelope.Analyzer == "" { + t.Errorf("%s: analyzer is empty", name) + } + // findings must be a JSON array (not null) + if string(envelope.Findings) == "null" { + t.Errorf("%s: findings is null, must be array", name) + } + if len(envelope.Findings) == 0 || envelope.Findings[0] != '[' { + t.Errorf("%s: findings is not a JSON array: %s", name, string(envelope.Findings[:min(len(envelope.Findings), 20)])) + } + } } diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index 92d54fe..7a00551 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -3,8 +3,11 @@ package triage import ( "context" "encoding/json" + "fmt" "os" "path/filepath" + "strconv" + "strings" "github.com/NexGenCloud/vm-diagnostics/internal/collector" "github.com/NexGenCloud/vm-diagnostics/internal/output" @@ -35,9 +38,12 @@ type TriageResult struct { // produces a triage result. It should be resilient to missing files. type Analyzer func(ctx context.Context, workDir string) (*TriageResult, error) +// triageSchemaVersion is the schema version emitted in triage result JSON files. +const triageSchemaVersion = "2.0.0" + // RunAllAnalyzers executes all registered analyzers with spinner feedback. // Missing artifacts are handled gracefully — analyzers skip what isn't there. -func RunAllAnalyzers(ctx context.Context, workDir string, writer *output.Writer, u ui.UI) ([]*TriageResult, error) { +func RunAllAnalyzers(ctx context.Context, workDir string, archiveID string, writer *output.Writer, u ui.UI) ([]*TriageResult, error) { analyzers := []struct { name string fn Analyzer @@ -53,7 +59,14 @@ func RunAllAnalyzers(ctx context.Context, workDir string, writer *output.Writer, break } sp := u.StartSpinner("Analyzing " + a.name + "...") - tr, err := a.fn(ctx, workDir) + tr, err := func() (tr *TriageResult, err error) { + defer func() { + if recovered := recover(); recovered != nil { + err = fmt.Errorf("panic: %v", recovered) + } + }() + return a.fn(ctx, workDir) + }() if err != nil { sp.Fail("Failed: " + a.name + ": " + err.Error()) continue @@ -79,10 +92,26 @@ func RunAllAnalyzers(ctx context.Context, workDir string, writer *output.Writer, // Write machine-readable JSON (reserve → write → track) if len(tr.Findings) > 0 || len(tr.Facts) > 0 { jsonPath := "triage/_data/" + tr.Name + ".json" + typedFacts := output.ConvertFacts(tr.Facts) + findings := tr.Findings + if findings == nil { + findings = []Finding{} // schema requires array, not null + } jsonData, jErr := json.MarshalIndent(struct { - Findings []Finding `json:"findings"` - Facts map[string]string `json:"facts,omitempty"` - }{tr.Findings, tr.Facts}, "", " ") + Kind string `json:"kind"` + SchemaVersion string `json:"schema_version"` + ArchiveID string `json:"archive_id"` + Analyzer string `json:"analyzer"` + Findings []Finding `json:"findings"` + Facts map[string]any `json:"facts,omitempty"` + }{ + Kind: "triage_result", + SchemaVersion: triageSchemaVersion, + ArchiveID: archiveID, + Analyzer: tr.Name, + Findings: findings, + Facts: typedFacts, + }, "", " ") if jErr == nil { if rErr := writer.ReservePath(jsonPath); rErr != nil { u.Warn("Failed to reserve " + jsonPath + ": " + rErr.Error()) @@ -96,29 +125,62 @@ func RunAllAnalyzers(ctx context.Context, workDir string, writer *output.Writer, } count := len(tr.Findings) - sp.Success(a.name + " (" + itoa(count) + " finding(s))") + sp.Success(a.name + " (" + strconv.Itoa(count) + " finding(s))") results = append(results, tr) } return results, nil } -// readArtifact reads a collected artifact file. Returns "" if the file doesn't exist. -func readArtifact(workDir, relPath string) string { - data, err := os.ReadFile(filepath.Join(workDir, relPath)) - if err != nil { +// ArtifactState describes the usability of a collected artifact for triage. +type ArtifactState int + +const ( + ArtifactMissing ArtifactState = iota // file does not exist + ArtifactSkipped // file exists but content is skipped/unavailable + ArtifactUsable // file exists with usable payload +) + +// firstPayloadLine returns the first non-empty payload line. +// It tolerates headered artifacts (# metadata + --- separator) and plain text. +func firstPayloadLine(content string) string { + trimmed := strings.TrimSpace(content) + if trimmed == "" { return "" } - return string(data) + lines := strings.Split(trimmed, "\n") + + // Headered artifacts use an explicit separator line. + start := 0 + for i, raw := range lines { + if strings.TrimSpace(raw) == "---" { + start = i + 1 + break + } + } + for _, raw := range lines[start:] { + line := strings.TrimSpace(raw) + if line != "" { + return line + } + } + return "" } -func itoa(n int) string { - if n == 0 { - return "0" +// checkArtifact reads an artifact and classifies its state. +func checkArtifact(workDir, relPath string) (ArtifactState, string) { + data, err := os.ReadFile(filepath.Join(workDir, relPath)) + if err != nil { + return ArtifactMissing, "" + } + content := string(data) + firstLine := firstPayloadLine(content) + if firstLine == "" { + return ArtifactMissing, "" } - s := "" - for n > 0 { - s = string(rune('0'+n%10)) + s - n /= 10 + // The executor writes a skip sentinel as the first payload line when a + // command was intentionally not run (e.g., missing root permissions). + if strings.HasPrefix(firstLine, "[SKIPPED") { + return ArtifactSkipped, "" } - return s + return ArtifactUsable, content } diff --git a/customers/vm-troubleshooting/internal/triage/xid.go b/customers/vm-troubleshooting/internal/triage/xid.go index 6c5e336..7e944a4 100644 --- a/customers/vm-troubleshooting/internal/triage/xid.go +++ b/customers/vm-troubleshooting/internal/triage/xid.go @@ -1,10 +1,11 @@ package triage import ( + "cmp" "context" "fmt" "regexp" - "sort" + "slices" "strconv" "strings" @@ -177,12 +178,15 @@ func parseXidEvents(dmesg string) []XidEvent { events = append(events, ev) } - // Sort: severity descending (critical first), then count descending - sort.Slice(events, func(i, j int) bool { - if events[i].Severity != events[j].Severity { - return events[i].Severity > events[j].Severity - } - return events[i].Count > events[j].Count + // Sort: total order so output is deterministic regardless of map iteration order. + // Tiebreakers extend to the dedup key (BDF + Code) so no ties are possible. + slices.SortStableFunc(events, func(a, b XidEvent) int { + return cmp.Or( + cmp.Compare(b.Severity, a.Severity), // severity DESC + cmp.Compare(b.Count, a.Count), // count DESC + cmp.Compare(a.BDF, b.BDF), // BDF ASC (tiebreaker) + cmp.Compare(a.Code, b.Code), // code ASC (dedup key) + ) }) return events @@ -190,14 +194,33 @@ func parseXidEvents(dmesg string) []XidEvent { // AnalyzeXid reads dmesg artifacts and classifies Xid/SXid errors. func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { - dmesg := readArtifact(workDir, "logs/dmesg.txt") - if dmesg == "" { - dmesg = readArtifact(workDir, "nvidia/xid_errors.txt") + // Try primary source, then fallback + sources := []string{"logs/dmesg.txt", "nvidia/xid_errors.txt"} + var dmesg string + anySkipped := false + for _, src := range sources { + state, content := checkArtifact(workDir, src) + switch state { + case ArtifactUsable: + if dmesg == "" { + dmesg = content + } + case ArtifactSkipped: + anySkipped = true + } } if dmesg == "" { + if anySkipped { + return &TriageResult{ + Name: "gpu_health", + Facts: map[string]string{"xid_scan_status": "unavailable"}, + }, nil + } return nil, nil // no dmesg available } + const maxXidEvents = 100 + events := parseXidEvents(dmesg) if len(events) == 0 { return &TriageResult{ @@ -207,9 +230,15 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { }, nil } + totalCount := len(events) + capped := totalCount > maxXidEvents + if capped { + events = events[:maxXidEvents] + } + var findings []Finding var textLines []string - textLines = append(textLines, fmt.Sprintf("Xid/SXid Analysis: %d unique event(s)\n", len(events))) + textLines = append(textLines, fmt.Sprintf("Xid/SXid Analysis: %d unique event(s)\n", totalCount)) for _, ev := range events { prefix := "Xid" @@ -239,11 +268,15 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { } } + if capped { + textLines = append(textLines, fmt.Sprintf("\n ... %d additional event(s) not shown (capped at %d)", totalCount-maxXidEvents, maxXidEvents)) + } + return &TriageResult{ Name: "gpu_health", Findings: findings, Facts: map[string]string{ - "xid_classified_count": itoa(len(events)), + "xid_classified_count": strconv.Itoa(totalCount), }, Text: strings.Join(textLines, "\n") + "\n", }, nil diff --git a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go new file mode 100644 index 0000000..b010ec7 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go @@ -0,0 +1,69 @@ +package triage + +import ( + "context" + "os" + "path/filepath" + "testing" +) + +func TestAnalyzeXid_UsesUsableFallbackWhenPrimarySkipped(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(workDir, "nvidia"), 0o755); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("[SKIPPED permission_or_access]\n"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(workDir, "nvidia/xid_errors.txt"), []byte( + "# Command: dmesg\n---\n[1000.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1234\n", + ), 0o644); err != nil { + t.Fatal(err) + } + + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil triage result") + } + if tr.Facts["xid_scan_status"] == "unavailable" { + t.Fatalf("expected usable fallback data, got unavailable facts: %#v", tr.Facts) + } + if tr.Facts["xid_classified_count"] != "1" { + t.Fatalf("expected xid_classified_count=1, got %q", tr.Facts["xid_classified_count"]) + } +} + +func TestAnalyzeXid_ReturnsUnavailableWhenOnlySkippedSourcesExist(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("[SKIPPED permission_or_access]\n"), 0o644); err != nil { + t.Fatal(err) + } + + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil triage result for skipped input") + } + if tr.Facts["xid_scan_status"] != "unavailable" { + t.Fatalf("expected xid_scan_status=unavailable, got %q", tr.Facts["xid_scan_status"]) + } + if len(tr.Findings) != 0 { + t.Fatalf("expected no findings for skipped input, got %d", len(tr.Findings)) + } +} diff --git a/customers/vm-troubleshooting/non-selected.png b/customers/vm-troubleshooting/non-selected.png deleted file mode 100644 index 657813b5b49efc4cbca0a68cb0ca74cbc136f73c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18482 zcmc$_cT`hv(=HrTL;(c_L_n$(>C&YaL3(e}L7H^wy$AwUdhaL@dWT2?L6KfU2^~U| z7FrSrC3H^wJ?AM6l9G?Q1}@c7i^prBKDWsiIhYYuPG*x-*_MO{1Z-fxd=xt zLZMg~`Jy;f;B}|i7v+v^C5jG*8qcf6mN#N`do}ve^KRz9b;zVH&3L}Zb_2yl6fd8& zW=0XQQIX##QWxcXx($_XIz@Mk${7PIb@HWvDH@es_$HH|6;0dc+Ea38psPIJ4_XC# zBt!&BqmrQZS0sOb(S!2-Qk8WAhN`EZ+q4AsTcutqCosCFd!ZMNj(oof^g~o(bHNwY z)#jH0b2j}oB4tDEbdO87f7$yCavsWF?FTK`Tz%o|P{i8E?R=Y2Zo7IL>!W(oIdG=B zfNp0w+NZiw&SB`hbzD$FBP_*qt>VlpX!5q%% zAGqVtwStqPWo z4NnyJ?gn>^!?vhS!(^Yhx%LizU}PTjmX=84NbFaYw`Pmnj}*|W%XJvcJj5-Wf6uyJ z9FFSUu1(&OuS%)>b5)gZeC($f>Vd8i{spCQ#MLw&)sFk@OxXdj|=H@}gsXjbn|}Ks(6LN!Try*WO)w zE@0a|8|GuOHW(D2jy}REl&@^N$wkJq$jZgZot~|&wcYhe&d)>(dS(>LLClU&77^=@ zCnY}}OOuw;?oS{1*#&(IYhH7riT<3y;dCdhlFtR^FnyZlieCbxQm+k;|be@imF`d@v z&mZ9EWop=Kl=8iBf<2r5WzUs%>dM871fN3C7GT+lTmG20EdZsSJc<+{Uch>_40IHg zft6NmJ7gS@6p!i#@&1(cUv;+7d3sJ675978CC5IY9&S*;er(ALq{`>YXb7C)@19^u zbo*z-g|cfU3neF^Cr5_CN6hq-qY7{A z=#+JCOXT9uASt7s3uXdds!X?!J8*{2R9G;0QU|Y0c{yOLOa~k}YqT$oFNUu!SH#mz zJmoME*6Um7_lTL*aFZttupy#eg_~xH)xRKQl0dw7J z=kNnR+|hDhu^XRrIoZiR_Wk)(KGg_*bNiR{Dzl^K_m%y0%yC}e07}OTTM>E5gNAlb zgk9dnw4Nti;8TS!zC`2UyBBAr(sB2l!{s77Q8tT@ZA#{>4=(%?Y##8Zs_wpV zX|bjJEKDx)<#f3cxcW4|#A5!ZJcC@!c z)vc06jxq(g8~? z(xtla`|EZ;-rO{~N)+({ZzEMMgX+m8L=1Y!YQO>KHd_1_*FmNg)Vg+4+P)8jCIp?o zfHefJX3<{_*+twc9h-9)n-@JI(TIWw$=E-skk+}w4c)e(N>e33G8jge&ZG9s-_oI# zl*Nek{0>5}Vf*`|9pO*hl|RDEwUxsoA`INRKQHtRCt}4&FfQTZFRkeQkHFZ?e@rbZe`aAM!`^d(dw4H z=EBI}Ceki(>M+c_TAx`;J62Q3S3SUSrTHlQ~zj23cBGs&dl zqMFH3eCODHh@g63Qkp9GJc9Q{&VKe3bZ3G03yNgmkT-%=P=}|ba7R|G%4wi`6wm25 zV|MbcrrsQ^REr3DX=UgC6LDME)nuyt+V_PKlqv45R)(sHcIN4!y!d`=L~U(9c&fXk zDib*2wj9P-J-Y@=%!qAWfDdvR0lZw6sdGE}Sx)_QDwp9#J>Zpa2U(+IYt$ZHlr#sV} zlf3119_(8Pyr)2}^#=npGU2C+t0c#aMObVj*h%~qLnogl53FdRk2)iETJLw~lbgdz zNzsv!v)ShcDE8;7Bm%Sy^1ls>D5&dU$ttdwGo?O`B z@iz3U))XYDvZq}WR4BXdOHMax#;M<_xfSLhH$#MiH_3i0XFq<}bzauA1`@Qx^%u1Z zEFjB9p_Z&9tAA*~Ppgl%1{KQ`5TW*W%Z6%JZVrq;UfLeC628#zk3$4=zQ4!Kyy@N) zagkaOwGwr{eoV~GGaZ>fyr{tLE!okL+9(#_Vc56u#^7V`$VU=Va4)iKMxt45=6A7h zPy0{-68imZJi+?rTz>Vobr4P0)zhC#-j7}7LFIt$K@Bq(lrp~3eOz&zU4n8vk|LQR zV8b50ElRGeL%G-IDExIJQTehjSiSO*FF9mV)XheGEkL^e*lRT)6#Y- zs`K|OcX)}>Hl7%ZJ(TYlS0@wemxMzt*GnMKOQ{nWt#;#n(tM{r+yRP@X?Qzxx`q}= zvuTucCd4Bn3&YmuN35&m_;QI9b(g}W>IR;lNA(u2MFW{QZeb+CEgQnnJ-wNnU zo}+6QBW=TCL)ph(x(;X3I@;DfmJA0nZ!-CYmzS^BB*0^MvT|)5dd$Has z?`4E{LCg1!K-uaSryjVUCuF#vuwQIENpD>HLe*$LuhqM&Z_rP&p?O}vrxA6ArBuWS zs@ccz%Zyg1;}P^Bhd6W>bu3-v?~212LHK>j8UHY^_MtJTRNDYLB(+(P*(11wg!q&a z7RejG#JHa9p%Vj8|sYjHw0Tmg~-;qQ6kDt-wz8_DWw$- zyT~4F%4c6BcFJ7*5@9T?&>v>R&my`U&EKA7|2az;5jxmH%EtKS>oEBQ#b2C#uj`ID zhc?Ox+MN4-1K*Ig%F&w1sK4$W1J&$cU%-ZyV-3oD5hRg#_e8;z=3s|7%-_^bYzxe|oe|UkU_|wA-1Zcr2Zp?KM)Yk0fTMaPgY3XF1p)p0c^Hth8_QmBY>NTsO4B^Wun5(vF#T zGUQJSuQ;9Myi%?TY7^UeqL|<65CAHgK7G zqW1gxJV>}8oL}SO;6;erJiNAPVDi?0^^O$Blw{AO+HZcJrl;3qFB)FSx8!$*SGJ^@WG& z4X>-CZn>GCTKnxh)a-$6>W~h0n@5uAAz`giA&9SYWxb=BXt^T6MQ#}vou4{ZKAote z?Epyj_n*e~etoni%X@R4O!|-mEqgX5kRHzu)$%3a? zcswFS(XjHUk>;}fHLR`8{lF6FrYY!e@$rAhgrP9kA{bUqO zNay1(&!NEl22mBOcGG#t%)^4_W&kR^L%@Ag78PWH9 zA9T3mJcd`%3?xs!j8I2$pxboz@giqM(--HT3J37eP6ZxaOcF5Rvqa&=3z}o=<4IKe zoa5vC?OQ00i1R&v=Ox`H2f)?-{bR(PV;FwkYxoVQ5MxOaKDr}3uib15|lR7~zi+3>27nggNUzu2KhVRB@a z2z@+}!i#3mq{x{5soZ6iI7zb=6I1FF-W76=XE=QjQW-)B~MA;sS3``-Ni(!HoT0^!vANy=WdvwI_>@A6vYBj|>_6#Wm7)mJsCW z^-zE8sL@HVq=Br;RBv7(Gsb>&0OxME6`3I&uok<(Mc zihX)>7=DG^;3M&-gF+J07vCWwcix=g!+2~cIy5l(3|>Gc*|tGHG9~)f(H5i%%q!_t z;u$T8I|Xr6XQ4c7NQL`JqGeERB$VpZB+n`zY-s8=7I>yfMR|O%KNF7nRAQw=sQ@63 z&A62b;!Vn!K#=H{1UPTOpfqgGuRrVt3NcHMNsvBH9R4zi(Nd?dzph7li;Y010F>|Z zy#MZcco9&E4+CR}rKcnsW#fu2hI@iH-P)c(Ezdp9LTn1mtlWUOh3DH_J4T|hK4FT{ z39__(ndml{No!vcIwGJQGN7{tN(si#RT#}NJs*=I>rps9Bj!%)^bk1mExPj1|NfBb zFOS1=Kl;|+$0k!M|6Sg{W4T6H6xeSNh${eqjrINCh4QxI4Z;$o_HPlDDF0u7)aM@t zNjiVpxC#q03}1o!3(i(7rJ{vypfv@i&7R!sItg>~ugU+oUhMB5Q2FN0f&J*wabx>s14fQn84kC%dV>13fRp#ICAw|;x_gQ{+ zv$@thyptgLKROw;y>w0rlZvzKKWbl95`bw6Bw(|rS^{`-;-{zP{;CYUaeIE$(p}Kv z#81xG8`dZbwREprL1+{HN)E3|34!9Z@foTwGjfW#Z(Smg#POJ$LSQ=Xs30a<~riu#WnnGdu6ZXtz22F_Gq;scip<-3JVO)2LV+d!PE;Z>kn*oA1D z9XRC6xrEEJX$*V8S#6q)yv|KVBo}jw^_z7^pOc`rVxQ_J0le5B-I70Jw48I1p{i@U zvG*D`;#@_r^H;$N1@gh!TFIj;3yd=6wYj2Nm!PXU*;WB^t;J;;f?10;ExL8mo4ncw zgKizrfkg-^QP)>mIExg~hbR2?_)*tN3g>Kl+5H%`gI_;Na{G7iDwd}DyZQf1)Jh?u zSC)AaFCV6Txzf?MW+x5YrN#;IB?Tf4&ua0=_Ddb>dTzq!o`gw~vn@tD7!cu>`bw>6 z_|}{wQPfWnN{#u24L>#oP{A$opdXx^++d@!xqg;2Jt3L%#Yj!bLfCfa00ciz zq0;(%9T5f$!-Hv81FZrc5We^3=SMg1Nw1FBqo%G*`-cl9eVgOGs*^O)&30r1-1{QW z8m(vzrrgeGeKrry*5s{&$I4zaI5g@&F!Dne0Xly+bM1pdP9l9^EB@G_d$QlQlg?@P zFXac?Gly0Gs(V8c4Lk5#m|4HNx8|w&^IDrnm+fI^I60W7{u1+!%0Y{lkxx~}VK3-_ z>$1sRviOqOqT+I3@=I^PJ)E|#j;}T5{0PN_F9~(q{8kM&gH>E~+IaF!6t*d4fHyfQ z2N!9cv>mc6-CybKDEQzLR&hoscSXJB0@xWT3$`a%{Z+$Q|$C!VV1uGK^N^QPaPqJr%P+-F++kdPXAcI3vjuxATR(~aS@2$ zBb!?;Lv|>3P{`e6R5+K18~<2q?JhUn#4+-4TpTsh`h{)6Ndq7Kt~2 zjfUcTsw6-JA&6Yoz3u=2=X+T;I$cFOxmv8V<7_~PvH*WeY@~&P9HPwa818Ffc-|h0y_=2K z1gh9iC0>_e;8vsI@FC>7M2{{8Fnf6#Uy=ikf!iqL88D80YPw1Z6w z%=8}*B*rQ*TU>niBLIKs)@qIP&Uuo!`#d}Uw)1SDNnM?9!_L%ph;Jz$$QM9vblHht z+umM287%7w5CB*%p7Ou{?yOQH)B5IfIIRSVfa>j|~)iwDX((!tgms=6AkcfP+U%Ph<+ zdtP%CE)d42TUV7DFFrkVTbjhB{W8rct1IZd#rVoa3Yvz>(^W%pjdGTZngda=B+ydz zy$>|&hd%ZWA3zFk`qa2nO-}m5mVPy}Yo#>JSysbqnp9f{_`=CSvNY08fbc)jei>RC zm0|s25m*4F$rWO`POa#Z$qp z*bErD(abZ$*ckTwPfp;}0Q_v&VcEyW+}a$UvtwdtPb=^#dkFc;z!o8%8%CB#k&0IJO_)v{Mc6f>q%!5;ww}YS}O2sMbqI$wWKP z*U}`s_8Nng%4FE>@-asTa>fDbzs5Au1FY!-TGXc&HMq6=bIS7p@TjSrnAARXWpV`Lc=WT zvYiXlT=R4O9CMeteMok-O|NhkDtUC`fUnM~Aq`%z9}+9qyr^Whv^ONtJVcldWt+O2 z<8QUCvF$Y0hlRIjz4o{@uJSrJY*I?C(#?aG)0pM>1Na%Un~%U`StvX#{Ia+IJ_wTP zQaa1obj=kY;ch^?a@XHGvBmS*e#{IVA3~}{%j)4rpzLqz2RqK3!)DyCGRL>oUzd&( zfAm*f&6@9Yv$ujy9#u`sftZT*l2xN|(S9eNazX8roD&!`aqA%6!uZctK|s={i-(d6 z7VGPRYv~373U#LZ@*uS(GmvUdeA>|P$Kl{rg)wlYBJ6XW&SqD0*u}ME{v(hW*32JP zHmED?r|%Ov3hmU^4e<&vGlxdFC|w#T0f=k?n=@~xK3L_nTV)>{xC9Rvh92SG!apDi z{XFut<-N86r+fRD=I&xRe|K}pN&Rb|eo+x3{`eS0j4nVmXunElC zGTimzez}6f&xeP>g`;b)@{2d-bwPl}v+sU)xih%et)vzln(m5MF77wKtY!3NUCa9_ z_&F)`@Xmw9nZ-4D!E6zvry^N2- zs^-G#w}gtuVX7dv(VCD+!|Vo|rOTaY>deng*!|%wC?I2kf)}I(gtTPys~Sa9I`K<|Cu?zJQP=(y^TpR47gvR#jA|>4qkwh-s5f0ooSlT zGoT`4_X}i(AO4`mjH>&4$lnzWAaUun9*Ajgf}fZ@Z=dS^%sgN@^g8+uNUtHppg)80 zKn{`r#QN-NtQH;sy`Mzukl%S@+)5gKdN2gCFlwYEn0D{?(F^WR#7ia7*!$f87hHg&54+Xs;SgL>I{}-9Cz|!|XvNW4O&B!@aGO@~)lSbuQjW7!B8?s}BHoB*qSrP_yOL0jT zJ2Mk>E|XPYA+r6kfn-tg28$y2Ktb3+hv-lDe7xMXlv?F7L=XYO{m=(48QtD@@mi*e zUw~%{Pi3PTObRylF4wt$G+*__Nf3u>#CI{@N%~mmYduek5sOyh@mc@eQ5-!I+CDhwt{8r}!d*kX8JRmrHgCP? zEy=JRasxp86-n42ns8fge~7S|pqJlG(%zh{DPFBFg;0wRfpoIFyFv?{bJ`*Z;z2Zc zgxCbn7~o2CkpU@h`>CkG`wqIT)-L@va51Lh1X)2kxWBTgY4lcdPe(Iid0Uirnbpv- z#(HUyAtHAeR}EkEDD7)WRNU5@&lD1KnwQ2r?)R-~&H7p3bd{P3zg`_*?jHhq&MG)e zFwl)P!5j?6;pQty;lRtNtCvd?5^-5>@{xhbqOtBD`o$CZ)V3siUtBp+=hJ32sCZt! z!Yr>|dK&*|DAn&~tP%qnwsNjfU3QKXAw%`d$Bt5P{ek(p&q|g??gB^+3Gt2Q`|HxB zYXl@xlynfPdE-sPTzy3x(bV7cijbg&cP}<dK4HS_U`cx-&ejAQe%y^m3Vx1-MFBpY7lAU#J04{6E1)$D}V18M{yksQX z4c+w2IAqvfdl&po@Ox9~gqHNbLg)^|Y60{K?BDSY=7UNfUMP-!PvUUwc^1`XT_QL} zCRKp$h!;jQ7?`^PH5$T0-GR=9F%#D|S+?FD2ZCCxHBzwOS8E~C!!buPUj=|yoW4?q z@@|(V5fsdBEs=&7?H}p0ZBed&5+eaw!9W?2z<1%k*7SeleJE4V+?jhpM$)L{`gxIw z=jI~px`35>=g`Bm+@wQ)Zr!+q$iURj84Mn=R`LlOh!DvA1Io-%C9a_Y+$@Xm_%KGK zJT-vE5jdiKVcn1x-JlNV9(Qh+kHAe@q8ncgV=qc!@Mg{Rip*f877NXXX%F0gaiTVx zoZ5m9ecZLQ4Ms|=@kWwx`WC$`w2$1htScUC=Cbi%^L*9p%$l~`kJ9L|V?!`(`xQ6f zi`926VoDXBH*fYcM=IG1w;x=)5Or^*wo;zP6L@K@)Ik4v?PA24%7&S-0Pu}kJ2yxZ2B_^U?!B^et5xJ^y)<;seA30}3D z%@6S^^SG9{_6NI2Q5z*ST3vS^dwcoS?e`+*d7ooUHc)W&gH?V;U0v#3U)C8Ha8Us` z@8okxr^9Epy=D^koRFRjO%ROb%yHbKfa`gkG+UoT5;=1n@qE4*KSPOvR4mGFn5PflpIXYHHh^0fI(a*7Ug}*sRtk!`%reYRgK!Y|VO7?(6SuB&S4lTFoT2Oa1U>sy z3&sbW5%!tyrTp^(0@R&JH5$tDqHux}$6;1*tqft+5sRDVLM1QDHt;l8FI%kWz*$s&za{wG)0p(=Wl7$AcTlpd>C(CTPuwAoVS*K2g5t zbwTk}` zfcU`&Ij&E=@&cL~^lK=6lGr+!PaD%mn&>Th6~E%o^hPw~egoXnA1C2I23mke#OIznEYF!K4J~tvji00Lu>ZKXBk>@;)*}&Wh*+?_)lrn}N z!{afjy?Re)pQ5a<3}F2a+r&-Y=d*f^&^^vC6r06cSGYJjwjPZ#icqF7-P&X<*-0Of zT2WML-%jU!GV%t(*U=^nmB$1{mtxwF8**!@8|V$JsOGm8GYqu*Fk!1rFukF*N%9l< z-m3l0wzH9_DFZ=l^9LhX2O*_2o0pX@_r`J1*e))QWR@j9MWp%+G!G{fq?M(tE{|;I znY<^rBA>zzFyeo!wM$0d9y5-L?1v`uaWngw^XIz&zGofUP}-fIV3O~SGO8b-tw+Kq z4)b+(WbnopXr^BIgEkN0s#^WQ^@aJp!2F-FODGAg-4gAK;SR6U2(#v%fW&bZUnxho zcMF_yAlFZ#?|~*ueun)4`8wKVB3TQyx}8Qua!>fbi(4mh1x+_4F4<$bmP|fB?X&fC z-aKi)Sg_^`ZwtIKr*L9o#rNW}99^9{G}K2j{g&Wxl*F?@kIHmo;~14eXTk};K@8q5 zqx<0jQeS(&e3OK`hVnz81}-7=DJagEC`iU;-T{2Oc3@Up2&#kPynTdsh*0HWKB)Hf zP>-n|SQoDcJpFNS_G|qwdqABL;r>q5Xodf$B3@KG$PUiwV&=sxZdP=&O9&8E zWm8C1%>3hT7$2VxLA~|yjnLx-`OTBe5Jj1)e31zVEKLj9XSj$O+O%4K>(vl%9r8U` z1HKXaAvj}VsYEXFNGjL~b4$9nhZxW4}QI^S>(;b`;j}&aXObOXRrn4{-Yve&xp2 zOB<`EcMv+1MZguJJ{Mz3(8(O{2%9mP{fXJG56{Ca1A&QVRbrtv;ifLwumXvARV8?N z!4hOPQr%Fo)Nbq%xZz$9eFv~%e6e(2KcGmbV2L)8xsF(?Qd_gUcwaa5ow;OU691#y zEwKY)(Pgay*6=L7PY;ta>R}}pYSsCZ#7RNgwtD6}$DtK#oHbL4ntfr(HTz8IB zgf(E#m8Imp-S#f_3kbxJ1G-r;piGs|i*&GwI<>QhxAEX8LO?UL9#&)zi0aCaw`ZGF zY0z+NW^_h$;oZ3#!~)7VwPVSv+_lZY%~T5#xp={r9=Ee$-tO2pJ6+N$5pMEcGi}2j zG(xm;@B4wT^vZCV*bAul6ShG=+*M2G7EW%QBcZjf9C(Em;HzIm5N<4G!+aItGcc<0}| znE;lyKLHCo(`y>&K^KjL4C!B(DdBTAJHrlpQlwnQb)?DQ*X*O3^x0DGrF8Db4BrVm zu9&Q-qMDwHCwlsd`$@Z2IpheQ;(fgf^LvDri@+~3GNvd)=6QVff$`=2-wElBEUb}x zq{;YWjm|t5p)8RX#J4RR#w>4G7vY57|B(lYq5u`fH8 z5npSQg>}i)zeY8Gv@>9PylOov)nQfOS2dvNvFC?C)Bu#(4P1Ml|caf(1_{jAgMrhMY;J7h?xctRUc{Ivq?_0r=s1?7%Ni`c-uZYtp!C*rI;sY&n9BweD`=yu z(Q|@U+Cvf%Cr7U3+jl2#>j3gg!>Qo`vA>M1l}B;h<3cc~L`3I#zrf__4;M zf(k{L5pjZC+rme20o1~QCR3sL&5_P6IEBvgRS?50WMpus7&0#RV26|SwEFnp` z%%*&D=y;jZyKOT_HyXqzYa8+*cP+j@^sc|98hVNP`(i}!SreJC-Q|i4#P)@?^Oz9e z2O!Cm(D_4QaMU&v+?B5sJ@M#vpaiLRW^(sgO<_h{8C!*b+(QciA=$mVT8b{W&x*gh z4D5HD5pY_4AM^6}kiJ)QXP#fuN*9gU5&5d@juC@adQq2OAp%QQ>{xc>m;O*y@%`*c zlz7RxIL&^pPO#3qv*DX4EAZM9xAn%E7Q;nwC*&CiT*7ONpH|2T?bINM%-c3d*Vl_ z&+okL`uMICp+x&bAx$2Uodgnhulzt!PAJ9vP)TID=0bqUiyjW@>ffr1dG8bAG~fR| z@g|nx>_RPJG`d6>PS4~}gv#f#43)D|;J51Ahc|W>)q<>5CPV_FC%%qJz+R613BP+2 zpbSq`RGzrL(ySI6amCW*7=3&6*Rtj5AFT+AKqdOX>a>jc3Ilf}(?W3Juuq|R{?Q*z zjo&^w9WNIGD>fr0ymW0SJ9K02faExA<>9sPGeE_SnnwSnZ@8lDBf-xOkiy~&-$V`v zq9-Rm&CQoX!V_3G!+H*ip2AVX&-ptj;xxk)N|JZ8yGi;uH51rhg+~BVqgZdHJNCX$h{-m2TfiSY+@V)!zrNj5LxVpj?I+Ux#4fYeEJVtm z1r`_DQRz;}m9e;4B`yuPV)XuMn1t7SHCr@2UDbr2ZGB}WFSJ@a^^c0de7JQWygly?0QE;1daAvKN3W^4=`fc4STp@xO1&(yGm6>|x^=bw z)&bU>7-4+Ir?kpzF;_US|KoEh66i(ahWMyqGq7Ceh4P`lbi(o`EA$N0SW!6IEGUJ@XVx8R7gQgb zv){-r@h^L%Dl@XscpEU+|BYptKD$<6Ogmn9rlepNIyZT()=SC@<*+WWpF(QnH$x13 zpx^OOBsqVt$6ztsaw&i;v5Gkm+)GZNssl1Qdwgn0Z^AM*b3F0U>m4k>dyb)mXmtbY z9Qu62-!s+B&P}KZF}k{D6m^(N;K(gPs*Xfgg|g5Esf^c)rMn3z!<0C}>bNcq?qge+ zj};2gXiEzK4an)fN#4lYkv3z8JAWLmFcNw6E}*jjxoM?J8=X%MW#MMfh`0R@ z0_ji>BVNs9*K|*tx4=?@+&o_8G}!>gs4ke4jbJVO*Q%T~)#0JHTOz|p1CrV5cU$aI z;if@NAmX2X+6Hfb5HM$%+t5?G)Ja(`yEf?;8re{m%tw2c!@d0}^(}8)K&T4G+`k3C z1nVLlS!h9>h;fUU5g`j+%Mb7~KvG)UAxzkO$7R#vSt6O2f5>bLOiRYN)7DYMNEkeuUWZmC>* z(CgOjADFj1{g{Y2h~wAMef|tb_|1WavSDrfCSJts*gp;h(vc1lsaIivT6S6G9uQ)| zd#G$_j!wI=Tb%z=ai7FHS?BAvPg^oaY;poSoXqbpxxf*8SihTU7Wng`7rmHJZF(@F zdT1jR==w3?ywo&{fu*~Ty&D&;8_G-gIbNbSpe7!Y)#WnnVo= zFr=+(9BV1%Ab@J4RS*09{xoqO@x^D&%1-*rUmW`~9Q;;96|9VBkX?O#>$jk3UU(mK zLbi!-T5XM^-1cjgJ`}-rOdH~RZguEDbZ?{ByY@CMnzp2QIsAr5n{*I^D>n^@`s!8E zn=T5dl6OCUmdV8Bz3)aTyoQM|?c|mmbc?E>bde^fa*;1kVvZz>8MkqnyLs5(yNEq( z+wEifQ5CgQblEDaI`YrwV67Z%yBQ|v2lScC?V;F3THv)`cOIFS{zA0X-^?S}EP`}yL{-!5QdRwJ6+Vb>--7*ZTxGfXryK6SrO%kq zIU_%NMfiG3Mn~@dG|~{B@jjuE`X=B%IEv-}yr21RA1^sm@HGHkShM4|d<}o}1Wf=b zAnMcgfBOIaXE(GuQJ^#U?@W1g4T+9x#m@1fHD^?m&6d|}6y7O$d< zes$QZZ+tuF>5I*mQTc|mK7laOKcOCYeQkfn&qoqd$QTaOc?mNn`(vy@n0(7OY14i@ zh;XCmZ}*z_A9J=0nO6cuOEZ6Gw3T`Ph;$|ao##(J3K*4^j1cCIQm4O>kq{<}luhRG zu}av$!IKTHxgE`mQ|4TyF~U^gPTqGy>Lo^)a^HxEU#?qvhK==s;ZUJRm0>oF$QDgx zYHL;G6J}ZO%zWmAyD|qSZQ{8s>T3*Sbu!!w{s6oEq<84;u-EoAlUWj&h)WH*XI@T( zRu${}#nN!~Iah)Ht(Lv|_dGJ+iRr1>hXvCq#ODu67nC17qxG@cH4F6*4KvH-T#3^d z4>s=SseQUV))EJgw@MN?6A-abcrt5p%Y)_7a~nw(in=i*Gf-jUfij84&Fb<5D94PyN7cssi+!9A?^k%OF^%Yw}hcYi2=LhgrP`k|XdKmE7i%_fGHy1Z> zRZMlO{?+Wh;pj*90dx5qlotpMnaj}$l=s7bX0v*{T^3)StsP08ebjSxER5Bh@M)Nz zFHkdg9`x>pzp{Gi^EC-(7P1MYnu{D2&ZoV?wnTo4_Z6_Ve1!xbQU)><83(+40WDtC7p0woooT-jAJVD>j8V-GRY42Q+j+4lJAJPPet4D@qAC)?# z^DudOXL=|6NvAK3)B$*5KWlq9kKcp@`Ai6++ zS(tZrBO?DZ#TkLDFkSo6Hqa>Amd^fo!M{4O69@p1-Z^( zF^%k#Ns~bFAxqAzw@rHO91Xsa?fV$azj|v3$=kh2c8by1l0Y!582C9V@jacX$sORo zE47jQtC2(7ZOu^nu4XXbErR!J%kZbP1M^fU#lma28+fCKCSTvJO|pQ0_TJcX*S}(V zdfjOSWW8f|<+@f;xQU@VU0akb>wDY%cfeGKK1A<({tm@>B9<^LH~Fv+(scmKkCBN% zBW6gYmpDdOMH5HmuCQR|j?>u6u8zC(9g6X_)Egr|kznCJ?Wu_ywORz} zLab-@IyOPhJdd3Q;E&Ov1WOyv5#Q5f5^8Bw4U}^srCN}u&3-2D!m>@(!e+6(7PB9- zdRp4Xn3Q8ZY9YROIluQdcWbuN_-(Im{6lzryUnQskjZ8Y+8plU_St!Q@Ud)-F(?+w z5fNjcZ3+sqtsuDxB(Dx|JNX|gwu!I|wV$_H?$y=zkq5rtG7S?w@m1lO^W3FIwT)Z` z=jL{bKAL;*-nX)|dk?+jGvD}2d{aS0PuAEn7N0fBMSDbMhuVG}s`;(2(QG zp=1*q{^o7W-D}LTr(eI-`*d;IQ6=k=-Yu6;ZTNa8B~m*2+RINyOD%3Q&6E>(wdQn* z{?1Qn>3f$c2>2JroqKR^Ug`D=f0w=e`{D8$;LcyO^ng{n@AkA#T9C{!>E{pN83x}h z=jT6PE1}i9G3slr0kJT>J9sZP?ox z?6+C@fZF7Mb*%$ONjejQLLaoc1#zglZ-}TZ4|w()(0!v5s)GeP{``L}?d#xaeSSYM PU>H1I{an^LB{Ts58=EFZ diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index 251ba7a..99708b1 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://nexgencloud.com/schemas/vm-diagnostics/manifest/v1", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/manifest/v2", "title": "VM Diagnostics Manifest", - "description": "Machine-readable index of a vm-diagnostics archive. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", + "description": "Machine-readable index of a vm-diagnostics archive. artifact_index covers collector-produced payload and derived diagnostic files; framework control files (manifest.json, report.ndjson, SUMMARY.txt, metadata.json, transfer_commands.txt, schemas/*) are excluded. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", "type": "object", "required": ["schema_version", "archive_id", "generated_at", "hostname", "artifact_index", "collectors"], "properties": { @@ -56,7 +56,7 @@ "journalctl", "dmesg", "systemctl", "dpkg", "rpm", "smartctl", "nvme", "json", "procfs", "netlink", "sysctl", "ps", "top", "text", "binary", "ss", "mount", "lsmod", "pip", "docker", "nmcli", "networkctl", "resolvectl", "bridge", "netplan", - "iptables", "nft", "ufw", "firewall-cmd", "ibstat", "rdma", "apt-mark", "sh", + "iptables", "nft", "ufw", "firewall-cmd", "ibstat", "ibstatus", "ibv_devinfo", "rdma", "apt-mark", "sh", "hostname", "date", "uptime", "uname", "csv" ] }, @@ -67,7 +67,7 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config" + "packages", "storage", "infiniband", "processes", "config", "triage" ] } } @@ -77,15 +77,15 @@ "type": "object", "required": ["status", "duration_ms"], "properties": { - "status": { "type": "string" }, + "status": { "type": "string", "enum": ["ok", "partial", "failed", "skipped"] }, "duration_ms": { "type": "integer" }, "artifact_count": { "type": "integer" }, "skipped_count": { "type": "integer" }, "error_count": { "type": "integer" }, "facts": { "type": "object" }, "issues": { "type": "array", "items": { "$ref": "#/$defs/issue" } }, - "skipped": { "type": "array", "items": { "type": "string" } }, - "errors": { "type": "array", "items": { "type": "string" } } + "skip_reasons": { "type": "array", "items": { "$ref": "#/$defs/skip_reason" } }, + "structured_errors": { "type": "array", "items": { "$ref": "#/$defs/structured_error" } } } }, "issue": { @@ -97,6 +97,30 @@ "message": { "type": "string" }, "hidden": { "type": "boolean" } } + }, + "skip_reason": { + "type": "object", + "required": ["reason", "detail"], + "properties": { + "reason": { + "type": "string", + "enum": ["disabled_by_flag", "command_unavailable", "source_unavailable", "not_applicable", "permission_or_access", "daemon_unavailable"] + }, + "detail": { "type": "string" }, + "artifact_path": { "type": "string" } + } + }, + "structured_error": { + "type": "object", + "required": ["code", "message"], + "properties": { + "code": { + "type": "string", + "enum": ["command_failed", "command_timed_out", "probe_failed", "artifact_validation_failed", "artifact_reserve_failed", "artifact_write_failed", "enumeration_failed"] + }, + "message": { "type": "string" }, + "artifact_path": { "type": "string" } + } } } } diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index ab8f228..a06ab0a 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://nexgencloud.com/schemas/vm-diagnostics/report-record/v1", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/report-record/v2", "title": "VM Diagnostics Report Record", - "description": "Schema for each NDJSON line in report.ndjson. Discriminated by 'type' field. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", + "description": "Schema for each NDJSON line in report.ndjson. Discriminated by 'type' field. Wire rules: UTF-8 encoding, each line is one complete JSON object followed by \\n (0x0A), optionally preceded by \\r (0x0D). JSON texts must not contain literal newlines or carriage returns. Parsers may silently ignore empty lines. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", "type": "object", "required": ["schema_version", "type", "ts", "collector"], "properties": { @@ -18,7 +18,7 @@ "path": { "type": "string" }, "command": { "type": "string" }, "exit_code": { "type": "integer" }, - "status": { "type": "string" }, + "status": { "type": "string", "enum": ["ok", "skipped", "error"] }, "tags": { "type": "array", "items": { @@ -26,13 +26,13 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config" + "packages", "storage", "infiniband", "processes", "config", "triage" ] } }, "duration_ms": { "type": "integer" } }, - "required": ["type", "path"] + "required": ["type", "path", "exit_code", "status", "duration_ms"] }, { "properties": { @@ -55,7 +55,7 @@ { "properties": { "type": { "const": "collector_summary" }, - "status": { "type": "string" }, + "status": { "type": "string", "enum": ["ok", "partial", "failed", "skipped"] }, "artifact_count": { "type": "integer" }, "duration_ms": { "type": "integer" } }, diff --git a/customers/vm-troubleshooting/schemas/triage-result.schema.json b/customers/vm-troubleshooting/schemas/triage-result.schema.json new file mode 100644 index 0000000..51c40f4 --- /dev/null +++ b/customers/vm-troubleshooting/schemas/triage-result.schema.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/triage-result/v2", + "title": "VM Diagnostics Triage Result", + "description": "Schema for triage/_data/*.json files. Each file is a per-analyzer result envelope containing classified findings and typed facts. The finding object shape is closed; facts remain open for additive analyzer growth.", + "type": "object", + "required": ["kind", "schema_version", "archive_id", "analyzer", "findings"], + "additionalProperties": false, + "properties": { + "kind": { "type": "string", "const": "triage_result" }, + "schema_version": { "type": "string", "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" }, + "archive_id": { "type": "string" }, + "analyzer": { "type": "string" }, + "findings": { + "type": "array", + "items": { "$ref": "#/$defs/finding" } + }, + "facts": { + "type": "object", + "description": "Typed analyzer facts. Integer-keyed facts are JSON numbers; 'unavailable' maps to null. Open for additive growth." + } + }, + "$defs": { + "finding": { + "type": "object", + "required": ["severity", "category", "title", "description"], + "additionalProperties": false, + "properties": { + "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "hidden": { "type": "boolean" }, + "category": { "type": "string" }, + "title": { "type": "string" }, + "description": { "type": "string" }, + "action": { "type": "string" }, + "evidence": { "type": "array", "items": { "type": "string" } } + } + } + } +} diff --git a/customers/vm-troubleshooting/selected.png b/customers/vm-troubleshooting/selected.png deleted file mode 100644 index 1d9cd3e82a456fcf53928b97a40477d3d96a96c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19212 zcmc$`XH-*Nv@VPw@&XEqA|ldzSE}@)^xh#fMT&F<0@9_4i1Zp-Kza*OLWfxBEl6)k zl-?r*h#@3=@jYjJW1KV2kMG_)#&>@(7JFr9WzDtbn)8{@oO>r28*1I4xlcnzMs`C- z`>81z83h9w*%g6*u914iItK+w%@u!BEp;-;#Dg8u#Z@;o12rQ~w} z{$yly-~awy8T74kCL_}=(Rr$7{>I?|apR4}Y~l7LxJNSl!y{@A0qUFFSFfwZUJDkE z{ju}*AK*2!o+0)YKugbD&fs83ILqeqcA$B|++OLyW6KVCn#jG<+wP*hs(Z9Uzbgda zU%US7MKm=H7w6T(7UZ72CmIT0bOVQ%B~S?6&Rh`d45w(^3s`Qlsj8l_WFiF1)zkz+|M+&AnNFY;7#Ha z#9mCisJ3fem$xx4$2KVJmoD?z_^mM>x9wuWY9Whbv`)l`eKNVvFB-z0Z@yy!hW?)a z0p~Oeiy($92UE)znc@zmQ8}0LFY@dj+Q4)-M!nH`89L+oR)sJxV)DwMlvLjl(AiNe zEfifStN}xDp+RzmE=Ni9LrIL_VLb(9X@B|YHys4_2ocQj{3sfmXCqevpNAe3e6zGM z?^K4aNXljgw+-!j)@*d4ipm`%%l0Spn*>=ZuWfTFwQHy|wO)3Kz=G4p9{Q_;+O@Oc z;%V=sFGWV#h;G3Zy~eJhxMZcv&A!vDaa>x+C3i<%-`J4$5MfL>lhf6-OwGZ%8jB8I zob_=9GG)0)=r3&$yqCv6rZOZOO~)nLx-2Hqx^#lpgr^UH9jSvko;u(ssfLf=6@*0~ zPVA_5%(lwah=gG(7aZJkop6{9=A{mv;36ce@3wSHzZbiy!cmK^Pj4=>7PFkjPDtt$ z{_5tHXl}QWQG`y94HeL7M+Bd~!9Xfaw?ac2!@73@Z35aNL^G)575peVt=adM%4kk+ ze<-gfx&(hUN25beS=~oiUcegf=;5A9f^ILbQ!`GAgua1xdv69f?s6&gYM1rs1jhQX zEs6<8LTGQnNBAW9yY*YCB3W?{B>K0uQZQ5gE_*?liQN`dw=5*8BA_BmGa^&nKZI70 zJ##!k(bZYJT&KQ6LYQ9c8=b?ss4qsABX=eFGcgmXYEvS>!Z*oLhXZaieD*OB9Un)V zw{-A6!LA#=F5pwmpzk-*so6@8d+QG`Gk}j;J zK-=EZ%{y9zs^BxNGi#IAXR?7+SX4}~bK@)l%~I7reF%QEOwd1XQB1vs1m`>0h}cI)wq`;}#Xu`^< z%81RJk+cA%$ajf+xj8uZ9NC6W+c08ls=&{q(@IN6$D0yHag{sn=(B<~W2T7kuqG`_ zf`c;=m7PdDebCU5Chcd^zJ|%#{j(p0^sj8gHUrcB9G%>i!7lXv>SXa17zmSjFR0`ZG z2*E1NcFdU;8o4=|dW+9KdP#Jr`d-j%?RR|sZHu!t7Ns%e%e(zYaZ7l=_r#G4*8S<} z8Nmaj`0(N`ZsVuyI#uDM-_6Pf(XCOF9-?)9R|hgTs5PT$`kVz};GXsoy){NOW;3A# zc7$}SNG*YOoa~$L*Snn0`0gL4hGG*gtO zW(a)7E5yul&0XB7!qg=VEZiiZjn86JJJ;=~fXRP9E&7fu>>xKdafXRI$DJB8$^Ygs z?cRYIG~$NmJcH4 zmsl7-1T%VN&3XL@fP1-sli2)Y+eCa*{qX`0SUdxlPyNxzx3fN$UbKj`o)EduNddc{ zcOS~ycx$Zs#Y9600qD7tcH^ zBb$4R$J3frYH2UU!P!jobg39PGf$)`W%@Xt#TlDCemvKDmsu+CW5Lns+Hu+r6QQ3N zWRE%3`29NAHUHe!$nj^(^7N=zs8yitVCBU3_2#mlZ%S=af*U4AZ50fyr(+D+c?M_V z&LfEc-_Y&?1t-{b6)6KSYM7#0mP zT{Z$<|BBzyCSF7*bDnNtSCdsLR~&u^4c{3EQ--AwrL;A#>RA z<{jxDgI$*!4|WJ58BOY2#Ax%u)N@(Zi$aw=2^BpeYX`R9|2jnRn8|k;49&iMRC1`?aLSrWv2$@L4^Hh(42J*W~n`0#XBLjtdYN@IuJfk8xij{tHo1IAu7 zK5piLP@3EpU3#`br1<5UHvvn#Q70S8r9!-%*BeKxWHiA#jeC_=C4*0@9jmc>8#9lZ z%i3L_c9>?1xlnwF^wWyV*q@NHJ=nMmyA zMoGGF85Hp{mTwJbUamNm%hH26n@6=OdbWOjNtUfY*n@#TLDr7;Dv(U!${E;=$yoN` z50~;*y|$lfg)uIfz{uE;B-}edFPKKPm-{<$`$2yAeq?BoVPc>`@}h$E#N}Cs#2=QU zP^}6>m^JGjm1*;NVnGGrMaAG-HsMB3v*8ZYJpL(5*zin*Z-&yjx`mXk^Qu5?c9(nn zG~g#gOF`*|Xz;4N(hm-*lizKI?=1<-)p|$ID{2XF0BlsMdu8V1P zhUl|?@)VWr@?fCSiv&VaB2C@q*<{$7;CgPq5k+ZMVoZI<)T+5=fuTYm|ZXK*0hH4uvirIy6y38pg!aI@^ zLL59~ZLxF(GA-(w07*Wbj>D6P$@P}%v*yx^WX`c5W$hWXX_Y_gE9Do`zny zRp_vruyE=Ot!o=V5K$)l%%yBUlPbcmOYzf4up1*bqx(mq{1IF1k%a#n72Kbc!@Ut+ zQxlP+U~~^R-tfcq`(D2Oqh|CS0I|T4cNH1wL4bDpUuy=2weg{HJw2|$yn)_kgSN|c zhKo3f*|A0sqQQCglBPR4M;zh{(7ZTv88sW-59JDAlBmu4w!&S~ri4~!M#00`6PDnP zkpc((+%Aov+X%hm{PPUIxy77Z>^a8R6Iq-hJ@3Bl$v~0tHO?{!eDNbEpC#q8Yiz21 zTNk9OC-_WGg=+akLyy|p=iSI-M{AgE08%cjv+(Zjz*Yc0-BExrUN$#(1_JqcHH(XA zY>iTeXe2L;FPv{2%Fl#ny1bdtoPe}=Zl8>&;w+BW7Rz^6E5zPTiD}X2!vxvIDs>XQ z8atIA&uEww)=bl!P?|kr+9{1kL}oBw9!L+tE+bMNSebqAbs-0{L<*p81RdzS`kjlT zJAXB$Nq3wm)ssA|W;V18JA7nA^siKSLWW5CO1)B-^z;NBr@Ig>^}4JV&HFL#ZCt z`9r7e7hn2@@FT;wd$NuxDp``<94%3ydQ>#0oCRKbkklL8&s6Ha3{h*IbXaah#!%1r z6~61`eU$om#)DCPc^BNKDfIznxx7>T)XqmcqUgL$`juSm8SJR-*h~`5Y~P-v$Slj- zSk9sI0r;Wd`RsL8Z~t@V1@!49GMjqunisVGm&P3X5bZH*hk={+dGA%va9@z?x{pAN zzf74bYo$fI8Pg%Z?1)LY$vmNges19W4?udD3d=j6>;9x>Kk=f?yl=CfdQ|(%*?q@s zu#N0JWr5nUqe6v6cMWQ|ysJg(afWhzj0YHcrb6^PWikp1vmAFidpvk}=b;(R3S0`6 zqn})2rh=8CYXihtg~L@ZB&QnO4o*0ji*g8i9vg4Zm#2 zT)?QI`&{_ahuVF0G~?F$D)=I#2ln(3ifoW6W!}kVfQqwj))0amo*ycGj;6u-*&J@6 zy{Vat8hRHwuO3v+u9}H3V)}Y_lsI3GyuN>)*{AyHDBRsR7)adSS!JyWaI`PApZ4y& z7;9g#5N7{um!6IuGDoOSHpx!;O`NUW@7v|#jY~ktnnEy=wXWK9ARnC=3yu<^LWs7x zYRT;teI{Z0%F|C?6S!ef21++;<|p5R8_O2p$+{gq?d$#(JNez_6^Au(ODEBt#)CfO zftutA@MeI>g@?95+f@Wg$&&1SJwNNTIeMDFw{fO;hIVhF;iT5*1fIKCSdN3mr?XTY zyEEFS_P-SleLnIx_&UGi$G~T*8y@zYmkqpgdjfE8Sz5>!ISrJ zahyLE1^=i58;}>Km-9Ypp}UcPpV7E#1HqJ3448BKZO_GBA3OC3JJ| z_e-n5`5LttaTD+!QxTk-WYhG*G`)|L&r5S}MW6UHI+fn64&K{naJE#d`4N4`Q%B@ba4FqvBk_+Zugcf~ zZb|#jDIo!`KGY^=p5tR*v}wjj7*rvpJS$KU=MTUY$*&hj;1|;cX@#a<@Su(|e^<@K zf0R5jI+#sUelxJ>@J_J39_W7kV5YR-eYjaEh_STGxcq)vKVU-olm671Vq`7vVoHQq;U9N4B=)#0BBN4{-Pr74*=;Xm0o_ z6jlzN8G@ggTStbPGoD+{ofpV@F23KA{}!CU9vOz%!A=7t-l9mP!!_QGAcvhgRBr!b zM2$b|iIU{#o3>wNRIE>o^ZL;R<8FT$N^J&srColIc``kwNQxLo4}**dM7gP;WN^88i4Z8|+qYfl2;q zsVJ`xs`=xp7n~U?xTI|{yw@DN#KgOytyyG}|0#?vDjI_BFnr?p>$diZgQS?%S5@^Z z?CzlGCt=p=Okk-{bGFwv!ta?(o=LchRWv5n+8P6VbK+jWN4Dl^UpbstyjAqWMLyWe zL&u7(6p>gOnut!09oFO7&*Uj#i$BGl+G56TwwzQ_g~TbJs&=-u9cH(7Kaz+vtOjwL z%Dnb-a@63F2>(nF3y6>v3v7qI{&F@+@f(L*>psy@3tkVR4;l}0+WmQDDx&+lEZ|M@zq5^iNIqkSHQ@Y{v)%Fqge<&NnAOX6>CXGgUMd! z7lle1zK_8fgGsVxKfIQq)B2}&J%j~GrCxoPwQlD%Nr2x@zpGGQmUypIMW<{s+!2x@ z&VnCq)XGVEfffW)bh3&52c7qSKGpvvDJzE-h8;off?&yheGSaNVl|l!BuSh7xLPl% zEz_E_Bj3z9jY+?;_*i`LaqsZjKz{Yh)`KPY3VEXnJOj^5mFC7CePAU9l;oeibL8?O z^xs|7Xg=<@8Z*ZZ9N|V!Nxdp^XWLD$_mwvMRp!3mq8t7dxz#Uw zmrkQVA2u^zzZf@{Qd`~=+}jrn`Hm}4^yvnDa9xHJlC-#~5v;VaAeptdwkHVwk*+5$ zWA>vi?1)ivw&!&i(Jwq~alt5AsnhF0s1feL73pw+991kStRLrEe9S*nf*A)T%%lg8eBseW32j3QRdbW?c!d!O7Rox2W^wJX0OEQ}=FTMflO zNBWnn9QAZ;8c2{|4Y>N(7OB!n`vefcv)z=*PusI;{t9hE++1;V6(&I7x93$D04=UL_Q##Ag0xN*1*nQ?>eK?@tacRsYGeOzK&o_b<)V;y?|TbicX& z+SP4_4Ch}nTo$RQ0zQrfG2a94ob={GG2;y9G0?$r{6Sy|Z2AUR*bMHKHm`4drA+;= z#ZG;~YgU+FUt9LjIjCflKELdAOafGl3TxKaRj$th8dx$+y8LUV{RASKPh)dA`4W&(wj8vNol*C4bXB#Vq;8;kDr@-x8E`=(8z0 z8UE9gZ7==xX#d^&)x}dW%#ZlaU*yl!_nJYZ{6mV0!mc-ORL^>#PKmY+_;s-U-&|5i zo8dn^1>edlZw3z+Hfb=5ZEWKA)ho=5WiZosx(YrHN}C~E05U!)aiV^>W6MpJ0p64Is*#xN<4ot4X<~*GLpTY zF>ufFcKn#8c8bah9%neG|K}zL|J=?>@AW@?6aMl-7%Pz~hmBp+sg=c;+@Ughxd`Dy9M}^NO zLOx#l{?`MAqeF3jaHlAT_P1X|YAF&lu%nW$J}qmQL(dD$85_mlRj!=q&|3XPuoMmY zx|me~j!d{dwSl6*Z;VLN)XRY%c;~3Otq87JaC~ldfD2}p-0GUyH`$vMWry|1wd;+m z5W>p#TRfI$JvV&hDzZCfJccJJ9n9V*moHKVXhqR*^?6YKkj4f};JA7UYJaoV&RXr+ zg;(lg>ZU^$<3hR^zbtCm56SGvDwiOs*kf)#?E^T^2(y*pe8v6M26N==w~#p`4SSoc zyU6upiRgwqz8pwjKe=eg7mN(_jH+f95&XHVjPDzU)CFRLkPm5^1bu6W)=M^u!rC$D_h=?5+q}P;t+WRH7k3F z79UqA#_BL@Z+~;>{K*+r8#;}~i#L(*vHAv^{q{^$tWcl0F{QQr@@1h4QqVR#bExPe zdBROf>t-)<;nXVVC6V^F2#?qIJDAHgR__I#?f7%=(q={^Bk$Px!+AXsJJUJxm z$is16)!S6~i=-_}Jzi{eZkXS3`?7T9w$p?Q@}AAl`_!XG$7i2t<$DDN=}3yuTZ-wF z5j13C4zgm|LsB0F{CH&VPuJBs%5$_+bO-J5qI1jI-(Ekc&~Go|h9vrnX0S=k`qUL= zce+~Iw6epcvM#dVuG_(v6Gfi>K$fOp;y{m*r0@W=E*zDhw4CnIQFcJ7bGYQJ6C9?S z`J!#IpM=~yWlGxiW>h~d(*jYX1ik9uE$!mAy*`Km0xWr_o`pal+2^<0=az}F=`;2M zQ}4A%EA?xVD+&pLQ~t%x9jZ1oJ@(+;f6ZaB|IPmz1@!!3OVAk9fOh+=|4eIo<%nB5^QrKlJ|4O)P1`qkf4`|EAF`JG>36gV|-*{4R0Z zi_QKVs zx$t-N9LQFl!zRe8JUHIRWmVlK0;3Lkjs#q6y1c!p&3GIPC(OFO`y@=wwM9KFf%8@l zLa|yF9Zw$KcBLcZ(9WhvAd%2IUF-DqsS=WdGCCuPOXVdbrw@O3aNPHuenbMmJ!Oln z8f&FlKbl{1m4=;8TN~IfAM)RxVp4`|S&oRa^%(P57SE5~$Ot}E|GF=^5g^CGZXJpK zL`!RwlFkjX`)pR<{KUh#MEM$k`|Se-PVcxm{muk}$R2vCFr*}!J^Wt4K^jSxvIqWZJ4Urjw!mm5zP}948M9sF?I_7o2fPpwH()^g{(gNSUSqRs+ zQgO%nK7Q^{)2)huJJNMx?#c=qJ)Vq7 z_r89Y-gN?GO}cPI20-mVj-KTzW+<`m{9{mn?8v6wI=~RN8W$JdpnMmPe|jEm$4l7r z9Fy<}H5IdRP#!MZ8g9pB`2{5Dc#5Z2L&A52@2*H|=;pV*+ZHqDW|smBWhEK}@mUC1 ze-fL#fj>xu&dv^ZraaFb^Y2u)sb`dKdRWZ&YKO2Jy;39gyU~&^G~%~>OgW%38=~-p znCrPu9)ruD8rjnrlL%3wu9xmKKKkR}Zj6_enIq_VN0u#ZZT8#|=}c&wUZ<=+X5#m? zL48ujE;Q{j3V*4bcrvo7Z>ILHSvfQmQ+QEcWp|FA+_M-p63-N+_sy*6{v}bu6VA5C z(z4QKp)3wcpDA?zYTx|SK?+u1Ta*c?PUUs)6qbXzp->gVJ(6AGqV1k)>O_eo4suoY zv>228ZbbeHBJU(1u`b0vmSsJTRW!lAB%r=P2W2(S;~g9)(HByACre{*Pw{o$$5GET z(FS?@nwhgXbeas5O>3I3a-(lQi1^I2cRV0x*Bu&>WA` zJk)CiPHGzo!7ydPd)%QdyJRl@b*Afa12==68-{W`)??;qKxmp?=+>d_ubRoKtQ$%E zJh0FC*G&354vQ$s?8W)N!G-H2~X(s*al*()z@)bZ#BuRYK$X+^T z&vte69i+t5H!H@KVYqJY{n@p}pB_o?8GqK#XPDZ)c`lMWX&+hV^N;0~%lGDDhu`v3 zEWV_^nQq4ppvPLIeUjRWtg#to4c;x}bd}FuBdStG6}Q7WJ=3OXiXHL+V`01;;zN%k zsXuiE=bi54_%aDLkOc634c`O$g^2u-QG~gp#PkqvC%EhJ-^U^($SB_Sa`8vSc7c&RUoM1|WP01T8{8ubZRvg14OHST6i-Sp`+C zLvpH}i?q%J&SlLWYeYO6ioh52=6Jl0dtV!B8pLKW>`@3k!`#4yjY(g@Toc;DC;UYq-KH`0js#FU`CSJetjj2`FC6ux+D=A|Qsy*&93OCEK-$P-I zIYj+9{5){)?cTd33R6Iby6fEvxyqrQx7}ZMEbrtLz#e${2hCqcZ*yi~Np>6e9KfT# zk=DcOI0XZ7OVl_Q2m9hLJ~KOiU03p{q?d?0-8pEF~9DhDG*^=!pD&5SJ}XZX50-%ii1{cIoVPTN`qu-e)IOmP{i%hqo$I6->0eX-1x;M_`IzjZ>#p z2Gy9<*dOa<8iBJ+Lyrs3Ol-XFba6H+`hH%L4WDl9MR5g37W{{i7DfrC-e^QM5%OQN z-Em9Wy(|2gJ#`5|Q9i{iIJ|CSDV%kh1C;tY2B7g5szWYB?nk(E+3XG~76g&+9fT5~fgzrc$Kb42N?ypqr z#05l+Cn>hByCIPAN9DJHas-e$n;vmFHacnf3I>!H9|r0G{8>X&*2PaUf@BYxMni`3 zgYy}yDH3Mr7xzmQVWam@R8xzqD0?(^3D>KzA8PoIl@?$TM1GH8`c2sQOs)>sNu zaYyc{XVJy(S9&gE_N^QRukKXph=U!9qpA2KgAJTV>;xru78~u0CYL`+bM*jIM!e>+ zrm#7gT3kJNfChVnZpL?cMm7MWlzUemf`THU(2p)>Aiqu;G3DP8^T3~k69Ks%UOgYt8p9 z!nGDIPFR~}@+!qQQ-+%7ZG1&&4*{I{^@$yZ({ZQk8+EWBa2tWU!|eV^I%W+Ee${i# zWtKke$l3m%M?Jgwfi>3^^k^%FVCUpik3M z#%*1EYxy5OuI1<`f3o$W)53^yk!7slX&`ZJT}M7e_h*+O*>+bGn|9_6v72&%0*$Va zqtpkCKn?G@jCW_Ge6XW^s-FJ+&hn`mLBGD4ZT{Jd@_-t1vB2zU=*D;FP5Z3lmGGe4 z(>ES0UCZB>14>GI)MOrC83G0^AViL7oHHXD$dh zq7QpU`XrdeM4JZN;-;7H@A?Q@d>AReOr~-PPaFYPFpXr#%j%jH?YsqXc}I_YKkGTq zh4#AbE+4?83vhe3GeJlMIsPIL!!$_(L%i$LKRwrDTnRlgS@-9Bcg@`w9BJyOU!++ z*gtCSz2=-Qg%_JySZapjKVll^tC^D-#u{e{vq;U?qwHzCCbH()NM2Ap9$Z_I>Yk z8N(J1JFB{sWIiKw1fjc*@>SGJr7+gR4;<~3^Vb_`g{vI zYI580F%&pDw}0!$I5FVGE>c;U&)c<0VG0_L35fbqx%o(7+{z(trKw$2#+ovo(Jy6I z>-_!7t2tJQhR=qUnmr~Lo!4{rimlaNS`@w)%QeZUA&<6{d&&3Xv~KR7AlhOEJQC2r zU*7ur+d_~P>L2eL%Ei+%Y>nhD(avN;^>tJXcskbGg7*sE_`gV?;aSgpaFSdw^{1)s zhdp_BWXirSBRg&jV9` z&1CM1(Qo$PiF6J6IeSujY*|_7eAC|~Utohz0P%p?Ei0al+vA}P-RjIve4;V5JC{h8 z0MEYaDPxv9KbPzC?$(~F5@ZSwqe8w!B~kH2j*0kR-X-H;YKm+=?ZTG{HrQYxv>_Wf+;?)@JWXQ^2p@-kfM zaQP>qITl$G%})!B=Bpm}SgwsWm1{WvIf;|_KGt*$9^G2PrPD}MbYjY>H7u>ly_x_o z<1Ndo1a`WD3laCzD!NL}mE^E$3k+2HpQ8g9*U{2wZp?a2;9j}X4YJ=k1l4c5*e;Xg z0ZpOC_-N>W$vk(lHKxD{`ca`Z4bSNZ(Yy<($(XLy7sXAV-H1QjQpoI_(v+58tC=6U zA)W`3kgq?^txI}Xw(5^ns+{{-Q`pui;@oN2r8(XEb-B(6X3nWBAlh!}fO7m@=E;s2 zL~5+mGe1xMdVhE76ua4cAROL>if};vwBu=Z3LewovURi+9LA+jgvuX>#WUSMy|Ho@ z2zgM*Z))z+u*`UGKlD@@U*Y2xS;dh+^^Z>dtpsj?4s^`tcS#m%ZxkIv`gSF9iNPXA zz$FU)TvW!Xf+?=pv#{-sPgcAHGuK_R^P5FpH%N>&WoG&0@<#E0ROkCX`Tv@i3}Dei zM?RfHxUgw2J~@5&g6!=>zuyn238@tx%sFmI579!*WCwE&O2}+^?E3-PmwIQ&iVV!| z7=at_i{nE3v>ul}SOC^|mFNDl{RPgyn)iIY+`$~{A4jW(G2(?9D#T}AuXXVw3~ zeazp!h{+nql@L-D^t$!|nB4BkDsZel@Aq-LME|<;=iq6LIhBjLZtwK4E#Q;Ro6617 zD>3Y)7v9!|jT2t`Od)(l+)+p|H&F(Ie6~y%`n`EgccGf}<;~2zJ7h!If$qWr8O)o7 ztzkNG`+v&pkK--1K_It$o(+uyKO3Ex?$4%M7Sc_%2)e5QNJe^jfcf*4bvMtm_a7lL zIGfH5b5-M#4}?<|mmDFYk-#I#>B+C-S*0ocZMff_TxOej=0V7sR$Jr#>82}Wav9Bc z6Nn0Inf%BkSjzmh5RX{~kKK!Do|3=`UUR+p%DG8f$(OTB{hkkQd)zjw2%ZCdW=8^a z?f-f0aW!U2%KxHpCOTsZ7&Npglm@k`PYoTb>kH00PSk2h({)qHN2R(4&E?7#Ny(+Q zvlZ7`Loef|j2(+3+#42Q76g`}NuklXL|qoK8X#G zE6$gWSTYuLCiotBy@>F!5kqAX62`! zC(_K{ZMKR8q}*q#$_l!oPiv~$_Ryp^qMr&{b z<4utsM|%$2Sq)50{>+|EwP^Rlxyzuh_Oy2H=Oo~Exbd?V>Nm4|K55N=;~>}+aLbpu z@X;sjCFN>b1VUibxVA+n>blPSw`UvSqD@+i=U7(x&1tT5{kCW8ZL-73WgMTd+GCDB z>CkaLHJvQ1B8k&#gw0Y0y4h4dW6yM3RO?f%ws*-k5x4(|9+HnqzCsq#TtLb2y%Hrf z>T4-p8#q2wY*4_rHWLTbtF!!n5!i+_$2h`(vxdZX?A0uKe(rj_PH#Ca^a@l=!*JSw zo4KGY-#NS~+N6B9uZpnpBi&m2pQWuYYl~t=+hp8(&^YNPS1h$4tBp6(>Tkepitp@w zySqctBx8xO6qcP~OT9=JZ2B^mwrbKeqfd^22QK>8*@@-8L<+e~_^O;KK)a-+XXJ1O zSCit`t3cPCC@Dqp`yv%b2T};ZRF7|l%Xf@8l8^N| zxbm%w`*0q!IacR11L%a}t5tu=bv!RDyl?nLo+5>`0&NdS+Q8kV2z@(+6rvh;PA_iJ z?+_o@WkhpHTb~aCbi2u?bj|bb42#UmBM{(#z2WohH=TH7YH^EQ=po{oyFYpKo~DjF z#uBftfY+C^e>dD+$(}YoF*?huI=@=)om?qtO5IsXsy9g^axbpBl4ZU*9E66DcNPwcK0s5(>w0RMVaI| z!dX#5;+Ge%`oy$AhqNRFAxENvJ@&MW66e2dF^=4nbZ52`z`0616y?rXU&3v0K@N@K zGi~`TPyA*;`V5&B9X`DQ4iLN;G6D&LkvxshSPNzO#!P2@QybzCSw>p!{h)aWYbw-G z5{Bs{ngmajk)FYjy!_ALdpjb)sv=sqLtj=|-VTv3aX43IEqcRGpY9qZXN8_y_he-4 zm{58PDb!~Y(3$qT1zTL2Wt`si+2Cf$lC)tgz?N;o3VN@xQjd+slmSQ($i4%nwTc*T zs_oIP*d=fFK{82tzwW&0O{edF$o+U>Q}$Rlq%k`#y+cA}j_Y|4@SbvbiIG+5TdCTI zraaB+Ntq1N)<0yQ)TnD2YCdnw-_;Yg8XEg1Z-Z$Ed6Rz(#ws9+q65~sZa8N^`X2L@ zKcO899p6C)r0I{h*iQGwS=PEvS4d%HcXV$meU1_QZQCFnx+=<5ZH;p&`j{^*3s80g zwh5&lTL+0{MDktL>lkx1wZ>M59d@edp`I!=EELgYVE4V1(pF{#Uv&T$C*P>+)*fmv zvPvHR$06$wy9d=tK(y~m-VOQo50{#IUXRb^kMs-PXR(kR4=SZI{=AD&kM#{6eN)a#w5eq?O@xN=wP?`#&zV^egD5 zg_ge)e<$ay-JCNFQ9e5NwRi^JP06DrFT-1tZeBqWTQDP z+uzrIQxvzwtf=O0vi()cwY3wR5oNc^Mtu@UDrZup&zk09ydY3e$1@tYvk7cX(!bJ; zAiHCl7v586Yj-R3jm7tYeCHBOz+NW(OH=Zk=`qUXZaqV6Mo8%Sr|k$2ul7^HD|nD! z;bFW!?_&|RVP13lu9h0V&}Fv_{+{t*%+SoZIky7hms-`W_!`L@y71}R)Gbh zg`ne#`KJ|QSrSb~H-M82t69;k8MrWI46QM%C-_<3g zP_GmXb{cK?OEQbRQMn!K;09H+4c2EfRqMYd$!!#6&Qalh?9Foybt+6F&2`33mu+P;jo+NeZtl zoG+cX#l3};w-tq()g9v=&G$_|3Bwk;WLzpEdbb1IA{0dly>GH52PYpD0r^<96vq+Y zsL2ZJX~`3GOd(=ZH-I}`4-P_4AJxkI2Au~=@+-N4PKB(0AnxriJw07z`!H5u!GSxt z(ezbN)gdSzVZhg*stK|o*0|Ks{F8irN*Xz#F&=JW$T@NbVc{)5Bs~&RGH=68`+d&U zwG)ioFi(X#f22ut*slZ3qz*pw#=iI3zG2Tz2i46NI@u$8*4{Ey6f_=|)AZrjuEGM3 zg430E^PHq4T2!Xd9nd57u=ywf!>o~hv-S70PF1#-?}qg&`!PRgEy(s-DypTbQ*F&l za2@!=&ztq$XDzL9+r%{ye5#S2({A`9wXnK!bJ%|NZ~(sPC&Y$JTz+(6@N-|Cq*+3J zt4Cqnk>zrqR49y$!`APv_I0oi;F=F4op{>N;c=xadPhv-)i2K-LLlmw^kMc@i(I?P zm?#opwUo_?GF{)P$@skTH#-+@BDS*ESxidTJ@;HQ^?_R~TT1KS_b6o|70xuQ?dfOx z6jpd@n11Cy_3Mn&BDbY9t(Od`_bymsM`U@F`WtD}&%e=5&Kow?-5l{YtldoSFeL@& zoFl!Rd^;*mC7GkjU_955<0p@W?Us9UBku-or#o8Lknh--f`Yy~>6?O--wpk{@zJKfOI0@|)HXNkhHwjS|m&yeEn{%&+Ak4M?_m(t@z zM5a##xKjM-Vcu>x2?D>4^7>3pZc*G49m_qnDl~a6R_Uu7Xg_%vY|ltukWI;N{JzJo zo-t~98j-pG<>`DMYjqfpR!|#zQ6my_P{dtg6$KGLTa2&8nBF6Gc<-m+hC8Xy% ztN!-4f>#-~ zdx9RSx&8>Jx2^asLp+#D`Tf`k{TlvxPZ(7k+1vfSXovxc%b^3Bh`sW$d>T|jyV$l3 zk*u|Y!r9p~WWqD*LyKk!0ufPI`-xo?c*WyZMu234jZEL!sv|}FFO4er`v~-(<=6R3`eZPa-IIF?Gs=O=zB!tOOu565o9x$r{fm7#qp43>0Clzu zYxYGKUFI4<)wzLVxxWx8;~H7VG-VVie=Sq;7qCmB3t1Yn5@_weiw2H;Z4De|2Ru*u zlKwHDvu8EVYu;dZ=`Y$Mm{)ht?4+9u!m1wdc_-P{e{&hxDAQDaX*o&&c5m7SjSu@G zS&ZLXO?io%K*!kqYozv#P#?h|Dm~vCBo$%6JQjHh+iR<%ydO8q99ih!ZJa+Gsg@m^+VbamiHCk2lfCXspN=a=w?1@t*#tsK$x55Ee++-5 z!2)PJjbr~P(KL2H!=78SHMLn#E=5p0a<-3oaMYqsc`&*aZ=m-HM*zOh(xS@r4z3jc7Tn26M8QWgKGw8zOOGthb1#vTcY=`ERL%TQeM`vq1@^$0CG75r5M zPYqG2w1-Le$sZgfdjzEP7j=tgPwq+KGhZ2UJpi+{TBNjRPHG5`$pbx8ZtGJ%_?}p! zzlwd35bO3bz+VTRrX6M&>#9QWt^25qzd}$xQr_|q6Lz1J@PmDKFX4!!h7;-N>XIVi z^~EY@D_7yZvV!zSVe!+Oj|6)$bAC6j;GHUYMVR~?_jRxHzj@37?V{3ksR4RG!jFum zm-UsCt(B`W*D7l>&hhbp${~ggQ)2z! zuciKPb*_OQJ{DeNfwKMI6}eVC1uS2RI1*9TfAJRApTvejuDIYxcjT2GqY>4=HQi*U zC4WKK|6&#Qnj8b!&!%%E)tGscMi9!Uf;7T^ts%9c-(CtxSPAvbDgYio30~(Hc1Z*+ zHwUD=$PGF5>u?WHAys>5+MHl=_T6~}L_hS)DsxDtG-R{?eTIK$u_|CTi~ic*ziIGp@E)=24$A0Y&7f@*;W?gveG*Dd_VyKnEzc&x$^I##!O=kp$#b7N zvf0i@1wLjK@Ks3gpquwt4;~V5<@sq(+mI8cw{j9+7@*v#&4+K=sCKbb+fF;ra5c0u z2KXf9EMI=b(e*nPzD9{kr2( zr~3SyY!N4%5fPL$q>p!k(5lEL5xIKnbTWS7(t}Lx5k|s|b0wbi!q*Bi12JMsX4>10 z^|t`NkKfrX3esOZjf~t^FEP7!AS(LcwbG74$(h9l7h;C*Ktk2NU8^QqyLkq8hb!RVIdWKYF;zm~kG_o5jnogpHE&4`Csy_titLw|_IUjxE>(zmJz#Ls-S7Wd-<^`jsvG}S zEWLR()_K2^h*#6@{Y!35zjAzfUi`AS+J#yHpmxcJzkFqrBWo6Gr5yj}y0+LMbj?&P z!?X9+zj-D3T*qV8dl&b8=`(BAf0wn_tlT*5kjXh~XSS1`mbX&wr6;|u`x()-w&uXb zA8(RGKfm$+@z!teubR*o_3A4pE&RE%{KD-u0Wb4~Ila0}}lo%Kh!How|)#msx2)&!+-!;*WmgHR%pZ=)k0#4XYx0|pbOG&vRf9W5G_$}s@ ztCrobcK^n!A-0i=qqpeF`X)Q3*y`V2#T#RCnx^^fwEUVf`;GDBSl!2ui`k1_J-r>O z#`XGh|NHi@v)=I>zr43(cJZ~lOMb<@zIKc;{X|IYg6V0NGTifk+jGhfEZFU|@_xzj z-K$rgU$Mn9d^Jy;fBJTXoeq_@o8q|90oW}qS-PdM zpI{ZpS|e+*i7LfZFZaLw zvVD8s+91)a80F&;wWoG3=`Z{A5m==^bYAxC2e8Ea+jnizwPO?%sQJaOu8C zi*Iij+JkS)ORq8x-}PZmX~6Qj^Gy4$_a%jD+z;O!oLk-yawf#_nE3poT%Ud!f93mp z$FS`Bj?G%rPoH6&H;ujYU9$7OWlN_2cRJN}J67%RMf2#@?-PzZ%m%vOV1D52JE{kP zXI~g^n)J?FR$oVL8L(Pbx1SQe&qDps34^?E_jkB6Z;DV}Z=$xmRcfMYwv zT(*1r(nZm0fR*c7cAJ}04*rwvCKT|#&-yr9JamNJe)EWU_D|xj*TVbpHV>RDRb0due<;ari`(u$-4HlU zVp@{R#2~)QjFCZr58Q^Rn#yl)nYV0$FGzy2U0P@@3!?UhL3hxAR%$zFY(ZT(if6+I i-FBWq{nX*|Pu}Ud?QWF^OA3HaW$<+Mb6Mw<&;$U78jX$s diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..0753256 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,215 @@ +# gather-info Architecture + +## Overview + +`gather-info` is a static Linux binary that collects VM diagnostics into a self-contained `.tar.gz` archive. It is designed for customer and support engineer use on machines we do not control and cannot access directly. + +The binary runs 11 domain collectors, performs automated triage analysis, and produces structured machine-readable output alongside human-readable summaries. + +## Execution Pipeline + +``` +main.go + | + v +cli/root.go Cobra flags, signal handling, context creation + | + v +runner/runner.go Orchestration hub + | + |-- privilege/ Detect root/sudo availability + |-- platform/ Detect distro, GPU, DCGM, WSL + |-- install/ Optional interactive DCGM installation + | + |-- collector/ 11 domain collectors (sequential, skip-aware) + | |-- system CPU, memory, disk, processes, hardware + | |-- network Links, routes, neighbors, firewall rules, netplan + | |-- nvidia nvidia-smi, dmesg Xid extraction, driver params + | |-- dcgm dcgmi discovery, health, stats, optional level-2 diag + | |-- docker docker info/ps, sanitized container inspect + | |-- services Batch D-Bus service status, failed services, fabricmanager + | |-- journal dmesg, journalctl (kernel, errors, OOM), optional full journal + | |-- packages dpkg/rpm nvidia packages, pip, held packages + | |-- additional Limits, sysctl, LVM, sensors, mounts + | |-- storage nvme, smartctl + | '-- infiniband ibstat, rdma tools + | + |-- triage/ Post-collection analysis (after collectors, before output) + | |-- xid NVIDIA Xid/SXid classification (24-code catalog) + | |-- firewall Firewall posture detection (iptables/ufw/nft/firewalld) + | '-- critical Critical log extraction (panic, HW error, fallen off bus, timeout) + | + |-- output/ Generate structured output files + | |-- metadata.json Lightweight execution summary + | |-- SUMMARY.txt Human-readable report + | |-- manifest.json Full machine-readable archive index + | |-- report.ndjson Streaming event log (1 JSON line per record) + | '-- schemas/ Embedded JSON schemas (self-describing archive) + | + |-- transfer/ Floating IP detection, SCP command generation + | + '-- archive tar.gz the work directory +``` + +## Package Map + +| Package | Purpose | Depends on | +|---------|---------|------------| +| `cmd/gather-info` | Binary entrypoint | cli, config | +| `internal/cli` | Cobra command, flags, signal handling | config, executor, runner, ui | +| `internal/config` | Config struct, modes, timeouts, exit codes, build metadata | stdlib only | +| `internal/runner` | Orchestration: detect → collect → triage → output → archive | all internal packages | +| `internal/collector` | Collector interface, registry, core types (Issue, Severity, ArtifactRecord) | executor, output, ui, platform, probe, sanitize | +| `internal/triage` | Post-collection analysis (Xid, firewall, critical logs) | collector (types), output (writer), ui | +| `internal/output` | Writer, manifest, report, summary, archive creation | executor, schemas | +| `internal/transfer` | IP discovery, floating IP detection, SCP commands | netlink | +| `internal/executor` | Subprocess execution: timeouts, process groups, capture limits | stdlib only | +| `internal/ui` | TTY-aware terminal output (pterm), spinners, prompts | pterm, isatty, runewidth | +| `internal/probe` | Go-native probes: systemd D-Bus, procfs, netlink, GHW | go-systemd, ghw, procfs, netlink | +| `internal/platform` | Distro, GPU, DCGM, WSL detection | executor | +| `internal/sanitize` | Redaction of secrets from configs, process lists, Docker inspect | stdlib only | +| `internal/privilege` | Root/sudo detection and interactive acquisition | stdlib only | +| `internal/install` | Optional DCGM installation and daemon enablement (Ubuntu 22.04/24.04) | config, executor, platform, ui | + +## Core Types + +### `collector.Severity` (int enum) + +``` +SeverityUnspecified = 0 // sentinel, catches uninitialized Issue{} +SeverityInfo = 1 +SeverityWarning = 2 +SeverityCritical = 3 +``` + +Explicit integer values (not iota). MarshalJSON/UnmarshalJSON serialize as strings (`"info"`, `"warning"`, `"critical"`). + +### `collector.Issue` + +```go +Severity Severity +Category string // "GPU", "SVC", "MEM", "DISK", "FW", "LOG", etc. +Message string +Hidden bool // omitted from SUMMARY.txt, present in manifest/report +``` + +### `collector.ArtifactRecord` + +Every collected file has structured metadata: path, type (`command`/`file`/`probe`), command string, exit code, status (`ok`/`skipped`/`error`), timing, SHA-256, content type, parser hint, and semantic tags. + +### `collector.CollectorResult` + +Aggregated per-collector output: ID, name, issues, facts (`map[string]string`), artifacts, skipped reasons, errors, duration. + +### `triage.Finding` + +Richer than Issue: includes severity, category, title, description, recommended action, and evidence lines. Findings are converted to synthetic issues for the manifest. + +## Machine-Readable Output + +### `manifest.json` — Archive Index + +The primary machine-readable file. Contains: +- **`artifact_index[]`** — flat list of every collector-produced file with SHA-256, size, parser hint, tags +- **`collectors{}`** — per-collector summary with status, duration, facts (typed), issues, skipped reasons, errors +- **`platform{}`** — OS and kernel +- Schema version, archive ID, tool version, generation timestamp + +Control files (manifest.json itself, report.ndjson, SUMMARY.txt, metadata.json, schemas/) are excluded from `artifact_index`. + +### `report.ndjson` — Event Stream + +Same data as manifest in streaming NDJSON format. Four record types discriminated by `type`: +- `artifact` — file was collected +- `issue` — problem detected +- `fact` — key-value observation +- `collector_summary` — collector finished + +Order: per collector (registration order) → artifacts → issues → facts (alphabetical) → summary. + +Wire rules (per NDJSON spec v1.0.0): UTF-8, `\n` delimited, no internal newlines, parsers may ignore empty lines. + +### `triage/_data/*.json` — Analysis Detail + +Three JSON files with rich finding detail: +- `gpu_health.json` — Xid/SXid events with code, BDF, severity, count, action, evidence +- `firewall_posture.json` — posture classification, per-tool results +- `critical_events.json` — critical log events with pattern, severity, evidence + +### `metadata.json` — Execution Summary + +Lightweight backward-compatible summary: version, flags, per-collector counts (artifacts, skipped, errors, duration). Does not duplicate manifest detail. + +### `SUMMARY.txt` — Human Report + +Text report with issues grouped by severity (CRITICAL → WARNING → INFO), system/hardware/GPU summaries, collector status table, and archive contents listing. + +## Controlled Vocabularies + +**Tags** (on artifacts): identity, cpu, memory, disk, hardware, gpu, gpu-errors, gpu-health, network, firewall, docker, docker-security, services, journal, oom, packages, storage, infiniband, processes, config, triage + +**Parser hints** (on artifacts): ~40 values identifying the tool/format that produced the content (e.g., `nvidia-smi`, `dmesg`, `systemctl`, `json`, `text`) + +**Issue categories**: GPU, SVC, MEM, DISK, FW, KERN, HW, TIMEOUT, ERR + +## Collection Modes + +`--mode=safe|quick|standard|deep` + +| Collector | safe | quick | standard | deep | +|-----------|------|-------|----------|------| +| System | run | run | run | run | +| Network | run | run | run | run | +| NVIDIA | skip | run | run | run | +| DCGM | skip | skip | run | run + level-2 diag | +| Docker | skip | skip | run | run + container logs | +| Services | run | run | run | run | +| Journal | skip | skip | run | run + full journal | +| Packages | run | skip | run | run | +| Additional | run | run | run | run | +| Storage | run | run | run | run | +| InfiniBand | run | run | run | run | + +Explicit CLI flags (`--skip-*`, `--include-*`) always override mode defaults. + +## Key Invariants + +1. **Path reservation** — all artifact paths globally unique via `Writer.ReservePath()` +2. **Atomic writes** — all files written via temp → rename (no partial files) +3. **Process isolation** — all subprocesses in process groups for clean cleanup +4. **Context propagation** — cancellation flows through all layers +5. **TTY awareness** — stderr for progress, stdout reserved for archive path +6. **Fail per section** — collector errors don't stop other collectors +7. **Triage timing** — runs after collection, before summary generation + +## External Dependencies + +| Library | Purpose | +|---------|---------| +| `github.com/spf13/cobra` | CLI framework | +| `github.com/pterm/pterm` | Terminal UI (spinners, styled output) | +| `github.com/coreos/go-systemd/v22/dbus` | Systemd D-Bus for batch service status | +| `github.com/jaypipes/ghw` | Hardware detection (CPU, memory, PCI) | +| `github.com/prometheus/procfs` | /proc parsing | +| `github.com/vishvananda/netlink` | Netlink route/interface queries | +| `github.com/mattn/go-isatty` | TTY detection | +| `golang.org/x/sys` | Unix syscalls (disk space, process groups) | + +## Build + +```bash +CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -trimpath \ + -ldflags="-s -w -X .../config.Version=v1.2.0 -X .../config.Commit=$(git rev-parse --short HEAD) -X .../config.BuildDate=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + -o gather-info ./cmd/gather-info +``` + +Static binary, no CGO, ~15MB. Runs on Ubuntu 20.04/22.04/24.04 with no external dependencies. + +## Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | Archive created, no errors | +| 1 | Fatal error, no archive produced | +| 2 | Archive created, some collectors had errors | +| 3 | Interrupted (SIGINT/SIGTERM), partial work directory preserved | From 23959222ff3a63ad77c7474b9e15208259e18f18 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Wed, 8 Apr 2026 13:00:53 +0200 Subject: [PATCH 02/23] gather-info: fix banner and info box rendering in narrow terminals Add 3-tier adaptive banner: full BigText (84+ cols), stacked BigText (43-83 cols), compact styled text (<43 cols). Increase two-column info box threshold so it collapses to single-column at moderate widths instead of rendering cramped values. --- .../vm-troubleshooting/internal/ui/pterm.go | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/customers/vm-troubleshooting/internal/ui/pterm.go b/customers/vm-troubleshooting/internal/ui/pterm.go index 3f54570..7640edd 100644 --- a/customers/vm-troubleshooting/internal/ui/pterm.go +++ b/customers/vm-troubleshooting/internal/ui/pterm.go @@ -15,7 +15,11 @@ const ( boxHorizontalPadding = 1 boxColumnGap = 4 minBoxContentWidth = 40 - minValueColumnWidth = 14 + minValueColumnWidth = 20 + + // BigText visible widths (measured from pterm default font). + bigTextFullWidth = 82 // "Hyperstack" on one line + bigTextWordWidth = 41 // "Hyper" or "stack" alone ) type boxLine struct { @@ -50,10 +54,30 @@ func (u *PtermUI) Banner() { fmt.Fprintln(os.Stderr, "=== Hyperstack - VM Diagnostics ===") return } - pterm.DefaultBigText.WithLetters( - pterm.NewLettersFromStringWithStyle("Hyper", pterm.NewStyle(pterm.FgMagenta)), - pterm.NewLettersFromStringWithStyle("stack", pterm.NewStyle(pterm.FgLightMagenta)), - ).Render() + width := pterm.GetTerminalWidth() + switch { + case width >= bigTextFullWidth+2: + // Wide: single-line BigText + pterm.DefaultBigText.WithLetters( + pterm.NewLettersFromStringWithStyle("Hyper", pterm.NewStyle(pterm.FgMagenta)), + pterm.NewLettersFromStringWithStyle("stack", pterm.NewStyle(pterm.FgLightMagenta)), + ).Render() + case width >= bigTextWordWidth+2: + // Medium: stacked BigText — each word on its own line + pterm.DefaultBigText.WithLetters( + pterm.NewLettersFromStringWithStyle("Hyper", pterm.NewStyle(pterm.FgMagenta)), + ).Render() + pterm.DefaultBigText.WithLetters( + pterm.NewLettersFromStringWithStyle("stack", pterm.NewStyle(pterm.FgLightMagenta)), + ).Render() + default: + // Narrow: compact styled text + fmt.Fprintf(os.Stderr, "\n %s%s %s\n", + pterm.NewStyle(pterm.FgMagenta, pterm.Bold).Sprint("Hyper"), + pterm.NewStyle(pterm.FgLightMagenta, pterm.Bold).Sprint("stack"), + pterm.NewStyle(pterm.FgLightWhite).Sprint("– VM Diagnostics"), + ) + } fmt.Fprintln(os.Stderr) } From 2e5762a404d7843f2660b4eb07a95106e4ac31e3 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Wed, 8 Apr 2026 13:37:45 +0200 Subject: [PATCH 03/23] gather-info: implement phase 2 stable identity end-to-end Add required issue/finding codes, confidence, and stable fingerprints across collector, triage, runner, manifest, report, schemas, and tests to provide machine-stable identity and recurrence tracking. Made-with: Cursor --- customers/vm-troubleshooting/AGENTS.md | 2 +- customers/vm-troubleshooting/CODEMAP.md | 2 +- .../internal/collector/collector.go | 61 ++++++++++-- .../internal/collector/collector_test.go | 36 ++++++- .../internal/collector/journal.go | 10 +- .../internal/collector/services.go | 37 ++++++- .../internal/collector/system.go | 22 ++++- .../internal/identity/fingerprint.go | 30 ++++++ .../internal/identity/fingerprint_test.go | 45 +++++++++ .../output/archive_consistency_test.go | 2 +- .../internal/output/contract_test.go | 88 +++++++++++++++++ .../internal/output/manifest.go | 36 ++++--- .../internal/output/manifest_test.go | 18 +++- .../internal/output/report.go | 34 ++++--- .../internal/output/report_test.go | 24 ++++- .../internal/output/summary.go | 10 +- .../internal/output/summary_test.go | 42 ++++++++ .../internal/runner/runner.go | 97 +++++++++++++------ .../internal/runner/runner_test.go | 57 +++++++++++ .../internal/triage/critical.go | 70 ++++++++----- .../internal/triage/critical_test.go | 17 +++- .../internal/triage/firewall.go | 34 ++++++- .../internal/triage/firewall_analyze_test.go | 47 +++++++++ .../internal/triage/integration_test.go | 51 +++++++--- .../internal/triage/triage.go | 36 +++++-- .../vm-troubleshooting/internal/triage/xid.go | 27 ++++-- .../internal/triage/xid_analyze_test.go | 21 ++++ .../schemas/manifest.schema.json | 22 ++++- .../schemas/report-record.schema.json | 22 ++++- .../schemas/triage-result.schema.json | 17 +++- 30 files changed, 869 insertions(+), 148 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/identity/fingerprint.go create mode 100644 customers/vm-troubleshooting/internal/identity/fingerprint_test.go create mode 100644 customers/vm-troubleshooting/internal/output/summary_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/firewall_analyze_test.go diff --git a/customers/vm-troubleshooting/AGENTS.md b/customers/vm-troubleshooting/AGENTS.md index a9473e9..4305c5a 100644 --- a/customers/vm-troubleshooting/AGENTS.md +++ b/customers/vm-troubleshooting/AGENTS.md @@ -58,7 +58,7 @@ The archive contains three complementary output files for different consumers: Rules for the structured layer: - Schema files live in `schemas/` in-repo and are included in every archive. -- `schema_version` follows semver: minor adds fields, major changes types. Current: `2.0.0`. +- `schema_version` follows semver: minor adds fields, major changes types. Current: `3.0.0`. - Facts with integer keys (`cpu_cores`, `gpu_count`, `memory_total`, `oom_event_count`, `xid_classified_count`, `critical_event_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report/triage. `"unavailable"` maps to `null`. - All other facts remain strings. diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index 8e68ac3..535bb5c 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -101,7 +101,7 @@ Keep this file updated in the same change as architecture or collector changes. ### `internal/triage/` - Owns post-collection analysis: Xid classification, firewall posture detection, critical log extraction. - Runs after collectors, before summary/manifest generation. Findings injected as a synthetic "triage" collector result. -- Output: `triage/*.txt` (human) + `triage/_data/*.json` (machine). Hidden findings omitted from SUMMARY.txt but present in manifest/report. +- Output: `triage/*.txt` (human) + `triage/_data/*.json` (machine). Only `confidence=high` issues appear in `SUMMARY.txt`; both `high` and `low` are preserved in manifest/report. - Xid catalog: 24 datacenter-relevant codes from NVIDIA r590, with Xid 154 dynamic severity from recovery action text. ### `internal/collector/` diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index 2a3855f..a64c54c 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) @@ -218,15 +219,63 @@ func (r *CollectorResult) SetFact(key, value string) { r.Facts[key] = value } } -func (r *CollectorResult) AddIssue(sev Severity, cat, msg string) { - r.Issues = append(r.Issues, Issue{Severity: sev, Category: cat, Message: msg}) +func (r *CollectorResult) AddIssue(code IssueCode, sev Severity, confidence Confidence, cat, msg string, fingerprintParts ...string) { + issue := Issue{ + Code: code, + Severity: sev, + Confidence: confidence, + Category: cat, + Message: msg, + } + if len(fingerprintParts) > 0 { + issue.Fingerprint = identity.Fingerprint(fingerprintParts...) + } + r.Issues = append(r.Issues, issue) +} + +type IssueCode string + +const ( + IssueOOMEvents IssueCode = "oom_events" + IssueDiskWarning IssueCode = "disk_warning" + IssueDiskCritical IssueCode = "disk_critical" + IssueSvcFailed IssueCode = "svc_failed" + IssueSvcFabricmanagerBenign IssueCode = "svc_fabricmanager_benign" +) + +// CollectorIssueCodes enumerates collector-owned issue codes. +var CollectorIssueCodes = map[string]bool{ + string(IssueOOMEvents): true, + string(IssueDiskWarning): true, + string(IssueDiskCritical): true, + string(IssueSvcFailed): true, + string(IssueSvcFabricmanagerBenign): true, +} + +type Confidence string + +const ( + ConfidenceHigh Confidence = "high" + ConfidenceLow Confidence = "low" +) + +func (c Confidence) String() string { + return string(c) +} + +func (c Confidence) Valid() bool { + return c == ConfidenceHigh || c == ConfidenceLow } type Issue struct { - Severity Severity - Category string - Message string - Hidden bool // default false = shown in SUMMARY.txt + Code IssueCode + Severity Severity + Confidence Confidence + Category string + Message string + Fingerprint string + RelatedArtifactPaths []string + UnresolvedArtifactPaths []string // populated when structured inputs land (Phase 3) } type Severity int diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index cc1257f..313d9cb 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -11,6 +11,7 @@ import ( "testing" "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" "github.com/NexGenCloud/vm-diagnostics/internal/output" "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) @@ -258,11 +259,40 @@ func TestSeverityZeroValue(t *testing.T) { } } -func TestIssueHiddenDefault(t *testing.T) { +func TestIssueConfidenceDefault(t *testing.T) { t.Parallel() var issue Issue - if issue.Hidden { - t.Error("zero-value Issue should not be hidden") + if issue.Confidence.Valid() { + t.Error("zero-value Issue should have invalid confidence") + } +} + +func TestAddIssueComputesFingerprint(t *testing.T) { + t.Parallel() + + r := NewResult() + r.AddIssue( + IssueOOMEvents, + SeverityCritical, + ConfidenceHigh, + "MEM", + "1 OOM killer event(s)", + "journal", + string(IssueOOMEvents), + ) + if len(r.Issues) != 1 { + t.Fatalf("expected 1 issue, got %d", len(r.Issues)) + } + issue := r.Issues[0] + if issue.Code != IssueOOMEvents { + t.Fatalf("expected code %q, got %q", IssueOOMEvents, issue.Code) + } + if issue.Confidence != ConfidenceHigh { + t.Fatalf("expected confidence %q, got %q", ConfidenceHigh, issue.Confidence) + } + want := identity.Fingerprint("journal", string(IssueOOMEvents)) + if issue.Fingerprint != want { + t.Fatalf("unexpected fingerprint: got %q want %q", issue.Fingerprint, want) } } diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 57d5ac9..351c70e 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -97,7 +97,15 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") } else if oomCount > 0 { r.SetFact("oom_event_count", fmt.Sprintf("%d", oomCount)) - r.AddIssue(SeverityCritical, "MEM", fmt.Sprintf("%d OOM killer event(s)", oomCount)) + r.AddIssue( + IssueOOMEvents, + SeverityCritical, + ConfidenceHigh, + "MEM", + fmt.Sprintf("%d OOM killer event(s)", oomCount), + "journal", + string(IssueOOMEvents), + ) c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, strings.Join(oom, "\n")+"\n", "", "journalctl", []string{"oom"}) } else { r.SetFact("oom_event_count", "0") diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index 15c605d..143a42d 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "slices" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" @@ -221,8 +222,15 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context } if fmIdx >= 0 && c.isFabricManagerBenign(ctx) { - r.AddIssue(SeverityWarning, "SVC", - "nvidia-fabricmanager inactive (expected: no NVSwitch/SXM detected)") + r.AddIssue( + IssueSvcFabricmanagerBenign, + SeverityWarning, + ConfidenceHigh, + "SVC", + "nvidia-fabricmanager inactive (expected: no NVSwitch/SXM detected)", + "svc", + string(IssueSvcFabricmanagerBenign), + ) // Remove from failed list for accurate counting failedNames = append(failedNames[:fmIdx], failedNames[fmIdx+1:]...) } @@ -231,8 +239,29 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context r.SetFact("failed_service_count", fmt.Sprintf("%d", len(failedNames))) if len(failedNames) > 0 { - r.AddIssue(SeverityCritical, "SVC", - fmt.Sprintf("%d failed systemd service(s)", len(failedNames))) + deduped := make([]string, 0, len(failedNames)) + seen := make(map[string]struct{}, len(failedNames)) + for _, name := range failedNames { + name = strings.TrimSpace(name) + if name == "" { + continue + } + if _, ok := seen[name]; ok { + continue + } + seen[name] = struct{}{} + deduped = append(deduped, name) + } + slices.Sort(deduped) + fingerprintParts := append([]string{"svc", string(IssueSvcFailed)}, deduped...) + r.AddIssue( + IssueSvcFailed, + SeverityCritical, + ConfidenceHigh, + "SVC", + fmt.Sprintf("%d failed systemd service(s)", len(failedNames)), + fingerprintParts..., + ) } } diff --git a/customers/vm-troubleshooting/internal/collector/system.go b/customers/vm-troubleshooting/internal/collector/system.go index 2e3b5f0..7f05aab 100644 --- a/customers/vm-troubleshooting/internal/collector/system.go +++ b/customers/vm-troubleshooting/internal/collector/system.go @@ -66,9 +66,27 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) continue } if d.UsedPct >= 95 { - r.AddIssue(SeverityCritical, "DISK", fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct)) + r.AddIssue( + IssueDiskCritical, + SeverityCritical, + ConfidenceHigh, + "DISK", + fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct), + "sys", + string(IssueDiskCritical), + d.Mountpoint, + ) } else if d.UsedPct >= 85 { - r.AddIssue(SeverityWarning, "DISK", fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct)) + r.AddIssue( + IssueDiskWarning, + SeverityWarning, + ConfidenceHigh, + "DISK", + fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct), + "sys", + string(IssueDiskWarning), + d.Mountpoint, + ) } } c.saveProcess(ctx, r, "processes/ps_aux.txt", executor.CommandSpec{Name: "ps", Args: []string{"aux", "--sort=-%mem"}, Timeout: config.TimeoutMedium}) diff --git a/customers/vm-troubleshooting/internal/identity/fingerprint.go b/customers/vm-troubleshooting/internal/identity/fingerprint.go new file mode 100644 index 0000000..8a215bd --- /dev/null +++ b/customers/vm-troubleshooting/internal/identity/fingerprint.go @@ -0,0 +1,30 @@ +package identity + +import ( + "crypto/sha256" + "encoding/hex" + "strings" +) + +const emptyPartSentinel = "_" + +// Fingerprint computes a stable 128-bit fingerprint from an ordered tuple. +// Caller must pre-sort/deduplicate any set-valued inputs before passing them. +func Fingerprint(parts ...string) string { + if len(parts) == 0 { + return "" + } + + normalized := make([]string, len(parts)) + for i, part := range parts { + value := strings.ToLower(strings.TrimSpace(part)) + if value == "" { + value = emptyPartSentinel + } + normalized[i] = value + } + + joined := strings.Join(normalized, "\x00") + sum := sha256.Sum256([]byte(joined)) + return hex.EncodeToString(sum[:16]) +} diff --git a/customers/vm-troubleshooting/internal/identity/fingerprint_test.go b/customers/vm-troubleshooting/internal/identity/fingerprint_test.go new file mode 100644 index 0000000..6943a3f --- /dev/null +++ b/customers/vm-troubleshooting/internal/identity/fingerprint_test.go @@ -0,0 +1,45 @@ +package identity + +import ( + "regexp" + "testing" +) + +func TestFingerprint_EmptyInput(t *testing.T) { + t.Parallel() + if got := Fingerprint(); got != "" { + t.Fatalf("Fingerprint() = %q, want empty string", got) + } +} + +func TestFingerprint_NormalizesAndUsesSentinel(t *testing.T) { + t.Parallel() + + a := Fingerprint(" XID ", "79", "", "3B:00") + b := Fingerprint("xid", "79", "_", "3b:00") + if a != b { + t.Fatalf("expected normalized fingerprints to match: %q vs %q", a, b) + } +} + +func TestFingerprint_OrderSensitive(t *testing.T) { + t.Parallel() + + a := Fingerprint("svc", "svc_failed", "a.service", "b.service") + b := Fingerprint("svc", "svc_failed", "b.service", "a.service") + if a == b { + t.Fatal("expected different fingerprints when tuple order differs") + } +} + +func TestFingerprint_Format(t *testing.T) { + t.Parallel() + + got := Fingerprint("fw", "restrictive", "ufw") + if len(got) != 32 { + t.Fatalf("expected 32 hex chars, got %d (%q)", len(got), got) + } + if !regexp.MustCompile(`^[0-9a-f]{32}$`).MatchString(got) { + t.Fatalf("fingerprint %q is not lowercase 32-char hex", got) + } +} diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index 2c41862..957ddfa 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -21,7 +21,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "2.0.0", + SchemaVersion: "3.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/contract_test.go b/customers/vm-troubleshooting/internal/output/contract_test.go index 420146b..ebf5fbc 100644 --- a/customers/vm-troubleshooting/internal/output/contract_test.go +++ b/customers/vm-troubleshooting/internal/output/contract_test.go @@ -6,6 +6,7 @@ import ( "github.com/NexGenCloud/vm-diagnostics/internal/collector" "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/triage" schemaassets "github.com/NexGenCloud/vm-diagnostics/schemas" ) @@ -254,6 +255,44 @@ func TestSchemaCollectorStatusEnumMatchesCode(t *testing.T) { checkEnumMatchesMap(t, "manifest collector status", schemaStatuses, goStatuses) } +func TestSchemaIssueCodeEnumsMatchGoConstants(t *testing.T) { + t.Parallel() + + expected := make(map[string]bool, len(collector.CollectorIssueCodes)+len(triage.FindingCodes)) + for code := range collector.CollectorIssueCodes { + expected[code] = true + } + for code := range triage.FindingCodes { + expected[code] = true + } + + manifestCodes := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "issue", "properties", "code", "enum") + checkEnumMatchesMap(t, "manifest issue code", manifestCodes, expected) + + reportCodes := extractReportIssueEnumAtField(t, "code") + checkEnumMatchesMap(t, "report issue code", reportCodes, expected) + + triageCodes := extractSchemaEnumAtPath(t, "triage-result.schema.json", "$defs", "finding", "properties", "code", "enum") + checkEnumMatchesMap(t, "triage finding code", triageCodes, triage.FindingCodes) +} + +func TestSchemaConfidenceEnumsMatchGoConstants(t *testing.T) { + t.Parallel() + expected := map[string]bool{ + string(collector.ConfidenceHigh): true, + string(collector.ConfidenceLow): true, + } + + manifestConfidence := extractSchemaEnumAtPath(t, "manifest.schema.json", "$defs", "issue", "properties", "confidence", "enum") + checkEnumMatchesMap(t, "manifest issue confidence", manifestConfidence, expected) + + reportConfidence := extractReportIssueEnumAtField(t, "confidence") + checkEnumMatchesMap(t, "report issue confidence", reportConfidence, expected) + + triageConfidence := extractSchemaEnumAtPath(t, "triage-result.schema.json", "$defs", "finding", "properties", "confidence", "enum") + checkEnumMatchesMap(t, "triage finding confidence", triageConfidence, expected) +} + // TestTriageResultSchemaIsLoadable verifies the triage result schema exists and is valid JSON. func TestTriageResultSchemaIsLoadable(t *testing.T) { t.Parallel() @@ -332,3 +371,52 @@ func extractSchemaEnumAtPath(t *testing.T, schemaFile string, keys ...string) [] t.Fatalf("schema %s: path did not end with 'enum'", schemaFile) return nil } + +func extractReportIssueEnumAtField(t *testing.T, field string) []string { + t.Helper() + data, err := schemaassets.FS.ReadFile("report-record.schema.json") + if err != nil { + t.Fatalf("reading report schema: %v", err) + } + var doc map[string]any + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("parsing report schema: %v", err) + } + oneOf, ok := doc["oneOf"].([]any) + if !ok { + t.Fatal("report schema: missing oneOf") + } + for i, branchRaw := range oneOf { + branch, ok := branchRaw.(map[string]any) + if !ok { + continue + } + props, ok := branch["properties"].(map[string]any) + if !ok { + continue + } + typ, ok := props["type"].(map[string]any) + if !ok || typ["const"] != "issue" { + continue + } + fieldObj, ok := props[field].(map[string]any) + if !ok { + t.Fatalf("report schema: issue branch oneOf[%d] missing field %q", i, field) + } + enumRaw, ok := fieldObj["enum"].([]any) + if !ok { + t.Fatalf("report schema: issue branch oneOf[%d] field %q missing enum", i, field) + } + result := make([]string, len(enumRaw)) + for j, v := range enumRaw { + s, ok := v.(string) + if !ok { + t.Fatalf("report schema: issue enum %q[%d] not string", field, j) + } + result[j] = s + } + return result + } + t.Fatal("report schema: issue branch not found") + return nil +} diff --git a/customers/vm-troubleshooting/internal/output/manifest.go b/customers/vm-troubleshooting/internal/output/manifest.go index 30fe729..98747f4 100644 --- a/customers/vm-troubleshooting/internal/output/manifest.go +++ b/customers/vm-troubleshooting/internal/output/manifest.go @@ -50,10 +50,14 @@ type ManifestArtifact struct { // ManifestIssueMeta is the JSON representation of an issue. type ManifestIssueMeta struct { - Severity string `json:"severity"` - Category string `json:"category"` - Message string `json:"message"` - Hidden bool `json:"hidden,omitempty"` + Code string `json:"code"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Category string `json:"category"` + Message string `json:"message"` + IssueFingerprint string `json:"issue_fingerprint,omitempty"` + RelatedArtifactPaths []string `json:"related_artifact_paths,omitempty"` + UnresolvedArtifactPaths []string `json:"unresolved_artifact_paths,omitempty"` } // ManifestSkipReason is the JSON representation of a skip reason. @@ -222,10 +226,14 @@ func WriteManifestFromResults(w *Writer, meta ManifestMeta, collectorIDs []strin } for _, issue := range ri.Issues { mc.Issues = append(mc.Issues, ManifestIssueMeta{ - Severity: issue.Severity, - Category: issue.Category, - Message: issue.Message, - Hidden: issue.Hidden, + Code: issue.Code, + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Message: issue.Message, + IssueFingerprint: issue.IssueFingerprint, + RelatedArtifactPaths: issue.RelatedArtifactPaths, + UnresolvedArtifactPaths: issue.UnresolvedArtifactPaths, }) } for _, s := range ri.Skipped { @@ -330,10 +338,14 @@ type ManifestArtifactInput struct { } type ManifestIssueInput struct { - Severity string - Category string - Message string - Hidden bool + Code string + Severity string + Confidence string + Category string + Message string + IssueFingerprint string + RelatedArtifactPaths []string + UnresolvedArtifactPaths []string } // BuildManifestInput converts from the exported types used in the runner. diff --git a/customers/vm-troubleshooting/internal/output/manifest_test.go b/customers/vm-troubleshooting/internal/output/manifest_test.go index 34cf74a..e697de6 100644 --- a/customers/vm-troubleshooting/internal/output/manifest_test.go +++ b/customers/vm-troubleshooting/internal/output/manifest_test.go @@ -37,7 +37,7 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "2.0.0", + SchemaVersion: "3.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", @@ -59,7 +59,13 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { Tags: []string{"cpu", "hardware"}, }}, map[string]string{"cpu_cores": "16", "oom_event_count": "unavailable"}, - []ManifestIssueInput{{Severity: "warning", Category: "TEST", Message: "example"}}, + []ManifestIssueInput{{ + Code: "disk_warning", + Severity: "warning", + Confidence: "high", + Category: "TEST", + Message: "example", + }}, nil, nil, ), @@ -105,4 +111,12 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { if facts["oom_event_count"] != nil { t.Fatalf("expected oom_event_count to map to null, got %#v", facts["oom_event_count"]) } + issues := manifest["collectors"].(map[string]any)["system"].(map[string]any)["issues"].([]any) + if len(issues) != 1 { + t.Fatalf("expected one issue, got %d", len(issues)) + } + issue := issues[0].(map[string]any) + if issue["code"] != "disk_warning" || issue["confidence"] != "high" { + t.Fatalf("unexpected issue identity fields: %#v", issue) + } } diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 6c34377..82e0eda 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -26,10 +26,14 @@ type ReportRecord struct { // can explicitly emit zero values, while non-artifact record types omit them. // type=issue fields - Severity string `json:"severity,omitempty"` - Category string `json:"category,omitempty"` - Message string `json:"message,omitempty"` - Hidden bool `json:"hidden,omitempty"` + Code string `json:"code,omitempty"` + Severity string `json:"severity,omitempty"` + Confidence string `json:"confidence,omitempty"` + Category string `json:"category,omitempty"` + Message string `json:"message,omitempty"` + IssueFingerprint string `json:"issue_fingerprint,omitempty"` + RelatedArtifactPaths []string `json:"related_artifact_paths,omitempty"` + UnresolvedArtifactPaths []string `json:"unresolved_artifact_paths,omitempty"` // type=fact fields Key string `json:"key,omitempty"` @@ -39,7 +43,7 @@ type ReportRecord struct { ArtifactCount int `json:"artifact_count,omitempty"` } -const reportSchemaVersion = "2.0.0" +const reportSchemaVersion = "3.0.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. @@ -71,14 +75,18 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes for _, issue := range ri.Issues { if err := enc.Encode(ReportRecord{ - SchemaVersion: reportSchemaVersion, - Type: "issue", - Timestamp: ts, - Collector: cid, - Severity: issue.Severity, - Category: issue.Category, - Message: issue.Message, - Hidden: issue.Hidden, + SchemaVersion: reportSchemaVersion, + Type: "issue", + Timestamp: ts, + Collector: cid, + Code: issue.Code, + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Message: issue.Message, + IssueFingerprint: issue.IssueFingerprint, + RelatedArtifactPaths: issue.RelatedArtifactPaths, + UnresolvedArtifactPaths: issue.UnresolvedArtifactPaths, }); err != nil { return err } diff --git a/customers/vm-troubleshooting/internal/output/report_test.go b/customers/vm-troubleshooting/internal/output/report_test.go index 736c2e9..23df6ef 100644 --- a/customers/vm-troubleshooting/internal/output/report_test.go +++ b/customers/vm-troubleshooting/internal/output/report_test.go @@ -22,7 +22,13 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { 10, []ManifestArtifactInput{{Path: "system/hostname.txt", Type: "command", Command: "hostname -f", Status: "ok", ParserHint: "hostname", Tags: []string{"identity"}}}, map[string]string{"cpu_cores": "16", "hostname": "node-1"}, - []ManifestIssueInput{{Severity: "warning", Category: "TEST", Message: "warn"}}, + []ManifestIssueInput{{ + Code: "disk_warning", + Severity: "warning", + Confidence: "high", + Category: "TEST", + Message: "warn", + }}, nil, nil, ), @@ -87,6 +93,22 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { if _, ok := records[0]["duration_ms"]; !ok { t.Fatalf("artifact record should include explicit duration_ms, got %#v", records[0]) } + foundIssue := false + for _, rec := range records { + if rec["type"] != "issue" { + continue + } + foundIssue = true + if rec["code"] != "disk_warning" { + t.Fatalf("issue record code mismatch: %#v", rec) + } + if rec["confidence"] != "high" { + t.Fatalf("issue record confidence mismatch: %#v", rec) + } + } + if !foundIssue { + t.Fatal("expected at least one issue record") + } last := records[len(records)-1] if last["collector"] != "network" || last["type"] != "collector_summary" { t.Fatalf("unexpected last record ordering: %#v", last) diff --git a/customers/vm-troubleshooting/internal/output/summary.go b/customers/vm-troubleshooting/internal/output/summary.go index 66d4617..2cf7c46 100644 --- a/customers/vm-troubleshooting/internal/output/summary.go +++ b/customers/vm-troubleshooting/internal/output/summary.go @@ -14,10 +14,10 @@ import ( // Severity is a string ("info", "warning", "critical") — the runner converts // from collector.Severity via String() to avoid an import cycle. type SummaryIssue struct { - Severity string // "info", "warning", "critical" - Category string - Message string - Hidden bool + Severity string // "info", "warning", "critical" + Confidence string // "high", "low" + Category string + Message string } type SummaryResult struct { @@ -68,7 +68,7 @@ func WriteSummary(w *Writer, hostname, version string, results []SummaryResult) var critical, warning, info []string for _, r := range results { for _, issue := range r.Issues { - if issue.Hidden { + if issue.Confidence != "high" { continue } line := fmt.Sprintf("[%s] %s", issue.Category, issue.Message) diff --git a/customers/vm-troubleshooting/internal/output/summary_test.go b/customers/vm-troubleshooting/internal/output/summary_test.go new file mode 100644 index 0000000..68ac87c --- /dev/null +++ b/customers/vm-troubleshooting/internal/output/summary_test.go @@ -0,0 +1,42 @@ +package output + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestWriteSummary_ShowsOnlyHighConfidenceIssues(t *testing.T) { + t.Parallel() + + root := t.TempDir() + w := NewWriter(root) + results := []SummaryResult{ + { + Name: "triage", + Issues: []SummaryIssue{ + {Severity: "critical", Confidence: "high", Category: "GPU", Message: "High confidence issue"}, + {Severity: "warning", Confidence: "low", Category: "ERR", Message: "Low confidence issue"}, + }, + Facts: map[string]string{ + "hostname": "node-1", + }, + }, + } + if err := WriteSummary(w, "node-1", "dev", results); err != nil { + t.Fatalf("WriteSummary failed: %v", err) + } + + data, err := os.ReadFile(filepath.Join(root, "SUMMARY.txt")) + if err != nil { + t.Fatalf("reading SUMMARY.txt: %v", err) + } + text := string(data) + if !strings.Contains(text, "High confidence issue") { + t.Fatal("expected high-confidence issue in SUMMARY.txt") + } + if strings.Contains(text, "Low confidence issue") { + t.Fatal("did not expect low-confidence issue in SUMMARY.txt") + } +} diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index 6f12c8c..86a078e 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -158,16 +158,12 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { triageResult.Name = "Triage Analysis" for _, tr := range triageResults { for _, f := range tr.Findings { - if !f.Severity.Valid() { - r.UI.Warn(fmt.Sprintf("triage: skipping finding with invalid severity: %s", f.Title)) + issue, convErr := triageFindingToIssue(f) + if convErr != nil { + r.UI.Warn(fmt.Sprintf("triage: skipping finding %q: %v", f.Title, convErr)) continue } - triageResult.Issues = append(triageResult.Issues, collector.Issue{ - Severity: f.Severity, - Category: f.Category, - Message: f.Title + ": " + f.Description, - Hidden: f.Hidden, - }) + triageResult.Issues = append(triageResult.Issues, issue) } for k, v := range tr.Facts { triageResult.SetFact(k, v) @@ -211,6 +207,12 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { }, Collectors: map[string]output.CollectorMeta{}, } + // Validate issues once per collector; both summary and manifest use the filtered set. + validIssues := make(map[string][]collector.Issue, len(results)) + for _, res := range results { + validIssues[res.ID] = validateIssues(r.UI, res.ID, res.Issues) + } + partial := false for _, res := range results { status := collectorStatus(res) @@ -230,16 +232,12 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { for _, e := range res.Errors { sr.ErrorMessages = append(sr.ErrorMessages, e.Message) } - for _, issue := range res.Issues { - if !issue.Severity.Valid() { - r.UI.Warn(fmt.Sprintf("collector %s: issue with invalid severity skipped: %s", res.ID, issue.Message)) - continue - } + for _, issue := range validIssues[res.ID] { sr.Issues = append(sr.Issues, output.SummaryIssue{ - Severity: issue.Severity.String(), - Category: issue.Category, - Message: issue.Message, - Hidden: issue.Hidden, + Severity: issue.Severity.String(), + Confidence: issue.Confidence.String(), + Category: issue.Category, + Message: issue.Message, }) } summaryResults = append(summaryResults, sr) @@ -282,16 +280,18 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { DurationMS: a.Duration.Milliseconds(), ParserHint: a.ParserHint, Tags: a.Tags, }) } - issues := make([]output.ManifestIssueInput, 0, len(res.Issues)) - for _, issue := range res.Issues { - if !issue.Severity.Valid() { - continue // already warned above - } + filtered := validIssues[res.ID] + issues := make([]output.ManifestIssueInput, 0, len(filtered)) + for _, issue := range filtered { issues = append(issues, output.ManifestIssueInput{ - Severity: issue.Severity.String(), - Category: issue.Category, - Message: issue.Message, - Hidden: issue.Hidden, + Code: string(issue.Code), + Severity: issue.Severity.String(), + Confidence: issue.Confidence.String(), + Category: issue.Category, + Message: issue.Message, + IssueFingerprint: issue.Fingerprint, + RelatedArtifactPaths: issue.RelatedArtifactPaths, + UnresolvedArtifactPaths: issue.UnresolvedArtifactPaths, }) } skipped := make([]output.ManifestSkipReasonInput, 0, len(res.Skipped)) @@ -318,7 +318,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "2.0.0", + SchemaVersion: "3.0.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, @@ -404,6 +404,49 @@ func collectorStatus(res *collector.CollectorResult) string { return "ok" } +func triageFindingToIssue(f triage.Finding) (collector.Issue, error) { + if !f.Severity.Valid() { + return collector.Issue{}, fmt.Errorf("invalid severity %q", f.Severity.String()) + } + if f.Code == "" { + return collector.Issue{}, fmt.Errorf("missing code") + } + if !f.Confidence.Valid() { + return collector.Issue{}, fmt.Errorf("invalid confidence %q", f.Confidence.String()) + } + return collector.Issue{ + Code: collector.IssueCode(f.Code), + Severity: f.Severity, + Confidence: f.Confidence, + Category: f.Category, + Message: f.Title + ": " + f.Description, + Fingerprint: f.Fingerprint, + RelatedArtifactPaths: append([]string(nil), f.SourceArtifacts...), + }, nil +} + +// validateIssues filters out issues with invalid severity, code, or confidence, +// warning about each one. Both the summary and manifest loops use the returned slice. +func validateIssues(u ui.UI, collectorID string, issues []collector.Issue) []collector.Issue { + valid := make([]collector.Issue, 0, len(issues)) + for _, issue := range issues { + if !issue.Severity.Valid() { + u.Warn(fmt.Sprintf("collector %s: issue with invalid severity skipped: %s", collectorID, issue.Message)) + continue + } + if issue.Code == "" { + u.Warn(fmt.Sprintf("collector %s: issue missing code skipped: %s", collectorID, issue.Message)) + continue + } + if !issue.Confidence.Valid() { + u.Warn(fmt.Sprintf("collector %s: issue with invalid confidence skipped: %s", collectorID, issue.Message)) + continue + } + valid = append(valid, issue) + } + return valid +} + func uiAllowedInstall(u ui.UI, cfg *config.Config) bool { return u.IsInteractive() && !cfg.NonInteractive } diff --git a/customers/vm-troubleshooting/internal/runner/runner_test.go b/customers/vm-troubleshooting/internal/runner/runner_test.go index 6bddf4a..feffa2b 100644 --- a/customers/vm-troubleshooting/internal/runner/runner_test.go +++ b/customers/vm-troubleshooting/internal/runner/runner_test.go @@ -4,6 +4,9 @@ import ( "os" "path/filepath" "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/triage" ) func TestValidateOutputDir(t *testing.T) { @@ -27,3 +30,57 @@ func TestValidateOutputDirRejectsFile(t *testing.T) { t.Fatal("expected validateOutputDir to reject non-directory path") } } + +func TestTriageFindingToIssue_PreservesIdentityFields(t *testing.T) { + t.Parallel() + + f := triage.Finding{ + Code: triage.FindingXid, + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "GPU", + Title: "Xid 79 (GPU_FALLEN_OFF_BUS)", + Description: "GPU has fallen off the bus on 3b:00", + SourceArtifacts: []string{"logs/dmesg.txt"}, + Fingerprint: "0123456789abcdef0123456789abcdef", + } + issue, err := triageFindingToIssue(f) + if err != nil { + t.Fatalf("triageFindingToIssue failed: %v", err) + } + if issue.Code != collector.IssueCode(f.Code) { + t.Fatalf("code mismatch: got %q want %q", issue.Code, f.Code) + } + if issue.Confidence != f.Confidence { + t.Fatalf("confidence mismatch: got %q want %q", issue.Confidence, f.Confidence) + } + if len(issue.RelatedArtifactPaths) != 1 || issue.RelatedArtifactPaths[0] != "logs/dmesg.txt" { + t.Fatalf("related paths mismatch: %#v", issue.RelatedArtifactPaths) + } + if issue.Fingerprint != f.Fingerprint { + t.Fatalf("fingerprint mismatch: got %q want %q", issue.Fingerprint, f.Fingerprint) + } +} + +func TestTriageFindingToIssue_ValidatesRequiredIdentityFields(t *testing.T) { + t.Parallel() + + _, err := triageFindingToIssue(triage.Finding{ + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Title: "missing code", + }) + if err == nil { + t.Fatal("expected error for missing code") + } + + _, err = triageFindingToIssue(triage.Finding{ + Code: triage.FindingCriticalLog, + Severity: collector.SeverityWarning, + Confidence: collector.Confidence("maybe"), + Title: "bad confidence", + }) + if err == nil { + t.Fatal("expected error for invalid confidence") + } +} diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 992cc45..e4f3835 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -10,6 +10,7 @@ import ( "strings" "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" ) // CriticalPattern describes a high-impact log pattern to search for. @@ -18,36 +19,37 @@ type CriticalPattern struct { Pattern *regexp.Regexp Severity collector.Severity Category string - Hidden bool // if true, finding omitted from SUMMARY.txt + // Confidence controls SUMMARY visibility (high shown, low hidden). + Confidence collector.Confidence } // criticalPatterns are high-confidence patterns applied to all log sources. // NOTE: Xid/SXid is owned by triage/xid.go; OOM is owned by collector/journal.go. // Do not add patterns here that duplicate those owners. var criticalPatterns = []CriticalPattern{ - {"Kernel Panic", regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), collector.SeverityCritical, "KERN", false}, - {"Hardware Error", regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), collector.SeverityCritical, "HW", false}, - {"Fallen Off Bus", regexp.MustCompile(`(?i)fallen off the bus`), collector.SeverityCritical, "GPU", false}, - {"Timeout", regexp.MustCompile(`(?i)\b(timeout|timed out)\b`), collector.SeverityWarning, "TIMEOUT", false}, + {"Kernel Panic", regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), collector.SeverityCritical, "KERN", collector.ConfidenceHigh}, + {"Hardware Error", regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), collector.SeverityCritical, "HW", collector.ConfidenceHigh}, + {"Fallen Off Bus", regexp.MustCompile(`(?i)fallen off the bus`), collector.SeverityCritical, "GPU", collector.ConfidenceHigh}, + {"Timeout", regexp.MustCompile(`(?i)\b(timeout|timed out)\b`), collector.SeverityWarning, "TIMEOUT", collector.ConfidenceHigh}, } // lowConfidencePatterns are applied only to error-priority sources (journal_errors.txt). -// They produce Hidden=true findings to avoid flooding SUMMARY.txt. +// They produce confidence=low findings to avoid flooding SUMMARY.txt. var lowConfidencePatterns = []CriticalPattern{ - {"Error/Fail", regexp.MustCompile(`(?i)\b(error|failed|failure)\b`), collector.SeverityInfo, "ERR", true}, + {"Error/Fail", regexp.MustCompile(`(?i)\b(error|failed|failure)\b`), collector.SeverityInfo, "ERR", collector.ConfidenceLow}, } const maxEvents = 100 // criticalEvent is an internal deduplication record. type criticalEvent struct { - pattern string - line string - severity collector.Severity - category string - hidden bool - count int - source string + pattern string + line string + severity collector.Severity + category string + confidence collector.Confidence + count int + source string } // AnalyzeCriticalLogs scans collected log artifacts for high-impact patterns. @@ -123,13 +125,13 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro existing.count++ } else { seen[k] = &criticalEvent{ - pattern: p.Name, - line: line, - severity: p.Severity, - category: p.Category, - hidden: p.Hidden, - count: 1, - source: src.path, + pattern: p.Name, + line: line, + severity: p.Severity, + category: p.Category, + confidence: p.Confidence, + count: 1, + source: src.path, } eventOrder = append(eventOrder, k) } @@ -184,12 +186,17 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro } findings = append(findings, Finding{ + Code: FindingCriticalLog, Severity: ev.severity, - Hidden: ev.hidden, + Confidence: ev.confidence, Category: ev.category, Title: ev.pattern, Description: fmt.Sprintf("%s (%dx in %s)", line, ev.count, ev.source), Evidence: []string{line}, + SourceArtifacts: []string{ + ev.source, + }, + Fingerprint: identity.Fingerprint("crit", criticalPatternKey(ev.pattern), criticalSourceClass(ev.source)), }) textLines = append(textLines, fmt.Sprintf(" [%s] %s (%dx): %s", ev.severity.String(), ev.pattern, ev.count, line)) @@ -208,3 +215,22 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro Text: strings.Join(textLines, "\n") + "\n", }, nil } + +func criticalPatternKey(pattern string) string { + key := strings.ToLower(strings.TrimSpace(pattern)) + key = strings.ReplaceAll(key, " ", "_") + return key +} + +func criticalSourceClass(path string) string { + switch path { + case "logs/dmesg.txt": + return "dmesg" + case "logs/journal_kernel.txt": + return "journal_kernel" + case "logs/journal_errors.txt": + return "journal_errors" + default: + return "unknown" + } +} diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index fc58538..138ed0c 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" ) func TestAnalyzeCriticalLogs_OOMNotMatched(t *testing.T) { @@ -31,7 +32,7 @@ func TestAnalyzeCriticalLogs_OOMNotMatched(t *testing.T) { } } -func TestAnalyzeCriticalLogs_LowConfidenceHidden(t *testing.T) { +func TestAnalyzeCriticalLogs_LowConfidence(t *testing.T) { t.Parallel() workDir := t.TempDir() os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) @@ -49,12 +50,22 @@ func TestAnalyzeCriticalLogs_LowConfidenceHidden(t *testing.T) { } for _, f := range tr.Findings { if f.Title == "Error/Fail" { - if !f.Hidden { - t.Error("low-confidence Error/Fail finding should be Hidden=true") + if f.Confidence != collector.ConfidenceLow { + t.Errorf("low-confidence Error/Fail finding should have confidence=low, got %q", f.Confidence) } if f.Severity != collector.SeverityInfo { t.Errorf("low-confidence finding should be info, got %s", f.Severity) } + if f.Code != FindingCriticalLog { + t.Errorf("low-confidence finding should have code=%q, got %q", FindingCriticalLog, f.Code) + } + if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_errors.txt" { + t.Errorf("expected source_artifacts to contain journal_errors, got %#v", f.SourceArtifacts) + } + wantFP := identity.Fingerprint("crit", criticalPatternKey(f.Title), criticalSourceClass("logs/journal_errors.txt")) + if f.Fingerprint != wantFP { + t.Errorf("unexpected fingerprint: got %q want %q", f.Fingerprint, wantFP) + } return } } diff --git a/customers/vm-troubleshooting/internal/triage/firewall.go b/customers/vm-troubleshooting/internal/triage/firewall.go index f10e587..32e44f2 100644 --- a/customers/vm-troubleshooting/internal/triage/firewall.go +++ b/customers/vm-troubleshooting/internal/triage/firewall.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" ) // FirewallPosture classifies the overall firewall stance. @@ -242,6 +243,25 @@ func AnalyzeFirewall(_ context.Context, workDir string) (*TriageResult, error) { posture = PostureUnknown source = "none" } + sourceArtifact := "" + switch source { + case "ufw": + sourceArtifact = "network/ufw_status.txt" + case "firewalld": + sourceArtifact = "network/firewalld_zones.txt" + case "nftables": + sourceArtifact = "network/nftables.txt" + case "iptables": + sourceArtifact = "network/iptables.txt" + } + confidence := collector.ConfidenceHigh + if posture == PostureUnknown { + confidence = collector.ConfidenceLow + } + var sourceArtifacts []string + if sourceArtifact != "" { + sourceArtifacts = []string{sourceArtifact} + } sev := collector.SeverityInfo var desc, action string @@ -264,11 +284,15 @@ func AnalyzeFirewall(_ context.Context, workDir string) (*TriageResult, error) { } finding := Finding{ - Severity: sev, - Category: "FW", - Title: "Firewall Posture: " + string(posture), - Description: desc, - Action: action, + Code: FindingFirewallPosture, + Severity: sev, + Confidence: confidence, + Category: "FW", + Title: "Firewall Posture: " + string(posture), + Description: desc, + Action: action, + SourceArtifacts: sourceArtifacts, + Fingerprint: identity.Fingerprint("fw", string(posture), source), } text := fmt.Sprintf("Firewall Posture Analysis\n\n Posture: %s\n Source: %s\n %s\n Action: %s\n", posture, source, desc, action) diff --git a/customers/vm-troubleshooting/internal/triage/firewall_analyze_test.go b/customers/vm-troubleshooting/internal/triage/firewall_analyze_test.go new file mode 100644 index 0000000..c3a934e --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/firewall_analyze_test.go @@ -0,0 +1,47 @@ +package triage + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" +) + +func TestAnalyzeFirewall_SetsIdentityFields(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "network"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(workDir, "network/ufw_status.txt"), []byte( + "Status: active\nDefault: deny (incoming), allow (outgoing)\n", + ), 0o644); err != nil { + t.Fatal(err) + } + + tr, err := AnalyzeFirewall(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil || len(tr.Findings) != 1 { + t.Fatalf("expected one finding, got %#v", tr) + } + f := tr.Findings[0] + if f.Code != FindingFirewallPosture { + t.Fatalf("expected code %q, got %q", FindingFirewallPosture, f.Code) + } + if f.Confidence != collector.ConfidenceHigh { + t.Fatalf("expected confidence=high, got %q", f.Confidence) + } + if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "network/ufw_status.txt" { + t.Fatalf("expected source_artifacts=[network/ufw_status.txt], got %#v", f.SourceArtifacts) + } + wantFP := identity.Fingerprint("fw", "restrictive", "ufw") + if f.Fingerprint != wantFP { + t.Fatalf("unexpected fingerprint: got %q want %q", f.Fingerprint, wantFP) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/integration_test.go b/customers/vm-troubleshooting/internal/triage/integration_test.go index 40b3dc9..c73d267 100644 --- a/customers/vm-troubleshooting/internal/triage/integration_test.go +++ b/customers/vm-troubleshooting/internal/triage/integration_test.go @@ -15,7 +15,7 @@ import ( // TestTriagePipeline_EndToEnd runs all analyzers against a mock workDir // and verifies the full contract: files written, artifacts tracked, -// findings have valid severities, and Hidden behavior is correct. +// findings have valid severities, and confidence behavior is correct. func TestTriagePipeline_EndToEnd(t *testing.T) { t.Parallel() @@ -34,7 +34,7 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { os.WriteFile(filepath.Join(workDir, "network/iptables.txt"), []byte("Chain INPUT (policy DROP 0 packets, 0 bytes)\nChain FORWARD (policy DROP 0 packets, 0 bytes)\nChain OUTPUT (policy DROP 0 packets, 0 bytes)\n"), 0o644) - // journal_errors with low-confidence match (should be hidden) + // journal_errors with low-confidence match os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte("# Command: journalctl\n---\nsystemd[1]: Failed to start some.service\n"), 0o644) @@ -42,6 +42,16 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if err != nil { t.Fatalf("RunAllAnalyzers failed: %v", err) } + for _, tr := range results { + for _, f := range tr.Findings { + if f.Code == "" { + t.Fatalf("finding missing code: analyzer=%s finding=%+v", tr.Name, f) + } + if !f.Confidence.Valid() { + t.Fatalf("finding has invalid confidence: analyzer=%s finding=%+v", tr.Name, f) + } + } + } // --- Verify triage files exist on disk --- for _, name := range []string{ @@ -78,10 +88,13 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { continue } triageResult.Issues = append(triageResult.Issues, collector.Issue{ - Severity: f.Severity, - Category: f.Category, - Message: f.Title + ": " + f.Description, - Hidden: f.Hidden, + Code: collector.IssueCode(f.Code), + Severity: f.Severity, + Confidence: f.Confidence, + Category: f.Category, + Message: f.Title + ": " + f.Description, + Fingerprint: f.Fingerprint, + RelatedArtifactPaths: append([]string(nil), f.SourceArtifacts...), }) } for k, v := range tr.Facts { @@ -100,8 +113,16 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if len(triageResult.Issues) == 0 { t.Fatal("expected triage issues") } + for _, issue := range triageResult.Issues { + if issue.Code == "" { + t.Fatalf("issue missing code: %+v", issue) + } + if !issue.Confidence.Valid() { + t.Fatalf("issue has invalid confidence: %+v", issue) + } + } - // Check Xid 79 is critical and visible + // Check Xid 79 is critical and high confidence foundXid := false for _, issue := range triageResult.Issues { if strings.Contains(issue.Message, "Xid 79") { @@ -109,8 +130,8 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if issue.Severity != collector.SeverityCritical { t.Errorf("Xid 79 should be critical, got %s", issue.Severity) } - if issue.Hidden { - t.Error("Xid 79 should not be hidden") + if issue.Confidence != collector.ConfidenceHigh { + t.Errorf("Xid 79 should be high confidence, got %q", issue.Confidence) } } } @@ -118,7 +139,7 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { t.Error("expected Xid 79 finding") } - // Check firewall posture is info and visible + // Check firewall posture is info and high confidence foundFW := false for _, issue := range triageResult.Issues { if strings.Contains(issue.Message, "Firewall Posture") { @@ -126,8 +147,8 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if issue.Severity != collector.SeverityInfo { t.Errorf("firewall posture should be info, got %s", issue.Severity) } - if issue.Hidden { - t.Error("firewall posture should not be hidden") + if issue.Confidence != collector.ConfidenceHigh { + t.Errorf("firewall posture should be high confidence, got %q", issue.Confidence) } } } @@ -135,11 +156,11 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { t.Error("expected firewall posture finding") } - // Check low-confidence Error/Fail is hidden + // Check low-confidence Error/Fail is low confidence for _, issue := range triageResult.Issues { if strings.Contains(issue.Message, "Error/Fail") { - if !issue.Hidden { - t.Error("Error/Fail finding should be hidden") + if issue.Confidence != collector.ConfidenceLow { + t.Errorf("Error/Fail finding should be low confidence, got %q", issue.Confidence) } if issue.Severity != collector.SeverityInfo { t.Errorf("Error/Fail should be info, got %s", issue.Severity) diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index 7a00551..e04494e 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -16,13 +16,33 @@ import ( // Finding represents a single analyzed finding from a triage analyzer. type Finding struct { - Severity collector.Severity `json:"severity"` - Hidden bool `json:"hidden,omitempty"` - Category string `json:"category"` - Title string `json:"title"` - Description string `json:"description"` - Action string `json:"action,omitempty"` - Evidence []string `json:"evidence,omitempty"` + Code FindingCode `json:"code"` + Severity collector.Severity `json:"severity"` + Confidence collector.Confidence `json:"confidence"` + Category string `json:"category"` + Title string `json:"title"` + Description string `json:"description"` + Action string `json:"action,omitempty"` + Evidence []string `json:"evidence,omitempty"` + SourceArtifacts []string `json:"source_artifacts,omitempty"` + Fingerprint string `json:"issue_fingerprint,omitempty"` +} + +type FindingCode string + +const ( + FindingXid FindingCode = "xid" + FindingSXid FindingCode = "sxid" + FindingFirewallPosture FindingCode = "firewall_posture" + FindingCriticalLog FindingCode = "critical_log" +) + +// FindingCodes enumerates triage-owned finding codes. +var FindingCodes = map[string]bool{ + string(FindingXid): true, + string(FindingSXid): true, + string(FindingFirewallPosture): true, + string(FindingCriticalLog): true, } // TriageResult holds the output of a single analyzer. @@ -39,7 +59,7 @@ type TriageResult struct { type Analyzer func(ctx context.Context, workDir string) (*TriageResult, error) // triageSchemaVersion is the schema version emitted in triage result JSON files. -const triageSchemaVersion = "2.0.0" +const triageSchemaVersion = "3.0.0" // RunAllAnalyzers executes all registered analyzers with spinner feedback. // Missing artifacts are handled gracefully — analyzers skip what isn't there. diff --git a/customers/vm-troubleshooting/internal/triage/xid.go b/customers/vm-troubleshooting/internal/triage/xid.go index 7e944a4..d38ae76 100644 --- a/customers/vm-troubleshooting/internal/triage/xid.go +++ b/customers/vm-troubleshooting/internal/triage/xid.go @@ -10,6 +10,7 @@ import ( "strings" "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" ) // XidEntry describes a known Xid error code. @@ -197,6 +198,7 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { // Try primary source, then fallback sources := []string{"logs/dmesg.txt", "nvidia/xid_errors.txt"} var dmesg string + var sourceArtifact string anySkipped := false for _, src := range sources { state, content := checkArtifact(workDir, src) @@ -204,6 +206,7 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { case ArtifactUsable: if dmesg == "" { dmesg = content + sourceArtifact = src } case ArtifactSkipped: anySkipped = true @@ -242,8 +245,12 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { for _, ev := range events { prefix := "Xid" + family := "xid" + findingCode := FindingXid if ev.IsSXid { prefix = "SXid" + family = "sxid" + findingCode = FindingSXid } resetNote := "" if ev.RequiresReset { @@ -252,14 +259,22 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { title := fmt.Sprintf("%s %d (%s)", prefix, ev.Code, ev.Name) desc := fmt.Sprintf("%s on %s (%dx)%s", ev.Description, ev.BDF, ev.Count, resetNote) + var sourceArtifacts []string + if sourceArtifact != "" { + sourceArtifacts = []string{sourceArtifact} + } findings = append(findings, Finding{ - Severity: ev.Severity, - Category: "GPU", - Title: title, - Description: desc, - Action: ev.Action, - Evidence: []string{fmt.Sprintf("BDF=%s count=%d", ev.BDF, ev.Count)}, + Code: findingCode, + Severity: ev.Severity, + Confidence: collector.ConfidenceHigh, + Category: "GPU", + Title: title, + Description: desc, + Action: ev.Action, + Evidence: []string{fmt.Sprintf("BDF=%s count=%d", ev.BDF, ev.Count)}, + SourceArtifacts: sourceArtifacts, + Fingerprint: identity.Fingerprint(family, strconv.Itoa(ev.Code), ev.BDF), }) textLines = append(textLines, fmt.Sprintf(" [%s] %s: %s", ev.Severity.String(), title, desc)) diff --git a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go index b010ec7..a93599e 100644 --- a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go +++ b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go @@ -5,6 +5,8 @@ import ( "os" "path/filepath" "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/identity" ) func TestAnalyzeXid_UsesUsableFallbackWhenPrimarySkipped(t *testing.T) { @@ -40,6 +42,25 @@ func TestAnalyzeXid_UsesUsableFallbackWhenPrimarySkipped(t *testing.T) { if tr.Facts["xid_classified_count"] != "1" { t.Fatalf("expected xid_classified_count=1, got %q", tr.Facts["xid_classified_count"]) } + if len(tr.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(tr.Findings)) + } + if tr.Findings[0].Code != FindingXid { + t.Fatalf("expected finding code %q, got %q", FindingXid, tr.Findings[0].Code) + } + if tr.Findings[0].Confidence != "high" { + t.Fatalf("expected high confidence, got %q", tr.Findings[0].Confidence) + } + if len(tr.Findings[0].SourceArtifacts) != 1 || tr.Findings[0].SourceArtifacts[0] != "nvidia/xid_errors.txt" { + t.Fatalf("expected fallback source artifact, got %#v", tr.Findings[0].SourceArtifacts) + } + if tr.Findings[0].Fingerprint == "" { + t.Fatal("expected non-empty finding fingerprint") + } + wantFP := identity.Fingerprint("xid", "79", "3b:00") + if tr.Findings[0].Fingerprint != wantFP { + t.Fatalf("unexpected fingerprint: got %q want %q", tr.Findings[0].Fingerprint, wantFP) + } } func TestAnalyzeXid_ReturnsUnavailableWhenOnlySkippedSourcesExist(t *testing.T) { diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index 99708b1..17cfee1 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/manifest/v2", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/manifest/v3", "title": "VM Diagnostics Manifest", "description": "Machine-readable index of a vm-diagnostics archive. artifact_index covers collector-produced payload and derived diagnostic files; framework control files (manifest.json, report.ndjson, SUMMARY.txt, metadata.json, transfer_commands.txt, schemas/*) are excluded. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", "type": "object", @@ -90,12 +90,28 @@ }, "issue": { "type": "object", - "required": ["severity", "category", "message"], + "required": ["code", "severity", "confidence", "category", "message"], "properties": { + "code": { + "type": "string", + "enum": [ + "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "xid", "sxid", "firewall_posture", "critical_log" + ] + }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "confidence": { "type": "string", "enum": ["high", "low"] }, "category": { "type": "string" }, "message": { "type": "string" }, - "hidden": { "type": "boolean" } + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" }, + "related_artifact_paths": { + "type": "array", + "items": { "type": "string" } + }, + "unresolved_artifact_paths": { + "type": "array", + "items": { "type": "string" } + } } }, "skip_reason": { diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index a06ab0a..4126bc5 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/report-record/v2", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/report-record/v3", "title": "VM Diagnostics Report Record", "description": "Schema for each NDJSON line in report.ndjson. Discriminated by 'type' field. Wire rules: UTF-8 encoding, each line is one complete JSON object followed by \\n (0x0A), optionally preceded by \\r (0x0D). JSON texts must not contain literal newlines or carriage returns. Parsers may silently ignore empty lines. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", "type": "object", @@ -37,12 +37,28 @@ { "properties": { "type": { "const": "issue" }, + "code": { + "type": "string", + "enum": [ + "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "xid", "sxid", "firewall_posture", "critical_log" + ] + }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "confidence": { "type": "string", "enum": ["high", "low"] }, "category": { "type": "string" }, "message": { "type": "string" }, - "hidden": { "type": "boolean" } + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" }, + "related_artifact_paths": { + "type": "array", + "items": { "type": "string" } + }, + "unresolved_artifact_paths": { + "type": "array", + "items": { "type": "string" } + } }, - "required": ["type", "severity", "category", "message"] + "required": ["type", "code", "severity", "confidence", "category", "message"] }, { "properties": { diff --git a/customers/vm-troubleshooting/schemas/triage-result.schema.json b/customers/vm-troubleshooting/schemas/triage-result.schema.json index 51c40f4..2a57a5d 100644 --- a/customers/vm-troubleshooting/schemas/triage-result.schema.json +++ b/customers/vm-troubleshooting/schemas/triage-result.schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/triage-result/v2", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/triage-result/v3", "title": "VM Diagnostics Triage Result", "description": "Schema for triage/_data/*.json files. Each file is a per-analyzer result envelope containing classified findings and typed facts. The finding object shape is closed; facts remain open for additive analyzer growth.", "type": "object", @@ -23,16 +23,25 @@ "$defs": { "finding": { "type": "object", - "required": ["severity", "category", "title", "description"], + "required": ["code", "severity", "confidence", "category", "title", "description"], "additionalProperties": false, "properties": { + "code": { + "type": "string", + "enum": ["xid", "sxid", "firewall_posture", "critical_log"] + }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, - "hidden": { "type": "boolean" }, + "confidence": { "type": "string", "enum": ["high", "low"] }, "category": { "type": "string" }, "title": { "type": "string" }, "description": { "type": "string" }, "action": { "type": "string" }, - "evidence": { "type": "array", "items": { "type": "string" } } + "evidence": { "type": "array", "items": { "type": "string" } }, + "source_artifacts": { + "type": "array", + "items": { "type": "string" } + }, + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" } } } } From e5c040c1c1c578b1768052a0cadaa1eae95b19ca Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Wed, 8 Apr 2026 18:53:11 +0200 Subject: [PATCH 04/23] gather-info: complete phase 3/4 triage core hardening Add bounded structured journal ingestion and fallback-aware triage parsing, then split Xid catalog parsing from local policy while making critical pattern identity explicit for deterministic, stable findings and aligned docs. Made-with: Cursor --- customers/vm-troubleshooting/AGENTS.md | 7 + customers/vm-troubleshooting/CODEMAP.md | 7 +- .../internal/collector/collector_test.go | 16 +- .../internal/collector/journal.go | 221 +++++++++++++++++- .../internal/collector/journal_phase3_test.go | 172 ++++++++++++++ .../internal/runner/runner.go | 15 +- .../internal/runner/runner_test.go | 8 +- .../internal/triage/critical.go | 158 ++++++++++--- .../internal/triage/critical_test.go | 103 +++++++- .../internal/triage/journal_ndjson.go | 76 ++++++ .../internal/triage/journal_ndjson_test.go | 48 ++++ .../internal/triage/triage.go | 2 + .../vm-troubleshooting/internal/triage/xid.go | 148 ++++++------ .../internal/triage/xid_analyze_test.go | 39 ++++ .../internal/triage/xid_test.go | 67 ++++++ .../internal/triage/xidcatalog/UPSTREAM.md | 16 ++ .../internal/triage/xidcatalog/catalog.go | 117 ++++++++++ .../triage/xidcatalog/catalog_test.go | 38 +++ .../triage/xidcatalog/sync_catalog.sh | 11 + docs/architecture.md | 27 ++- 20 files changed, 1149 insertions(+), 147 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/collector/journal_phase3_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/journal_ndjson.go create mode 100644 customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md create mode 100644 customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go create mode 100644 customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh diff --git a/customers/vm-troubleshooting/AGENTS.md b/customers/vm-troubleshooting/AGENTS.md index 4305c5a..89df119 100644 --- a/customers/vm-troubleshooting/AGENTS.md +++ b/customers/vm-troubleshooting/AGENTS.md @@ -59,9 +59,16 @@ The archive contains three complementary output files for different consumers: Rules for the structured layer: - Schema files live in `schemas/` in-repo and are included in every archive. - `schema_version` follows semver: minor adds fields, major changes types. Current: `3.0.0`. +- Issues and findings are identity-bearing records: keep `code`, `severity`, `confidence`, and deterministic fingerprints populated. - Facts with integer keys (`cpu_cores`, `gpu_count`, `memory_total`, `oom_event_count`, `xid_classified_count`, `critical_event_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report/triage. `"unavailable"` maps to `null`. - All other facts remain strings. +## Triage ownership and policy boundaries +- Keep analyzer family ownership singular (no duplicate classification across analyzers). +- `xidcatalog` is the neutral Xid data/parsing boundary. Keep local support policy (severity, reset/reboot guidance, operational overrides like Xid 154) in `triage/xid.go`. +- Do not reintroduce presentation-only flags (`hidden`). Summary visibility is derived from `confidence`. +- Preserve deterministic sorting and explicit fingerprint inputs in analyzers. + ## Artifact registration - Every artifact must go through `saveCommand`, `saveFile`, `saveCapturedProbe`, `saveProbeOutput`, or `saveDirConcat` in `common.go`, or use `Writer.ReservePath` + one of the `Add*Artifact` helpers for custom flows. - Every artifact requires a `parserHint` (from `ValidParserHints`) and 1-3 `tags` (from `ValidTags`). Both are validated before write — invalid values record an error and skip the write. diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index 535bb5c..a0ae39d 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -102,7 +102,8 @@ Keep this file updated in the same change as architecture or collector changes. - Owns post-collection analysis: Xid classification, firewall posture detection, critical log extraction. - Runs after collectors, before summary/manifest generation. Findings injected as a synthetic "triage" collector result. - Output: `triage/*.txt` (human) + `triage/_data/*.json` (machine). Only `confidence=high` issues appear in `SUMMARY.txt`; both `high` and `low` are preserved in manifest/report. -- Xid catalog: 24 datacenter-relevant codes from NVIDIA r590, with Xid 154 dynamic severity from recovery action text. +- Xid catalog: local package boundary in `internal/triage/xidcatalog` (neutral lookup/parser), with local support policy and Xid 154 operational overrides kept in `internal/triage/xid.go`. +- Finding identity is explicit and stable (`code`, `confidence`, `issue_fingerprint`, artifact linkage fields), then bridged by runner into issue records. ### `internal/collector/` - Owns domain collectors and shared collector helpers. @@ -118,7 +119,7 @@ Keep this file updated in the same change as architecture or collector changes. | `DcgmCollector` | `dcgm/` | No | `dcgmi` discovery, health, stats, and level 2 diag (1-min cap) | | `DockerCollector` | `docker/` | Mostly shell | Docker CLI plus Go sanitization | | `ServicesCollector` | `services/` | Mixed | Batch D-Bus resolution (`ListUnitsByNamesContext`), distro-aware SSH name (`sshd`→`ssh` on Debian), existence-gated per-service artifacts, fabricmanager false-positive handling (tri-state NVSwitch detection) | -| `JournalCollector` | `logs/` | No | `journalctl`/`dmesg` are authoritative | +| `JournalCollector` | `logs/` | No | `journalctl`/`dmesg` are authoritative; emits text plus bounded sanitized NDJSON (`journal_kernel.ndjson`, `journal_errors.ndjson`) using one fixed `--until` bound per run | | `PackagesCollector` | `packages/` | No | Package managers remain distro authority | | `AdditionalCollector` | `system/`, `hardware/` | Mixed | Limits, sysctl, LVM, sensors, mounts | | `StorageCollector` | `hardware/` | No | `nvme` and `smartctl` | @@ -174,7 +175,7 @@ Run from `customers/vm-troubleshooting/`: gofmt -w . go test ./... go vet ./... -go build -o bin/gather-info ./cmd/gather-info +CGO_ENABLED=0 go build ./cmd/gather-info ``` For smoke testing: diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index 313d9cb..d08c017 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/NexGenCloud/vm-diagnostics/internal/executor" "github.com/NexGenCloud/vm-diagnostics/internal/identity" @@ -116,27 +117,32 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { fake.RootAccess = true fake.Binaries["journalctl"] = true fake.Binaries["dmesg"] = true + const until = "2026-04-08T10:00:00Z" + base := "-b --no-pager --until=" + until // Provide responses for all journalctl commands that run before the OOM grep for _, args := range []string{ - "journalctl -b --no-pager -k", - "journalctl -b --no-pager -p err", - "journalctl -b --no-pager -p warning", + "journalctl " + base + " -k", + "journalctl " + base + " -p err", + "journalctl " + base + " -p warning", + "journalctl " + base + " -k -o json --output-fields=" + journalOutputFields + " --lines=50001", + "journalctl " + base + " -p err -o json --output-fields=" + journalOutputFields + " --lines=50001", } { fake.Commands[args] = executor.FakeResponse{Stdout: []byte("log line\n")} } fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} // journalctl --grep returns exit 1 when no entries match (like grep) - fake.Commands["journalctl -b --no-pager --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ + fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ ExitCode: 1, Err: fmt.Errorf("exit status 1"), } // Per-service journal commands for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { - fake.Commands["journalctl -b --no-pager -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} } root := t.TempDir() collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + collector.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 10, 0, 0, 0, time.UTC) } res, err := collector.Collect(context.Background()) if err != nil { t.Fatalf("Collect failed: %v", err) diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 351c70e..6026c8e 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -2,12 +2,16 @@ package collector import ( "context" + "encoding/json" "fmt" + "strconv" "strings" + "time" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/sanitize" "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) @@ -15,15 +19,34 @@ type JournalCollector struct { Base Since string IncludeFull bool + nowUTC func() time.Time } func NewJournalCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, since string, includeFull bool) *JournalCollector { - return &JournalCollector{Base: Base{Exec: exec, Writer: writer, UI: ui}, Since: since, IncludeFull: includeFull} + return &JournalCollector{ + Base: Base{Exec: exec, Writer: writer, UI: ui}, + Since: since, + IncludeFull: includeFull, + nowUTC: time.Now, + } } func (c *JournalCollector) Name() string { return "Journal" } func (c *JournalCollector) ID() string { return "journal" } +const ( + journalNDJSONRecordLimit = 50000 + journalNDJSONByteLimit = 10 * 1024 * 1024 + journalNDJSONSentinelReserve = 256 +) + +var journalServiceUnits = []string{ + "docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", + "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved", +} + +const journalOutputFields = "MESSAGE,PRIORITY,SYSLOG_IDENTIFIER,_SYSTEMD_UNIT,_TRANSPORT,__REALTIME_TIMESTAMP,_BOOT_ID" + func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Exec.CommandExists("journalctl") { @@ -31,12 +54,8 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } - journalArgs := []string{"--no-pager"} - if c.Since == "" || c.Since == "boot" { - journalArgs = append([]string{"-b"}, journalArgs...) - } else { - journalArgs = append([]string{"--since=" + c.Since}, journalArgs...) - } + untilUTC := c.nowUTC().UTC().Format(time.RFC3339) + journalArgs := c.journalBaseArgs(untilUTC) if c.IncludeFull { c.saveCommand(ctx, r, "logs/journal_full.txt", executor.CommandSpec{Name: "journalctl", Args: journalArgs, NeedsRoot: true, Timeout: config.TimeoutSlow}, "journalctl", "journal") @@ -44,6 +63,8 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCommand(ctx, r, "logs/journal_kernel.txt", executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-k"), NeedsRoot: true, Timeout: config.TimeoutSlow}, "journalctl", "journal") c.saveCommand(ctx, r, "logs/journal_errors.txt", executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-p", "err"), NeedsRoot: true, Timeout: config.TimeoutSlow}, "journalctl", "journal") c.saveCommand(ctx, r, "logs/journal_warnings.txt", executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-p", "warning"), NeedsRoot: true, Timeout: config.TimeoutSlow}, "journalctl", "journal") + c.saveStructuredJournalNDJSON(ctx, r, "logs/journal_kernel.ndjson", append(append([]string{}, journalArgs...), "-k"), config.TimeoutSlow) + c.saveStructuredJournalNDJSON(ctx, r, "logs/journal_errors.ndjson", append(append([]string{}, journalArgs...), "-p", "err"), config.TimeoutSlow) if c.Exec.CommandExists("dmesg") { c.saveCommand(ctx, r, "logs/dmesg.txt", executor.CommandSpec{Name: "dmesg", Args: []string{"-T"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "dmesg", "journal") @@ -63,7 +84,7 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error r.RecordSkipForArtifact(SkipPermissionOrAccess, "journalctl OOM scan requires root", oomPath) r.SetFact("oom_event_count", "unavailable") c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan unavailable") - for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { + for _, svc := range journalServiceUnits { c.saveCommand(ctx, r, fmt.Sprintf("logs/journal_%s.txt", svc), executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-u", svc), NeedsRoot: true, Timeout: config.TimeoutMedium}, "journalctl", "journal", "services") } return r, nil @@ -112,9 +133,191 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, "No OOM events found\n", "", "journalctl", []string{"oom"}) } - for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { + for _, svc := range journalServiceUnits { c.saveCommand(ctx, r, fmt.Sprintf("logs/journal_%s.txt", svc), executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-u", svc), NeedsRoot: true, Timeout: config.TimeoutMedium}, "journalctl", "journal", "services") } return r, nil } + +func (c *JournalCollector) journalBaseArgs(untilUTC string) []string { + args := []string{"--no-pager", "--until=" + untilUTC} + if c.Since == "" || c.Since == "boot" { + return append([]string{"-b"}, args...) + } + return append([]string{"--since=" + c.Since}, args...) +} + +func (c *JournalCollector) saveStructuredJournalNDJSON(ctx context.Context, r *CollectorResult, path string, baseArgs []string, timeout time.Duration) { + specArgs := append(append([]string{}, baseArgs...), + "-o", "json", + "--output-fields="+journalOutputFields, + fmt.Sprintf("--lines=%d", journalNDJSONRecordLimit+1), + ) + spec := executor.CommandSpec{ + Name: "journalctl", + Args: specArgs, + NeedsRoot: true, + Timeout: timeout, + } + result, stdout, _ := c.Exec.Capture(ctx, spec, journalNDJSONByteLimit+journalNDJSONSentinelReserve) + if result.Skipped { + r.RecordSkipForArtifact(SkipPermissionOrAccess, "journalctl structured output requires root", path) + return + } + if result.Err != nil && result.ExitCode >= 2 { + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", spec.String(), result.Err), path) + return + } + + content, recordsWritten, truncated, reason := buildJournalNDJSONContent(stdout, result.Truncated) + tags := []string{"journal"} + if err := ValidateTagsAndHint("json", tags); err != nil { + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) + return + } + if reservedErr := c.Writer.ReservePath(path); reservedErr != nil { + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, reservedErr), path) + return + } + if writeErr := c.Writer.SaveOutput(path, content); writeErr != nil { + c.Writer.ReleasePath(path) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, writeErr), path) + return + } + // Probe type: Go-processed command output (not raw command artifact). + r.Artifacts = append(r.Artifacts, ArtifactRecord{ + Path: path, + Type: "probe", + Command: spec.String(), + ExitCode: result.ExitCode, + Status: "ok", + Truncated: truncated, + Tags: sortedTags(tags), + Duration: result.Duration, + ParserHint: "json", + }) + if truncated { + c.UI.Verbose(fmt.Sprintf(" structured journal truncated: %s (%s, records=%d)", path, reason, recordsWritten)) + } +} + +func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int, bool, string) { + lines := strings.Split(string(raw), "\n") + written := make([]string, 0, len(lines)) + recordsWritten := 0 + usedBytes := 0 + truncated := false + truncationReason := "" + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + rec := map[string]any{} + if err := json.Unmarshal([]byte(line), &rec); err != nil { + // Keep output parseable NDJSON by skipping malformed lines. + continue + } + normalized := normalizeJournalRecord(rec) + encoded, err := json.Marshal(normalized) + if err != nil { + continue + } + + if recordsWritten >= journalNDJSONRecordLimit { + truncated = true + truncationReason = "record_limit" + break + } + needed := len(encoded) + 1 // include newline + if usedBytes+needed+journalNDJSONSentinelReserve > journalNDJSONByteLimit { + truncated = true + truncationReason = "byte_limit" + break + } + written = append(written, string(encoded)) + recordsWritten++ + usedBytes += needed + } + + if forceByteTruncated { + truncated = true + if truncationReason == "" { + truncationReason = "byte_limit" + } + } + + if truncated { + // Use a worst-case sentinel size for the eviction loop (the actual + // sentinel can only be smaller after recordsWritten decreases). + worstSentinel, _ := json.Marshal(map[string]any{ + "_truncated": true, + "records_written": recordsWritten, + "reason": truncationReason, + }) + for len(written) > 0 && usedBytes+len(worstSentinel)+1 > journalNDJSONByteLimit { + last := written[len(written)-1] + written = written[:len(written)-1] + usedBytes -= len(last) + 1 + recordsWritten-- + } + // Re-marshal with the actual post-eviction count. + sentinel, _ := json.Marshal(map[string]any{ + "_truncated": true, + "records_written": recordsWritten, + "reason": truncationReason, + }) + if usedBytes+len(sentinel)+1 <= journalNDJSONByteLimit { + written = append(written, string(sentinel)) + } + } + + if len(written) == 0 { + return "", recordsWritten, truncated, truncationReason + } + return strings.Join(written, "\n") + "\n", recordsWritten, truncated, truncationReason +} + +func normalizeJournalRecord(rec map[string]any) map[string]any { + out := map[string]any{ + "MESSAGE": sanitize.SensitiveConfig(journalValueString(rec["MESSAGE"])), + "PRIORITY": journalValueString(rec["PRIORITY"]), + "SYSLOG_IDENTIFIER": journalValueString(rec["SYSLOG_IDENTIFIER"]), + "_SYSTEMD_UNIT": journalValueString(rec["_SYSTEMD_UNIT"]), + "_TRANSPORT": journalValueString(rec["_TRANSPORT"]), + "__REALTIME_TIMESTAMP": journalValueString(rec["__REALTIME_TIMESTAMP"]), + "_BOOT_ID": journalValueString(rec["_BOOT_ID"]), + } + return out +} + +func journalValueString(v any) string { + switch t := v.(type) { + case nil: + return "" + case string: + return t + case float64: + return strconv.FormatInt(int64(t), 10) + case bool: + if t { + return "true" + } + return "false" + case []any: + bytes := make([]byte, 0, len(t)) + for _, item := range t { + n, ok := item.(float64) + if !ok || n < 0 || n > 255 { + return "" + } + bytes = append(bytes, byte(n)) + } + return string(bytes) + default: + return "" + } +} diff --git a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go new file mode 100644 index 0000000..8f1f6e7 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go @@ -0,0 +1,172 @@ +package collector + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +func TestJournalCollectorWritesParseableStructuredNDJSON(t *testing.T) { + t.Parallel() + + const until = "2026-04-08T12:00:00Z" + base := "-b --no-pager --until=" + until + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Commands["journalctl "+base+" -k"] = executor.FakeResponse{Stdout: []byte("kernel\n")} + fake.Commands["journalctl "+base+" -p err"] = executor.FakeResponse{Stdout: []byte("err\n")} + fake.Commands["journalctl "+base+" -p warning"] = executor.FakeResponse{Stdout: []byte("warn\n")} + fake.Commands["journalctl "+base+" -k -o json --output-fields="+journalOutputFields+" --lines=50001"] = executor.FakeResponse{ + Stdout: []byte( + `{"MESSAGE":"token=abc123","PRIORITY":"3","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"kernel.service","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"boot1"}` + "\n", + ), + } + fake.Commands["journalctl "+base+" -p err -o json --output-fields="+journalOutputFields+" --lines=50001"] = executor.FakeResponse{ + Stdout: []byte( + `{"MESSAGE":"password=secret","PRIORITY":"3","SYSLOG_IDENTIFIER":"systemd","_SYSTEMD_UNIT":"x.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"2","_BOOT_ID":"boot1"}` + "\n", + ), + } + fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ + ExitCode: 1, // no OOM matches + Err: context.DeadlineExceeded, // ignored because exit code < 2 logic gates real errors + } + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + + root := t.TempDir() + c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + c.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 12, 0, 0, 0, time.UTC) } + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if len(res.Errors) != 0 { + t.Fatalf("expected no collection errors, got: %+v", res.Errors) + } + + for _, path := range []string{"logs/journal_kernel.ndjson", "logs/journal_errors.ndjson"} { + data, readErr := os.ReadFile(filepath.Join(root, path)) + if readErr != nil { + t.Fatalf("reading %s: %v", path, readErr) + } + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + if strings.TrimSpace(line) == "" { + continue + } + var obj map[string]any + if err := json.Unmarshal([]byte(line), &obj); err != nil { + t.Fatalf("%s has non-JSON line %q: %v", path, line, err) + } + } + } + kernelData, _ := os.ReadFile(filepath.Join(root, "logs/journal_kernel.ndjson")) + if strings.Contains(string(kernelData), "abc123") { + t.Fatal("expected token value to be redacted in structured NDJSON") + } +} + +func TestJournalCollectorStructuredNDJSONSkipStateIsExplicit(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.Binaries["journalctl"] = true + // No root access -> all journalctl calls are skipped. + root := t.TempDir() + c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + hasKernelSkip := false + hasErrorSkip := false + for _, s := range res.Skipped { + if s.ArtifactPath == "logs/journal_kernel.ndjson" { + hasKernelSkip = true + } + if s.ArtifactPath == "logs/journal_errors.ndjson" { + hasErrorSkip = true + } + } + if !hasKernelSkip || !hasErrorSkip { + t.Fatalf("expected explicit structured artifact skips, got: %+v", res.Skipped) + } +} + +func TestJournalCollectorStructuredNDJSONErrorStateIsExplicit(t *testing.T) { + t.Parallel() + + const until = "2026-04-08T13:00:00Z" + base := "-b --no-pager --until=" + until + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Commands["journalctl "+base+" -k"] = executor.FakeResponse{Stdout: []byte("kernel\n")} + fake.Commands["journalctl "+base+" -p err"] = executor.FakeResponse{Stdout: []byte("err\n")} + fake.Commands["journalctl "+base+" -p warning"] = executor.FakeResponse{Stdout: []byte("warn\n")} + // Structured kernel command fails with a real error (exit >= 2). + fake.Commands["journalctl "+base+" -k -o json --output-fields="+journalOutputFields+" --lines=50001"] = executor.FakeResponse{ + ExitCode: 2, + Err: context.DeadlineExceeded, + } + fake.Commands["journalctl "+base+" -p err -o json --output-fields="+journalOutputFields+" --lines=50001"] = executor.FakeResponse{ + Stdout: []byte(`{"MESSAGE":"ok","PRIORITY":"3","SYSLOG_IDENTIFIER":"a","_SYSTEMD_UNIT":"u","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b"}` + "\n"), + } + fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ExitCode: 1} + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + + root := t.TempDir() + c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + c.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 13, 0, 0, 0, time.UTC) } + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + foundKernelErr := false + for _, e := range res.Errors { + if e.ArtifactPath == "logs/journal_kernel.ndjson" { + foundKernelErr = true + } + } + if !foundKernelErr { + t.Fatalf("expected explicit structured artifact error for kernel NDJSON, got: %+v", res.Errors) + } +} + +func TestBuildJournalNDJSONContentAddsTruncationSentinel(t *testing.T) { + t.Parallel() + + raw := []byte(`{"MESSAGE":"hello","PRIORITY":"5","SYSLOG_IDENTIFIER":"a","_SYSTEMD_UNIT":"u","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b"}` + "\n") + content, recordsWritten, truncated, reason := buildJournalNDJSONContent(raw, true) + if !truncated || reason != "byte_limit" { + t.Fatalf("expected forced byte_limit truncation, got truncated=%v reason=%q", truncated, reason) + } + if recordsWritten != 1 { + t.Fatalf("expected records_written=1, got %d", recordsWritten) + } + lines := strings.Split(strings.TrimSpace(content), "\n") + if len(lines) != 2 { + t.Fatalf("expected payload + sentinel lines, got %d", len(lines)) + } + last := map[string]any{} + if err := json.Unmarshal([]byte(lines[1]), &last); err != nil { + t.Fatalf("invalid sentinel JSON: %v", err) + } + if last["_truncated"] != true { + t.Fatalf("expected _truncated=true, got %#v", last["_truncated"]) + } +} diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index 86a078e..858b053 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -415,13 +415,14 @@ func triageFindingToIssue(f triage.Finding) (collector.Issue, error) { return collector.Issue{}, fmt.Errorf("invalid confidence %q", f.Confidence.String()) } return collector.Issue{ - Code: collector.IssueCode(f.Code), - Severity: f.Severity, - Confidence: f.Confidence, - Category: f.Category, - Message: f.Title + ": " + f.Description, - Fingerprint: f.Fingerprint, - RelatedArtifactPaths: append([]string(nil), f.SourceArtifacts...), + Code: collector.IssueCode(f.Code), + Severity: f.Severity, + Confidence: f.Confidence, + Category: f.Category, + Message: f.Title + ": " + f.Description, + Fingerprint: f.Fingerprint, + RelatedArtifactPaths: append([]string(nil), f.SourceArtifacts...), + UnresolvedArtifactPaths: append([]string(nil), f.UnresolvedArtifactPaths...), }, nil } diff --git a/customers/vm-troubleshooting/internal/runner/runner_test.go b/customers/vm-troubleshooting/internal/runner/runner_test.go index feffa2b..b551195 100644 --- a/customers/vm-troubleshooting/internal/runner/runner_test.go +++ b/customers/vm-troubleshooting/internal/runner/runner_test.go @@ -42,7 +42,10 @@ func TestTriageFindingToIssue_PreservesIdentityFields(t *testing.T) { Title: "Xid 79 (GPU_FALLEN_OFF_BUS)", Description: "GPU has fallen off the bus on 3b:00", SourceArtifacts: []string{"logs/dmesg.txt"}, - Fingerprint: "0123456789abcdef0123456789abcdef", + UnresolvedArtifactPaths: []string{ + "logs/journal_kernel.ndjson", + }, + Fingerprint: "0123456789abcdef0123456789abcdef", } issue, err := triageFindingToIssue(f) if err != nil { @@ -57,6 +60,9 @@ func TestTriageFindingToIssue_PreservesIdentityFields(t *testing.T) { if len(issue.RelatedArtifactPaths) != 1 || issue.RelatedArtifactPaths[0] != "logs/dmesg.txt" { t.Fatalf("related paths mismatch: %#v", issue.RelatedArtifactPaths) } + if len(issue.UnresolvedArtifactPaths) != 1 || issue.UnresolvedArtifactPaths[0] != "logs/journal_kernel.ndjson" { + t.Fatalf("unresolved paths mismatch: %#v", issue.UnresolvedArtifactPaths) + } if issue.Fingerprint != f.Fingerprint { t.Fatalf("fingerprint mismatch: got %q want %q", issue.Fingerprint, f.Fingerprint) } diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index e4f3835..4178e42 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -15,10 +15,12 @@ import ( // CriticalPattern describes a high-impact log pattern to search for. type CriticalPattern struct { - Name string - Pattern *regexp.Regexp - Severity collector.Severity - Category string + Name string + Code FindingCode + FingerprintKey string + Pattern *regexp.Regexp + Severity collector.Severity + Category string // Confidence controls SUMMARY visibility (high shown, low hidden). Confidence collector.Confidence } @@ -27,41 +29,86 @@ type CriticalPattern struct { // NOTE: Xid/SXid is owned by triage/xid.go; OOM is owned by collector/journal.go. // Do not add patterns here that duplicate those owners. var criticalPatterns = []CriticalPattern{ - {"Kernel Panic", regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), collector.SeverityCritical, "KERN", collector.ConfidenceHigh}, - {"Hardware Error", regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), collector.SeverityCritical, "HW", collector.ConfidenceHigh}, - {"Fallen Off Bus", regexp.MustCompile(`(?i)fallen off the bus`), collector.SeverityCritical, "GPU", collector.ConfidenceHigh}, - {"Timeout", regexp.MustCompile(`(?i)\b(timeout|timed out)\b`), collector.SeverityWarning, "TIMEOUT", collector.ConfidenceHigh}, + { + Name: "Kernel Panic", + Code: FindingCriticalLog, + FingerprintKey: "kernel_panic", + Pattern: regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), + Severity: collector.SeverityCritical, + Category: "KERN", + Confidence: collector.ConfidenceHigh, + }, + { + Name: "Hardware Error", + Code: FindingCriticalLog, + FingerprintKey: "hardware_error", + Pattern: regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + }, + { + Name: "Fallen Off Bus", + Code: FindingCriticalLog, + FingerprintKey: "fallen_off_bus", + Pattern: regexp.MustCompile(`(?i)fallen off the bus`), + Severity: collector.SeverityCritical, + Category: "GPU", + Confidence: collector.ConfidenceHigh, + }, + { + Name: "Timeout", + Code: FindingCriticalLog, + FingerprintKey: "timeout", + Pattern: regexp.MustCompile(`(?i)\b(timeout|timed out)\b`), + Severity: collector.SeverityWarning, + Category: "TIMEOUT", + Confidence: collector.ConfidenceHigh, + }, } // lowConfidencePatterns are applied only to error-priority sources (journal_errors.txt). // They produce confidence=low findings to avoid flooding SUMMARY.txt. var lowConfidencePatterns = []CriticalPattern{ - {"Error/Fail", regexp.MustCompile(`(?i)\b(error|failed|failure)\b`), collector.SeverityInfo, "ERR", collector.ConfidenceLow}, + { + Name: "Error/Fail", + Code: FindingCriticalLog, + FingerprintKey: "error_fail", + Pattern: regexp.MustCompile(`(?i)\b(error|failed|failure)\b`), + Severity: collector.SeverityInfo, + Category: "ERR", + Confidence: collector.ConfidenceLow, + }, } const maxEvents = 100 // criticalEvent is an internal deduplication record. type criticalEvent struct { - pattern string - line string - severity collector.Severity - category string - confidence collector.Confidence - count int - source string + pattern string + code FindingCode + fingerprintKey string + line string + severity collector.Severity + category string + confidence collector.Confidence + count int + source string + unresolved []string } // AnalyzeCriticalLogs scans collected log artifacts for high-impact patterns. func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, error) { type sourceSpec struct { path string + fallback string + kind string lowConfidence bool // also apply lowConfidencePatterns } sources := []sourceSpec{ - {"logs/dmesg.txt", false}, - {"logs/journal_kernel.txt", false}, - {"logs/journal_errors.txt", true}, + {path: "logs/dmesg.txt", kind: "text", lowConfidence: false}, + {path: "logs/journal_kernel.ndjson", fallback: "logs/journal_kernel.txt", kind: "ndjson", lowConfidence: false}, + {path: "logs/journal_errors.ndjson", fallback: "logs/journal_errors.txt", kind: "ndjson", lowConfidence: true}, } // Single-pass artifact check: classify each source once and cache the @@ -70,6 +117,9 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro sourceSpec state ArtifactState content string + lines []string + // canonical structured artifacts that should exist but were unavailable. + unresolved []string } checked := make([]checkedSource, len(sources)) anySkipped := false @@ -77,10 +127,26 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro for i, src := range sources { state, content := checkArtifact(workDir, src.path) checked[i] = checkedSource{sourceSpec: src, state: state, content: content} + if state == ArtifactUsable { + checked[i].lines = sourceLinesForCritical(src.kind, content) + } + // Structured source unavailable -> fallback to text while retaining unresolved path. + if state != ArtifactUsable && src.fallback != "" { + fbState, fbContent := checkArtifact(workDir, src.fallback) + if fbState == ArtifactUsable { + checked[i].state = ArtifactUsable + checked[i].content = fbContent + checked[i].lines = sourceLinesForCritical("text", fbContent) + checked[i].path = src.fallback + checked[i].unresolved = []string{src.path} + } else if fbState == ArtifactSkipped { + anySkipped = true + } + } if state == ArtifactSkipped { anySkipped = true } - if state == ArtifactUsable { + if checked[i].state == ArtifactUsable { anyUsable = true } } @@ -106,14 +172,13 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro if src.state != ArtifactUsable { continue } - content := src.content patterns := criticalPatterns if src.lowConfidence { patterns = append(patterns, lowConfidencePatterns...) } - for _, line := range strings.Split(content, "\n") { + for _, line := range src.lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "#") { continue @@ -125,13 +190,16 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro existing.count++ } else { seen[k] = &criticalEvent{ - pattern: p.Name, - line: line, - severity: p.Severity, - category: p.Category, - confidence: p.Confidence, - count: 1, - source: src.path, + pattern: p.Name, + code: p.Code, + fingerprintKey: p.FingerprintKey, + line: line, + severity: p.Severity, + category: p.Category, + confidence: p.Confidence, + count: 1, + source: src.path, + unresolved: append([]string(nil), src.unresolved...), } eventOrder = append(eventOrder, k) } @@ -186,7 +254,7 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro } findings = append(findings, Finding{ - Code: FindingCriticalLog, + Code: ev.code, Severity: ev.severity, Confidence: ev.confidence, Category: ev.category, @@ -196,7 +264,8 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro SourceArtifacts: []string{ ev.source, }, - Fingerprint: identity.Fingerprint("crit", criticalPatternKey(ev.pattern), criticalSourceClass(ev.source)), + Fingerprint: identity.Fingerprint("crit", ev.fingerprintKey, criticalSourceClass(ev.source)), + UnresolvedArtifactPaths: append([]string(nil), ev.unresolved...), }) textLines = append(textLines, fmt.Sprintf(" [%s] %s (%dx): %s", ev.severity.String(), ev.pattern, ev.count, line)) @@ -216,21 +285,36 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro }, nil } -func criticalPatternKey(pattern string) string { - key := strings.ToLower(strings.TrimSpace(pattern)) - key = strings.ReplaceAll(key, " ", "_") - return key -} - func criticalSourceClass(path string) string { switch path { case "logs/dmesg.txt": return "dmesg" + case "logs/journal_kernel.ndjson": + return "journal_kernel" case "logs/journal_kernel.txt": return "journal_kernel" + case "logs/journal_errors.ndjson": + return "journal_errors" case "logs/journal_errors.txt": return "journal_errors" default: return "unknown" } } + +func sourceLinesForCritical(kind, content string) []string { + switch kind { + case "ndjson": + events := parseJournalNDJSON(content) + lines := make([]string, 0, len(events)) + for _, ev := range events { + if strings.TrimSpace(ev.Message) == "" { + continue + } + lines = append(lines, ev.Message) + } + return lines + default: + return strings.Split(content, "\n") + } +} diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index 138ed0c..aacf8fb 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -62,7 +62,7 @@ func TestAnalyzeCriticalLogs_LowConfidence(t *testing.T) { if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_errors.txt" { t.Errorf("expected source_artifacts to contain journal_errors, got %#v", f.SourceArtifacts) } - wantFP := identity.Fingerprint("crit", criticalPatternKey(f.Title), criticalSourceClass("logs/journal_errors.txt")) + wantFP := identity.Fingerprint("crit", "error_fail", criticalSourceClass("logs/journal_errors.txt")) if f.Fingerprint != wantFP { t.Errorf("unexpected fingerprint: got %q want %q", f.Fingerprint, wantFP) } @@ -183,6 +183,83 @@ func TestAnalyzeCriticalLogs_IgnoresSkippedSourceContentWhenOtherSourceUsable(t } } +func TestAnalyzeCriticalLogs_TextFallbackMarksUnresolvedStructuredPath(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + + // Structured NDJSON is absent; analyzer should fall back to text artifact. + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte( + "# Command: journalctl\n---\nsystemd[1]: Failed to start some-service.service\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil || len(tr.Findings) == 0 { + t.Fatal("expected findings from text fallback") + } + found := false + for _, f := range tr.Findings { + if f.Title != "Error/Fail" { + continue + } + found = true + if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_errors.txt" { + t.Fatalf("unexpected source artifacts: %#v", f.SourceArtifacts) + } + if len(f.UnresolvedArtifactPaths) != 1 || f.UnresolvedArtifactPaths[0] != "logs/journal_errors.ndjson" { + t.Fatalf("expected unresolved structured path, got %#v", f.UnresolvedArtifactPaths) + } + } + if !found { + t.Fatal("expected Error/Fail finding from fallback text source") + } +} + +func TestAnalyzeCriticalLogs_NDJSONPrimarySource(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + + // Provide structured NDJSON as the primary source (no text fallback needed). + ndjson := `{"MESSAGE":"kernel panic - not syncing: Fatal exception","PRIORITY":"0","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1000","_BOOT_ID":"b1"}` + "\n" + + `{"MESSAGE":"hardware error detected on CPU 0","PRIORITY":"2","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1001","_BOOT_ID":"b1"}` + "\n" + os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte(ndjson), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil result") + } + + foundPanic := false + foundHW := false + for _, f := range tr.Findings { + switch f.Title { + case "Kernel Panic": + foundPanic = true + if f.SourceArtifacts[0] != "logs/journal_kernel.ndjson" { + t.Errorf("expected NDJSON source, got %v", f.SourceArtifacts) + } + if len(f.UnresolvedArtifactPaths) != 0 { + t.Errorf("NDJSON primary source should have no unresolved paths, got %v", f.UnresolvedArtifactPaths) + } + case "Hardware Error": + foundHW = true + } + } + if !foundPanic { + t.Error("expected Kernel Panic finding from NDJSON source") + } + if !foundHW { + t.Error("expected Hardware Error finding from NDJSON source") + } +} + func TestAnalyzeCriticalLogs_XidNotMatched(t *testing.T) { // Xid is owned by triage/xid.go, not critical.go. t.Parallel() @@ -223,3 +300,27 @@ func TestAnalyzeCriticalLogs_Cap(t *testing.T) { t.Errorf("expected at most %d findings, got %d", maxEvents, len(tr.Findings)) } } + +func TestAnalyzeCriticalLogs_DeterministicTiebreakers(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "# Command: dmesg\n---\n"+ + "[1.0] operation timed out on device b\n"+ + "[1.1] operation timed out on device a\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if len(tr.Findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(tr.Findings)) + } + first := tr.Findings[0].Evidence[0] + second := tr.Findings[1].Evidence[0] + if first > second { + t.Fatalf("expected lexicographic tiebreak ordering, got first=%q second=%q", first, second) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/journal_ndjson.go b/customers/vm-troubleshooting/internal/triage/journal_ndjson.go new file mode 100644 index 0000000..529a442 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/journal_ndjson.go @@ -0,0 +1,76 @@ +package triage + +import ( + "encoding/json" + "strconv" + "strings" +) + +// normalizedJournalEvent is intentionally narrow and local to triage analyzers. +// Keep _TRANSPORT available for future filtering/ownership work. +type normalizedJournalEvent struct { + Message string + Priority string + SyslogIdentifier string + SystemdUnit string + Transport string + RealtimeTimestamp string + BootID string +} + +func parseJournalNDJSON(content string) []normalizedJournalEvent { + lines := strings.Split(content, "\n") + events := make([]normalizedJournalEvent, 0, len(lines)) + for _, raw := range lines { + line := strings.TrimSpace(raw) + if line == "" { + continue + } + obj := map[string]any{} + if err := json.Unmarshal([]byte(line), &obj); err != nil { + continue + } + // Skip truncation sentinel lines emitted by the collector. + if _, ok := obj["_truncated"]; ok { + continue + } + events = append(events, normalizedJournalEvent{ + Message: triageString(obj["MESSAGE"]), + Priority: triageString(obj["PRIORITY"]), + SyslogIdentifier: triageString(obj["SYSLOG_IDENTIFIER"]), + SystemdUnit: triageString(obj["_SYSTEMD_UNIT"]), + Transport: triageString(obj["_TRANSPORT"]), + RealtimeTimestamp: triageString(obj["__REALTIME_TIMESTAMP"]), + BootID: triageString(obj["_BOOT_ID"]), + }) + } + return events +} + +func triageString(v any) string { + switch t := v.(type) { + case nil: + return "" + case string: + return t + case float64: + return strconv.FormatInt(int64(t), 10) + case bool: + if t { + return "true" + } + return "false" + case []any: + bytes := make([]byte, 0, len(t)) + for _, item := range t { + n, ok := item.(float64) + if !ok || n < 0 || n > 255 { + return "" + } + bytes = append(bytes, byte(n)) + } + return string(bytes) + default: + return "" + } +} diff --git a/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go b/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go new file mode 100644 index 0000000..e57ab17 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go @@ -0,0 +1,48 @@ +package triage + +import ( + "testing" +) + +func TestParseJournalNDJSON_SkipsSentinel(t *testing.T) { + t.Parallel() + content := `{"MESSAGE":"real event","PRIORITY":"3","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"} +{"_truncated":true,"records_written":1,"reason":"record_limit"} +` + events := parseJournalNDJSON(content) + if len(events) != 1 { + t.Fatalf("expected 1 event (sentinel should be skipped), got %d", len(events)) + } + if events[0].Message != "real event" { + t.Errorf("expected 'real event', got %q", events[0].Message) + } +} + +func TestParseJournalNDJSON_HandlesEmptyAndMalformed(t *testing.T) { + t.Parallel() + content := ` +not valid json +{"MESSAGE":"ok","PRIORITY":"6"} +` + events := parseJournalNDJSON(content) + if len(events) != 1 { + t.Fatalf("expected 1 event (malformed and empty skipped), got %d", len(events)) + } + if events[0].Message != "ok" { + t.Errorf("expected 'ok', got %q", events[0].Message) + } +} + +func TestParseJournalNDJSON_ByteArrayMessage(t *testing.T) { + t.Parallel() + // journalctl can emit MESSAGE as byte array instead of string. + content := `{"MESSAGE":[104,101,108,108,111],"PRIORITY":"3"} +` + events := parseJournalNDJSON(content) + if len(events) != 1 { + t.Fatalf("expected 1 event, got %d", len(events)) + } + if events[0].Message != "hello" { + t.Errorf("expected byte array decoded to 'hello', got %q", events[0].Message) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index e04494e..cd99b1d 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -26,6 +26,8 @@ type Finding struct { Evidence []string `json:"evidence,omitempty"` SourceArtifacts []string `json:"source_artifacts,omitempty"` Fingerprint string `json:"issue_fingerprint,omitempty"` + // Internal bridge field. Runner maps this to Issue.UnresolvedArtifactPaths. + UnresolvedArtifactPaths []string `json:"-"` } type FindingCode string diff --git a/customers/vm-troubleshooting/internal/triage/xid.go b/customers/vm-troubleshooting/internal/triage/xid.go index d38ae76..292415e 100644 --- a/customers/vm-troubleshooting/internal/triage/xid.go +++ b/customers/vm-troubleshooting/internal/triage/xid.go @@ -11,48 +11,42 @@ import ( "github.com/NexGenCloud/vm-diagnostics/internal/collector" "github.com/NexGenCloud/vm-diagnostics/internal/identity" + "github.com/NexGenCloud/vm-diagnostics/internal/triage/xidcatalog" ) -// XidEntry describes a known Xid error code. -// Based on NVIDIA Xid Catalog r590, with team policy overrides documented inline. -type XidEntry struct { - Code int - Name string - Description string - Severity collector.Severity - RequiresReset bool - Action string +// xidPolicy keeps local support policy separate from neutral catalog data. +type xidPolicy struct { + severity collector.Severity + requiresReset bool + action string } -// xidCatalog contains the 24 datacenter-relevant Xid codes. -var xidCatalog = map[int]XidEntry{ - 13: {13, "GR_EXCEPTION", "Graphics engine exception", collector.SeverityWarning, false, "Restart application; run under cuda-gdb or compute sanitizer if recurring"}, - 31: {31, "MMU_ERR_FLT", "GPU memory page fault", collector.SeverityWarning, false, "Restart application; check for out-of-bounds access"}, - 43: {43, "RESETCHANNEL_VERIF_ERROR", "GPU stopped processing (software fault)", collector.SeverityInfo, false, "Usually benign — application terminated, GPU healthy"}, - 45: {45, "PREEMPTIVE_REMOVAL", "Preemptive cleanup due to prior error", collector.SeverityInfo, false, "Secondary effect of another Xid; check preceding errors"}, - 46: {46, "GPU_TIMEOUT_ERROR", "GPU stopped processing (timeout)", collector.SeverityCritical, true, "Reset GPU; contact support if recurring"}, - 48: {48, "GPU_ECC_DBE", "Double-bit ECC error (uncorrectable)", collector.SeverityCritical, true, "Reset GPU; check SRAM DBE thresholds for RMA"}, - 56: {56, "DISPLAY_CHANNEL_EXCEPTION", "Display channel exception (unused in datacenter)", collector.SeverityInfo, false, "Contact support"}, - 57: {57, "FB_LINK_TRAINING_FAILURE", "Framebuffer link training failure (unused)", collector.SeverityInfo, false, "Contact support"}, - // Xid 61/62: historically swapped in some references. 61=breakpoint (unused), 62=halt (critical). - 61: {61, "PMU_BREAKPOINT", "PMU breakpoint (unused)", collector.SeverityInfo, false, "Contact support"}, - 62: {62, "PMU_HALT_ERROR", "Internal micro-controller halt", collector.SeverityCritical, true, "Reset GPU; contact support"}, - // Xid 63/64: 63=benign remap event (INFO), 64=remap failure (CRITICAL). - 63: {63, "DRAM_RETIREMENT_EVENT", "GPU memory remapping event (benign)", collector.SeverityInfo, false, "No action required"}, - 64: {64, "DRAM_RETIREMENT_FAILURE", "GPU memory remapping failure", collector.SeverityCritical, true, "Reset GPU; contact support for potential RMA"}, - 68: {68, "NVDEC0_ERROR", "NVDEC0 exception", collector.SeverityWarning, false, "Restart application"}, - 69: {69, "GR_CLASS_ERROR", "Graphics engine class error", collector.SeverityWarning, false, "Restart application; check CUDA code"}, - 74: {74, "NVLINK_ERROR", "NVLink error", collector.SeverityCritical, true, "Reset GPU or reboot node; contact support"}, - 79: {79, "GPU_FALLEN_OFF_BUS", "GPU has fallen off the bus", collector.SeverityCritical, true, "Reboot node; check PCIe seating and power"}, - 92: {92, "EXCESSIVE_SBE_INTERRUPTS", "High single-bit ECC error rate", collector.SeverityWarning, false, "Monitor; contact support if persistent"}, - 94: {94, "CONTAINED_ERROR", "Contained memory error (app-local)", collector.SeverityWarning, false, "Restart affected application; reset GPU when convenient"}, - 95: {95, "UNCONTAINED_ERROR", "Uncontained memory error (all apps affected)", collector.SeverityCritical, true, "Reset GPU before restarting any application"}, - 109: {109, "CTXSW_TIMEOUT_ERROR", "Context switch timeout", collector.SeverityCritical, true, "Reset GPU; contact support"}, - // Xid 119: CRITICAL (GSP RPC timeout — requires GPU reset). - 119: {119, "GSP_RPC_TIMEOUT", "GSP RPC timeout", collector.SeverityCritical, true, "Reset GPU or power cycle node"}, - 120: {120, "GSP_ERROR", "GSP error", collector.SeverityCritical, true, "Reset GPU or power cycle node"}, - 150: {150, "NVLINK_MSE_ERROR", "NVLink MSE error", collector.SeverityCritical, true, "Follow NVLink5 error workflow"}, - 154: {154, "GPU_RECOVERY_ACTION", "GPU recovery action changed", collector.SeverityInfo, false, "Informational — see accompanying Xid"}, +// Local policy overrides and support guidance. +var xidPolicies = map[int]xidPolicy{ + 13: {collector.SeverityWarning, false, "Restart application; run under cuda-gdb or compute sanitizer if recurring"}, + 31: {collector.SeverityWarning, false, "Restart application; check for out-of-bounds access"}, + 43: {collector.SeverityInfo, false, "Usually benign — application terminated, GPU healthy"}, + 45: {collector.SeverityInfo, false, "Secondary effect of another Xid; check preceding errors"}, + 46: {collector.SeverityCritical, true, "Reset GPU; contact support if recurring"}, + 48: {collector.SeverityCritical, true, "Reset GPU; check SRAM DBE thresholds for RMA"}, + 56: {collector.SeverityInfo, false, "Contact support"}, + 57: {collector.SeverityInfo, false, "Contact support"}, + 61: {collector.SeverityInfo, false, "Contact support"}, + 62: {collector.SeverityCritical, true, "Reset GPU; contact support"}, + 63: {collector.SeverityInfo, false, "No action required"}, + 64: {collector.SeverityCritical, true, "Reset GPU; contact support for potential RMA"}, + 68: {collector.SeverityWarning, false, "Restart application"}, + 69: {collector.SeverityWarning, false, "Restart application; check CUDA code"}, + 74: {collector.SeverityCritical, true, "Reset GPU or reboot node; contact support"}, + 79: {collector.SeverityCritical, true, "Reboot node; check PCIe seating and power"}, + 92: {collector.SeverityWarning, false, "Monitor; contact support if persistent"}, + 94: {collector.SeverityWarning, false, "Restart affected application; reset GPU when convenient"}, + 95: {collector.SeverityCritical, true, "Reset GPU before restarting any application"}, + 109: {collector.SeverityCritical, true, "Reset GPU; contact support"}, + 119: {collector.SeverityCritical, true, "Reset GPU or power cycle node"}, + 120: {collector.SeverityCritical, true, "Reset GPU or power cycle node"}, + 150: {collector.SeverityCritical, true, "Follow NVLink5 error workflow"}, + 154: {collector.SeverityInfo, false, "Informational — see accompanying Xid"}, } // Xid 154 recovery action map: action text → (severity, requires_reset). @@ -82,21 +76,9 @@ type XidEvent struct { } var ( - xidRe = regexp.MustCompile(`NVRM:\s*Xid\s*\(PCI:([^)]+)\):\s*(\d+)`) - sxidRe = regexp.MustCompile(`NVRM:\s*SXid\s*\(PCI:([^)]+)\):\s*(\d+)`) xid154ActRe = regexp.MustCompile(`(?i)to\s+0x[0-9a-fA-F]+\s*\(([^)]+)\)`) ) -// normalizeBDF strips the domain prefix (0000:) and lowercases. -func normalizeBDF(bdf string) string { - bdf = strings.ToLower(strings.TrimSpace(bdf)) - if parts := strings.SplitN(bdf, ":", 2); len(parts) == 2 && len(parts[0]) == 4 { - // Strip "0000:" domain prefix - bdf = parts[1] - } - return bdf -} - // parseXidEvents extracts Xid/SXid events from dmesg text. func parseXidEvents(dmesg string) []XidEvent { type key struct { @@ -108,10 +90,7 @@ func parseXidEvents(dmesg string) []XidEvent { recoveryActions := make(map[key]string) // Xid 154 recovery action text for _, line := range strings.Split(dmesg, "\n") { - // Try Xid match - if m := xidRe.FindStringSubmatch(line); m != nil { - bdf := normalizeBDF(m[1]) - code, _ := strconv.Atoi(m[2]) + if code, bdf, ok := xidcatalog.ParseKernelLine(line); ok { k := key{bdf, code, false} counts[k]++ @@ -122,10 +101,7 @@ func parseXidEvents(dmesg string) []XidEvent { } } } - // Try SXid match - if m := sxidRe.FindStringSubmatch(line); m != nil { - bdf := normalizeBDF(m[1]) - code, _ := strconv.Atoi(m[2]) + if code, bdf, ok := xidcatalog.ParseKernelSXidLine(line); ok { k := key{bdf, code, true} counts[k]++ } @@ -133,19 +109,20 @@ func parseXidEvents(dmesg string) []XidEvent { var events []XidEvent for k, count := range counts { - entry, known := xidCatalog[k.code] - sev := collector.SeverityWarning - requiresReset := false + info, known := xidcatalog.Lookup(k.code) + policy := xidPolicy{ + severity: collector.SeverityWarning, + requiresReset: false, + action: "Contact support", + } + if override, ok := xidPolicies[k.code]; ok { + policy = override + } name := fmt.Sprintf("Unknown Xid %d", k.code) desc := "Unknown Xid error" - action := "Contact support" - if known { - sev = entry.Severity - requiresReset = entry.RequiresReset - name = entry.Name - desc = entry.Description - action = entry.Action + name = info.Name + desc = info.Description } if k.isSXid { @@ -159,9 +136,9 @@ func parseXidEvents(dmesg string) []XidEvent { IsSXid: k.isSXid, Name: name, Description: desc, - Severity: sev, - RequiresReset: requiresReset, - Action: action, + Severity: policy.severity, + RequiresReset: policy.requiresReset, + Action: policy.action, Count: count, } @@ -195,18 +172,39 @@ func parseXidEvents(dmesg string) []XidEvent { // AnalyzeXid reads dmesg artifacts and classifies Xid/SXid errors. func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { - // Try primary source, then fallback - sources := []string{"logs/dmesg.txt", "nvidia/xid_errors.txt"} + // Try primary source, then structured journal, then collector fallback artifact. + sources := []struct { + path string + kind string + }{ + {path: "logs/dmesg.txt", kind: "text"}, + {path: "logs/journal_kernel.ndjson", kind: "ndjson"}, + {path: "nvidia/xid_errors.txt", kind: "text"}, + } var dmesg string var sourceArtifact string anySkipped := false for _, src := range sources { - state, content := checkArtifact(workDir, src) + state, content := checkArtifact(workDir, src.path) switch state { case ArtifactUsable: if dmesg == "" { + if src.kind == "ndjson" { + events := parseJournalNDJSON(content) + lines := make([]string, 0, len(events)) + for _, ev := range events { + if strings.TrimSpace(ev.Message) == "" { + continue + } + lines = append(lines, ev.Message) + } + content = strings.Join(lines, "\n") + } + if strings.TrimSpace(content) == "" { + continue + } dmesg = content - sourceArtifact = src + sourceArtifact = src.path } case ArtifactSkipped: anySkipped = true diff --git a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go index a93599e..d7eeafa 100644 --- a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go +++ b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go @@ -4,6 +4,7 @@ import ( "context" "os" "path/filepath" + "slices" "testing" "github.com/NexGenCloud/vm-diagnostics/internal/identity" @@ -88,3 +89,41 @@ func TestAnalyzeXid_ReturnsUnavailableWhenOnlySkippedSourcesExist(t *testing.T) t.Fatalf("expected no findings for skipped input, got %d", len(tr.Findings)) } } + +func TestAnalyzeXid_FingerprintsStableAcrossRuns(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + content := "# Command: dmesg\n---\n" + + "[1000.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1\n" + + "[1001.0] NVRM: Xid (PCI:0000:86:00): 79, pid=2\n" + if err := os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + getFingerprints := func() []string { + t.Helper() + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil result") + } + fps := make([]string, 0, len(tr.Findings)) + for _, f := range tr.Findings { + fps = append(fps, f.Fingerprint) + } + slices.Sort(fps) + return fps + } + + first := getFingerprints() + second := getFingerprints() + if !slices.Equal(first, second) { + t.Fatalf("fingerprints not stable across runs: first=%v second=%v", first, second) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/xid_test.go b/customers/vm-troubleshooting/internal/triage/xid_test.go index df21295..96ed468 100644 --- a/customers/vm-troubleshooting/internal/triage/xid_test.go +++ b/customers/vm-troubleshooting/internal/triage/xid_test.go @@ -1,6 +1,9 @@ package triage import ( + "context" + "os" + "path/filepath" "strings" "testing" @@ -112,6 +115,42 @@ func TestParseXidEvents_UnknownCode(t *testing.T) { } } +func TestParseXidEvents_KnownCatalogCodeUsesDefaultPolicy(t *testing.T) { + t.Parallel() + // 121 exists in the catalog, but has no explicit local policy override. + dmesg := `[1000.0] NVRM: Xid (PCI:0000:3b:00): 121, pid=0 +` + events := parseXidEvents(dmesg) + if len(events) != 1 { + t.Fatalf("expected 1 event, got %d", len(events)) + } + ev := events[0] + if ev.Name != "C2C_LINK_ERROR" { + t.Fatalf("expected catalog name C2C_LINK_ERROR, got %q", ev.Name) + } + if ev.Severity != collector.SeverityWarning { + t.Fatalf("expected default warning severity, got %s", ev.Severity) + } + if ev.Action != "Contact support" { + t.Fatalf("expected default action, got %q", ev.Action) + } +} + +func TestParseXidEvents_DeterministicTiebreakers(t *testing.T) { + t.Parallel() + dmesg := `[1000.0] NVRM: Xid (PCI:0000:86:00): 79, pid=0 +[1001.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=0 +` + events := parseXidEvents(dmesg) + if len(events) != 2 { + t.Fatalf("expected 2 events, got %d", len(events)) + } + // Same severity/count => BDF ASC tiebreaker. + if events[0].BDF != "3b:00" || events[1].BDF != "86:00" { + t.Fatalf("unexpected ordering: %+v", events) + } +} + func TestParseXidEvents_Empty(t *testing.T) { t.Parallel() events := parseXidEvents("") @@ -119,3 +158,31 @@ func TestParseXidEvents_Empty(t *testing.T) { t.Errorf("expected 0 events from empty dmesg, got %d", len(events)) } } + +func TestAnalyzeXid_NDJSONSource(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + + // No dmesg.txt — Xid analyzer should fall back to journal_kernel.ndjson. + ndjson := `{"MESSAGE":"[12345.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1234, name=python3","PRIORITY":"3","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1000","_BOOT_ID":"b1"}` + "\n" + os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte(ndjson), 0o644) + + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil || len(tr.Findings) == 0 { + t.Fatal("expected Xid findings from NDJSON source") + } + f := tr.Findings[0] + if f.Code != FindingXid { + t.Errorf("expected code %q, got %q", FindingXid, f.Code) + } + if f.Severity != collector.SeverityCritical { + t.Errorf("Xid 79 should be critical, got %s", f.Severity) + } + if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_kernel.ndjson" { + t.Errorf("expected NDJSON source artifact, got %v", f.SourceArtifacts) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md b/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md new file mode 100644 index 0000000..dbc5a42 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md @@ -0,0 +1,16 @@ +# Xid Catalog Source Notes + +This package keeps a local, minimal Xid catalog and parser surface for triage. + +Upstream reference used for content curation: +- NVIDIA Xid documentation and datacenter guidance (r590 era) + +Local policy boundary: +- `internal/triage/xidcatalog` stores neutral catalog metadata and kernel line parsing. +- `internal/triage/xid.go` owns local support policy (severity, reset guidance, actions, and operational overrides such as Xid 154). + +Update flow: +1. Review upstream Xid docs for added/changed codes. +2. Update `catalog.go` entries (name + neutral description only). +3. Keep or adjust local policy in `xid.go` only when support policy changes. +4. Run `gofmt`, `go test`, `go vet`, and `CGO_ENABLED=0 go build ./cmd/gather-info`. diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go new file mode 100644 index 0000000..da63493 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go @@ -0,0 +1,117 @@ +package xidcatalog + +import ( + "regexp" + "strconv" + "strings" +) + +// XidInfo is neutral catalog metadata (not support policy). +type XidInfo struct { + Code int + Name string + Description string +} + +var catalog = map[int]XidInfo{ + 8: {Code: 8, Name: "GPU_CHANNEL_TIMEOUT", Description: "GPU channel timeout detected"}, + 9: {Code: 9, Name: "DRIVER_ERROR", Description: "Driver error detected"}, + 10: {Code: 10, Name: "GPU_CONTEXT_SWITCH_TIMEOUT", Description: "GPU context switch timeout"}, + 11: {Code: 11, Name: "PBDMA_ERROR", Description: "PBDMA pushbuffer DMA error"}, + 12: {Code: 12, Name: "GRAPHICS_ENGINE_EXCEPTION", Description: "Graphics engine exception"}, + 13: {Code: 13, Name: "GR_EXCEPTION", Description: "Graphics engine exception"}, + 14: {Code: 14, Name: "GRAPHICS_SM_EXCEPTION", Description: "Graphics SM exception"}, + 15: {Code: 15, Name: "GRAPHICS_SW_NOTIFY", Description: "Graphics software notification error"}, + 16: {Code: 16, Name: "GRAPHICS_CLASS_ERROR", Description: "Graphics class error"}, + 17: {Code: 17, Name: "GRAPHICS_METHOD_ERROR", Description: "Graphics method error"}, + 18: {Code: 18, Name: "GRAPHICS_FIRMWARE_ERROR", Description: "Graphics firmware error"}, + 19: {Code: 19, Name: "GRAPHICS_EXCEPTION", Description: "Graphics exception"}, + 20: {Code: 20, Name: "GPU_DMA_PUSHER_ERROR", Description: "DMA pusher error"}, + 21: {Code: 21, Name: "GPU_DMA_FETCH_ERROR", Description: "DMA fetch error"}, + 22: {Code: 22, Name: "GPU_DMA_SEMAPHORE_ERROR", Description: "DMA semaphore error"}, + 23: {Code: 23, Name: "GPU_DMA_ILLEGAL_METHOD", Description: "Illegal DMA method"}, + 24: {Code: 24, Name: "GPU_CHANNEL_ERROR", Description: "GPU channel error"}, + 25: {Code: 25, Name: "GPU_PCI_ERROR", Description: "PCIe transaction error"}, + 26: {Code: 26, Name: "GPU_MEMORY_TRANSFER_ERROR", Description: "Memory transfer error"}, + 27: {Code: 27, Name: "GPU_DISPLAY_ERROR", Description: "Display subsystem error"}, + 28: {Code: 28, Name: "GPU_FIRMWARE_TIMEOUT", Description: "Firmware timeout"}, + 29: {Code: 29, Name: "GPU_FIRMWARE_COMM_ERROR", Description: "Firmware communication error"}, + 30: {Code: 30, Name: "GPU_FIRMWARE_EXCEPTION", Description: "Firmware exception"}, + 31: {Code: 31, Name: "MMU_ERR_FLT", Description: "GPU memory page fault"}, + 43: {Code: 43, Name: "RESETCHANNEL_VERIF_ERROR", Description: "GPU stopped processing (software fault)"}, + 45: {Code: 45, Name: "PREEMPTIVE_REMOVAL", Description: "Preemptive cleanup due to prior error"}, + 46: {Code: 46, Name: "GPU_TIMEOUT_ERROR", Description: "GPU stopped processing (timeout)"}, + 48: {Code: 48, Name: "GPU_ECC_DBE", Description: "Double-bit ECC error (uncorrectable)"}, + 56: {Code: 56, Name: "DISPLAY_CHANNEL_EXCEPTION", Description: "Display channel exception"}, + 57: {Code: 57, Name: "FB_LINK_TRAINING_FAILURE", Description: "Framebuffer link training failure"}, + 61: {Code: 61, Name: "PMU_BREAKPOINT", Description: "PMU breakpoint"}, + 62: {Code: 62, Name: "PMU_HALT_ERROR", Description: "Internal micro-controller halt"}, + 63: {Code: 63, Name: "DRAM_RETIREMENT_EVENT", Description: "GPU memory remapping event"}, + 64: {Code: 64, Name: "DRAM_RETIREMENT_FAILURE", Description: "GPU memory remapping failure"}, + 68: {Code: 68, Name: "NVDEC0_ERROR", Description: "NVDEC0 exception"}, + 69: {Code: 69, Name: "GR_CLASS_ERROR", Description: "Graphics engine class error"}, + 74: {Code: 74, Name: "NVLINK_ERROR", Description: "NVLink error"}, + 79: {Code: 79, Name: "GPU_FALLEN_OFF_BUS", Description: "GPU has fallen off the bus"}, + 92: {Code: 92, Name: "EXCESSIVE_SBE_INTERRUPTS", Description: "High single-bit ECC error rate"}, + 94: {Code: 94, Name: "CONTAINED_ERROR", Description: "Contained memory error (app-local)"}, + 95: {Code: 95, Name: "UNCONTAINED_ERROR", Description: "Uncontained memory error (all apps affected)"}, + 109: {Code: 109, Name: "CTXSW_TIMEOUT_ERROR", Description: "Context switch timeout"}, + 119: {Code: 119, Name: "GSP_RPC_TIMEOUT", Description: "GSP RPC timeout"}, + 120: {Code: 120, Name: "GSP_ERROR", Description: "GSP error"}, + 121: {Code: 121, Name: "C2C_LINK_ERROR", Description: "Chip-to-chip link error"}, + 137: {Code: 137, Name: "NVLINK_FLA_PRIV_ERROR", Description: "NVLink fabric address fault"}, + 140: {Code: 140, Name: "NVLINK_SUBLINK_ERROR", Description: "NVLink sublink error"}, + 143: {Code: 143, Name: "NVLINK_FATAL_ERROR", Description: "NVLink fatal error"}, + 150: {Code: 150, Name: "NVLINK_MSE_ERROR", Description: "NVLink MSE error"}, + 154: {Code: 154, Name: "GPU_RECOVERY_ACTION", Description: "GPU recovery action changed"}, +} + +var ( + xidRe = regexp.MustCompile(`NVRM:\s*Xid\s*\(PCI:([^)]+)\):\s*(\d+)`) + sxidRe = regexp.MustCompile(`NVRM:\s*SXid\s*\(PCI:([^)]+)\):\s*(\d+)`) +) + +// Lookup returns neutral catalog metadata for a code. +func Lookup(code int) (XidInfo, bool) { + info, ok := catalog[code] + return info, ok +} + +// Size returns catalog entry count for tests. +func Size() int { + return len(catalog) +} + +// ParseKernelLine parses a kernel Xid log line. +func ParseKernelLine(line string) (code int, bdf string, ok bool) { + m := xidRe.FindStringSubmatch(line) + if m == nil { + return 0, "", false + } + n, err := strconv.Atoi(m[2]) + if err != nil { + return 0, "", false + } + return n, normalizeBDF(m[1]), true +} + +// ParseKernelSXidLine parses a kernel SXid log line. +func ParseKernelSXidLine(line string) (code int, bdf string, ok bool) { + m := sxidRe.FindStringSubmatch(line) + if m == nil { + return 0, "", false + } + n, err := strconv.Atoi(m[2]) + if err != nil { + return 0, "", false + } + return n, normalizeBDF(m[1]), true +} + +func normalizeBDF(bdf string) string { + bdf = strings.ToLower(strings.TrimSpace(bdf)) + if parts := strings.SplitN(bdf, ":", 2); len(parts) == 2 && len(parts[0]) == 4 { + bdf = parts[1] + } + return bdf +} diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_test.go b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_test.go new file mode 100644 index 0000000..40d14eb --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_test.go @@ -0,0 +1,38 @@ +package xidcatalog + +import "testing" + +func TestLookupCoverageGrowth(t *testing.T) { + t.Parallel() + // Phase 4 expands coverage beyond the prior 24 hand-maintained entries. + if got := Size(); got <= 24 { + t.Fatalf("expected catalog size > 24, got %d", got) + } +} + +func TestParseKernelLine_NormalizesBDF(t *testing.T) { + t.Parallel() + line := `NVRM: Xid (PCI:0000:3B:00): 79, pid=1234, name=python3` + code, bdf, ok := ParseKernelLine(line) + if !ok { + t.Fatal("expected ParseKernelLine to match") + } + if code != 79 { + t.Fatalf("expected code 79, got %d", code) + } + if bdf != "3b:00" { + t.Fatalf("expected normalized bdf 3b:00, got %q", bdf) + } +} + +func TestParseKernelSXidLine(t *testing.T) { + t.Parallel() + line := `NVRM: SXid (PCI:0000:86:00): 31, pid=0` + code, bdf, ok := ParseKernelSXidLine(line) + if !ok { + t.Fatal("expected ParseKernelSXidLine to match") + } + if code != 31 || bdf != "86:00" { + t.Fatalf("unexpected parse result code=%d bdf=%q", code, bdf) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh b/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh new file mode 100644 index 0000000..064b97c --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Manual sync helper for xidcatalog package. +# Usage: +# 1) Review current NVIDIA Xid docs and identify changed codes. +# 2) Edit catalog.go entries (neutral metadata only). +# 3) Keep support policy changes in ../xid.go. +# 4) Run repository verification commands. + +echo "xidcatalog sync is a manual curated process; see UPSTREAM.md" diff --git a/docs/architecture.md b/docs/architecture.md index 0753256..06487ae 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -35,7 +35,8 @@ runner/runner.go Orchestration hub | '-- infiniband ibstat, rdma tools | |-- triage/ Post-collection analysis (after collectors, before output) - | |-- xid NVIDIA Xid/SXid classification (24-code catalog) + | |-- xid NVIDIA Xid/SXid classification (catalog + local policy overrides) + | |-- xidcatalog Local Xid catalog/parsing boundary (neutral data) | |-- firewall Firewall posture detection (iptables/ufw/nft/firewalld) | '-- critical Critical log extraction (panic, HW error, fallen off bus, timeout) | @@ -60,7 +61,9 @@ runner/runner.go Orchestration hub | `internal/config` | Config struct, modes, timeouts, exit codes, build metadata | stdlib only | | `internal/runner` | Orchestration: detect → collect → triage → output → archive | all internal packages | | `internal/collector` | Collector interface, registry, core types (Issue, Severity, ArtifactRecord) | executor, output, ui, platform, probe, sanitize | -| `internal/triage` | Post-collection analysis (Xid, firewall, critical logs) | collector (types), output (writer), ui | +| `internal/triage` | Post-collection analysis (Xid, firewall, critical logs) | collector (types), identity, output (writer), ui | +| `internal/triage/xidcatalog` | Neutral Xid catalog lookup + kernel line parsers | stdlib only | +| `internal/identity` | Stable issue fingerprint helper (normalized tuple hashing) | stdlib only | | `internal/output` | Writer, manifest, report, summary, archive creation | executor, schemas | | `internal/transfer` | IP discovery, floating IP detection, SCP commands | netlink | | `internal/executor` | Subprocess execution: timeouts, process groups, capture limits | stdlib only | @@ -87,10 +90,14 @@ Explicit integer values (not iota). MarshalJSON/UnmarshalJSON serialize as strin ### `collector.Issue` ```go -Severity Severity -Category string // "GPU", "SVC", "MEM", "DISK", "FW", "LOG", etc. -Message string -Hidden bool // omitted from SUMMARY.txt, present in manifest/report +Code IssueCode +Severity Severity +Confidence Confidence // "high" or "low" +Category string // "GPU", "SVC", "MEM", "DISK", "FW", etc. +Message string +Fingerprint string +RelatedArtifactPaths []string +UnresolvedArtifactPaths []string ``` ### `collector.ArtifactRecord` @@ -103,7 +110,7 @@ Aggregated per-collector output: ID, name, issues, facts (`map[string]string`), ### `triage.Finding` -Richer than Issue: includes severity, category, title, description, recommended action, and evidence lines. Findings are converted to synthetic issues for the manifest. +Richer than Issue: includes `code`, `severity`, `confidence`, title/description/action, evidence, source artifact paths, and issue fingerprint. Findings are converted to synthetic issues by the runner bridge. ## Machine-Readable Output @@ -112,6 +119,7 @@ Richer than Issue: includes severity, category, title, description, recommended The primary machine-readable file. Contains: - **`artifact_index[]`** — flat list of every collector-produced file with SHA-256, size, parser hint, tags - **`collectors{}`** — per-collector summary with status, duration, facts (typed), issues, skipped reasons, errors +- issue records include `code`, `severity`, `confidence`, `message`, `issue_fingerprint`, and path linkage (`related_artifact_paths`, `unresolved_artifact_paths`) - **`platform{}`** — OS and kernel - Schema version, archive ID, tool version, generation timestamp @@ -132,9 +140,9 @@ Wire rules (per NDJSON spec v1.0.0): UTF-8, `\n` delimited, no internal newlines ### `triage/_data/*.json` — Analysis Detail Three JSON files with rich finding detail: -- `gpu_health.json` — Xid/SXid events with code, BDF, severity, count, action, evidence +- `gpu_health.json` — Xid/SXid findings with `code`, `confidence`, fingerprint, source artifacts, and typed facts - `firewall_posture.json` — posture classification, per-tool results -- `critical_events.json` — critical log events with pattern, severity, evidence +- `critical_events.json` — critical log findings with explicit pattern metadata and deterministic fingerprints ### `metadata.json` — Execution Summary @@ -143,6 +151,7 @@ Lightweight backward-compatible summary: version, flags, per-collector counts (a ### `SUMMARY.txt` — Human Report Text report with issues grouped by severity (CRITICAL → WARNING → INFO), system/hardware/GPU summaries, collector status table, and archive contents listing. +Only `confidence=high` issues are shown in `SUMMARY.txt`; low-confidence issues remain available in machine-readable outputs. ## Controlled Vocabularies From f295418616aa80a6bf0607d0d1b85b8887673745 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Wed, 8 Apr 2026 20:24:10 +0200 Subject: [PATCH 05/23] gather-info v0.2.0: fix skipped artifact placeholders, fingerprint uniqueness, and report completeness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Write placeholder files for skipped artifacts so skip_reasons never reference paths missing from the archive (common.go saveSkippedArtifact) - Fix critical event fingerprints: include log line content in the hash so distinct findings produce unique fingerprints for the UI - Add skip_count and error_count to report.ndjson collector_summary records - Fix journalctl --until to use local time format instead of UTC/RFC3339 - Rename nowUTC → nowFunc to reflect local-time semantics - Add verbose logging for structured journal probe paths - Strip git tag namespace prefix from version display in info box - Bump version to 0.2.0 --- .../internal/collector/collector_test.go | 4 +- .../internal/collector/common.go | 45 +++++++++--- .../internal/collector/common_skip_test.go | 68 +++++++++++++++++++ .../internal/collector/journal.go | 19 ++++-- .../internal/collector/journal_phase3_test.go | 18 +++-- .../internal/collector/network.go | 7 +- .../internal/collector/network_skip_test.go | 55 +++++++++++++++ .../internal/config/version.go | 2 +- .../output/archive_consistency_test.go | 44 +++++++++++- .../internal/output/report.go | 4 ++ .../internal/output/report_test.go | 20 +++++- .../internal/runner/runner.go | 5 +- .../internal/triage/critical.go | 2 +- .../internal/triage/critical_test.go | 2 +- .../schemas/report-record.schema.json | 2 + 15 files changed, 266 insertions(+), 31 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/collector/common_skip_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/network_skip_test.go diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index d08c017..b726708 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -117,7 +117,7 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { fake.RootAccess = true fake.Binaries["journalctl"] = true fake.Binaries["dmesg"] = true - const until = "2026-04-08T10:00:00Z" + const until = "2026-04-08 10:00:00" base := "-b --no-pager --until=" + until // Provide responses for all journalctl commands that run before the OOM grep for _, args := range []string{ @@ -142,7 +142,7 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { root := t.TempDir() collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) - collector.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 10, 0, 0, 0, time.UTC) } + collector.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 10, 0, 0, 0, time.UTC) } res, err := collector.Collect(context.Background()) if err != nil { t.Fatalf("Collect failed: %v", err) diff --git a/customers/vm-troubleshooting/internal/collector/common.go b/customers/vm-troubleshooting/internal/collector/common.go index fa3985c..dceb66b 100644 --- a/customers/vm-troubleshooting/internal/collector/common.go +++ b/customers/vm-troubleshooting/internal/collector/common.go @@ -18,6 +18,36 @@ type Base struct { UI ui.UI } +func (b Base) saveSkippedArtifact(r *CollectorResult, path, artifactType, command, source string, reason SkipCode, detail string, tags ...string) string { + b.UI.Verbose(fmt.Sprintf(" skip-artifact: %s -> %s", detail, path)) + if err := ValidateTagsAndHint("text", tags); err != nil { + r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) + return "" + } + if err := b.Writer.ReservePath(path); err != nil { + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", path, err), path) + return "" + } + content := fmt.Sprintf("[SKIPPED %s] %s\n", reason, detail) + if err := b.Writer.SaveOutput(path, content); err != nil { + b.Writer.ReleasePath(path) + r.RecordErrorForArtifact(ErrArtifactWrite, fmt.Sprintf("%s: %v", path, err), path) + return "" + } + r.Artifacts = append(r.Artifacts, ArtifactRecord{ + Path: path, + Type: artifactType, + Command: command, + Source: source, + ExitCode: 0, + Status: "skipped", + Tags: sortedTags(tags), + ParserHint: "text", + }) + r.RecordSkipForArtifact(reason, detail, path) + return path +} + func (b Base) saveProbeOutput(r *CollectorResult, path, content, hint string, tags ...string) string { b.UI.Verbose(fmt.Sprintf(" probe: %s -> %s", hint, path)) if err := ValidateTagsAndHint(hint, tags); err != nil { @@ -123,7 +153,7 @@ func (b Base) saveCommand(ctx context.Context, r *CollectorResult, path string, func (b Base) saveDirConcat(r *CollectorResult, dest, dir string, sanitize func(string) string, tags ...string) { entries, err := os.ReadDir(dir) if err != nil { - r.RecordSkipForArtifact(SkipSourceUnavailable, dir+": unavailable", dest) + b.saveSkippedArtifact(r, dest, "file", "", dir, SkipSourceUnavailable, dir+": unavailable", tags...) return } var buf strings.Builder @@ -156,7 +186,7 @@ func (b Base) saveDirConcat(r *CollectorResult, dest, dir string, sanitize func( return } if filesSeen > 0 && filesRead == 0 && hadReadErr { - r.RecordSkipForArtifact(SkipSourceUnavailable, dir+": unavailable", dest) + b.saveSkippedArtifact(r, dest, "file", "", dir, SkipSourceUnavailable, dir+": unavailable", tags...) } } @@ -197,15 +227,14 @@ func (b Base) saveFile(r *CollectorResult, dest, src string, sanitize func(strin r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", dest, err), dest) return } - if err := b.Writer.ReservePath(dest); err != nil { - r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", dest, err), dest) - return - } data, err := os.ReadFile(src) if err != nil { - b.Writer.ReleasePath(dest) - r.RecordSkipForArtifact(SkipSourceUnavailable, src+": unavailable", dest) + b.saveSkippedArtifact(r, dest, "file", "", src, SkipSourceUnavailable, src+": unavailable", tags...) + return + } + if err := b.Writer.ReservePath(dest); err != nil { + r.RecordErrorForArtifact(ErrArtifactReserve, fmt.Sprintf("%s: %v", dest, err), dest) return } content := string(data) diff --git a/customers/vm-troubleshooting/internal/collector/common_skip_test.go b/customers/vm-troubleshooting/internal/collector/common_skip_test.go new file mode 100644 index 0000000..8e08678 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/common_skip_test.go @@ -0,0 +1,68 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +func TestSaveFileWritesSkippedPlaceholderForMissingSource(t *testing.T) { + t.Parallel() + + root := t.TempDir() + b := Base{Writer: output.NewWriter(root), UI: ui.NoopUI{}} + r := NewResult() + + b.saveFile(r, "network/interfaces.txt", "/definitely/missing/interfaces", nil, "network") + + if len(r.Artifacts) != 1 { + t.Fatalf("expected 1 artifact, got %d", len(r.Artifacts)) + } + if r.Artifacts[0].Path != "network/interfaces.txt" || r.Artifacts[0].Status != "skipped" || r.Artifacts[0].Type != "file" { + t.Fatalf("unexpected artifact metadata: %+v", r.Artifacts[0]) + } + if len(r.Skipped) != 1 || r.Skipped[0].ArtifactPath != "network/interfaces.txt" { + t.Fatalf("expected skipped artifact record, got %+v", r.Skipped) + } + + data, err := os.ReadFile(filepath.Join(root, "network/interfaces.txt")) + if err != nil { + t.Fatalf("reading skipped artifact: %v", err) + } + want := "[SKIPPED source_unavailable] /definitely/missing/interfaces: unavailable\n" + if string(data) != want { + t.Fatalf("unexpected skipped artifact content:\nwant %q\ngot %q", want, string(data)) + } +} + +func TestSaveDirConcatWritesSkippedPlaceholderForMissingDirectory(t *testing.T) { + t.Parallel() + + root := t.TempDir() + b := Base{Writer: output.NewWriter(root), UI: ui.NoopUI{}} + r := NewResult() + + b.saveDirConcat(r, "network/interfaces_d.txt", "/definitely/missing/interfaces.d", nil, "network", "config") + + if len(r.Artifacts) != 1 { + t.Fatalf("expected 1 artifact, got %d", len(r.Artifacts)) + } + if r.Artifacts[0].Path != "network/interfaces_d.txt" || r.Artifacts[0].Status != "skipped" || r.Artifacts[0].Type != "file" { + t.Fatalf("unexpected artifact metadata: %+v", r.Artifacts[0]) + } + if len(r.Skipped) != 1 || r.Skipped[0].ArtifactPath != "network/interfaces_d.txt" { + t.Fatalf("expected skipped artifact record, got %+v", r.Skipped) + } + + data, err := os.ReadFile(filepath.Join(root, "network/interfaces_d.txt")) + if err != nil { + t.Fatalf("reading skipped artifact: %v", err) + } + want := "[SKIPPED source_unavailable] /definitely/missing/interfaces.d: unavailable\n" + if string(data) != want { + t.Fatalf("unexpected skipped artifact content:\nwant %q\ngot %q", want, string(data)) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 6026c8e..daeb30b 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -19,7 +19,7 @@ type JournalCollector struct { Base Since string IncludeFull bool - nowUTC func() time.Time + nowFunc func() time.Time } func NewJournalCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, since string, includeFull bool) *JournalCollector { @@ -27,7 +27,7 @@ func NewJournalCollector(exec executor.Executor, writer *output.Writer, ui ui.UI Base: Base{Exec: exec, Writer: writer, UI: ui}, Since: since, IncludeFull: includeFull, - nowUTC: time.Now, + nowFunc: time.Now, } } @@ -54,8 +54,10 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } - untilUTC := c.nowUTC().UTC().Format(time.RFC3339) - journalArgs := c.journalBaseArgs(untilUTC) + // Use local time: journalctl interprets --until in the system's local timezone. + untilLocal := c.nowFunc().Format("2006-01-02 15:04:05") + c.UI.Verbose(fmt.Sprintf(" journal upper bound: --until=%s", untilLocal)) + journalArgs := c.journalBaseArgs(untilLocal) if c.IncludeFull { c.saveCommand(ctx, r, "logs/journal_full.txt", executor.CommandSpec{Name: "journalctl", Args: journalArgs, NeedsRoot: true, Timeout: config.TimeoutSlow}, "journalctl", "journal") @@ -140,8 +142,8 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } -func (c *JournalCollector) journalBaseArgs(untilUTC string) []string { - args := []string{"--no-pager", "--until=" + untilUTC} +func (c *JournalCollector) journalBaseArgs(until string) []string { + args := []string{"--no-pager", "--until=" + until} if c.Since == "" || c.Since == "boot" { return append([]string{"-b"}, args...) } @@ -160,9 +162,10 @@ func (c *JournalCollector) saveStructuredJournalNDJSON(ctx context.Context, r *C NeedsRoot: true, Timeout: timeout, } + c.UI.Verbose(fmt.Sprintf(" probe-ndjson: %s -> %s", spec.String(), path)) result, stdout, _ := c.Exec.Capture(ctx, spec, journalNDJSONByteLimit+journalNDJSONSentinelReserve) if result.Skipped { - r.RecordSkipForArtifact(SkipPermissionOrAccess, "journalctl structured output requires root", path) + c.saveSkippedArtifact(r, path, "command", spec.String(), "", SkipPermissionOrAccess, "journalctl structured output requires root", "journal") return } if result.Err != nil && result.ExitCode >= 2 { @@ -199,6 +202,8 @@ func (c *JournalCollector) saveStructuredJournalNDJSON(ctx context.Context, r *C }) if truncated { c.UI.Verbose(fmt.Sprintf(" structured journal truncated: %s (%s, records=%d)", path, reason, recordsWritten)) + } else { + c.UI.Verbose(fmt.Sprintf(" structured journal written: %s (records=%d)", path, recordsWritten)) } } diff --git a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go index 8f1f6e7..3deabbe 100644 --- a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go +++ b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go @@ -17,7 +17,7 @@ import ( func TestJournalCollectorWritesParseableStructuredNDJSON(t *testing.T) { t.Parallel() - const until = "2026-04-08T12:00:00Z" + const until = "2026-04-08 12:00:00" base := "-b --no-pager --until=" + until fake := executor.NewFake() @@ -46,7 +46,7 @@ func TestJournalCollectorWritesParseableStructuredNDJSON(t *testing.T) { root := t.TempDir() c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) - c.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 12, 0, 0, 0, time.UTC) } + c.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 12, 0, 0, 0, time.UTC) } res, err := c.Collect(context.Background()) if err != nil { t.Fatalf("Collect failed: %v", err) @@ -102,12 +102,22 @@ func TestJournalCollectorStructuredNDJSONSkipStateIsExplicit(t *testing.T) { if !hasKernelSkip || !hasErrorSkip { t.Fatalf("expected explicit structured artifact skips, got: %+v", res.Skipped) } + + for _, path := range []string{"logs/journal_kernel.ndjson", "logs/journal_errors.ndjson"} { + data, readErr := os.ReadFile(filepath.Join(root, path)) + if readErr != nil { + t.Fatalf("expected skipped structured artifact %s to be written: %v", path, readErr) + } + if !strings.HasPrefix(string(data), "[SKIPPED ") { + t.Fatalf("expected skipped placeholder content for %s, got %q", path, string(data)) + } + } } func TestJournalCollectorStructuredNDJSONErrorStateIsExplicit(t *testing.T) { t.Parallel() - const until = "2026-04-08T13:00:00Z" + const until = "2026-04-08 13:00:00" base := "-b --no-pager --until=" + until fake := executor.NewFake() @@ -131,7 +141,7 @@ func TestJournalCollectorStructuredNDJSONErrorStateIsExplicit(t *testing.T) { root := t.TempDir() c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) - c.nowUTC = func() time.Time { return time.Date(2026, 4, 8, 13, 0, 0, 0, time.UTC) } + c.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 13, 0, 0, 0, time.UTC) } res, err := c.Collect(context.Background()) if err != nil { t.Fatalf("Collect failed: %v", err) diff --git a/customers/vm-troubleshooting/internal/collector/network.go b/customers/vm-troubleshooting/internal/collector/network.go index 8ee4c8e..dcbdae3 100644 --- a/customers/vm-troubleshooting/internal/collector/network.go +++ b/customers/vm-troubleshooting/internal/collector/network.go @@ -78,18 +78,23 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error {"network/bridge_fdb.txt", executor.CommandSpec{Name: "bridge", Args: []string{"fdb", "show"}, Timeout: config.TimeoutMedium}, "bridge"}, } { if !c.Exec.CommandExists(spec.cmd.Name) { + c.saveSkippedArtifact(r, spec.path, "command", spec.cmd.String(), "", SkipCommandUnavailable, spec.cmd.Name+": unavailable", "network") continue } c.saveCommand(ctx, r, spec.path, spec.cmd, spec.hint, "network") } if _, err := os.Stat("/etc/NetworkManager/system-connections"); err == nil && c.Exec.CommandExists("ls") { c.saveCommand(ctx, r, "network/nm_connection_files.txt", executor.CommandSpec{Name: "ls", Args: []string{"-la", "/etc/NetworkManager/system-connections"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "text", "network") + } else if err == nil { + c.saveSkippedArtifact(r, "network/nm_connection_files.txt", "command", "ls -la /etc/NetworkManager/system-connections", "", SkipCommandUnavailable, "ls: unavailable", "network") } if c.Exec.CommandExists("netplan") { netplanSpec := executor.CommandSpec{Name: "netplan", Args: []string{"get", "--all"}, NeedsRoot: true, Timeout: config.TimeoutMedium} result, stdout, stderr := c.Exec.Capture(ctx, netplanSpec, 2*1024*1024) content := sanitize.SensitiveConfig(string(stdout)) _ = c.saveCapturedProbe(r, "network/netplan_get.txt", netplanSpec, result, content, string(stderr), "netplan", []string{"network", "config"}, "Sensitive values have been redacted") + } else { + c.saveSkippedArtifact(r, "network/netplan_get.txt", "command", "netplan get --all", "", SkipCommandUnavailable, "netplan: unavailable", "network", "config") } if data, err := os.ReadFile("/proc/net/vlan/config"); err == nil { c.saveProbeOutput(r, "network/vlan_config.txt", string(data), "procfs", "network") @@ -110,7 +115,7 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error {"network/ufw_status.txt", "ufw", []string{"status", "verbose"}, true, "ufw", []string{"firewall"}}, } { if !c.Exec.CommandExists(spec.name) { - r.RecordSkip(SkipCommandUnavailable, spec.name+": unavailable") + c.saveSkippedArtifact(r, spec.path, "command", executor.CommandSpec{Name: spec.name, Args: spec.args}.String(), "", SkipCommandUnavailable, spec.name+": unavailable", spec.tags...) continue } c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, Timeout: config.TimeoutMedium}, spec.hint, spec.tags...) diff --git a/customers/vm-troubleshooting/internal/collector/network_skip_test.go b/customers/vm-troubleshooting/internal/collector/network_skip_test.go new file mode 100644 index 0000000..ba902f8 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/network_skip_test.go @@ -0,0 +1,55 @@ +package collector + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +func TestNetworkCollectorWritesSkippedArtifactsForUnavailableCommands(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + root := t.TempDir() + c := NewNetworkCollector(fake, output.NewWriter(root), ui.NoopUI{}) + + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + for _, path := range []string{ + "network/nm_general.txt", + "network/networkctl_list.txt", + "network/netplan_get.txt", + "network/firewalld_zones.txt", + } { + data, readErr := os.ReadFile(filepath.Join(root, path)) + if readErr != nil { + t.Fatalf("expected skipped artifact %s to be written: %v", path, readErr) + } + if !strings.HasPrefix(string(data), "[SKIPPED ") { + t.Fatalf("expected skipped placeholder content for %s, got %q", path, string(data)) + } + } + + foundNMCLI := false + foundFirewalld := false + for _, s := range res.Skipped { + switch s.ArtifactPath { + case "network/nm_general.txt": + foundNMCLI = true + case "network/firewalld_zones.txt": + foundFirewalld = true + } + } + if !foundNMCLI || !foundFirewalld { + t.Fatalf("expected skipped artifact paths for unavailable commands, got %+v", res.Skipped) + } +} diff --git a/customers/vm-troubleshooting/internal/config/version.go b/customers/vm-troubleshooting/internal/config/version.go index c5a9ec3..00579b5 100644 --- a/customers/vm-troubleshooting/internal/config/version.go +++ b/customers/vm-troubleshooting/internal/config/version.go @@ -1,7 +1,7 @@ package config var ( - Version = "0.1.1" + Version = "0.2.0" Commit = "unknown" BuildDate = "unknown" ) diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index 957ddfa..1eb474a 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -19,6 +19,9 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { if err := w.SaveOutput("system/hostname.txt", "# Command: hostname -f\n---\nnode-1\n"); err != nil { t.Fatalf("SaveOutput failed: %v", err) } + if err := w.SaveOutput("network/interfaces.txt", "[SKIPPED source_unavailable] /etc/network/interfaces: unavailable\n"); err != nil { + t.Fatalf("SaveOutput failed: %v", err) + } meta := ManifestMeta{ SchemaVersion: "3.0.0", @@ -47,8 +50,28 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { nil, nil, ), + "network": BuildManifestInput( + "ok", + 3, + []ManifestArtifactInput{{ + Path: "network/interfaces.txt", + Type: "file", + Source: "/etc/network/interfaces", + Status: "skipped", + ParserHint: "text", + Tags: []string{"network"}, + }}, + nil, + nil, + []ManifestSkipReasonInput{{ + Reason: "source_unavailable", + Detail: "/etc/network/interfaces: unavailable", + ArtifactPath: "network/interfaces.txt", + }}, + nil, + ), } - collectorIDs := []string{"system"} + collectorIDs := []string{"system", "network"} if err := WriteManifestFromResults(w, meta, collectorIDs, results); err != nil { t.Fatalf("WriteManifestFromResults failed: %v", err) } @@ -84,8 +107,8 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { t.Fatalf("unmarshal manifest: %v", err) } artifactIndex := manifest["artifact_index"].([]any) - if len(artifactIndex) != 1 { - t.Fatalf("expected 1 manifest artifact, got %d", len(artifactIndex)) + if len(artifactIndex) != 2 { + t.Fatalf("expected 2 manifest artifacts, got %d", len(artifactIndex)) } reportBytes, err := os.ReadFile(filepath.Join(archiveRoot, "report.ndjson")) @@ -115,6 +138,21 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { if summaryLines != len(manifest["collectors"].(map[string]any)) { t.Fatalf("collector summary count mismatch: report=%d manifest=%d", summaryLines, len(manifest["collectors"].(map[string]any))) } + + for collectorID, rawCollector := range manifest["collectors"].(map[string]any) { + collectorMap := rawCollector.(map[string]any) + skipReasons := collectorMap["skip_reasons"].([]any) + for _, rawSkip := range skipReasons { + skipMap := rawSkip.(map[string]any) + artifactPath, _ := skipMap["artifact_path"].(string) + if artifactPath == "" { + continue + } + if _, err := os.Stat(filepath.Join(archiveRoot, artifactPath)); err != nil { + t.Fatalf("collector %s references missing skipped artifact %s: %v", collectorID, artifactPath, err) + } + } + } } func extractArchive(t *testing.T, archivePath, dest string) error { diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 82e0eda..4e91280 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -41,6 +41,8 @@ type ReportRecord struct { // type=collector_summary fields ArtifactCount int `json:"artifact_count,omitempty"` + SkipCount int `json:"skip_count,omitempty"` + ErrorCount int `json:"error_count,omitempty"` } const reportSchemaVersion = "3.0.0" @@ -119,6 +121,8 @@ func WriteReport(w *Writer, collectorIDs []string, resultsMap map[string]Manifes Collector: cid, Status: ri.Status, ArtifactCount: len(ri.Artifacts), + SkipCount: len(ri.Skipped), + ErrorCount: len(ri.Errors), DurationMS: &durationMS, }); err != nil { return err diff --git a/customers/vm-troubleshooting/internal/output/report_test.go b/customers/vm-troubleshooting/internal/output/report_test.go index 23df6ef..817ab7d 100644 --- a/customers/vm-troubleshooting/internal/output/report_test.go +++ b/customers/vm-troubleshooting/internal/output/report_test.go @@ -35,10 +35,13 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { "network": BuildManifestInput( "ok", 0, - []ManifestArtifactInput{{Path: "network/ip_addr.txt", Type: "probe", Status: "ok", ParserHint: "netlink", Tags: []string{"network"}}}, + []ManifestArtifactInput{ + {Path: "network/ip_addr.txt", Type: "probe", Status: "ok", ParserHint: "netlink", Tags: []string{"network"}}, + {Path: "network/interfaces.txt", Type: "file", Source: "/etc/network/interfaces", Status: "skipped", ParserHint: "text", Tags: []string{"network"}}, + }, map[string]string{"hostname": "node-1"}, nil, - nil, + []ManifestSkipReasonInput{{Reason: "source_unavailable", Detail: "/etc/network/interfaces: unavailable", ArtifactPath: "network/interfaces.txt"}}, nil, ), } @@ -109,6 +112,19 @@ func TestWriteReportProducesValidNDJSONInDeterministicOrder(t *testing.T) { if !foundIssue { t.Fatal("expected at least one issue record") } + foundSkippedArtifact := false + for _, rec := range records { + if rec["type"] != "artifact" || rec["path"] != "network/interfaces.txt" { + continue + } + foundSkippedArtifact = true + if rec["status"] != "skipped" { + t.Fatalf("expected skipped artifact status, got %#v", rec) + } + } + if !foundSkippedArtifact { + t.Fatal("expected skipped artifact record in report") + } last := records[len(records)-1] if last["collector"] != "network" || last["type"] != "collector_summary" { t.Fatalf("unexpected last record ordering: %#v", last) diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index 858b053..3389936 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -67,9 +67,12 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { if hasDCGM { dcgmStatus = "installed" } + // Strip git tag namespace prefix (e.g. "gather-info/v0.1.1" → "v0.1.1") + // for the info box. Archive metadata keeps the full string. + displayVersion := strings.TrimPrefix(config.Version, "gather-info/") r.UI.SystemInfo("VM Diagnostics Tool", [][2]string{ {"Hostname", host}, - {"Version", config.Version}, + {"Version", displayVersion}, {"OS", distro.Pretty}, {"Commit", config.Commit}, {"Root access", rootStatus}, diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 4178e42..8b482e3 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -264,7 +264,7 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro SourceArtifacts: []string{ ev.source, }, - Fingerprint: identity.Fingerprint("crit", ev.fingerprintKey, criticalSourceClass(ev.source)), + Fingerprint: identity.Fingerprint("crit", ev.fingerprintKey, criticalSourceClass(ev.source), ev.line), UnresolvedArtifactPaths: append([]string(nil), ev.unresolved...), }) diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index aacf8fb..2fa4939 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -62,7 +62,7 @@ func TestAnalyzeCriticalLogs_LowConfidence(t *testing.T) { if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_errors.txt" { t.Errorf("expected source_artifacts to contain journal_errors, got %#v", f.SourceArtifacts) } - wantFP := identity.Fingerprint("crit", "error_fail", criticalSourceClass("logs/journal_errors.txt")) + wantFP := identity.Fingerprint("crit", "error_fail", criticalSourceClass("logs/journal_errors.txt"), f.Evidence[0]) if f.Fingerprint != wantFP { t.Errorf("unexpected fingerprint: got %q want %q", f.Fingerprint, wantFP) } diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index 4126bc5..4ff2bf1 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -73,6 +73,8 @@ "type": { "const": "collector_summary" }, "status": { "type": "string", "enum": ["ok", "partial", "failed", "skipped"] }, "artifact_count": { "type": "integer" }, + "skip_count": { "type": "integer" }, + "error_count": { "type": "integer" }, "duration_ms": { "type": "integer" } }, "required": ["type", "status"] From 5b5d40e1188be47ac5283995c1080577c33739c7 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Thu, 9 Apr 2026 18:39:01 +0200 Subject: [PATCH 06/23] gather-info v0.2.1: post-audit triage fixes and single sudo re-exec Addresses all findings from the 3-machine production audit (5090-076, 5090-087, rtx6000pro). Key changes: Privilege escalation: replace per-command sudo with a single re-exec under sudo at startup via syscall.Exec, eliminating overhead from 48 individual sudo invocations per run. Triage hardening: - Split "Kernel Panic" pattern into separate Kernel Panic (CRITICAL), Kernel BUG (WARNING), and Kernel Stack Trace (INFO) findings - Add source-aware pattern matching so kernel-only patterns skip non-kernel journal_errors lines (prevents false positives) - Add NVIDIA RPC Failure, NVIDIA Driver Assertion, and Segfault patterns - Add hex address normalization for segfault dedup - Detect NDJSON truncation and emit FindingDataQuality warnings (survives zero-event early returns in both critical.go and xid.go) - Bump schema version to 3.1.0 Collector fixes: - Parse GPU CSV per-line to handle nvidia-smi error lines mixed with valid rows (partial failure when GPU fallen off bus) - Handle OOM probe timeout separately from exit errors - Fix fabricmanager label ("not running" not "inactive") - iptables uses IgnoreExit to avoid false warnings on nf_tables systems - Strip ANSI escapes from nvidia-smi topo output (TERM=dumb + StripANSI) Also adds Xid catalog codegen tooling and Makefile updates. --- customers/vm-troubleshooting/Makefile | 24 +- .../vm-troubleshooting/internal/cli/root.go | 11 +- .../internal/collector/additional.go | 3 +- .../internal/collector/collector.go | 4 +- .../internal/collector/collector_test.go | 293 +++++++- .../internal/collector/journal.go | 55 +- .../internal/collector/journal_phase3_test.go | 4 +- .../internal/collector/network.go | 17 +- .../internal/collector/network_skip_test.go | 31 + .../internal/collector/nvidia.go | 83 ++- .../internal/collector/services.go | 2 +- .../internal/collector/services_test.go | 8 + .../internal/collector/storage.go | 6 + .../output/archive_consistency_test.go | 2 +- .../internal/output/manifest_test.go | 2 +- .../internal/output/report.go | 2 +- .../internal/platform/dcgm.go | 15 +- .../internal/platform/dcgm_test.go | 33 + .../internal/privilege/privilege.go | 91 ++- .../internal/runner/runner.go | 49 +- .../internal/runner/runner_test.go | 40 ++ .../internal/sanitize/sanitize.go | 6 + .../internal/sanitize/sanitize_test.go | 17 + .../internal/transfer/commands.go | 69 +- .../internal/transfer/commands_test.go | 69 +- .../internal/triage/critical.go | 213 +++++- .../internal/triage/critical_test.go | 308 ++++++++- .../internal/triage/journal_ndjson.go | 23 +- .../internal/triage/journal_ndjson_test.go | 17 + .../internal/triage/triage.go | 4 +- .../vm-troubleshooting/internal/triage/xid.go | 100 ++- .../internal/triage/xid_analyze_test.go | 65 ++ .../internal/triage/xid_generated_policies.go | 118 ++++ .../internal/triage/xid_test.go | 69 +- .../internal/triage/xidcatalog/UPSTREAM.md | 43 +- .../internal/triage/xidcatalog/catalog.go | 54 +- .../triage/xidcatalog/catalog_generated.go | 116 ++++ .../triage/xidcatalog/sync_catalog.sh | 13 +- .../schemas/manifest.schema.json | 2 +- .../schemas/report-record.schema.json | 2 +- .../schemas/triage-result.schema.json | 2 +- .../tools/update-xid-catalog.py | 625 ++++++++++++++++++ 42 files changed, 2435 insertions(+), 275 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/platform/dcgm_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/xid_generated_policies.go create mode 100644 customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_generated.go create mode 100755 customers/vm-troubleshooting/tools/update-xid-catalog.py diff --git a/customers/vm-troubleshooting/Makefile b/customers/vm-troubleshooting/Makefile index 2cc2818..27d8840 100644 --- a/customers/vm-troubleshooting/Makefile +++ b/customers/vm-troubleshooting/Makefile @@ -1,4 +1,7 @@ -VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev) +VERSION_FROM_FILE := $(shell sed -n 's/^[[:space:]]*Version[[:space:]]*=[[:space:]]*"\([^"]*\)".*/\1/p' internal/config/version.go | head -1) +BASE_VERSION ?= $(if $(VERSION_FROM_FILE),$(VERSION_FROM_FILE),dev) +GIT_SUFFIX ?= $(shell sh -c 'c=$$(git rev-parse --short HEAD 2>/dev/null || echo unknown); d=""; git diff --quiet --ignore-submodules HEAD -- 2>/dev/null || d="-dirty"; printf -- "-g%s%s" "$$c" "$$d"') +VERSION ?= $(BASE_VERSION)$(GIT_SUFFIX) COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -s -w \ @@ -6,10 +9,15 @@ LDFLAGS := -s -w \ -X github.com/NexGenCloud/vm-diagnostics/internal/config.Commit=$(COMMIT) \ -X github.com/NexGenCloud/vm-diagnostics/internal/config.BuildDate=$(DATE) -build: +build: update-xid-catalog build-only + +build-only: CGO_ENABLED=0 go build -ldflags='$(LDFLAGS)' -o bin/gather-info ./cmd/gather-info -build-arm64: +build-arm64: update-xid-catalog + CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -ldflags='$(LDFLAGS)' -o bin/gather-info-arm64 ./cmd/gather-info + +build-arm64-only: CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -ldflags='$(LDFLAGS)' -o bin/gather-info-arm64 ./cmd/gather-info fmt: @@ -26,6 +34,14 @@ lint: vet clean: rm -rf bin +update-xid-catalog: + @python3 tools/update-xid-catalog.py \ + --xid-md ../../xid.md \ + --catalog-out internal/triage/xidcatalog/catalog_generated.go \ + --policy-out internal/triage/xid_generated_policies.go \ + && gofmt -w internal/triage/xidcatalog/catalog_generated.go internal/triage/xid_generated_policies.go \ + || echo "Warning: XID catalog update failed (network?), building with existing catalog" + all: fmt vet test build -.PHONY: build build-arm64 fmt vet test lint clean all +.PHONY: build build-only build-arm64 build-arm64-only fmt vet test lint clean all update-xid-catalog diff --git a/customers/vm-troubleshooting/internal/cli/root.go b/customers/vm-troubleshooting/internal/cli/root.go index 6f294f6..5178e60 100644 --- a/customers/vm-troubleshooting/internal/cli/root.go +++ b/customers/vm-troubleshooting/internal/cli/root.go @@ -8,6 +8,7 @@ import ( "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/privilege" "github.com/NexGenCloud/vm-diagnostics/internal/runner" "github.com/NexGenCloud/vm-diagnostics/internal/ui" "github.com/spf13/cobra" @@ -32,6 +33,14 @@ func NewRootCmd() *cobra.Command { return err } + // Escalate to root before doing anything else. On success + // the process is replaced by sudo and never returns. + // If already root or re-exec'd, this returns nil immediately. + ctx0 := context.Background() + if err := privilege.ReexecUnderSudo(ctx0, os.Args[1:]); err != nil { + return exitError{code: config.ExitFatal, err: err} + } + uiImpl := ui.New(cfg.Verbosity, cfg.NonInteractive) ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) @@ -42,7 +51,7 @@ func NewRootCmd() *cobra.Command { uiImpl.Warn("Interrupted. Cleaning up... (press Ctrl+C again to force quit)") }() - execImpl := executor.NewReal(false) + execImpl := executor.NewReal(os.Geteuid() == 0) r := runner.New(cfg, uiImpl, execImpl) res, err := r.Run(ctx) if err != nil { diff --git a/customers/vm-troubleshooting/internal/collector/additional.go b/customers/vm-troubleshooting/internal/collector/additional.go index 0fc0734..7135dab 100644 --- a/customers/vm-troubleshooting/internal/collector/additional.go +++ b/customers/vm-troubleshooting/internal/collector/additional.go @@ -42,7 +42,8 @@ func (c *AdditionalCollector) Collect(ctx context.Context) (*CollectorResult, er {"hardware/sensors.txt", "sensors", nil, false, config.TimeoutQuick, "text", []string{"hardware"}}, } { if !c.Exec.CommandExists(spec.name) { - r.RecordSkip(SkipCommandUnavailable, spec.name+": unavailable") + c.saveSkippedArtifact(r, spec.path, "command", spec.name, spec.hint, + SkipCommandUnavailable, spec.name+": unavailable", spec.tags...) continue } c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, Timeout: spec.timeout}, spec.hint, spec.tags...) diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index a64c54c..7a22a0b 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -334,7 +334,9 @@ func (r *Registry) RunAll(ctx context.Context, skip map[string]bool, u ui.UI) ([ results = append(results, res) dur := res.Duration.Round(time.Millisecond) - if len(res.Errors) > 0 { + if len(res.Errors) > 0 && len(res.Artifacts) > 0 { + sp.Warn(fmt.Sprintf("[%d/%d] %s (%s, %d artifact(s), %d error(s))", i+1, total, c.Name(), dur, len(res.Artifacts), len(res.Errors))) + } else if len(res.Errors) > 0 { sp.Fail(fmt.Sprintf("[%d/%d] %s (%s, %d error(s))", i+1, total, c.Name(), dur, len(res.Errors))) } else if len(res.Artifacts) == 0 && len(res.Skipped) > 0 { sp.Success(fmt.Sprintf("[%d/%d] %s (skipped: %s)", i+1, total, c.Name(), res.Skipped[0].Detail)) diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index b726708..d4e78b6 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -131,7 +131,7 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { } fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} // journalctl --grep returns exit 1 when no entries match (like grep) - fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ ExitCode: 1, Err: fmt.Errorf("exit status 1"), } @@ -156,6 +156,180 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { } } +func TestJournalCollectorOOMFilterRejectsFalsePositives(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Binaries["dmesg"] = true + const until = "2026-04-08 11:00:00" + base := "-b --no-pager --until=" + until + for _, args := range []string{ + "journalctl " + base + " -k", + "journalctl " + base + " -p err", + "journalctl " + base + " -p warning", + "journalctl " + base + " -k -o json --output-fields=" + journalOutputFields + " --lines=50001", + "journalctl " + base + " -p err -o json --output-fields=" + journalOutputFields + " --lines=50001", + } { + fake.Commands[args] = executor.FakeResponse{Stdout: []byte("log line\n")} + } + fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} + // Return MooseFS "out of memory killer disabled" — a false positive that previously + // matched the old broad "out of memory" pattern. + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ + Stdout: []byte( + "Apr 04 15:36:40 host mfsmount[3804]: [info] out of memory killer disabled\n" + + "Apr 04 15:36:40 host mfsmount[3805]: [info] out of memory killer disabled\n", + ), + } + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + root := t.TempDir() + + collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + collector.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 11, 0, 0, 0, time.UTC) } + res, err := collector.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["oom_event_count"]; got != "0" { + t.Fatalf("expected oom_event_count=0 for MooseFS false positive, got %q", got) + } + if len(res.Issues) > 0 { + for _, iss := range res.Issues { + if iss.Code == IssueOOMEvents { + t.Fatalf("expected no OOM issue for false-positive lines, got: %+v", iss) + } + } + } +} + +func TestJournalCollectorOOMFilterMatchesRealEvents(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Binaries["dmesg"] = true + const until = "2026-04-08 11:30:00" + base := "-b --no-pager --until=" + until + for _, args := range []string{ + "journalctl " + base + " -k", + "journalctl " + base + " -p err", + "journalctl " + base + " -p warning", + "journalctl " + base + " -k -o json --output-fields=" + journalOutputFields + " --lines=50001", + "journalctl " + base + " -p err -o json --output-fields=" + journalOutputFields + " --lines=50001", + } { + fake.Commands[args] = executor.FakeResponse{Stdout: []byte("log line\n")} + } + fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} + // Return real OOM killer lines — each variant should be detected. + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ + Stdout: []byte( + "Apr 04 12:00:01 host kernel: python3 invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0\n" + + "Apr 04 12:00:02 host kernel: Out of memory: Killed process 12345 (python3) total-vm:123456kB\n" + + "Apr 04 15:36:40 host mfsmount[3804]: [info] out of memory killer disabled\n", + ), + } + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + root := t.TempDir() + + collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + collector.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 11, 30, 0, 0, time.UTC) } + res, err := collector.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + // One OOM incident represented by two matching lines + one false positive. + if got := res.Facts["oom_event_count"]; got != "1" { + t.Fatalf("expected oom_event_count=1 for one real OOM incident, got %q", got) + } + foundOOMIssue := false + for _, iss := range res.Issues { + if iss.Code == IssueOOMEvents { + foundOOMIssue = true + } + } + if !foundOOMIssue { + t.Fatal("expected OOM issue to be raised for real OOM events") + } +} + +func TestCountOOMIncidents(t *testing.T) { + t.Parallel() + + lines := []string{ + "Apr 04 12:00:01 host kernel: python3 invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", + "Apr 04 12:00:02 host kernel: Memory cgroup out of memory: Killed process 111 (python3) total-vm:1234kB", + "Apr 04 12:10:01 host kernel: node invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", + "Apr 04 12:10:02 host kernel: Memory cgroup out of memory: Killed process 222 (python3) total-vm:5678kB", + } + if got := countOOMIncidents(lines); got != 2 { + t.Fatalf("expected 2 incidents from timestamped OOM bursts, got %d", got) + } + + fallbackOnly := []string{ + "kernel: invoked oom-killer: gfp_mask=0x100cca", + "kernel: invoked oom-killer: gfp_mask=0x100cca", + } + if got := countOOMIncidents(fallbackOnly); got != 2 { + t.Fatalf("expected fallback count=2 from invoked-only lines, got %d", got) + } +} + +func TestJournalCollectorOOMTimeoutSetsUnavailable(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Binaries["dmesg"] = true + const until = "2026-04-08 12:30:00" + base := "-b --no-pager --until=" + until + for _, args := range []string{ + "journalctl " + base + " -k", + "journalctl " + base + " -p err", + "journalctl " + base + " -p warning", + "journalctl " + base + " -k -o json --output-fields=" + journalOutputFields + " --lines=50001", + "journalctl " + base + " -p err -o json --output-fields=" + journalOutputFields + " --lines=50001", + } { + fake.Commands[args] = executor.FakeResponse{Stdout: []byte("log line\n")} + } + fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ + Stdout: []byte("kernel: Out of memory: Killed process 123 (x) total-vm:1kB\n"), + TimedOut: true, + Err: context.DeadlineExceeded, + } + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + + root := t.TempDir() + c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + c.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 12, 30, 0, 0, time.UTC) } + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["oom_event_count"]; got != "unavailable" { + t.Fatalf("expected oom_event_count=unavailable, got %q", got) + } + foundTimeoutErr := false + for _, e := range res.Errors { + if e.Code == ErrCommandTimedOut && e.ArtifactPath == "logs/oom_events.txt" { + foundTimeoutErr = true + } + } + if !foundTimeoutErr { + t.Fatalf("expected timeout error for logs/oom_events.txt, got %+v", res.Errors) + } +} + func TestNvidiaCollectorSavesRawXidArtifactWhenDmesgSkipped(t *testing.T) { t.Parallel() @@ -198,6 +372,123 @@ func TestNvidiaCollectorSavesRawXidArtifactWhenDmesgSkipped(t *testing.T) { } } +func newNvidiaCollectorFake(csvResp executor.FakeResponse) *executor.FakeExecutor { + fake := executor.NewFake() + fake.Binaries["nvidia-smi"] = true + fake.Commands["nvidia-smi"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["nvidia-smi -q"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["nvidia-smi topo -m"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["nvidia-smi nvlink --status"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["nvidia-smi pmon -s um -c 1"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader"] = csvResp + fake.Commands["nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total --format=csv"] = executor.FakeResponse{Stdout: []byte("0,0,0\n")} + fake.Commands["nvidia-smi --query-gpu=index,clocks_throttle_reasons.active --format=csv"] = executor.FakeResponse{Stdout: []byte("0,Not Active\n")} + fake.Commands["nvidia-smi --query-gpu=index,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max --format=csv"] = executor.FakeResponse{Stdout: []byte("0,4,4,16,16\n")} + fake.Commands["nvidia-smi --query-gpu=index,power.draw,power.limit,temperature.gpu --format=csv"] = executor.FakeResponse{Stdout: []byte("0,100,300,55\n")} + return fake +} + +func TestNvidiaCollectorParsesMixedRowsOnExitZero(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte( + "NVIDIA-SMI has failed because one GPU is lost\n" + + "NVIDIA RTX PRO 6000 Blackwell Server Edition, 97871 MiB, 575.57.08\n" + + "NVIDIA RTX PRO 6000 Blackwell Server Edition, 97871 MiB, 575.57.08\n", + ), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["gpu_count"]; got != "2" { + t.Fatalf("expected gpu_count=2, got %q", got) + } + if got := res.Facts["driver_version"]; got != "575.57.08" { + t.Fatalf("expected driver_version from parsed row, got %q", got) + } +} + +func TestNvidiaCollectorParsesMixedRowsOnNonZeroExit(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte( + "NVIDIA-SMI has failed because one GPU is lost\n" + + "NVIDIA RTX PRO 6000 Blackwell Server Edition, 97871 MiB, 575.57.08\n" + + "NVIDIA RTX PRO 6000 Blackwell Server Edition, 97871 MiB, 575.57.08\n", + ), + ExitCode: 1, + Err: fmt.Errorf("exit status 1"), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["gpu_count"]; got != "2" { + t.Fatalf("expected gpu_count=2 from mixed rows on non-zero exit, got %q", got) + } +} + +func TestNvidiaCollectorLeavesFactsUnavailableOnTimedOutCSV(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("NVIDIA A100, 40960 MiB, 550.00\n"), + TimedOut: true, + Err: context.DeadlineExceeded, + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["gpu_count"]; got != "unavailable" { + t.Fatalf("expected gpu_count=unavailable on timeout, got %q", got) + } +} + +func TestNvidiaCollectorLeavesFactsUnavailableWhenNoValidCSVRows(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("NVIDIA-SMI has failed because one GPU is lost\nstill broken\n"), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["gpu_count"]; got != "unavailable" { + t.Fatalf("expected gpu_count=unavailable, got %q", got) + } +} + +func TestNvidiaCollectorStripsANSIFromTopoArtifact(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("NVIDIA A100, 40960 MiB, 550.00\n"), + }) + fake.Commands["nvidia-smi topo -m"] = executor.FakeResponse{ + Stdout: []byte("\x1b[4mGPU0\x1b[0m CPU Affinity\n"), + } + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + if _, err := c.Collect(context.Background()); err != nil { + t.Fatalf("Collect failed: %v", err) + } + data, err := os.ReadFile(filepath.Join(root, "nvidia/nvidia-smi-topo.txt")) + if err != nil { + t.Fatalf("reading topo artifact: %v", err) + } + if strings.Contains(string(data), "\x1b[") { + t.Fatalf("expected ANSI escapes to be stripped, got: %q", string(data)) + } +} + func TestSeverityString(t *testing.T) { t.Parallel() tests := []struct { diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index daeb30b..5f0a21e 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -38,6 +38,7 @@ const ( journalNDJSONRecordLimit = 50000 journalNDJSONByteLimit = 10 * 1024 * 1024 journalNDJSONSentinelReserve = 256 + oomIncidentGap = 5 * time.Second ) var journalServiceUnits = []string{ @@ -75,7 +76,7 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error // Grep journal for OOM events oomSpec := executor.CommandSpec{ Name: "journalctl", - Args: append(append([]string{}, journalArgs...), "--grep=oom|out of memory|killed process", "--case-sensitive=false"), + Args: append(append(append([]string{}, journalArgs...), "-k"), "--grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm", "--case-sensitive=false"), NeedsRoot: true, Timeout: config.TimeoutMedium, } @@ -104,28 +105,35 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error continue } l := strings.ToLower(line) - if strings.Contains(l, "out of memory") || strings.Contains(l, "oom-kill") || strings.Contains(l, "invoked oom-killer") || strings.Contains(l, "killed process") { + if strings.Contains(l, "invoked oom-killer") || + strings.Contains(l, "oom-kill") || + strings.Contains(l, "out of memory: killed") || + (strings.Contains(l, "killed process") && strings.Contains(l, "total-vm")) { oom = append(oom, line) } } - oomCount := len(oom) + oomIncidentCount := countOOMIncidents(oom) // journalctl --grep returns exit 1 when no entries match (like grep). // Only treat exit codes >= 2 as real errors. oomRealErr := oomResult.Err != nil && oomResult.ExitCode >= 2 - if oomRealErr { + if oomResult.TimedOut { + r.SetFact("oom_event_count", "unavailable") + r.RecordErrorForArtifact(ErrCommandTimedOut, fmt.Sprintf("%s: timed out", oomSpec.String()), oomPath) + c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete (timed out)") + } else if oomRealErr { r.SetFact("oom_event_count", "unavailable") r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", oomSpec.String(), oomResult.Err), oomPath) c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") - } else if oomCount > 0 { - r.SetFact("oom_event_count", fmt.Sprintf("%d", oomCount)) + } else if oomIncidentCount > 0 { + r.SetFact("oom_event_count", fmt.Sprintf("%d", oomIncidentCount)) r.AddIssue( IssueOOMEvents, SeverityCritical, ConfidenceHigh, "MEM", - fmt.Sprintf("%d OOM killer event(s)", oomCount), + fmt.Sprintf("%d OOM killer event(s)", oomIncidentCount), "journal", string(IssueOOMEvents), ) @@ -142,6 +150,39 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } +func countOOMIncidents(lines []string) int { + count := 0 + var lastIncidentAt time.Time + haveLastIncidentAt := false + for _, line := range lines { + ts, ok := oomLineTimestamp(line) + if !ok { + // Unexpected formatting: preserve support value by counting the line. + count++ + continue + } + if !haveLastIncidentAt || ts.Sub(lastIncidentAt) > oomIncidentGap { + count++ + lastIncidentAt = ts + haveLastIncidentAt = true + } + } + return count +} + +func oomLineTimestamp(line string) (time.Time, bool) { + fields := strings.Fields(strings.TrimSpace(line)) + if len(fields) < 4 { + return time.Time{}, false + } + tsText := strings.Join(fields[:3], " ") + ts, err := time.ParseInLocation("Jan 02 15:04:05", tsText, time.Local) + if err != nil { + return time.Time{}, false + } + return time.Date(time.Now().Year(), ts.Month(), ts.Day(), ts.Hour(), ts.Minute(), ts.Second(), 0, time.Local), true +} + func (c *JournalCollector) journalBaseArgs(until string) []string { args := []string{"--no-pager", "--until=" + until} if c.Since == "" || c.Since == "boot" { diff --git a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go index 3deabbe..5b927ff 100644 --- a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go +++ b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go @@ -36,7 +36,7 @@ func TestJournalCollectorWritesParseableStructuredNDJSON(t *testing.T) { `{"MESSAGE":"password=secret","PRIORITY":"3","SYSLOG_IDENTIFIER":"systemd","_SYSTEMD_UNIT":"x.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"2","_BOOT_ID":"boot1"}` + "\n", ), } - fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ ExitCode: 1, // no OOM matches Err: context.DeadlineExceeded, // ignored because exit code < 2 logic gates real errors } @@ -134,7 +134,7 @@ func TestJournalCollectorStructuredNDJSONErrorStateIsExplicit(t *testing.T) { fake.Commands["journalctl "+base+" -p err -o json --output-fields="+journalOutputFields+" --lines=50001"] = executor.FakeResponse{ Stdout: []byte(`{"MESSAGE":"ok","PRIORITY":"3","SYSLOG_IDENTIFIER":"a","_SYSTEMD_UNIT":"u","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b"}` + "\n"), } - fake.Commands["journalctl "+base+" --grep=oom|out of memory|killed process --case-sensitive=false"] = executor.FakeResponse{ExitCode: 1} + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ExitCode: 1} for _, svc := range journalServiceUnits { fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} } diff --git a/customers/vm-troubleshooting/internal/collector/network.go b/customers/vm-troubleshooting/internal/collector/network.go index dcbdae3..a005a37 100644 --- a/customers/vm-troubleshooting/internal/collector/network.go +++ b/customers/vm-troubleshooting/internal/collector/network.go @@ -103,22 +103,23 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error path, name string args []string root bool + ignoreExit bool hint string tags []string }{ - {"network/ss_listen.txt", "ss", []string{"-tulpn"}, false, "ss", []string{"network"}}, - {"network/ss_all.txt", "ss", []string{"-tunap"}, false, "ss", []string{"network"}}, - {"network/iptables.txt", "iptables", []string{"-L", "-n", "-v"}, true, "iptables", []string{"firewall"}}, - {"network/iptables_nat.txt", "iptables", []string{"-t", "nat", "-L", "-n", "-v"}, true, "iptables", []string{"firewall"}}, - {"network/nftables.txt", "nft", []string{"list", "ruleset"}, true, "nft", []string{"firewall"}}, - {"network/firewalld_zones.txt", "firewall-cmd", []string{"--list-all-zones"}, true, "firewall-cmd", []string{"firewall"}}, - {"network/ufw_status.txt", "ufw", []string{"status", "verbose"}, true, "ufw", []string{"firewall"}}, + {"network/ss_listen.txt", "ss", []string{"-tulpn"}, false, false, "ss", []string{"network"}}, + {"network/ss_all.txt", "ss", []string{"-tunap"}, false, false, "ss", []string{"network"}}, + {"network/iptables.txt", "iptables", []string{"-L", "-n", "-v"}, true, true, "iptables", []string{"firewall"}}, + {"network/iptables_nat.txt", "iptables", []string{"-t", "nat", "-L", "-n", "-v"}, true, true, "iptables", []string{"firewall"}}, + {"network/nftables.txt", "nft", []string{"list", "ruleset"}, true, false, "nft", []string{"firewall"}}, + {"network/firewalld_zones.txt", "firewall-cmd", []string{"--list-all-zones"}, true, false, "firewall-cmd", []string{"firewall"}}, + {"network/ufw_status.txt", "ufw", []string{"status", "verbose"}, true, false, "ufw", []string{"firewall"}}, } { if !c.Exec.CommandExists(spec.name) { c.saveSkippedArtifact(r, spec.path, "command", executor.CommandSpec{Name: spec.name, Args: spec.args}.String(), "", SkipCommandUnavailable, spec.name+": unavailable", spec.tags...) continue } - c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, Timeout: config.TimeoutMedium}, spec.hint, spec.tags...) + c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, IgnoreExit: spec.ignoreExit, Timeout: config.TimeoutMedium}, spec.hint, spec.tags...) } return r, nil } diff --git a/customers/vm-troubleshooting/internal/collector/network_skip_test.go b/customers/vm-troubleshooting/internal/collector/network_skip_test.go index ba902f8..5b7f65d 100644 --- a/customers/vm-troubleshooting/internal/collector/network_skip_test.go +++ b/customers/vm-troubleshooting/internal/collector/network_skip_test.go @@ -2,6 +2,7 @@ package collector import ( "context" + "fmt" "os" "path/filepath" "strings" @@ -53,3 +54,33 @@ func TestNetworkCollectorWritesSkippedArtifactsForUnavailableCommands(t *testing t.Fatalf("expected skipped artifact paths for unavailable commands, got %+v", res.Skipped) } } + +func TestNetworkCollector_IptablesIgnoreExitDoesNotRecordError(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["iptables"] = true + fake.Commands["iptables -L -n -v"] = executor.FakeResponse{ + Stdout: []byte("Chain INPUT (policy ACCEPT)\n"), + ExitCode: 1, + Err: fmt.Errorf("exit status 1"), + } + fake.Commands["iptables -t nat -L -n -v"] = executor.FakeResponse{ + Stdout: []byte("Chain PREROUTING (policy ACCEPT)\n"), + ExitCode: 1, + Err: fmt.Errorf("exit status 1"), + } + + root := t.TempDir() + c := NewNetworkCollector(fake, output.NewWriter(root), ui.NoopUI{}) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + for _, e := range res.Errors { + if e.ArtifactPath == "network/iptables.txt" || e.ArtifactPath == "network/iptables_nat.txt" { + t.Fatalf("iptables non-zero exit should not be recorded as error: %+v", e) + } + } +} diff --git a/customers/vm-troubleshooting/internal/collector/nvidia.go b/customers/vm-troubleshooting/internal/collector/nvidia.go index 1d0d6d8..f9d7981 100644 --- a/customers/vm-troubleshooting/internal/collector/nvidia.go +++ b/customers/vm-troubleshooting/internal/collector/nvidia.go @@ -4,11 +4,15 @@ import ( "context" "encoding/csv" "fmt" + "io" + "os" + "path/filepath" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/sanitize" "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) @@ -59,14 +63,20 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) for _, spec := range []struct { path string args []string + env []string }{ - {"nvidia/nvidia-smi.txt", []string{}}, - {"nvidia/nvidia-smi-q.txt", []string{"-q"}}, - {"nvidia/nvidia-smi-topo.txt", []string{"topo", "-m"}}, - {"nvidia/nvidia-smi-nvlink.txt", []string{"nvlink", "--status"}}, - {"nvidia/nvidia-smi-pmon.txt", []string{"pmon", "-s", "um", "-c", "1"}}, + {"nvidia/nvidia-smi.txt", []string{}, nil}, + {"nvidia/nvidia-smi-q.txt", []string{"-q"}, nil}, + {"nvidia/nvidia-smi-topo.txt", []string{"topo", "-m"}, []string{"TERM=dumb"}}, + {"nvidia/nvidia-smi-nvlink.txt", []string{"nvlink", "--status"}, nil}, + {"nvidia/nvidia-smi-pmon.txt", []string{"pmon", "-s", "um", "-c", "1"}, nil}, } { - c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: "nvidia-smi", Args: spec.args, Timeout: config.TimeoutMedium}, "nvidia-smi", "gpu") + c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: "nvidia-smi", Args: spec.args, Env: spec.env, Timeout: config.TimeoutMedium}, "nvidia-smi", "gpu") + if spec.path == "nvidia/nvidia-smi-topo.txt" { + if err := stripANSIArtifact(filepath.Join(c.Writer.Root(), spec.path)); err != nil { + r.RecordErrorForArtifact(ErrProbeFailed, fmt.Sprintf("strip ANSI for %s: %v", spec.path, err), spec.path) + } + } } gpuSpec := executor.CommandSpec{Name: "nvidia-smi", Args: []string{"--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"}, Timeout: config.TimeoutMedium} r.SetFact("gpu_count", "unavailable") @@ -77,24 +87,15 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) if gpuResult.Err != nil && !gpuResult.Skipped { r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", gpuSpec.String(), gpuResult.Err), "nvidia/gpu_summary.txt") } - if gpuResult.Err == nil && !gpuResult.Skipped { - rows, parseErr := csv.NewReader(strings.NewReader(string(gpuCsv))).ReadAll() - if parseErr == nil && len(rows) > 0 { - validShape := true - for _, row := range rows { - if len(row) < 3 { - validShape = false - break - } + if !gpuResult.Skipped && !gpuResult.TimedOut { + rows := parseGPUCSVRows(string(gpuCsv)) + if len(rows) > 0 { + r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) + if model := strings.TrimSpace(rows[0][0]); model != "" { + r.SetFact("gpu_model", model) } - if validShape { - r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) - if model := strings.TrimSpace(rows[0][0]); model != "" { - r.SetFact("gpu_model", model) - } - if driver := strings.TrimSpace(rows[0][2]); driver != "" { - r.SetFact("driver_version", driver) - } + if driver := strings.TrimSpace(rows[0][2]); driver != "" { + r.SetFact("driver_version", driver) } } } @@ -147,3 +148,39 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) } return r, nil } + +func parseGPUCSVRows(raw string) [][]string { + lines := strings.Split(raw, "\n") + rows := make([][]string, 0, len(lines)) + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + reader := csv.NewReader(strings.NewReader(line)) + record, err := reader.Read() + if err != nil && err != io.EOF { + continue + } + if len(record) < 3 { + continue + } + rows = append(rows, record) + } + return rows +} + +func stripANSIArtifact(path string) error { + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + clean := sanitize.StripANSI(string(data)) + if clean == string(data) { + return nil + } + return os.WriteFile(path, []byte(clean), 0o644) +} diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index 143a42d..f4dfefc 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -227,7 +227,7 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context SeverityWarning, ConfidenceHigh, "SVC", - "nvidia-fabricmanager inactive (expected: no NVSwitch/SXM detected)", + "nvidia-fabricmanager not running (expected: no NVSwitch/SXM detected)", "svc", string(IssueSvcFabricmanagerBenign), ) diff --git a/customers/vm-troubleshooting/internal/collector/services_test.go b/customers/vm-troubleshooting/internal/collector/services_test.go index 6ee02bc..737e544 100644 --- a/customers/vm-troubleshooting/internal/collector/services_test.go +++ b/customers/vm-troubleshooting/internal/collector/services_test.go @@ -163,10 +163,12 @@ func TestFabricManagerDowngrade_RequiresBothConditions(t *testing.T) { c.reportFailedServices(r, context.Background(), failedNames) hasWarning := false + warningMessage := "" hasCritical := false for _, issue := range r.Issues { if issue.Severity == SeverityWarning && strings.Contains(issue.Message, "no NVSwitch") { hasWarning = true + warningMessage = issue.Message } if issue.Severity == SeverityCritical { hasCritical = true @@ -177,6 +179,12 @@ func TestFabricManagerDowngrade_RequiresBothConditions(t *testing.T) { if !hasWarning { t.Error("expected WARNING issue for benign fabricmanager") } + if strings.Contains(strings.ToLower(warningMessage), "inactive") { + t.Errorf("benign message should not say inactive: %q", warningMessage) + } + if !strings.Contains(strings.ToLower(warningMessage), "not running") { + t.Errorf("benign message should say not running: %q", warningMessage) + } if hasCritical { t.Error("should not have CRITICAL issue when downgraded") } diff --git a/customers/vm-troubleshooting/internal/collector/storage.go b/customers/vm-troubleshooting/internal/collector/storage.go index 6cd7524..13a6b10 100644 --- a/customers/vm-troubleshooting/internal/collector/storage.go +++ b/customers/vm-troubleshooting/internal/collector/storage.go @@ -35,6 +35,9 @@ func (c *StorageCollector) Collect(ctx context.Context) (*CollectorResult, error } c.saveCommand(ctx, r, "hardware/nvme_"+filepath.Base(dev)+"_smart.txt", executor.CommandSpec{Name: "nvme", Args: []string{"smart-log", dev}, NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true}, "nvme", "storage") } + } else { + c.saveSkippedArtifact(r, "hardware/nvme_list.txt", "command", "nvme", "nvme", + SkipCommandUnavailable, "nvme: unavailable", "storage") } // SMART data for block devices (sd*, vd*, xvd*) @@ -52,6 +55,9 @@ func (c *StorageCollector) Collect(ctx context.Context) (*CollectorResult, error } } } + } else { + c.saveSkippedArtifact(r, "hardware/smart_devices.txt", "command", "smartctl", "smartctl", + SkipCommandUnavailable, "smartctl: unavailable", "storage") } return r, nil diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index 1eb474a..d789230 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -24,7 +24,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.0.0", + SchemaVersion: "3.1.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/manifest_test.go b/customers/vm-troubleshooting/internal/output/manifest_test.go index e697de6..aa9d3da 100644 --- a/customers/vm-troubleshooting/internal/output/manifest_test.go +++ b/customers/vm-troubleshooting/internal/output/manifest_test.go @@ -37,7 +37,7 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.0.0", + SchemaVersion: "3.1.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 4e91280..ae5fea1 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -45,7 +45,7 @@ type ReportRecord struct { ErrorCount int `json:"error_count,omitempty"` } -const reportSchemaVersion = "3.0.0" +const reportSchemaVersion = "3.1.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. diff --git a/customers/vm-troubleshooting/internal/platform/dcgm.go b/customers/vm-troubleshooting/internal/platform/dcgm.go index 633e00a..36c51b2 100644 --- a/customers/vm-troubleshooting/internal/platform/dcgm.go +++ b/customers/vm-troubleshooting/internal/platform/dcgm.go @@ -10,25 +10,14 @@ import ( func DetectDCGM(ctx context.Context, exec executor.Executor, distro DistroInfo) bool { if exec.CommandExists("dcgmi") { - // Verify dcgmi can actually talk to the driver with a lightweight probe. - // A broken install (library mismatch, corrupted package) will fail quickly. - result, _, _ := exec.Capture(ctx, executor.CommandSpec{ - Name: "dcgmi", - Args: []string{"discovery", "-l"}, - NeedsRoot: true, - Timeout: detectTimeout, - }, 64*1024) - if result.Err == nil && !result.Skipped { - return true - } - // dcgmi exists but can't connect — fall through to package detection + return true } switch distro.Family { case "debian": if !exec.CommandExists("dpkg-query") { return false } - _, stdout, _ := exec.Capture(ctx, executor.CommandSpec{Name: "dpkg-query", Args: []string{"-W", "-f=${Package}\n", "datacenter-gpu-manager-4-cuda*"}, Timeout: detectTimeout}, 128*1024) + _, stdout, _ := exec.Capture(ctx, executor.CommandSpec{Name: "dpkg-query", Args: []string{"-W", "-f=${Package}\n", "datacenter-gpu-manager", "datacenter-gpu-manager-4-cuda*"}, Timeout: detectTimeout}, 128*1024) return strings.Contains(string(stdout), "datacenter-gpu-manager") case "rhel", "suse": if !exec.CommandExists("rpm") { diff --git a/customers/vm-troubleshooting/internal/platform/dcgm_test.go b/customers/vm-troubleshooting/internal/platform/dcgm_test.go new file mode 100644 index 0000000..d829096 --- /dev/null +++ b/customers/vm-troubleshooting/internal/platform/dcgm_test.go @@ -0,0 +1,33 @@ +package platform + +import ( + "context" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" +) + +func TestDetectDCGMReturnsTrueWhenDcgmiExists(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.Binaries["dcgmi"] = true + + if !DetectDCGM(context.Background(), fake, DistroInfo{Family: "debian"}) { + t.Fatal("expected dcgm detection to succeed when dcgmi is present") + } +} + +func TestDetectDCGMFallsBackToPackageQuery(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.Binaries["dpkg-query"] = true + fake.Commands["dpkg-query -W -f=${Package}\n datacenter-gpu-manager datacenter-gpu-manager-4-cuda*"] = executor.FakeResponse{ + Stdout: []byte("datacenter-gpu-manager\n"), + } + + if !DetectDCGM(context.Background(), fake, DistroInfo{Family: "debian"}) { + t.Fatal("expected dcgm detection to succeed from package metadata") + } +} diff --git a/customers/vm-troubleshooting/internal/privilege/privilege.go b/customers/vm-troubleshooting/internal/privilege/privilege.go index 94842dd..c1a6f6d 100644 --- a/customers/vm-troubleshooting/internal/privilege/privilege.go +++ b/customers/vm-troubleshooting/internal/privilege/privilege.go @@ -2,40 +2,97 @@ package privilege import ( "context" + "fmt" "os" "os/exec" + "syscall" "time" + + "github.com/mattn/go-isatty" +) + +const ( + sudoProbeTimeout = 15 * time.Second + + // envReexecGuard prevents infinite re-exec loops. Set by ReexecUnderSudo + // before replacing the process with sudo, so the re-exec'd child knows + // not to try again. + envReexecGuard = "GATHER_INFO_SUDO_REEXEC" ) +// State reports whether root-level privilege is available and why. type State struct { HasRoot bool + Reason string } +// Detect checks for root without any user interaction. func Detect(ctx context.Context) State { if os.Geteuid() == 0 { - return State{HasRoot: true} + return State{HasRoot: true, Reason: "effective uid is 0"} } if _, err := exec.LookPath("sudo"); err != nil { - return State{} + return State{Reason: "sudo not found on PATH"} } - probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) - defer cancel() - if err := exec.CommandContext(probeCtx, "sudo", "-n", "true").Run(); err == nil { - return State{HasRoot: true} - } - return State{} + return probeSudo(ctx, sudoProbeTimeout) } -func Acquire(ctx context.Context) State { +// ReexecUnderSudo replaces the current process with sudo . +// If the current process is already root or this is already a re-exec'd +// child, it returns nil immediately and the caller should continue normally. +// On successful exec the function never returns (the process is replaced). +// An error return means sudo is unavailable or exec failed; the caller +// should treat this as a fatal startup error. +func ReexecUnderSudo(ctx context.Context, args []string) error { if os.Geteuid() == 0 { - return State{HasRoot: true} + return nil // already root + } + if os.Getenv(envReexecGuard) == "1" { + // We already tried re-exec and still aren't root. Don't loop. + return fmt.Errorf("re-exec under sudo did not yield root (euid=%d)", os.Geteuid()) + } + + sudoPath, err := exec.LookPath("sudo") + if err != nil { + return fmt.Errorf("sudo not found on PATH: %w", err) + } + + binary, err := os.Executable() + if err != nil { + return fmt.Errorf("cannot determine executable path: %w", err) + } + + // Check whether sudo will need a password so we can show a message. + needsPassword := probeSudo(ctx, sudoProbeTimeout).HasRoot == false + + if needsPassword { + if !isatty.IsTerminal(os.Stdin.Fd()) { + return fmt.Errorf("sudo requires a password but stdin is not a terminal") + } + fmt.Fprintln(os.Stderr, "Root privileges required for full diagnostics.") + fmt.Fprintln(os.Stderr, "You may be prompted for your password.") + fmt.Fprintln(os.Stderr) } - cmd := exec.CommandContext(ctx, "sudo", "-v") - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stderr - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return State{} + + // Build argv: sudo [-E] + // -E preserves environment (e.g. TERM, locale) for consistent output. + argv := make([]string, 0, 3+len(args)) + argv = append(argv, "sudo", "-E", binary) + argv = append(argv, args...) + + // Set the re-exec guard so the child doesn't loop. + env := append(os.Environ(), envReexecGuard+"=1") + + // Replace process. On success this never returns. + return syscall.Exec(sudoPath, argv, env) +} + +func probeSudo(ctx context.Context, timeout time.Duration) State { + probeCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + if err := exec.CommandContext(probeCtx, "sudo", "-n", "true").Run(); err == nil { + return State{HasRoot: true, Reason: "sudo -n true succeeded"} + } else { + return State{Reason: fmt.Sprintf("sudo -n true failed: %v", err)} } - return State{HasRoot: true} } diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index 3389936..b4ec71a 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -37,28 +37,52 @@ type rootSetter interface { SetHasRoot(bool) } +var detectPrivilege = privilege.Detect + +// bootstrapPrivileges verifies that root was obtained by the earlier +// ReexecUnderSudo call and marks the executor accordingly. +// Privilege escalation happens once at process start (cli/root.go) via +// re-exec under sudo, so this is just a confirmation check. +func bootstrapPrivileges(ctx context.Context, u ui.UI, exec executor.Executor) (privilege.State, error) { + priv := detectPrivilege(ctx) + u.Verbose(fmt.Sprintf("Privilege check: root=%t (%s)", priv.HasRoot, priv.Reason)) + if !priv.HasRoot { + return privilege.State{}, fmt.Errorf("root privileges required but not available (%s)", priv.Reason) + } + if re, ok := exec.(rootSetter); ok { + re.SetHasRoot(true) + } + return priv, nil +} + func New(cfg *config.Config, u ui.UI, exec executor.Executor) *Runner { return &Runner{Config: cfg, UI: u, Exec: exec} } func (r *Runner) Run(ctx context.Context) (*RunResult, error) { r.UI.Banner() + startupSp := r.UI.StartSpinner("Checking system state...") - // Detect system state - priv := privilege.Detect(ctx) - if re, ok := r.Exec.(rootSetter); ok { - re.SetHasRoot(priv.HasRoot) + // Detect privilege first so the tool either escalates or fails before + // collecting partial data from a non-root session. + startupSp.Update("Detecting privilege level...") + if _, err := bootstrapPrivileges(ctx, r.UI, r.Exec); err != nil { + startupSp.Fail("Root escalation failed") + return &RunResult{ExitCode: config.ExitFatal}, err } + + // Detect system state + startupSp.Update("Detecting distro...") distro := platform.DetectDistro() + startupSp.Update("Detecting NVIDIA GPU...") hasGPU := platform.DetectNvidiaGPU(ctx, r.Exec) + startupSp.Update("Detecting DCGM...") hasDCGM := platform.DetectDCGM(ctx, r.Exec, distro) host, _ := os.Hostname() + startupSp.Success("System state detected") // System info box rootStatus := "yes" - if !priv.HasRoot { - rootStatus = "no (some diagnostics will be skipped)" - } gpuStatus := "not detected" if hasGPU { gpuStatus = "detected" @@ -83,13 +107,6 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { // DCGM install prompt if hasGPU && !hasDCGM && uiAllowedInstall(r.UI, r.Config) { - if !r.Exec.HasRoot() && r.UI.IsInteractive() { - if st := privilege.Acquire(ctx); st.HasRoot { - if re, ok := r.Exec.(rootSetter); ok { - re.SetHasRoot(true) - } - } - } if err := install.PromptAndInstallDCGM(ctx, r.Exec, r.UI, distro); err != nil { r.UI.Warn(err.Error()) } @@ -321,7 +338,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "3.0.0", + SchemaVersion: "3.1.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, @@ -357,7 +374,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } cmds := transfer.GenerateTransferCommands(archivePath, floatingIP, privateIP, usr) - candidates := transfer.GenerateCandidates(host, usr, ips) + candidates := transfer.GenerateCandidates(host, usr, floatingIP, ips) if err := writer.SaveOutput("transfer_commands.txt", cmds+"\n"+candidates); err != nil { r.UI.Warn("Failed to write transfer_commands.txt: " + err.Error()) } diff --git a/customers/vm-troubleshooting/internal/runner/runner_test.go b/customers/vm-troubleshooting/internal/runner/runner_test.go index b551195..bd5f172 100644 --- a/customers/vm-troubleshooting/internal/runner/runner_test.go +++ b/customers/vm-troubleshooting/internal/runner/runner_test.go @@ -1,14 +1,54 @@ package runner import ( + "context" "os" "path/filepath" "testing" "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/privilege" "github.com/NexGenCloud/vm-diagnostics/internal/triage" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) +func patchDetectPrivilege(t *testing.T, fn func(context.Context) privilege.State) { + t.Helper() + old := detectPrivilege + detectPrivilege = fn + t.Cleanup(func() { detectPrivilege = old }) +} + +func TestBootstrapPrivilegesFailsWithoutRoot(t *testing.T) { + patchDetectPrivilege(t, func(context.Context) privilege.State { + return privilege.State{Reason: "not root"} + }) + + exec := executor.NewReal(false) + if _, err := bootstrapPrivileges(context.Background(), ui.NoopUI{}, exec); err == nil { + t.Fatal("expected bootstrap to fail without root") + } + if exec.HasRoot() { + t.Fatal("exec should not be marked root on bootstrap failure") + } +} + +func TestBootstrapPrivilegesSetsRootOnExecutor(t *testing.T) { + patchDetectPrivilege(t, func(context.Context) privilege.State { + return privilege.State{HasRoot: true, Reason: "test"} + }) + + exec := executor.NewReal(false) + state, err := bootstrapPrivileges(context.Background(), ui.NoopUI{}, exec) + if err != nil { + t.Fatalf("expected bootstrap to succeed, got %v", err) + } + if !state.HasRoot || !exec.HasRoot() { + t.Fatal("expected root to be set on executor") + } +} + func TestValidateOutputDir(t *testing.T) { t.Parallel() diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize.go b/customers/vm-troubleshooting/internal/sanitize/sanitize.go index 6c6ce3b..f59c747 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize.go @@ -14,6 +14,7 @@ var ( processFlagRE = regexp.MustCompile(`(?i)--(password|passwd|secret|token|api[_-]?key|credential)(?:=|\s+)([^\s]+)`) authorizationRE = regexp.MustCompile(`(?i)(Authorization:\s*)(Bearer|Basic)\s+[^\s]+`) commonEnvRE = regexp.MustCompile(`(?i)\b(AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HUGGINGFACE_API_KEY|DATABASE_URL|PGPASSWORD|MYSQL_PWD|REDIS_PASSWORD)=[^\s]+`) + ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;?]*[ -/]*[@-~]`) ) func DockerInspect(input string) string { @@ -132,3 +133,8 @@ func GrepLines(input, pattern string) []string { } return matches } + +// StripANSI removes ANSI escape sequences from captured command output. +func StripANSI(input string) string { + return ansiEscapeRE.ReplaceAllString(input, "") +} diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go index f63469a..35c9833 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go @@ -153,3 +153,20 @@ func TestGrepLines(t *testing.T) { t.Errorf("expected nil for invalid regex, got %v", got) } } + +func TestStripANSI(t *testing.T) { + t.Parallel() + in := "\x1b[4mGPU0\x1b[0m \x1b[1;31mX\x1b[0m" + got := StripANSI(in) + if got != "GPU0 X" { + t.Fatalf("unexpected ANSI stripping result: %q", got) + } +} + +func TestStripANSINoSequences(t *testing.T) { + t.Parallel() + in := "plain text" + if got := StripANSI(in); got != in { + t.Fatalf("expected unchanged input, got %q", got) + } +} diff --git a/customers/vm-troubleshooting/internal/transfer/commands.go b/customers/vm-troubleshooting/internal/transfer/commands.go index e8355ae..067e093 100644 --- a/customers/vm-troubleshooting/internal/transfer/commands.go +++ b/customers/vm-troubleshooting/internal/transfer/commands.go @@ -22,8 +22,23 @@ var currentUser = user.Current type IPInfo struct { Address string Interface string - IsPrivate bool - HasDefaultRoute bool // interface carries the default IPv4 route + Visibility string // "public", "private", or "shared" (RFC 6598 CGNAT) + HasDefaultRoute bool // interface carries the default IPv4 route +} + +// cgnatBlock is the RFC 6598 Shared Address Space (100.64.0.0/10), +// used by Carrier-Grade NAT. Go's net.IP.IsPrivate() only covers RFC 1918. +var cgnatBlock = net.IPNet{IP: net.IP{100, 64, 0, 0}, Mask: net.CIDRMask(10, 32)} + +// ipVisibility classifies an IPv4 address for the transfer display. +func ipVisibility(ip net.IP) string { + if ip.IsPrivate() { + return "private" // RFC 1918 + } + if cgnatBlock.Contains(ip) { + return "shared" // RFC 6598 CGNAT + } + return "public" } // defaultRouteIfaceIndex returns the interface index for the default IPv4 route @@ -68,46 +83,50 @@ func DiscoverIPs() []IPInfo { ips = append(ips, IPInfo{ Address: ip.String(), Interface: iface.Name, - IsPrivate: ip.IsPrivate(), + Visibility: ipVisibility(ip), HasDefaultRoute: iface.Index == defaultIfIdx, }) } } - // Sort: default-route first, then public before private, alphabetical within group. + // Sort: default-route first, then public before shared before private, alphabetical within group. // Total order: Address+Interface is the tiebreaker so no ties are possible. slices.SortStableFunc(ips, compareIPs) return ips } // compareIPs defines a total order for IP entries: default-route first, -// then public before private, then alphabetical by address, then by interface name. +// then public before shared before private, then alphabetical by address, then by interface name. func compareIPs(a, b IPInfo) int { return cmp.Or( - boolDesc(a.HasDefaultRoute, b.HasDefaultRoute), // default-route first - boolAsc(a.IsPrivate, b.IsPrivate), // public (false) before private (true) - cmp.Compare(a.Address, b.Address), // address tiebreaker - cmp.Compare(a.Interface, b.Interface), // interface tiebreaker (same IP on multiple ifaces) + boolDesc(a.HasDefaultRoute, b.HasDefaultRoute), // default-route first + cmp.Compare(visibilityOrder(a.Visibility), visibilityOrder(b.Visibility)), // public → shared → private + cmp.Compare(a.Address, b.Address), // address tiebreaker + cmp.Compare(a.Interface, b.Interface), // interface tiebreaker (same IP on multiple ifaces) ) } -func boolDesc(a, b bool) int { - if a == b { +// visibilityOrder returns the sort rank: public first, shared second, private last. +func visibilityOrder(v string) int { + switch v { + case "public": return 0 + case "shared": + return 1 + case "private": + return 2 + default: + return 3 } - if a { - return -1 - } - return 1 } -func boolAsc(a, b bool) int { +func boolDesc(a, b bool) int { if a == b { return 0 } if a { - return 1 + return -1 } - return -1 + return 1 } // DetectUsername returns the likely SSH username. @@ -201,15 +220,19 @@ func GenerateTransferCommands(archivePath, floatingIP, privateIP, username strin } // GenerateCandidates produces the candidate address section text. -func GenerateCandidates(hostname, username string, ips []IPInfo) string { +func GenerateCandidates(hostname, username, floatingIP string, ips []IPInfo) string { var b strings.Builder - // Sort display: default-route first, then public, then private + // Sort display: default-route first, then public, shared, then private sorted := make([]IPInfo, len(ips)) copy(sorted, ips) slices.SortStableFunc(sorted, compareIPs) b.WriteString(fmt.Sprintf("Hostname: %s\n", hostname)) + if floatingIP != "" { + b.WriteString("\nFloating/Public IP (metadata):\n") + b.WriteString(fmt.Sprintf(" %s\n", floatingIP)) + } if len(sorted) > 0 { addressWidth := len("Address") ifaceWidth := len("Interface") @@ -225,15 +248,11 @@ func GenerateCandidates(hostname, username string, ips []IPInfo) string { b.WriteString("\n") b.WriteString(fmt.Sprintf("%-*s %-*s %-10s %s\n", addressWidth, "Address", ifaceWidth, "Interface", "Visibility", "Route")) for _, ip := range sorted { - visibility := "public" - if ip.IsPrivate { - visibility = "private" - } route := "-" if ip.HasDefaultRoute { route = "default ★" } - b.WriteString(fmt.Sprintf("%-*s %-*s %-10s %s\n", addressWidth, ip.Address, ifaceWidth, ip.Interface, visibility, route)) + b.WriteString(fmt.Sprintf("%-*s %-*s %-10s %s\n", addressWidth, ip.Address, ifaceWidth, ip.Interface, ip.Visibility, route)) } } diff --git a/customers/vm-troubleshooting/internal/transfer/commands_test.go b/customers/vm-troubleshooting/internal/transfer/commands_test.go index e82ba06..983a37e 100644 --- a/customers/vm-troubleshooting/internal/transfer/commands_test.go +++ b/customers/vm-troubleshooting/internal/transfer/commands_test.go @@ -8,32 +8,40 @@ import ( "testing" ) -func TestClassifyIP(t *testing.T) { +func TestIPVisibility(t *testing.T) { t.Parallel() tests := []struct { - ip string - private bool + ip string + want string }{ - {"10.0.0.1", true}, - {"10.255.255.255", true}, - {"172.16.0.1", true}, - {"172.31.255.255", true}, - {"192.168.0.1", true}, - {"192.168.255.255", true}, - {"203.0.113.10", false}, - {"8.8.8.8", false}, - {"1.1.1.1", false}, - {"172.15.0.1", false}, - {"172.32.0.1", false}, + // RFC 1918 private + {"10.0.0.1", "private"}, + {"10.255.255.255", "private"}, + {"172.16.0.1", "private"}, + {"172.31.255.255", "private"}, + {"192.168.0.1", "private"}, + {"192.168.255.255", "private"}, + // RFC 6598 CGNAT (100.64.0.0/10) + {"100.64.0.1", "shared"}, + {"100.65.30.184", "shared"}, + {"100.127.255.254", "shared"}, + // Public + {"203.0.113.10", "public"}, + {"8.8.8.8", "public"}, + {"1.1.1.1", "public"}, + {"172.15.0.1", "public"}, + {"172.32.0.1", "public"}, + {"100.128.0.1", "public"}, // just above CGNAT range + {"100.63.255.255", "public"}, // just below CGNAT range } for _, tt := range tests { ip := net.ParseIP(tt.ip) if ip == nil { t.Fatalf("invalid IP: %s", tt.ip) } - got := ip.IsPrivate() - if got != tt.private { - t.Errorf("IP %s: IsPrivate()=%v, want %v", tt.ip, got, tt.private) + got := ipVisibility(ip) + if got != tt.want { + t.Errorf("IP %s: ipVisibility()=%q, want %q", tt.ip, got, tt.want) } } } @@ -144,19 +152,30 @@ func TestTransferCommands_NoFloatingIP_NoPrivateIP(t *testing.T) { func TestGenerateCandidates_AlwaysHasHostname(t *testing.T) { t.Parallel() - out := GenerateCandidates("my-server-42", "ubuntu", nil) + out := GenerateCandidates("my-server-42", "ubuntu", "", nil) if !strings.Contains(out, "my-server-42") { t.Error("hostname missing from candidates") } } +func TestGenerateCandidates_IncludesFloatingIPMetadata(t *testing.T) { + t.Parallel() + out := GenerateCandidates("my-server-42", "ubuntu", "203.0.113.10", nil) + if !strings.Contains(out, "Floating/Public IP (metadata)") { + t.Fatal("floating/public IP metadata header missing") + } + if !strings.Contains(out, "203.0.113.10") { + t.Fatal("floating/public IP value missing") + } +} + func TestGenerateCandidates_PublicIPFirst(t *testing.T) { t.Parallel() ips := []IPInfo{ - {Address: "10.0.0.5", Interface: "eth0", IsPrivate: true}, - {Address: "203.0.113.10", Interface: "eth1", IsPrivate: false}, + {Address: "10.0.0.5", Interface: "eth0", Visibility: "private"}, + {Address: "203.0.113.10", Interface: "eth1", Visibility: "public"}, } - out := GenerateCandidates("myhost", "ubuntu", ips) + out := GenerateCandidates("myhost", "ubuntu", "", ips) pubIdx := strings.Index(out, "203.0.113.10") privIdx := strings.Index(out, "10.0.0.5") if pubIdx < 0 || privIdx < 0 { @@ -170,10 +189,10 @@ func TestGenerateCandidates_PublicIPFirst(t *testing.T) { func TestGenerateCandidates_DefaultRouteFirst(t *testing.T) { t.Parallel() ips := []IPInfo{ - {Address: "10.0.0.5", Interface: "eth0", IsPrivate: true, HasDefaultRoute: false}, - {Address: "10.0.0.1", Interface: "eth1", IsPrivate: true, HasDefaultRoute: true}, + {Address: "10.0.0.5", Interface: "eth0", Visibility: "private", HasDefaultRoute: false}, + {Address: "10.0.0.1", Interface: "eth1", Visibility: "private", HasDefaultRoute: true}, } - out := GenerateCandidates("myhost", "ubuntu", ips) + out := GenerateCandidates("myhost", "ubuntu", "", ips) defIdx := strings.Index(out, "10.0.0.1") otherIdx := strings.Index(out, "10.0.0.5") if defIdx < 0 || otherIdx < 0 { @@ -190,7 +209,7 @@ func TestGenerateCandidates_DefaultRouteFirst(t *testing.T) { func TestGenerateCandidates_DefaultRouteColumn(t *testing.T) { t.Parallel() ips := []IPInfo{{Address: "203.0.113.10", Interface: "eth0", HasDefaultRoute: true}} - out := GenerateCandidates("myhost", "ubuntu", ips) + out := GenerateCandidates("myhost", "ubuntu", "", ips) if !strings.Contains(out, "default ★") { t.Fatal("default route column marker missing") } diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 8b482e3..9e46ea6 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -11,6 +11,7 @@ import ( "github.com/NexGenCloud/vm-diagnostics/internal/collector" "github.com/NexGenCloud/vm-diagnostics/internal/identity" + "github.com/NexGenCloud/vm-diagnostics/internal/triage/xidcatalog" ) // CriticalPattern describes a high-impact log pattern to search for. @@ -23,6 +24,7 @@ type CriticalPattern struct { Category string // Confidence controls SUMMARY visibility (high shown, low hidden). Confidence collector.Confidence + KernelOnly bool } // criticalPatterns are high-confidence patterns applied to all log sources. @@ -33,10 +35,31 @@ var criticalPatterns = []CriticalPattern{ Name: "Kernel Panic", Code: FindingCriticalLog, FingerprintKey: "kernel_panic", - Pattern: regexp.MustCompile(`(?i)(kernel panic|BUG:|call trace)`), + Pattern: regexp.MustCompile(`(?i)\bkernel panic\b|\bpanic - not syncing\b`), Severity: collector.SeverityCritical, Category: "KERN", Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "Kernel BUG", + Code: FindingCriticalLog, + FingerprintKey: "kernel_bug", + Pattern: regexp.MustCompile(`(?i)\bBUG(?::|\s)`), + Severity: collector.SeverityWarning, + Category: "KERN", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "Kernel Stack Trace", + Code: FindingCriticalLog, + FingerprintKey: "kernel_stack_trace", + Pattern: regexp.MustCompile(`(?i)\bCall Trace:`), + Severity: collector.SeverityInfo, + Category: "KERN", + Confidence: collector.ConfidenceLow, + KernelOnly: true, }, { Name: "Hardware Error", @@ -46,6 +69,7 @@ var criticalPatterns = []CriticalPattern{ Severity: collector.SeverityCritical, Category: "HW", Confidence: collector.ConfidenceHigh, + KernelOnly: true, }, { Name: "Fallen Off Bus", @@ -55,6 +79,34 @@ var criticalPatterns = []CriticalPattern{ Severity: collector.SeverityCritical, Category: "GPU", Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "NVRM RPC Failure", + Code: FindingCriticalLog, + FingerprintKey: "nvrm_rpc_failure", + Pattern: regexp.MustCompile(`(?i)NVRM:.*rpcSendMessage failed`), + Severity: collector.SeverityWarning, + Category: "GPU", + Confidence: collector.ConfidenceHigh, + }, + { + Name: "NVRM Driver Assertion", + Code: FindingCriticalLog, + FingerprintKey: "nvrm_driver_assertion", + Pattern: regexp.MustCompile(`(?i)NVRM:.*(?:Assertion failed|Check failed)`), + Severity: collector.SeverityWarning, + Category: "GPU", + Confidence: collector.ConfidenceHigh, + }, + { + Name: "Segfault", + Code: FindingCriticalLog, + FingerprintKey: "segfault", + Pattern: regexp.MustCompile(`(?i)\bsegfault\b`), + Severity: collector.SeverityWarning, + Category: "PROC", + Confidence: collector.ConfidenceHigh, }, { Name: "Timeout", @@ -83,6 +135,32 @@ var lowConfidencePatterns = []CriticalPattern{ const maxEvents = 100 +var ( + pidBracketRe = regexp.MustCompile(`\[\s*\d+\]`) + kernelTsRe = regexp.MustCompile(`\[\s*\d+\.\d+\]`) + // Matches "from port " patterns in SSH/syslog messages. + ipPortRe = regexp.MustCompile(`from \d+\.\d+\.\d+\.\d+ port \d+`) + explicitHexRe = regexp.MustCompile(`\b0x[0-9a-fA-F]+\b`) + bareHexTokenRe = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`) +) + +// normalizeCriticalLine strips high-cardinality tokens for deduplication. +// The original line is preserved in evidence; the normalized form is only +// used for dedup keys and fingerprint generation. +func normalizeCriticalLine(line string) string { + n := pidBracketRe.ReplaceAllString(line, "[_]") + n = kernelTsRe.ReplaceAllString(n, "[_]") + n = ipPortRe.ReplaceAllString(n, "_._._._:_") + n = explicitHexRe.ReplaceAllString(n, "0xHEX") + n = bareHexTokenRe.ReplaceAllStringFunc(n, func(token string) string { + if !strings.ContainsAny(strings.ToLower(token), "abcdef") { + return token + } + return "HEX" + }) + return n +} + // criticalEvent is an internal deduplication record. type criticalEvent struct { pattern string @@ -97,6 +175,12 @@ type criticalEvent struct { unresolved []string } +type criticalSourceLine struct { + text string + kernelOrigin bool + xidFormatted bool +} + // AnalyzeCriticalLogs scans collected log artifacts for high-impact patterns. func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, error) { type sourceSpec struct { @@ -117,7 +201,10 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro sourceSpec state ArtifactState content string - lines []string + lines []criticalSourceLine + // NDJSON truncation metadata from the source itself. + truncated bool + truncateReason string // canonical structured artifacts that should exist but were unavailable. unresolved []string } @@ -128,7 +215,10 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro state, content := checkArtifact(workDir, src.path) checked[i] = checkedSource{sourceSpec: src, state: state, content: content} if state == ArtifactUsable { - checked[i].lines = sourceLinesForCritical(src.kind, content) + lines, truncated, reason := sourceLinesForCritical(src.kind, content, src.path) + checked[i].lines = lines + checked[i].truncated = truncated + checked[i].truncateReason = reason } // Structured source unavailable -> fallback to text while retaining unresolved path. if state != ArtifactUsable && src.fallback != "" { @@ -136,7 +226,9 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro if fbState == ArtifactUsable { checked[i].state = ArtifactUsable checked[i].content = fbContent - checked[i].lines = sourceLinesForCritical("text", fbContent) + lines, _, _ := sourceLinesForCritical("text", fbContent, src.fallback) + checked[i].lines = lines + checked[i].kind = "text" checked[i].path = src.fallback checked[i].unresolved = []string{src.path} } else if fbState == ArtifactSkipped { @@ -160,6 +252,30 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro }, nil } + var dataQualityFindings []Finding + for _, src := range checked { + if src.state != ArtifactUsable || src.kind != "ndjson" || !src.truncated { + continue + } + reasonText := "size limit" + if src.truncateReason == "record_limit" { + reasonText = "record limit" + } + dataQualityFindings = append(dataQualityFindings, Finding{ + Code: FindingDataQuality, + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Category: "DATA", + Title: "Critical Log Source Truncated", + Description: fmt.Sprintf("%s was truncated by %s; findings may be incomplete", src.path, reasonText), + Action: "Rerun with a narrower --journal-since window to reduce journal volume", + SourceArtifacts: []string{ + src.path, + }, + Fingerprint: identity.Fingerprint("triage", "critical_events", "data_quality", src.path, reasonText), + }) + } + // Dedup by (pattern_name, line_hash) to avoid counting the same line from multiple sources type dedupKey struct { pattern string @@ -178,14 +294,22 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro patterns = append(patterns, lowConfidencePatterns...) } - for _, line := range src.lines { - line = strings.TrimSpace(line) + for _, srcLine := range src.lines { + line := strings.TrimSpace(srcLine.text) if line == "" || strings.HasPrefix(line, "#") { continue } for _, p := range patterns { + if p.KernelOnly && !srcLine.kernelOrigin { + continue + } + if p.Name == "Fallen Off Bus" && srcLine.xidFormatted { + // Xid/SXid ownership belongs to xid.go; do not duplicate those + // lines in critical_log findings. + continue + } if p.Pattern.MatchString(line) { - k := dedupKey{p.Name, line} + k := dedupKey{p.Name, normalizeCriticalLine(line)} if existing, ok := seen[k]; ok { existing.count++ } else { @@ -210,10 +334,20 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro } if len(eventOrder) == 0 { + text := "No critical log events found.\n" + if len(dataQualityFindings) > 0 { + lines := []string{"Critical Log Analysis: 0 event(s)\n"} + for _, dq := range dataQualityFindings { + lines = append(lines, fmt.Sprintf(" [warning] %s: %s", dq.Title, dq.Description)) + } + lines = append(lines, "", "No critical log events found in the available log subset.") + text = strings.Join(lines, "\n") + "\n" + } return &TriageResult{ - Name: "critical_events", - Facts: map[string]string{"critical_event_count": "0"}, - Text: "No critical log events found.\n", + Name: "critical_events", + Findings: dataQualityFindings, + Facts: map[string]string{"critical_event_count": "0"}, + Text: text, }, nil } @@ -242,9 +376,13 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro } // Build findings and text - var findings []Finding + findings := make([]Finding, 0, len(dataQualityFindings)+len(events)) + findings = append(findings, dataQualityFindings...) var textLines []string textLines = append(textLines, fmt.Sprintf("Critical Log Analysis: %d event(s)\n", totalCount)) + for _, dq := range dataQualityFindings { + textLines = append(textLines, fmt.Sprintf(" [warning] %s: %s", dq.Title, dq.Description)) + } for _, ev := range events { // Truncate long lines for readability @@ -264,7 +402,7 @@ func AnalyzeCriticalLogs(_ context.Context, workDir string) (*TriageResult, erro SourceArtifacts: []string{ ev.source, }, - Fingerprint: identity.Fingerprint("crit", ev.fingerprintKey, criticalSourceClass(ev.source), ev.line), + Fingerprint: identity.Fingerprint("crit", ev.fingerprintKey, criticalSourceClass(ev.source), normalizeCriticalLine(ev.line)), UnresolvedArtifactPaths: append([]string(nil), ev.unresolved...), }) @@ -302,19 +440,56 @@ func criticalSourceClass(path string) string { } } -func sourceLinesForCritical(kind, content string) []string { +func sourceLinesForCritical(kind, content, path string) ([]criticalSourceLine, bool, string) { switch kind { case "ndjson": - events := parseJournalNDJSON(content) - lines := make([]string, 0, len(events)) - for _, ev := range events { + parsed := parseJournalNDJSONWithMeta(content) + lines := make([]criticalSourceLine, 0, len(parsed.Events)) + for _, ev := range parsed.Events { if strings.TrimSpace(ev.Message) == "" { continue } - lines = append(lines, ev.Message) + lines = append(lines, criticalSourceLine{ + text: ev.Message, + kernelOrigin: isKernelEvent(path, ev), + xidFormatted: isXidOrSXidLine(ev.Message), + }) } - return lines + return lines, parsed.Truncated, parsed.TruncationReason default: - return strings.Split(content, "\n") + rawLines := strings.Split(content, "\n") + lines := make([]criticalSourceLine, 0, len(rawLines)) + kernelOrigin := path == "logs/dmesg.txt" || path == "logs/journal_kernel.txt" + for _, line := range rawLines { + lines = append(lines, criticalSourceLine{ + text: line, + kernelOrigin: kernelOrigin, + xidFormatted: isXidOrSXidLine(line), + }) + } + return lines, false, "" + } +} + +func isKernelEvent(sourcePath string, ev normalizedJournalEvent) bool { + if sourcePath == "logs/journal_kernel.ndjson" { + return true + } + if strings.EqualFold(ev.Transport, "kernel") { + return true + } + if strings.EqualFold(ev.SyslogIdentifier, "kernel") { + return true + } + return false +} + +func isXidOrSXidLine(line string) bool { + if _, _, ok := xidcatalog.ParseKernelLine(line); ok { + return true + } + if _, _, ok := xidcatalog.ParseKernelSXidLine(line); ok { + return true } + return false } diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index 2fa4939..0126f73 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -62,7 +62,7 @@ func TestAnalyzeCriticalLogs_LowConfidence(t *testing.T) { if len(f.SourceArtifacts) != 1 || f.SourceArtifacts[0] != "logs/journal_errors.txt" { t.Errorf("expected source_artifacts to contain journal_errors, got %#v", f.SourceArtifacts) } - wantFP := identity.Fingerprint("crit", "error_fail", criticalSourceClass("logs/journal_errors.txt"), f.Evidence[0]) + wantFP := identity.Fingerprint("crit", "error_fail", criticalSourceClass("logs/journal_errors.txt"), normalizeCriticalLine(f.Evidence[0])) if f.Fingerprint != wantFP { t.Errorf("unexpected fingerprint: got %q want %q", f.Fingerprint, wantFP) } @@ -301,6 +301,76 @@ func TestAnalyzeCriticalLogs_Cap(t *testing.T) { } } +func TestAnalyzeCriticalLogs_DedupNormalizesPIDs(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + // Same mlx5_core error with different PIDs → should deduplicate to one finding. + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "---\n"+ + "[100.0] mlx5_core 0000:3b:00.0: cmd_work_handler[1234]: timeout reached\n"+ + "[100.0] mlx5_core 0000:3b:00.0: cmd_work_handler[5678]: timeout reached\n"+ + "[100.0] mlx5_core 0000:3b:00.0: cmd_work_handler[9012]: timeout reached\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + timeoutCount := 0 + for _, f := range tr.Findings { + if f.Title == "Timeout" { + timeoutCount++ + } + } + if timeoutCount != 1 { + t.Errorf("expected 1 deduplicated Timeout finding for PID-varying lines, got %d", timeoutCount) + } + // Verify count reflects all 3 lines + for _, f := range tr.Findings { + if f.Title == "Timeout" && !strings.Contains(f.Description, "3x") { + t.Errorf("expected count=3 in description, got %q", f.Description) + } + } +} + +func TestAnalyzeCriticalLogs_DedupNormalizesSSHIPs(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + // SSH preauth errors from different IPs → should deduplicate. + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte( + "---\n"+ + "sshd[1001]: error: maximum authentication attempts exceeded for root from 1.2.3.4 port 54321\n"+ + "sshd[1002]: error: maximum authentication attempts exceeded for root from 5.6.7.8 port 12345\n"+ + "sshd[1003]: error: maximum authentication attempts exceeded for root from 9.10.11.12 port 33333\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + errorCount := 0 + for _, f := range tr.Findings { + if f.Title == "Error/Fail" { + errorCount++ + } + } + if errorCount != 1 { + t.Errorf("expected 1 deduplicated Error/Fail finding for SSH IP-varying lines, got %d", errorCount) + } +} + +func TestNormalizeCriticalLine_PreservesNonMatchingContent(t *testing.T) { + t.Parallel() + // A line with no PIDs, kernel timestamps, or SSH IP patterns should be unchanged. + line := "mlx5_core 0000:3b:00.0: some normal error without brackets" + got := normalizeCriticalLine(line) + if got != line { + t.Errorf("expected unchanged line, got %q", got) + } +} + func TestAnalyzeCriticalLogs_DeterministicTiebreakers(t *testing.T) { t.Parallel() workDir := t.TempDir() @@ -324,3 +394,239 @@ func TestAnalyzeCriticalLogs_DeterministicTiebreakers(t *testing.T) { t.Fatalf("expected lexicographic tiebreak ordering, got first=%q second=%q", first, second) } } + +func TestAnalyzeCriticalLogs_CallTraceIsLowConfidenceInfo(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\nCall Trace:\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if len(tr.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(tr.Findings)) + } + f := tr.Findings[0] + if f.Title != "Kernel Stack Trace" { + t.Fatalf("expected Kernel Stack Trace title, got %q", f.Title) + } + if f.Severity != collector.SeverityInfo || f.Confidence != collector.ConfidenceLow { + t.Fatalf("expected info+low, got severity=%s confidence=%s", f.Severity, f.Confidence) + } +} + +func TestAnalyzeCriticalLogs_KernelBugPatterns(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "---\nBUG uvm_gpu_chunk_5 invalid list access\nBUG: unable to handle page fault for address: 0x0\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + kernelBUGCount := 0 + for _, f := range tr.Findings { + if f.Title != "Kernel BUG" { + continue + } + kernelBUGCount++ + if f.Severity != collector.SeverityWarning { + t.Fatalf("Kernel BUG should be warning, got %s", f.Severity) + } + } + if kernelBUGCount != 2 { + t.Fatalf("expected 2 Kernel BUG findings, got %d", kernelBUGCount) + } +} + +func TestAnalyzeCriticalLogs_KernelPanicOnlyMatchesRealPanicText(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\nkernel panic - not syncing: fatal exception\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if len(tr.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(tr.Findings)) + } + if tr.Findings[0].Title != "Kernel Panic" || tr.Findings[0].Severity != collector.SeverityCritical { + t.Fatalf("expected critical Kernel Panic, got %#v", tr.Findings[0]) + } +} + +func TestAnalyzeCriticalLogs_NonKernelBUGDoesNotMatchKernelRules(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.ndjson"), []byte( + `{"MESSAGE":"app BUG in parser","PRIORITY":"3","SYSLOG_IDENTIFIER":"my-app","_SYSTEMD_UNIT":"my-app.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"}`+"\n"+ + `{"MESSAGE":"app hardware error detected","PRIORITY":"3","SYSLOG_IDENTIFIER":"my-app","_SYSTEMD_UNIT":"my-app.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"2","_BOOT_ID":"b1"}`+"\n"+ + `{"MESSAGE":"app has fallen off the bus","PRIORITY":"3","SYSLOG_IDENTIFIER":"my-app","_SYSTEMD_UNIT":"my-app.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"3","_BOOT_ID":"b1"}`+"\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + for _, f := range tr.Findings { + if f.Title == "Kernel BUG" || f.Title == "Kernel Panic" || f.Title == "Kernel Stack Trace" || f.Title == "Hardware Error" || f.Title == "Fallen Off Bus" { + t.Fatalf("unexpected kernel finding for non-kernel source: %#v", f) + } + } +} + +func TestAnalyzeCriticalLogs_SegfaultDedupWithHexNormalization(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.txt"), []byte( + "---\n"+ + "python3[100]: segfault at 0 ip 0x7f1a2b3c4d5e sp 0x7ffd1234abcd error 4 in libc.so[7f1a2b300000+1a000]\n"+ + "python3[101]: segfault at 0 ip 0x8f1a2b3c4d5e sp 0x8ffd1234abce error 4 in libc.so[8f1a2b300000+1a000]\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + segfaultCount := 0 + for _, f := range tr.Findings { + if f.Title == "Segfault" { + segfaultCount++ + if !strings.Contains(f.Description, "2x") { + t.Fatalf("expected deduped segfault count 2x, got %q", f.Description) + } + } + } + if segfaultCount != 1 { + t.Fatalf("expected one deduped segfault finding, got %d", segfaultCount) + } +} + +func TestAnalyzeCriticalLogs_NVRMAssertionDedup(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "---\n"+ + "NVRM: Assertion failed at 0xabc123ef code 0x01\n"+ + "NVRM: Assertion failed at 0xdef456aa code 0x02\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + count := 0 + for _, f := range tr.Findings { + if f.Title == "NVRM Driver Assertion" { + count++ + if !strings.Contains(f.Description, "2x") { + t.Fatalf("expected deduped assertion count 2x, got %q", f.Description) + } + } + } + if count != 1 { + t.Fatalf("expected one deduped assertion finding, got %d", count) + } +} + +func TestAnalyzeCriticalLogs_XidFormattedFallenOffBusSkipped(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte( + "---\nNVRM: Xid (PCI:0000:3b:00): 79, GPU has fallen off the bus.\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + for _, f := range tr.Findings { + if f.Title == "Fallen Off Bus" { + t.Fatalf("expected Xid-formatted fallen-off-bus line to be owned by xid analyzer") + } + } +} + +func TestAnalyzeCriticalLogs_PlainFallenOffBusStillMatches(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\nGPU 0000:3b:00.0 has fallen off the bus.\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + found := false + for _, f := range tr.Findings { + if f.Title == "Fallen Off Bus" { + found = true + } + } + if !found { + t.Fatal("expected plain fallen-off-bus line to match") + } +} + +func TestAnalyzeCriticalLogs_TruncatedNDJSONStillWarnsWhenNoMatches(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.ndjson"), []byte( + `{"MESSAGE":"benign line","PRIORITY":"6","SYSLOG_IDENTIFIER":"app","_SYSTEMD_UNIT":"app.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"}`+"\n"+ + `{"_truncated":true,"records_written":1,"reason":"record_limit"}`+"\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + foundDataQuality := false + for _, f := range tr.Findings { + if f.Code == FindingDataQuality { + foundDataQuality = true + } + } + if !foundDataQuality { + t.Fatalf("expected data_quality warning for truncated NDJSON, got %#v", tr.Findings) + } +} + +func TestAnalyzeCriticalLogs_TruncatedNDJSONWithMatchesIncludesWarningAndFinding(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte( + `{"MESSAGE":"kernel panic - not syncing: fatal","PRIORITY":"0","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"}`+"\n"+ + `{"_truncated":true,"records_written":1,"reason":"byte_limit"}`+"\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + hasPanic := false + hasDataQuality := false + for _, f := range tr.Findings { + if f.Title == "Kernel Panic" { + hasPanic = true + } + if f.Code == FindingDataQuality { + hasDataQuality = true + } + } + if !hasPanic || !hasDataQuality { + t.Fatalf("expected panic + data_quality findings, got %#v", tr.Findings) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/journal_ndjson.go b/customers/vm-troubleshooting/internal/triage/journal_ndjson.go index 529a442..8cbdc8d 100644 --- a/customers/vm-troubleshooting/internal/triage/journal_ndjson.go +++ b/customers/vm-troubleshooting/internal/triage/journal_ndjson.go @@ -18,9 +18,21 @@ type normalizedJournalEvent struct { BootID string } +type parsedJournalNDJSON struct { + Events []normalizedJournalEvent + Truncated bool + TruncationReason string +} + func parseJournalNDJSON(content string) []normalizedJournalEvent { + return parseJournalNDJSONWithMeta(content).Events +} + +func parseJournalNDJSONWithMeta(content string) parsedJournalNDJSON { lines := strings.Split(content, "\n") - events := make([]normalizedJournalEvent, 0, len(lines)) + result := parsedJournalNDJSON{ + Events: make([]normalizedJournalEvent, 0, len(lines)), + } for _, raw := range lines { line := strings.TrimSpace(raw) if line == "" { @@ -30,11 +42,12 @@ func parseJournalNDJSON(content string) []normalizedJournalEvent { if err := json.Unmarshal([]byte(line), &obj); err != nil { continue } - // Skip truncation sentinel lines emitted by the collector. - if _, ok := obj["_truncated"]; ok { + if truncated, ok := obj["_truncated"].(bool); ok && truncated { + result.Truncated = true + result.TruncationReason = triageString(obj["reason"]) continue } - events = append(events, normalizedJournalEvent{ + result.Events = append(result.Events, normalizedJournalEvent{ Message: triageString(obj["MESSAGE"]), Priority: triageString(obj["PRIORITY"]), SyslogIdentifier: triageString(obj["SYSLOG_IDENTIFIER"]), @@ -44,7 +57,7 @@ func parseJournalNDJSON(content string) []normalizedJournalEvent { BootID: triageString(obj["_BOOT_ID"]), }) } - return events + return result } func triageString(v any) string { diff --git a/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go b/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go index e57ab17..95bf00b 100644 --- a/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go +++ b/customers/vm-troubleshooting/internal/triage/journal_ndjson_test.go @@ -46,3 +46,20 @@ func TestParseJournalNDJSON_ByteArrayMessage(t *testing.T) { t.Errorf("expected byte array decoded to 'hello', got %q", events[0].Message) } } + +func TestParseJournalNDJSONWithMeta_Truncation(t *testing.T) { + t.Parallel() + content := `{"MESSAGE":"real event","PRIORITY":"3","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"} +{"_truncated":true,"records_written":1,"reason":"record_limit"} +` + parsed := parseJournalNDJSONWithMeta(content) + if !parsed.Truncated { + t.Fatal("expected truncated=true") + } + if parsed.TruncationReason != "record_limit" { + t.Fatalf("unexpected truncation reason: %q", parsed.TruncationReason) + } + if len(parsed.Events) != 1 { + t.Fatalf("expected 1 event, got %d", len(parsed.Events)) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index cd99b1d..9715b7d 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -37,6 +37,7 @@ const ( FindingSXid FindingCode = "sxid" FindingFirewallPosture FindingCode = "firewall_posture" FindingCriticalLog FindingCode = "critical_log" + FindingDataQuality FindingCode = "data_quality" ) // FindingCodes enumerates triage-owned finding codes. @@ -45,6 +46,7 @@ var FindingCodes = map[string]bool{ string(FindingSXid): true, string(FindingFirewallPosture): true, string(FindingCriticalLog): true, + string(FindingDataQuality): true, } // TriageResult holds the output of a single analyzer. @@ -61,7 +63,7 @@ type TriageResult struct { type Analyzer func(ctx context.Context, workDir string) (*TriageResult, error) // triageSchemaVersion is the schema version emitted in triage result JSON files. -const triageSchemaVersion = "3.0.0" +const triageSchemaVersion = "3.1.0" // RunAllAnalyzers executes all registered analyzers with spinner feedback. // Missing artifacts are handled gracefully — analyzers skip what isn't there. diff --git a/customers/vm-troubleshooting/internal/triage/xid.go b/customers/vm-troubleshooting/internal/triage/xid.go index 292415e..a1eb0eb 100644 --- a/customers/vm-troubleshooting/internal/triage/xid.go +++ b/customers/vm-troubleshooting/internal/triage/xid.go @@ -45,20 +45,35 @@ var xidPolicies = map[int]xidPolicy{ 109: {collector.SeverityCritical, true, "Reset GPU; contact support"}, 119: {collector.SeverityCritical, true, "Reset GPU or power cycle node"}, 120: {collector.SeverityCritical, true, "Reset GPU or power cycle node"}, + 149: {collector.SeverityCritical, true, "Reset GPU; contact support for NVLink NETIR errors"}, 150: {collector.SeverityCritical, true, "Follow NVLink5 error workflow"}, 154: {collector.SeverityInfo, false, "Informational — see accompanying Xid"}, } -// Xid 154 recovery action map: action text → (severity, requires_reset). +// resolvePolicy merges hand-maintained overrides with generated baseline policies. +// Hand-maintained entries always win; generated provides baseline coverage for codes +// not explicitly overridden. Unknown Xids fall through to a default. +func resolvePolicy(code int) xidPolicy { + if p, ok := xidPolicies[code]; ok { + return p + } + if p, ok := generatedXidPolicies[code]; ok { + return p + } + return xidPolicy{collector.SeverityWarning, false, "Contact support"} +} + +// Xid 154 recovery action map: action text → (severity, requires_reset, emitted action). var xid154Actions = map[string]struct { severity collector.Severity requiresReset bool + action string }{ - "none": {collector.SeverityInfo, false}, - "drain p2p": {collector.SeverityWarning, false}, - "drain and reset": {collector.SeverityCritical, true}, - "gpu reset required": {collector.SeverityCritical, true}, - "node reboot required": {collector.SeverityCritical, true}, + "none": {collector.SeverityInfo, false, "No action required"}, + "drain p2p": {collector.SeverityWarning, false, "Drain P2P traffic"}, + "drain and reset": {collector.SeverityCritical, true, "Reset GPU"}, + "gpu reset required": {collector.SeverityCritical, true, "Reset GPU"}, + "node reboot required": {collector.SeverityCritical, true, "Reboot node"}, } // XidEvent is a parsed and classified Xid/SXid occurrence. @@ -110,14 +125,7 @@ func parseXidEvents(dmesg string) []XidEvent { var events []XidEvent for k, count := range counts { info, known := xidcatalog.Lookup(k.code) - policy := xidPolicy{ - severity: collector.SeverityWarning, - requiresReset: false, - action: "Contact support", - } - if override, ok := xidPolicies[k.code]; ok { - policy = override - } + policy := resolvePolicy(k.code) name := fmt.Sprintf("Unknown Xid %d", k.code) desc := "Unknown Xid error" if known { @@ -149,6 +157,7 @@ func parseXidEvents(dmesg string) []XidEvent { if override, found := xid154Actions[ra]; found { ev.Severity = override.severity ev.RequiresReset = override.requiresReset + ev.Action = override.action } } } @@ -183,16 +192,23 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { } var dmesg string var sourceArtifact string + var sourceKind string + ndjsonTruncated := false + ndjsonTruncateReason := "" anySkipped := false for _, src := range sources { state, content := checkArtifact(workDir, src.path) switch state { case ArtifactUsable: if dmesg == "" { + truncated := false + truncateReason := "" if src.kind == "ndjson" { - events := parseJournalNDJSON(content) - lines := make([]string, 0, len(events)) - for _, ev := range events { + parsed := parseJournalNDJSONWithMeta(content) + truncated = parsed.Truncated + truncateReason = parsed.TruncationReason + lines := make([]string, 0, len(parsed.Events)) + for _, ev := range parsed.Events { if strings.TrimSpace(ev.Message) == "" { continue } @@ -200,11 +216,16 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { } content = strings.Join(lines, "\n") } - if strings.TrimSpace(content) == "" { + if strings.TrimSpace(content) == "" && !truncated { continue } dmesg = content sourceArtifact = src.path + sourceKind = src.kind + if src.kind == "ndjson" { + ndjsonTruncated = truncated + ndjsonTruncateReason = truncateReason + } } case ArtifactSkipped: anySkipped = true @@ -220,14 +241,45 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { return nil, nil // no dmesg available } + var dataQualityFindings []Finding + if sourceKind == "ndjson" && ndjsonTruncated { + reasonText := "size limit" + if ndjsonTruncateReason == "record_limit" { + reasonText = "record limit" + } + dataQualityFindings = append(dataQualityFindings, Finding{ + Code: FindingDataQuality, + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Category: "DATA", + Title: "Xid Source Truncated", + Description: fmt.Sprintf("%s was truncated by %s; Xid findings may be incomplete", sourceArtifact, reasonText), + Action: "Rerun with a narrower --journal-since window to reduce journal volume", + SourceArtifacts: []string{ + sourceArtifact, + }, + Fingerprint: identity.Fingerprint("triage", "gpu_health", "data_quality", sourceArtifact, reasonText), + }) + } + const maxXidEvents = 100 events := parseXidEvents(dmesg) if len(events) == 0 { + text := "No Xid/SXid errors found in kernel logs.\n" + if len(dataQualityFindings) > 0 { + lines := []string{"Xid/SXid Analysis: 0 unique event(s)\n"} + for _, dq := range dataQualityFindings { + lines = append(lines, fmt.Sprintf(" [warning] %s: %s", dq.Title, dq.Description)) + } + lines = append(lines, "", "No Xid/SXid errors found in the available log subset.") + text = strings.Join(lines, "\n") + "\n" + } return &TriageResult{ - Name: "gpu_health", - Facts: map[string]string{"xid_classified_count": "0"}, - Text: "No Xid/SXid errors found in kernel logs.\n", + Name: "gpu_health", + Findings: dataQualityFindings, + Facts: map[string]string{"xid_classified_count": "0"}, + Text: text, }, nil } @@ -237,9 +289,13 @@ func AnalyzeXid(_ context.Context, workDir string) (*TriageResult, error) { events = events[:maxXidEvents] } - var findings []Finding + findings := make([]Finding, 0, len(dataQualityFindings)+len(events)) + findings = append(findings, dataQualityFindings...) var textLines []string textLines = append(textLines, fmt.Sprintf("Xid/SXid Analysis: %d unique event(s)\n", totalCount)) + for _, dq := range dataQualityFindings { + textLines = append(textLines, fmt.Sprintf(" [warning] %s: %s", dq.Title, dq.Description)) + } for _, ev := range events { prefix := "Xid" diff --git a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go index d7eeafa..717fa2c 100644 --- a/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go +++ b/customers/vm-troubleshooting/internal/triage/xid_analyze_test.go @@ -127,3 +127,68 @@ func TestAnalyzeXid_FingerprintsStableAcrossRuns(t *testing.T) { t.Fatalf("fingerprints not stable across runs: first=%v second=%v", first, second) } } + +func TestAnalyzeXid_TruncatedNDJSONWithXidsEmitsWarning(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + content := `{"MESSAGE":"[1000.0] NVRM: Xid (PCI:0000:3b:00): 79, pid=1","PRIORITY":"3","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"} +{"_truncated":true,"records_written":1,"reason":"record_limit"} +` + if err := os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil triage result") + } + hasXid := false + hasDataQuality := false + for _, f := range tr.Findings { + if f.Code == FindingXid { + hasXid = true + } + if f.Code == FindingDataQuality { + hasDataQuality = true + } + } + if !hasXid || !hasDataQuality { + t.Fatalf("expected both xid and data_quality findings, got %#v", tr.Findings) + } +} + +func TestAnalyzeXid_TruncatedNDJSONWithoutXidsStillWarns(t *testing.T) { + t.Parallel() + + workDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(workDir, "logs"), 0o755); err != nil { + t.Fatal(err) + } + content := `{"MESSAGE":"kernel: benign message","PRIORITY":"5","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"} +{"_truncated":true,"records_written":1,"reason":"byte_limit"} +` + if err := os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + tr, err := AnalyzeXid(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + if tr == nil { + t.Fatal("expected non-nil triage result") + } + if tr.Facts["xid_classified_count"] != "0" { + t.Fatalf("expected xid_classified_count=0, got %q", tr.Facts["xid_classified_count"]) + } + if len(tr.Findings) != 1 || tr.Findings[0].Code != FindingDataQuality { + t.Fatalf("expected only data_quality warning, got %#v", tr.Findings) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/xid_generated_policies.go b/customers/vm-troubleshooting/internal/triage/xid_generated_policies.go new file mode 100644 index 0000000..8a48898 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xid_generated_policies.go @@ -0,0 +1,118 @@ +// Code generated by tools/update-xid-catalog.py; DO NOT EDIT. +// Source: https://docs.nvidia.com/deploy/xid-errors/_downloads/4586dadb59119a55d1e93a181caa4272/Xid-Catalog.xlsx + +package triage + +import "github.com/NexGenCloud/vm-diagnostics/internal/collector" + +var generatedXidPolicies = map[int]xidPolicy{ + 8: {collector.SeverityWarning, false, "Restart application"}, + 11: {collector.SeverityWarning, false, "Restart application"}, + 13: {collector.SeverityWarning, false, "Restart application"}, + 14: {collector.SeverityInfo, false, "No action required"}, + 25: {collector.SeverityWarning, false, "Restart application"}, + 31: {collector.SeverityWarning, false, "Restart application"}, + 32: {collector.SeverityWarning, false, "Restart application"}, + 37: {collector.SeverityInfo, false, "No action required"}, + 38: {collector.SeverityInfo, false, "No action required"}, + 39: {collector.SeverityWarning, false, "Restart application"}, + 40: {collector.SeverityWarning, false, "Restart application"}, + 41: {collector.SeverityWarning, false, "Restart application"}, + 43: {collector.SeverityInfo, false, "No action required"}, + 44: {collector.SeverityInfo, false, "No action required"}, + 45: {collector.SeverityWarning, false, "Contact support"}, + 46: {collector.SeverityCritical, true, "Reset GPU"}, + 48: {collector.SeverityWarning, false, "Contact support"}, + 54: {collector.SeverityWarning, false, "Contact support"}, + 60: {collector.SeverityWarning, false, "Restart application"}, + 62: {collector.SeverityCritical, true, "Reset GPU"}, + 63: {collector.SeverityInfo, false, "No action required"}, + 64: {collector.SeverityCritical, true, "Reset GPU"}, + 66: {collector.SeverityInfo, false, "No action required"}, + 67: {collector.SeverityInfo, false, "No action required"}, + 68: {collector.SeverityWarning, false, "Restart application"}, + 69: {collector.SeverityWarning, false, "Restart application"}, + 70: {collector.SeverityWarning, false, "Restart application"}, + 71: {collector.SeverityWarning, false, "Restart application"}, + 72: {collector.SeverityWarning, false, "Restart application"}, + 74: {collector.SeverityWarning, false, "Contact support"}, + 75: {collector.SeverityWarning, false, "Restart application"}, + 76: {collector.SeverityWarning, false, "Restart application"}, + 77: {collector.SeverityWarning, false, "Restart application"}, + 78: {collector.SeverityWarning, false, "Contact support"}, + 79: {collector.SeverityCritical, true, "Reboot node"}, + 80: {collector.SeverityWarning, false, "Restart application"}, + 82: {collector.SeverityWarning, false, "Restart application"}, + 83: {collector.SeverityWarning, false, "Restart application"}, + 84: {collector.SeverityWarning, false, "Restart application"}, + 85: {collector.SeverityWarning, false, "Restart application"}, + 86: {collector.SeverityWarning, false, "Restart application"}, + 88: {collector.SeverityWarning, false, "Restart application"}, + 89: {collector.SeverityWarning, false, "Restart application"}, + 92: {collector.SeverityInfo, false, "No action required"}, + 93: {collector.SeverityInfo, false, "No action required"}, + 94: {collector.SeverityWarning, false, "Restart application"}, + 95: {collector.SeverityCritical, true, "Reset GPU"}, + 96: {collector.SeverityWarning, false, "Restart application"}, + 97: {collector.SeverityWarning, false, "Restart application"}, + 98: {collector.SeverityWarning, false, "Restart application"}, + 99: {collector.SeverityWarning, false, "Restart application"}, + 100: {collector.SeverityWarning, false, "Restart application"}, + 101: {collector.SeverityWarning, false, "Restart application"}, + 102: {collector.SeverityWarning, false, "Restart application"}, + 103: {collector.SeverityWarning, false, "Restart application"}, + 104: {collector.SeverityWarning, false, "Restart application"}, + 105: {collector.SeverityWarning, false, "Restart application"}, + 106: {collector.SeverityInfo, false, "No action required"}, + 107: {collector.SeverityInfo, false, "No action required"}, + 108: {collector.SeverityInfo, false, "No action required"}, + 109: {collector.SeverityCritical, true, "Reset GPU"}, + 110: {collector.SeverityCritical, true, "Reset GPU"}, + 119: {collector.SeverityCritical, true, "Reset GPU"}, + 120: {collector.SeverityCritical, true, "Reset GPU"}, + 121: {collector.SeverityInfo, false, "No action required"}, + 126: {collector.SeverityWarning, false, "Restart application"}, + 127: {collector.SeverityWarning, false, "Restart application"}, + 128: {collector.SeverityWarning, false, "Restart application"}, + 129: {collector.SeverityWarning, false, "Restart application"}, + 130: {collector.SeverityWarning, false, "Restart application"}, + 131: {collector.SeverityWarning, false, "Restart application"}, + 132: {collector.SeverityWarning, false, "Restart application"}, + 133: {collector.SeverityWarning, false, "Restart application"}, + 134: {collector.SeverityWarning, false, "Restart application"}, + 135: {collector.SeverityWarning, false, "Restart application"}, + 136: {collector.SeverityCritical, true, "Reset GPU"}, + 137: {collector.SeverityInfo, false, "No action required"}, + 139: {collector.SeverityWarning, false, "Restart application"}, + 140: {collector.SeverityCritical, true, "Reset GPU"}, + 141: {collector.SeverityInfo, false, "No action required"}, + 142: {collector.SeverityWarning, false, "Contact support"}, + 143: {collector.SeverityCritical, true, "Reset GPU"}, + 144: {collector.SeverityWarning, false, "Contact support"}, + 145: {collector.SeverityWarning, false, "Contact support"}, + 146: {collector.SeverityWarning, false, "Contact support"}, + 147: {collector.SeverityWarning, false, "Contact support"}, + 148: {collector.SeverityWarning, false, "Contact support"}, + 149: {collector.SeverityWarning, false, "Contact support"}, + 150: {collector.SeverityWarning, false, "Contact support"}, + 151: {collector.SeverityWarning, false, "Contact support"}, + 154: {collector.SeverityWarning, false, "Contact support"}, + 155: {collector.SeverityCritical, true, "Reset GPU"}, + 156: {collector.SeverityCritical, true, "Reset GPU"}, + 157: {collector.SeverityInfo, false, "No action required"}, + 158: {collector.SeverityCritical, true, "Reset GPU"}, + 159: {collector.SeverityWarning, false, "Contact support"}, + 160: {collector.SeverityInfo, false, "No action required"}, + 161: {collector.SeverityInfo, false, "No action required"}, + 162: {collector.SeverityWarning, false, "Contact support"}, + 163: {collector.SeverityWarning, false, "Contact support"}, + 164: {collector.SeverityWarning, false, "Contact support"}, + 165: {collector.SeverityWarning, false, "Contact support"}, + 166: {collector.SeverityWarning, false, "Contact support"}, + 167: {collector.SeverityWarning, false, "Contact support"}, + 168: {collector.SeverityWarning, false, "Contact support"}, + 169: {collector.SeverityWarning, false, "Contact support"}, + 170: {collector.SeverityWarning, false, "Contact support"}, + 171: {collector.SeverityWarning, false, "Contact support"}, + 172: {collector.SeverityWarning, false, "Contact support"}, +} diff --git a/customers/vm-troubleshooting/internal/triage/xid_test.go b/customers/vm-troubleshooting/internal/triage/xid_test.go index 96ed468..72c6f85 100644 --- a/customers/vm-troubleshooting/internal/triage/xid_test.go +++ b/customers/vm-troubleshooting/internal/triage/xid_test.go @@ -71,6 +71,9 @@ func TestParseXidEvents_Xid154Dynamic(t *testing.T) { if ev.RecoveryAction != "gpu reset required" { t.Errorf("recovery action should be 'gpu reset required', got %q", ev.RecoveryAction) } + if ev.Action != "Reset GPU" { + t.Errorf("expected action to be overridden to reset GPU, got %q", ev.Action) + } } func TestParseXidEvents_Xid154None(t *testing.T) { @@ -84,6 +87,29 @@ func TestParseXidEvents_Xid154None(t *testing.T) { if events[0].Severity != collector.SeverityInfo { t.Errorf("Xid 154 with None should be info, got %s", events[0].Severity) } + if events[0].Action != "No action required" { + t.Errorf("Xid 154 with None should emit no-action text, got %q", events[0].Action) + } +} + +func TestParseXidEvents_Xid154NodeRebootRequired(t *testing.T) { + t.Parallel() + dmesg := `[1000.0] NVRM: Xid (PCI:0000:3b:00): 154, pid=0 GPU recovery action changed from 0x0 (None) to 0x8 (Node Reboot Required) +` + events := parseXidEvents(dmesg) + if len(events) != 1 { + t.Fatalf("expected 1 event, got %d", len(events)) + } + ev := events[0] + if ev.Severity != collector.SeverityCritical { + t.Fatalf("expected critical severity, got %s", ev.Severity) + } + if !ev.RequiresReset { + t.Fatal("expected requiresReset=true") + } + if ev.Action != "Reboot node" { + t.Fatalf("expected action 'Reboot node', got %q", ev.Action) + } } func TestParseXidEvents_SXid(t *testing.T) { @@ -115,9 +141,10 @@ func TestParseXidEvents_UnknownCode(t *testing.T) { } } -func TestParseXidEvents_KnownCatalogCodeUsesDefaultPolicy(t *testing.T) { +func TestParseXidEvents_KnownCatalogCodeUsesGeneratedPolicy(t *testing.T) { t.Parallel() - // 121 exists in the catalog, but has no explicit local policy override. + // 121 exists in the catalog. No hand-maintained override, but has a + // generated baseline policy → resolvePolicy should return the generated policy. dmesg := `[1000.0] NVRM: Xid (PCI:0000:3b:00): 121, pid=0 ` events := parseXidEvents(dmesg) @@ -125,14 +152,40 @@ func TestParseXidEvents_KnownCatalogCodeUsesDefaultPolicy(t *testing.T) { t.Fatalf("expected 1 event, got %d", len(events)) } ev := events[0] - if ev.Name != "C2C_LINK_ERROR" { - t.Fatalf("expected catalog name C2C_LINK_ERROR, got %q", ev.Name) + // Verify catalog lookup succeeded (name should not start with "Unknown"). + if ev.Name == "Unknown Xid 121" { + t.Fatal("expected catalog name for Xid 121, got Unknown") + } + // Policy comes from generatedXidPolicies since 121 has no hand-maintained override. + genPolicy := generatedXidPolicies[121] + if ev.Severity != genPolicy.severity { + t.Fatalf("expected generated severity %s, got %s", genPolicy.severity, ev.Severity) + } + if ev.Action != genPolicy.action { + t.Fatalf("expected generated action %q, got %q", genPolicy.action, ev.Action) + } +} + +func TestResolvePolicy_HandMaintainedOverridesGenerated(t *testing.T) { + t.Parallel() + // Xid 46 is in both hand-maintained and generated policies. + // Hand-maintained should win. + p := resolvePolicy(46) + hand := xidPolicies[46] + if p.severity != hand.severity || p.requiresReset != hand.requiresReset || p.action != hand.action { + t.Fatalf("expected hand-maintained policy for Xid 46, got %+v", p) } - if ev.Severity != collector.SeverityWarning { - t.Fatalf("expected default warning severity, got %s", ev.Severity) +} + +func TestResolvePolicy_UnknownCodeFallsThrough(t *testing.T) { + t.Parallel() + // Xid 99999 is not in any map. + p := resolvePolicy(99999) + if p.severity != collector.SeverityWarning { + t.Fatalf("expected warning for unknown code, got %s", p.severity) } - if ev.Action != "Contact support" { - t.Fatalf("expected default action, got %q", ev.Action) + if p.action != "Contact support" { + t.Fatalf("expected 'Contact support' for unknown code, got %q", p.action) } } diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md b/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md index dbc5a42..e1f95fd 100644 --- a/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/UPSTREAM.md @@ -2,15 +2,40 @@ This package keeps a local, minimal Xid catalog and parser surface for triage. -Upstream reference used for content curation: -- NVIDIA Xid documentation and datacenter guidance (r590 era) +## Automated update flow + +The catalog is auto-generated from the NVIDIA Xid-Catalog.xlsx: + +```bash +cd customers/vm-troubleshooting +make update-xid-catalog +``` + +This runs `tools/update-xid-catalog.py` which: +1. Discovers the XLSX download link from https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html +2. Downloads the XLSX fresh (no local file assumed) +3. Parses all sheets natively (Python stdlib: zipfile + xml.etree) +4. Generates three files: + - `xid.md` (repo root) — human-readable reference, all sheets + - `internal/triage/xidcatalog/catalog_generated.go` — Go catalog map + - `internal/triage/xid_generated_policies.go` — Go baseline policy map +5. Runs `gofmt`, `go vet`, `go test` + +**Requirements:** Python 3.8+ (stdlib only, no pip packages), internet access, Go toolchain. + +## Two-layer policy design + +- **`xidcatalog/catalog_generated.go`**: Neutral metadata (code, name, description). Generated. +- **`triage/xid_generated_policies.go`**: Baseline policies derived from NVIDIA resolution buckets. Generated. +- **`triage/xid.go` `xidPolicies`**: Hand-maintained local overrides. Always win over generated. +- **`triage/xid.go` `resolvePolicy()`**: Merges hand + generated + default fallback. + +## Manual override + +To override a generated policy, add or update the entry in `xid.go:xidPolicies`. +Hand-maintained entries always take precedence over generated baseline policies. + +## Local policy boundary -Local policy boundary: - `internal/triage/xidcatalog` stores neutral catalog metadata and kernel line parsing. - `internal/triage/xid.go` owns local support policy (severity, reset guidance, actions, and operational overrides such as Xid 154). - -Update flow: -1. Review upstream Xid docs for added/changed codes. -2. Update `catalog.go` entries (name + neutral description only). -3. Keep or adjust local policy in `xid.go` only when support policy changes. -4. Run `gofmt`, `go test`, `go vet`, and `CGO_ENABLED=0 go build ./cmd/gather-info`. diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go index da63493..7c8e73c 100644 --- a/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog.go @@ -13,58 +13,8 @@ type XidInfo struct { Description string } -var catalog = map[int]XidInfo{ - 8: {Code: 8, Name: "GPU_CHANNEL_TIMEOUT", Description: "GPU channel timeout detected"}, - 9: {Code: 9, Name: "DRIVER_ERROR", Description: "Driver error detected"}, - 10: {Code: 10, Name: "GPU_CONTEXT_SWITCH_TIMEOUT", Description: "GPU context switch timeout"}, - 11: {Code: 11, Name: "PBDMA_ERROR", Description: "PBDMA pushbuffer DMA error"}, - 12: {Code: 12, Name: "GRAPHICS_ENGINE_EXCEPTION", Description: "Graphics engine exception"}, - 13: {Code: 13, Name: "GR_EXCEPTION", Description: "Graphics engine exception"}, - 14: {Code: 14, Name: "GRAPHICS_SM_EXCEPTION", Description: "Graphics SM exception"}, - 15: {Code: 15, Name: "GRAPHICS_SW_NOTIFY", Description: "Graphics software notification error"}, - 16: {Code: 16, Name: "GRAPHICS_CLASS_ERROR", Description: "Graphics class error"}, - 17: {Code: 17, Name: "GRAPHICS_METHOD_ERROR", Description: "Graphics method error"}, - 18: {Code: 18, Name: "GRAPHICS_FIRMWARE_ERROR", Description: "Graphics firmware error"}, - 19: {Code: 19, Name: "GRAPHICS_EXCEPTION", Description: "Graphics exception"}, - 20: {Code: 20, Name: "GPU_DMA_PUSHER_ERROR", Description: "DMA pusher error"}, - 21: {Code: 21, Name: "GPU_DMA_FETCH_ERROR", Description: "DMA fetch error"}, - 22: {Code: 22, Name: "GPU_DMA_SEMAPHORE_ERROR", Description: "DMA semaphore error"}, - 23: {Code: 23, Name: "GPU_DMA_ILLEGAL_METHOD", Description: "Illegal DMA method"}, - 24: {Code: 24, Name: "GPU_CHANNEL_ERROR", Description: "GPU channel error"}, - 25: {Code: 25, Name: "GPU_PCI_ERROR", Description: "PCIe transaction error"}, - 26: {Code: 26, Name: "GPU_MEMORY_TRANSFER_ERROR", Description: "Memory transfer error"}, - 27: {Code: 27, Name: "GPU_DISPLAY_ERROR", Description: "Display subsystem error"}, - 28: {Code: 28, Name: "GPU_FIRMWARE_TIMEOUT", Description: "Firmware timeout"}, - 29: {Code: 29, Name: "GPU_FIRMWARE_COMM_ERROR", Description: "Firmware communication error"}, - 30: {Code: 30, Name: "GPU_FIRMWARE_EXCEPTION", Description: "Firmware exception"}, - 31: {Code: 31, Name: "MMU_ERR_FLT", Description: "GPU memory page fault"}, - 43: {Code: 43, Name: "RESETCHANNEL_VERIF_ERROR", Description: "GPU stopped processing (software fault)"}, - 45: {Code: 45, Name: "PREEMPTIVE_REMOVAL", Description: "Preemptive cleanup due to prior error"}, - 46: {Code: 46, Name: "GPU_TIMEOUT_ERROR", Description: "GPU stopped processing (timeout)"}, - 48: {Code: 48, Name: "GPU_ECC_DBE", Description: "Double-bit ECC error (uncorrectable)"}, - 56: {Code: 56, Name: "DISPLAY_CHANNEL_EXCEPTION", Description: "Display channel exception"}, - 57: {Code: 57, Name: "FB_LINK_TRAINING_FAILURE", Description: "Framebuffer link training failure"}, - 61: {Code: 61, Name: "PMU_BREAKPOINT", Description: "PMU breakpoint"}, - 62: {Code: 62, Name: "PMU_HALT_ERROR", Description: "Internal micro-controller halt"}, - 63: {Code: 63, Name: "DRAM_RETIREMENT_EVENT", Description: "GPU memory remapping event"}, - 64: {Code: 64, Name: "DRAM_RETIREMENT_FAILURE", Description: "GPU memory remapping failure"}, - 68: {Code: 68, Name: "NVDEC0_ERROR", Description: "NVDEC0 exception"}, - 69: {Code: 69, Name: "GR_CLASS_ERROR", Description: "Graphics engine class error"}, - 74: {Code: 74, Name: "NVLINK_ERROR", Description: "NVLink error"}, - 79: {Code: 79, Name: "GPU_FALLEN_OFF_BUS", Description: "GPU has fallen off the bus"}, - 92: {Code: 92, Name: "EXCESSIVE_SBE_INTERRUPTS", Description: "High single-bit ECC error rate"}, - 94: {Code: 94, Name: "CONTAINED_ERROR", Description: "Contained memory error (app-local)"}, - 95: {Code: 95, Name: "UNCONTAINED_ERROR", Description: "Uncontained memory error (all apps affected)"}, - 109: {Code: 109, Name: "CTXSW_TIMEOUT_ERROR", Description: "Context switch timeout"}, - 119: {Code: 119, Name: "GSP_RPC_TIMEOUT", Description: "GSP RPC timeout"}, - 120: {Code: 120, Name: "GSP_ERROR", Description: "GSP error"}, - 121: {Code: 121, Name: "C2C_LINK_ERROR", Description: "Chip-to-chip link error"}, - 137: {Code: 137, Name: "NVLINK_FLA_PRIV_ERROR", Description: "NVLink fabric address fault"}, - 140: {Code: 140, Name: "NVLINK_SUBLINK_ERROR", Description: "NVLink sublink error"}, - 143: {Code: 143, Name: "NVLINK_FATAL_ERROR", Description: "NVLink fatal error"}, - 150: {Code: 150, Name: "NVLINK_MSE_ERROR", Description: "NVLink MSE error"}, - 154: {Code: 154, Name: "GPU_RECOVERY_ACTION", Description: "GPU recovery action changed"}, -} +// catalog is populated by catalog_generated.go (generated by tools/update-xid-catalog.py). +// To update: make update-xid-catalog var ( xidRe = regexp.MustCompile(`NVRM:\s*Xid\s*\(PCI:([^)]+)\):\s*(\d+)`) diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_generated.go b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_generated.go new file mode 100644 index 0000000..53bb149 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/catalog_generated.go @@ -0,0 +1,116 @@ +// Code generated by tools/update-xid-catalog.py; DO NOT EDIT. +// Source: https://docs.nvidia.com/deploy/xid-errors/_downloads/4586dadb59119a55d1e93a181caa4272/Xid-Catalog.xlsx + +package xidcatalog + +var catalog = map[int]XidInfo{ + 8: {Code: 8, Name: "ROBUST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT", Description: "GPU stopped processing"}, + 11: {Code: 11, Name: "ROBUST_CHANNEL_GR_ERROR_MISSING_HW", Description: "Invalid or corrupted push buffer stream"}, + 13: {Code: 13, Name: "ROBUST_CHANNEL_GR_EXCEPTION___ROBUST_CHANNEL_GR_ERROR_SW_NOTIFY", Description: "Graphics Engine Exception"}, + 14: {Code: 14, Name: "ROBUST_CHANNEL_FAKE_ERROR", Description: "Unused"}, + 25: {Code: 25, Name: "ROBUST_CHANNEL_GR_ILLEGAL_NOTIFY", Description: "Invalid or illegal push buffer stream"}, + 31: {Code: 31, Name: "ROBUST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT", Description: "GPU memory page fault"}, + 32: {Code: 32, Name: "ROBUST_CHANNEL_PBDMA_ERROR", Description: "Invalid or corrupted push buffer stream"}, + 37: {Code: 37, Name: "ROBUST_CHANNEL_FECS_ERR_UNIMP_FIRMWARE_METHOD", Description: "Driver firmware error"}, + 38: {Code: 38, Name: "ROBUST_CHANNEL_FECS_ERR_WATCHDOG_TIMEOUT", Description: "Driver firmware error"}, + 39: {Code: 39, Name: "ROBUST_CHANNEL_CE0_ERROR", Description: "Copy Engine Exception"}, + 40: {Code: 40, Name: "ROBUST_CHANNEL_CE1_ERROR", Description: "Copy Engine Exception"}, + 41: {Code: 41, Name: "ROBUST_CHANNEL_CE2_ERROR", Description: "Copy Engine Exception"}, + 43: {Code: 43, Name: "ROBUST_CHANNEL_RESETCHANNEL_VERIF_ERROR", Description: "GPU stopped processing"}, + 44: {Code: 44, Name: "ROBUST_CHANNEL_GR_FAULT_DURING_CTXSW", Description: "Graphics Engine fault during context switch"}, + 45: {Code: 45, Name: "ROBUST_CHANNEL_PREEMPTIVE_REMOVAL", Description: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE"}, + 46: {Code: 46, Name: "ROBUST_CHANNEL_GPU_TIMEOUT_ERROR", Description: "GPU stopped processing"}, + 48: {Code: 48, Name: "ROBUST_CHANNEL_GPU_ECC_DBE", Description: "Double Bit ECC Error"}, + 54: {Code: 54, Name: "SILENT_RUNNING_PWR_REDUCED_CLOCKING", Description: "Auxiliary power is not connected to the GPU board"}, + 60: {Code: 60, Name: "ROBUST_CHANNEL_SEC2_ERROR", Description: "Video processor exception"}, + 62: {Code: 62, Name: "PMU_HALT_ERROR", Description: "Internal micro-controller halt (newer drivers)"}, + 63: {Code: 63, Name: "INFOROM_DRAM_RETIREMENT_EVENT", Description: "GPU memory remapping event"}, + 64: {Code: 64, Name: "INFOROM_DRAM_RETIREMENT_FAILURE", Description: "GPU memory remapping failure"}, + 66: {Code: 66, Name: "ROBUST_CHANNEL_FECS_ERR_REG_ACCESS_VIOLATION", Description: "Illegal access by driver"}, + 67: {Code: 67, Name: "ROBUST_CHANNEL_FECS_ERR_VERIF_VIOLATION", Description: "Illegal access by driver"}, + 68: {Code: 68, Name: "ROBUST_CHANNEL_NVDEC0_ERROR", Description: "NVDEC0 Exception"}, + 69: {Code: 69, Name: "ROBUST_CHANNEL_GR_CLASS_ERROR", Description: "Graphics Engine class error"}, + 70: {Code: 70, Name: "ROBUST_CHANNEL_CE3_ERROR", Description: "CE3: Unknown Error"}, + 71: {Code: 71, Name: "ROBUST_CHANNEL_CE4_ERROR", Description: "CE4: Unknown Error"}, + 72: {Code: 72, Name: "ROBUST_CHANNEL_CE5_ERROR", Description: "CE5: Unknown Error"}, + 74: {Code: 74, Name: "NVLINK_ERROR", Description: "NVLINK Error"}, + 75: {Code: 75, Name: "ROBUST_CHANNEL_CE6_ERROR", Description: "CE6: Unknown Error"}, + 76: {Code: 76, Name: "ROBUST_CHANNEL_CE7_ERROR", Description: "CE7: Unknown Error"}, + 77: {Code: 77, Name: "ROBUST_CHANNEL_CE8_ERROR", Description: "CE8: Unknown Error"}, + 78: {Code: 78, Name: "VGPU_START_ERROR", Description: "vGPU Start Error"}, + 79: {Code: 79, Name: "ROBUST_CHANNEL_GPU_HAS_FALLEN_OFF_THE_BUS", Description: "GPU has fallen off the bus"}, + 80: {Code: 80, Name: "PBDMA_PUSHBUFFER_CRC_MISMATCH", Description: "Corrupted data sent to GPU"}, + 82: {Code: 82, Name: "ROBUST_CHANNEL_NVJPG0_ERROR", Description: "NVJPG0 Error"}, + 83: {Code: 83, Name: "ROBUST_CHANNEL_NVDEC1_ERROR", Description: "NVDEC1 Error"}, + 84: {Code: 84, Name: "ROBUST_CHANNEL_NVDEC2_ERROR", Description: "NVDEC2 Error"}, + 85: {Code: 85, Name: "ROBUST_CHANNEL_CE9_ERROR", Description: "CE9: Unknown Error"}, + 86: {Code: 86, Name: "ROBUST_CHANNEL_OFA0_ERROR", Description: "OFA Exception"}, + 88: {Code: 88, Name: "ROBUST_CHANNEL_NVDEC3_ERROR", Description: "NVDEC3 Error"}, + 89: {Code: 89, Name: "ROBUST_CHANNEL_NVDEC4_ERROR", Description: "NVDEC4 Error"}, + 92: {Code: 92, Name: "EXCESSIVE_SBE_INTERRUPTS", Description: "High single-bit ECC error rate"}, + 93: {Code: 93, Name: "INFOROM_ERASE_LIMIT_EXCEEDED", Description: "Non-fatal violation of provisioned InfoROM wear limit"}, + 94: {Code: 94, Name: "ROBUST_CHANNEL_CONTAINED_ERROR", Description: "Contained memory error"}, + 95: {Code: 95, Name: "ROBUST_CHANNEL_UNCONTAINED_ERROR", Description: "Uncontained memory error"}, + 96: {Code: 96, Name: "ROBUST_CHANNEL_NVDEC5_ERROR", Description: "NVDEC5 Error"}, + 97: {Code: 97, Name: "ROBUST_CHANNEL_NVDEC6_ERROR", Description: "NVDEC6 Error"}, + 98: {Code: 98, Name: "ROBUST_CHANNEL_NVDEC7_ERROR", Description: "NVDEC7 Error"}, + 99: {Code: 99, Name: "ROBUST_CHANNEL_NVJPG1_ERROR", Description: "NVJPG1 Error"}, + 100: {Code: 100, Name: "ROBUST_CHANNEL_NVJPG2_ERROR", Description: "NVJPG2 Error"}, + 101: {Code: 101, Name: "ROBUST_CHANNEL_NVJPG3_ERROR", Description: "NVJPG3 Error"}, + 102: {Code: 102, Name: "ROBUST_CHANNEL_NVJPG4_ERROR", Description: "NVJPG4 Error"}, + 103: {Code: 103, Name: "ROBUST_CHANNEL_NVJPG5_ERROR", Description: "NVJPG5 Error"}, + 104: {Code: 104, Name: "ROBUST_CHANNEL_NVJPG6_ERROR", Description: "NVJPG6 Error"}, + 105: {Code: 105, Name: "ROBUST_CHANNEL_NVJPG7_ERROR", Description: "NVJPG7 Error"}, + 106: {Code: 106, Name: "SMBPBI_TEST_MESSAGE", Description: "SMBPBI Test Message"}, + 107: {Code: 107, Name: "SMBPBI_TEST_MESSAGE_SILENT", Description: "SMBPBI Test Message Silent"}, + 108: {Code: 108, Name: "NVLINK_REMOTE_TRANSLATION_ERROR", Description: "Unused"}, + 109: {Code: 109, Name: "ROBUST_CHANNEL_CTXSW_TIMEOUT_ERROR", Description: "Context Switch Timeout Error"}, + 110: {Code: 110, Name: "SEC_FAULT_ERROR", Description: "Security Fault Error"}, + 119: {Code: 119, Name: "GSP_RPC_TIMEOUT", Description: "GSP RPC Timeout"}, + 120: {Code: 120, Name: "GSP_ERROR", Description: "GSP Error"}, + 121: {Code: 121, Name: "C2C_ERROR", Description: "C2C Error"}, + 126: {Code: 126, Name: "ROBUST_CHANNEL_CE10_ERROR", Description: "CE10: Unknown Error"}, + 127: {Code: 127, Name: "ROBUST_CHANNEL_CE11_ERROR", Description: "CE11: Unknown Error"}, + 128: {Code: 128, Name: "ROBUST_CHANNEL_CE12_ERROR", Description: "CE12: Unknown Error"}, + 129: {Code: 129, Name: "ROBUST_CHANNEL_CE13_ERROR", Description: "CE13: Unknown Error"}, + 130: {Code: 130, Name: "ROBUST_CHANNEL_CE14_ERROR", Description: "CE14: Unknown Error"}, + 131: {Code: 131, Name: "ROBUST_CHANNEL_CE15_ERROR", Description: "CE15: Unknown Error"}, + 132: {Code: 132, Name: "ROBUST_CHANNEL_CE16_ERROR", Description: "CE16: Unknown Error"}, + 133: {Code: 133, Name: "ROBUST_CHANNEL_CE17_ERROR", Description: "CE17: Unknown Error"}, + 134: {Code: 134, Name: "ROBUST_CHANNEL_CE18_ERROR", Description: "CE18: Unknown Error"}, + 135: {Code: 135, Name: "ROBUST_CHANNEL_CE19_ERROR", Description: "CE19: Unknown Error"}, + 136: {Code: 136, Name: "ALI_TRAINING_FAIL", Description: "Link Training Failed"}, + 137: {Code: 137, Name: "NVLINK_PRIV_ERR", Description: "NVLink Privilege Error"}, + 139: {Code: 139, Name: "ROBUST_CHANNEL_OFA1_ERROR", Description: "OFA1 Error"}, + 140: {Code: 140, Name: "UNRECOVERABLE_ECC_ERROR_ESCAPE", Description: "ECC Unrecovered Error"}, + 141: {Code: 141, Name: "ROBUST_CHANNEL_FAST_PATH_ERROR", Description: "CUDA Fast Path Error"}, + 142: {Code: 142, Name: "ROBUST_CHANNEL_NVENC3_ERROR", Description: "NVENC3 Error"}, + 143: {Code: 143, Name: "GPU_INIT_ERROR", Description: "GPU Initialization Error"}, + 144: {Code: 144, Name: "NVLINK_SAW_ERROR", Description: "NVLINK: SAW Error"}, + 145: {Code: 145, Name: "NVLINK_RLW_ERROR", Description: "NVLINK: RLW Error"}, + 146: {Code: 146, Name: "NVLINK_TLW_ERROR", Description: "NVLINK: TLW Error"}, + 147: {Code: 147, Name: "NVLINK_TREX_ERROR", Description: "NVLINK: TREX Error"}, + 148: {Code: 148, Name: "NVLINK_NVLPW_CTRL_ERROR", Description: "NVLINK: NVLPW_CTRL Error"}, + 149: {Code: 149, Name: "NVLINK_NETIR_ERROR", Description: "NVLINK: NETIR Error"}, + 150: {Code: 150, Name: "NVLINK_MSE_ERROR", Description: "NVLINK: MSE Error"}, + 151: {Code: 151, Name: "ROBUST_CHANNEL_KEY_ROTATION_ERROR", Description: "Key rotation Error"}, + 154: {Code: 154, Name: "GPU_RECOVERY_ACTION_CHANGED", Description: "GPU Recovery Action Changed"}, + 155: {Code: 155, Name: "NVLINK_SW_DEFINED_ERROR", Description: "NVLINK: SW Defined Error"}, + 156: {Code: 156, Name: "RESOURCE_RETIREMENT_EVENT", Description: "Resource Retirement Event"}, + 157: {Code: 157, Name: "RESOURCE_RETIREMENT_FAILURE", Description: "Resource Retirement Failure"}, + 158: {Code: 158, Name: "GPU_FATAL_TIMEOUT", Description: "GPU Fatal Timeout"}, + 159: {Code: 159, Name: "ROBUST_CHANNEL_CHI_NON_DATA_ERROR", Description: "CHI Non-Data Error"}, + 160: {Code: 160, Name: "CHANNEL_RETIREMENT_EVENT", Description: "Channel Retirement Event"}, + 161: {Code: 161, Name: "CHANNEL_RETIREMENT_FAILURE", Description: "Channel Retirement Failure"}, + 162: {Code: 162, Name: "PSHC_REENGAGED", Description: "Power Smoothing HW Circuitry capability reengaged"}, + 163: {Code: 163, Name: "PSHC_DISENGAGED", Description: "Power Smoothing HW Circuitry capability disengaged"}, + 164: {Code: 164, Name: "PSHC_LOW_LIFETIME", Description: "Power Smoothing HW Circuitry low lifetime reached"}, + 165: {Code: 165, Name: "PSHC_ZERO_LIFETIME", Description: "Power Smoothing HW Circuitry lifetime exhausted"}, + 166: {Code: 166, Name: "NVLINK_SECURE_CRYPTO_ERR", Description: "CC traffic seen prior to link properly being configured for encrypted traffic"}, + 167: {Code: 167, Name: "PCIE_FATAL_TIMEOUT", Description: "PCIE_FATAL_TIMEOUT"}, + 168: {Code: 168, Name: "REDUCED_GPU_MEMORY_CAPACITY", Description: "Errors found in WPR (write protected region)"}, + 169: {Code: 169, Name: "SEC2_HALT_ERROR", Description: "Internal micro-controller halt"}, + 170: {Code: 170, Name: "NVLINK_SECURE_OTHER", Description: "Interrupt seen in CC mode"}, + 171: {Code: 171, Name: "UNCORRECTABLE_DRAM_ERROR", Description: "Additional to Xid 48 providing more details on particulars of fault to differentiate DRAM/SRAM"}, + 172: {Code: 172, Name: "UNCORRECTABLE_SRAM_ERROR", Description: "Additional to Xid 48 providing more details on particulars of fault to differentiate DRAM/SRAM"}, +} diff --git a/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh b/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh index 064b97c..c9bce57 100644 --- a/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh +++ b/customers/vm-troubleshooting/internal/triage/xidcatalog/sync_catalog.sh @@ -1,11 +1,10 @@ #!/usr/bin/env bash set -euo pipefail -# Manual sync helper for xidcatalog package. -# Usage: -# 1) Review current NVIDIA Xid docs and identify changed codes. -# 2) Edit catalog.go entries (neutral metadata only). -# 3) Keep support policy changes in ../xid.go. -# 4) Run repository verification commands. +# This script is a convenience wrapper. The canonical command is: +# cd customers/vm-troubleshooting && make update-xid-catalog +# +# See UPSTREAM.md for details on the automated update flow. -echo "xidcatalog sync is a manual curated process; see UPSTREAM.md" +cd "$(dirname "$0")/../../.." +exec make update-xid-catalog diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index 17cfee1..b2745eb 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -96,7 +96,7 @@ "type": "string", "enum": [ "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", - "xid", "sxid", "firewall_posture", "critical_log" + "xid", "sxid", "firewall_posture", "critical_log", "data_quality" ] }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index 4ff2bf1..24e3451 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -41,7 +41,7 @@ "type": "string", "enum": [ "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", - "xid", "sxid", "firewall_posture", "critical_log" + "xid", "sxid", "firewall_posture", "critical_log", "data_quality" ] }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, diff --git a/customers/vm-troubleshooting/schemas/triage-result.schema.json b/customers/vm-troubleshooting/schemas/triage-result.schema.json index 2a57a5d..d036273 100644 --- a/customers/vm-troubleshooting/schemas/triage-result.schema.json +++ b/customers/vm-troubleshooting/schemas/triage-result.schema.json @@ -28,7 +28,7 @@ "properties": { "code": { "type": "string", - "enum": ["xid", "sxid", "firewall_posture", "critical_log"] + "enum": ["xid", "sxid", "firewall_posture", "critical_log", "data_quality"] }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, "confidence": { "type": "string", "enum": ["high", "low"] }, diff --git a/customers/vm-troubleshooting/tools/update-xid-catalog.py b/customers/vm-troubleshooting/tools/update-xid-catalog.py new file mode 100755 index 0000000..83607df --- /dev/null +++ b/customers/vm-troubleshooting/tools/update-xid-catalog.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python3 +""" +Fetch the NVIDIA Xid-Catalog.xlsx from docs.nvidia.com and generate: + 1. xid.md — human-readable reference (all sheets) + 2. catalog_generated.go — Go catalog map (code -> name + description) + 3. xid_generated_policies.go — Go policy map (code -> severity + action) + +Dependencies: Python 3.8+ stdlib only (no third-party packages). +Runs on developer machines (internet access required); outputs are committed +to the repo and compiled into a static, offline binary. + +Usage: + python3 tools/update-xid-catalog.py \\ + --xid-md xid.md \\ + --catalog-out internal/triage/xidcatalog/catalog_generated.go \\ + --policy-out internal/triage/xid_generated_policies.go +""" + +import argparse +import html.parser +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +import urllib.error +import urllib.request +import xml.etree.ElementTree as ET +import zipfile +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Dict, List, Optional, Tuple +from urllib.parse import urljoin + +CATALOG_PAGE = "https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html" +MAX_RETRIES = 3 +RETRY_BACKOFF = 2 # seconds, doubled each retry + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + +def _fetch_url(url: str, timeout: int = 30) -> bytes: + """Fetch URL with retries and exponential backoff.""" + req = urllib.request.Request(url, headers={"User-Agent": "update-xid-catalog/1.0"}) + delay = RETRY_BACKOFF + last_err: Optional[Exception] = None + for attempt in range(MAX_RETRIES): + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + return resp.read() + except (urllib.error.URLError, OSError) as e: + last_err = e + if attempt < MAX_RETRIES - 1: + print(f" Retry {attempt + 1}/{MAX_RETRIES} after {delay}s: {e}", file=sys.stderr) + time.sleep(delay) + delay *= 2 + raise RuntimeError(f"Failed after {MAX_RETRIES} attempts: {last_err}") + + +# --------------------------------------------------------------------------- +# HTML link extractor (find the .xlsx download link) +# --------------------------------------------------------------------------- + +class _LinkExtractor(html.parser.HTMLParser): + """Extract tags whose href ends with .xlsx.""" + + def __init__(self) -> None: + super().__init__() + self.links: List[str] = [] + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + if tag == "a": + href = dict(attrs).get("href", "") + if href and href.endswith(".xlsx"): + self.links.append(href) + + +def discover_xlsx_url() -> str: + """Scrape the NVIDIA Xid catalog page for the .xlsx download link.""" + print(f"Fetching catalog page: {CATALOG_PAGE}", file=sys.stderr) + page = _fetch_url(CATALOG_PAGE).decode("utf-8", errors="replace") + + parser = _LinkExtractor() + parser.feed(page) + if not parser.links: + raise RuntimeError("No .xlsx download link found on catalog page") + + url = urljoin(CATALOG_PAGE, parser.links[0]) + print(f" XLSX URL: {url}", file=sys.stderr) + return url + + +# --------------------------------------------------------------------------- +# XLSX native parser (Python stdlib: zipfile + xml.etree) +# --------------------------------------------------------------------------- + +_NS = { + "s": "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "pr": "http://schemas.openxmlformats.org/package/2006/relationships", +} + +_CELL_REF_RE = re.compile(r"^([A-Z]+)(\d+)$") + + +def _col_letter_to_index(col: str) -> int: + """Convert column letter (A, B, ..., AA, ...) to 0-based index.""" + result = 0 + for ch in col.upper(): + result = result * 26 + (ord(ch) - ord("A") + 1) + return result - 1 + + +def _parse_shared_strings(zf: zipfile.ZipFile) -> List[str]: + """Parse xl/sharedStrings.xml -> list of string values.""" + try: + data = zf.read("xl/sharedStrings.xml") + except KeyError: + return [] + root = ET.fromstring(data) + strings: List[str] = [] + for si in root.findall("s:si", _NS): + t_el = si.find("s:t", _NS) + if t_el is not None and t_el.text: + strings.append(t_el.text) + else: + # Rich text: concatenate fragments. + strings.append("".join( + rt.text for r_el in si.findall("s:r", _NS) + for rt in [r_el.find("s:t", _NS)] + if rt is not None and rt.text + )) + return strings + + +def _parse_worksheet(zf: zipfile.ZipFile, path: str, shared: List[str]) -> List[List[str]]: + """Parse a worksheet XML file into a list of rows (list of cell strings).""" + root = ET.fromstring(zf.read(path)) + rows: Dict[int, Dict[int, str]] = {} + max_col = 0 + + for row_el in root.findall(".//s:sheetData/s:row", _NS): + row_idx = int(row_el.get("r", "0")) + for c_el in row_el.findall("s:c", _NS): + m = _CELL_REF_RE.match(c_el.get("r", "")) + if not m: + continue + col_idx = _col_letter_to_index(m.group(1)) + max_col = max(max_col, col_idx) + + cell_type = c_el.get("t", "") + v_el = c_el.find("s:v", _NS) + val = "" + if v_el is not None and v_el.text: + if cell_type == "s": + idx = int(v_el.text) + val = shared[idx] if idx < len(shared) else "" + elif cell_type == "b": + val = "TRUE" if v_el.text == "1" else "FALSE" + else: + val = v_el.text + elif cell_type == "inlineStr": + is_el = c_el.find("s:is/s:t", _NS) + if is_el is not None and is_el.text: + val = is_el.text + + rows.setdefault(row_idx, {})[col_idx] = val + + if not rows: + return [] + + result: List[List[str]] = [] + for row_idx in sorted(rows): + row_data = rows[row_idx] + row = [""] * (max_col + 1) + for ci, v in row_data.items(): + row[ci] = v + result.append(row) + return result + + +def parse_xlsx(path: str) -> Dict[str, List[List[str]]]: + """Parse an XLSX file into {sheet_name: [[cell, ...], ...]}.""" + with zipfile.ZipFile(path, "r") as zf: + if "xl/workbook.xml" not in zf.namelist(): + raise RuntimeError("Invalid XLSX: missing xl/workbook.xml") + + shared = _parse_shared_strings(zf) + + wb = ET.fromstring(zf.read("xl/workbook.xml")) + sheet_info: List[Tuple[str, str]] = [ + (s.get("name", ""), + s.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id", "")) + for s in wb.findall(".//s:sheets/s:sheet", _NS) + ] + + rels_root = ET.fromstring(zf.read("xl/_rels/workbook.xml.rels")) + rid_to_path = { + rel.get("Id", ""): rel.get("Target", "") + for rel in rels_root.findall("pr:Relationship", _NS) + } + + sheets: Dict[str, List[List[str]]] = {} + for name, rid in sheet_info: + target = rid_to_path.get(rid, "") + if not name or not target: + continue + ws_path = f"xl/{target}" if not target.startswith("/") else target.lstrip("/") + try: + sheets[name] = _parse_worksheet(zf, ws_path, shared) + except KeyError: + print(f" Warning: worksheet {ws_path} not found", file=sys.stderr) + + return sheets + + +# --------------------------------------------------------------------------- +# Column discovery +# --------------------------------------------------------------------------- + +def _normalize_header(row: List[str]) -> List[str]: + """Flatten multiline cells to single lowercase strings.""" + return [" ".join(c.split()).strip().lower() for c in row] + + +def _find_header_row(rows: List[List[str]], required: List[str]) -> int: + """Find the first row containing all required column names (case-insensitive substring).""" + for i, row in enumerate(rows): + norm = _normalize_header(row) + if all( + any(cell and (req.lower() in cell or cell in req.lower()) for cell in norm) + for req in required + ): + return i + raise RuntimeError(f"Header row with columns {required} not found") + + +def _find_col(normalized_header: List[str], *candidates: str) -> int: + """Find column index matching any candidate (case-insensitive substring). + Expects a pre-normalized header (from _normalize_header).""" + for cand in candidates: + cl = cand.lower() + for i, h in enumerate(normalized_header): + if h and (cl in h or h in cl): + return i + return -1 + + +# --------------------------------------------------------------------------- +# Extract Xid entries from the "Xids" sheet +# --------------------------------------------------------------------------- + +@dataclass +class XidEntry: + """A single Xid entry extracted from the XLSX.""" + code: int + name: str + description: str + resolution_bucket: str + + def go_severity_and_reset(self) -> Tuple[str, bool, str]: + """Derive (severity_const, requires_reset, action) from resolution bucket.""" + bucket = self.resolution_bucket.strip().upper() + if "RESTART_BM" in bucket: + return "collector.SeverityCritical", True, "Reboot node" + if "RESET_GPU" in bucket: + return "collector.SeverityCritical", True, "Reset GPU" + if "WORKFLOW" in bucket and "RESET" in bucket: + return "collector.SeverityCritical", True, "Reset GPU" + if "RESTART_APP" in bucket: + return "collector.SeverityWarning", False, "Restart application" + if "IGNORE" in bucket: + return "collector.SeverityInfo", False, "No action required" + return "collector.SeverityWarning", False, "Contact support" + + +def _cell_int(val: str) -> Optional[int]: + """Parse a cell as int, handling float strings like '8.0'.""" + val = val.strip() + if not val: + return None + try: + return int(val) + except ValueError: + try: + return int(float(val)) + except (ValueError, OverflowError): + return None + + +def extract_xid_entries(sheets: Dict[str, List[List[str]]]) -> List[XidEntry]: + """Extract active Xid entries from the 'Xids' sheet.""" + xid_sheet = None + for name, rows in sheets.items(): + if name.lower().strip() in ("xids", "xid"): + xid_sheet = rows + break + if xid_sheet is None: + raise RuntimeError("'Xids' sheet not found in XLSX") + if len(xid_sheet) < 2: + raise RuntimeError("Xids sheet has no data rows") + + header_idx = _find_header_row(xid_sheet, ["Code", "Mnemonic"]) + header = xid_sheet[header_idx] + norm = _normalize_header(header) + data_rows = xid_sheet[header_idx + 1:] + + # Column indices — "code" before "xid" to avoid false match on "Type (XID)". + col_code = _find_col(norm, "code", "xid #", "xid#") + col_type = _find_col(norm, "type") + col_name = _find_col(norm, "mnemonic", "name", "xid name") + col_desc = _find_col(norm, "description", "desc") + col_bucket = _find_col(norm, "resolution bucket", "immediate action") + + if col_code < 0: + raise RuntimeError(f"Could not find 'Code' column in header: {header}") + if col_name < 0: + raise RuntimeError(f"Could not find 'Mnemonic'/'Name' column in header: {header}") + + # "Applies to" columns — match GPU family names + gpu_keywords = ("applies", "a100", "h100", "h200", "b100", "b200", "gb200", "gb300") + applies_cols = [i for i, h in enumerate(norm) if any(kw in h for kw in gpu_keywords)] + + print(f" Xids: {len(data_rows)} data rows, {len(applies_cols)} applies-to columns", file=sys.stderr) + + def _cell(row: List[str], col: int) -> str: + return row[col].strip() if 0 <= col < len(row) else "" + + entries: List[XidEntry] = [] + for row in data_rows: + code = _cell_int(_cell(row, col_code)) + if code is None: + continue + + xid_type = _cell(row, col_type).upper() + if xid_type and xid_type not in ("XID", "SXID"): + continue + + # Filter by "Applies to" — skip if all relevant columns say NO + if applies_cols and not any( + _cell(row, c).upper() in ("YES", "Y", "TRUE", "1") + for c in applies_cols + ): + continue + + name = _cell(row, col_name) or f"XID_{code}" + name = re.sub(r"[^A-Za-z0-9_]", "_", name).strip("_").upper() + desc = _cell(row, col_desc) or name + + entries.append(XidEntry(code=code, name=name, description=desc, + resolution_bucket=_cell(row, col_bucket))) + + entries.sort(key=lambda e: e.code) + return entries + + +# --------------------------------------------------------------------------- +# Output generators +# --------------------------------------------------------------------------- + +def _go_string(value: str) -> str: + """Escape a Python string for a double-quoted Go string literal.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("\r", " ").replace("\n", " ") + + +def generate_xid_md(sheets: Dict[str, List[List[str]]], source_url: str) -> str: + """Generate xid.md with all sheets as markdown tables.""" + lines = [ + f"", + f"", + "", "# NVIDIA Xid Error Catalog", "", + ] + + for sheet_name, rows in sheets.items(): + lines.append(f"## {sheet_name}") + lines.append("") + if not rows: + lines.extend(["*(empty sheet)*", ""]) + continue + + hdr = [c.strip().replace("\n", " ") if c else "" for c in rows[0]] + lines.append("| " + " | ".join(hdr) + " |") + lines.append("| " + " | ".join(["---"] * len(hdr)) + " |") + + for row in rows[1:]: + padded = list(row) + [""] * (len(hdr) - len(row)) + cells = [c.strip().replace("\n", " ").replace("|", "\\|") for c in padded[:len(hdr)]] + lines.append("| " + " | ".join(cells) + " |") + + lines.append("") + + return "\n".join(lines) + "\n" + + +def generate_catalog_go(entries: List[XidEntry], source_url: str) -> str: + """Generate catalog_generated.go.""" + lines = [ + "// Code generated by tools/update-xid-catalog.py; DO NOT EDIT.", + f"// Source: {source_url}", "", + "package xidcatalog", "", + "var catalog = map[int]XidInfo{", + ] + for e in entries: + lines.append(f'\t{e.code}: {{Code: {e.code}, Name: "{_go_string(e.name)}", ' + f'Description: "{_go_string(e.description)}"}},') + lines.extend(["}", ""]) + return "\n".join(lines) + + +def generate_policies_go(entries: List[XidEntry], source_url: str) -> str: + """Generate xid_generated_policies.go.""" + lines = [ + "// Code generated by tools/update-xid-catalog.py; DO NOT EDIT.", + f"// Source: {source_url}", "", + "package triage", "", + 'import "github.com/NexGenCloud/vm-diagnostics/internal/collector"', "", + "var generatedXidPolicies = map[int]xidPolicy{", + ] + for e in entries: + sev, reset, action = e.go_severity_and_reset() + lines.append(f'\t{e.code}: {{{sev}, {"true" if reset else "false"}, "{_go_string(action)}"}},') + lines.extend(["}", ""]) + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Change summary +# --------------------------------------------------------------------------- + +_CATALOG_LINE_RE = re.compile( + r'^\s*(\d+):\s*\{Code:\s*\d+,\s*Name:\s*"([^"]*)",\s*Description:\s*"([^"]*)"\}', re.MULTILINE) +_POLICY_LINE_RE = re.compile( + r'^\s*(\d+):\s*\{([^,]+),\s*(true|false),\s*"([^"]*)"\}', re.MULTILINE) + + +def _parse_go_catalog(text: str) -> Dict[int, Tuple[str, str]]: + """Parse catalog_generated.go -> {code: (name, description)}.""" + return {int(m.group(1)): (m.group(2), m.group(3)) for m in _CATALOG_LINE_RE.finditer(text)} + + +def _parse_go_policies(text: str) -> Dict[int, Tuple[str, bool, str]]: + """Parse xid_generated_policies.go -> {code: (severity, reset, action)}.""" + return { + int(m.group(1)): (m.group(2).strip(), m.group(3) == "true", m.group(4)) + for m in _POLICY_LINE_RE.finditer(text) + } + + +def _read_file(path: str) -> str: + """Read file contents, returning empty string if missing.""" + try: + with open(path, "r") as f: + return f.read() + except FileNotFoundError: + return "" + + +def _print_changes(catalog_path: str, policy_path: str, + new_catalog: str, new_policy: str) -> None: + """Compare old vs new generated files and print a concise human-friendly diff.""" + old_cat = _parse_go_catalog(_read_file(catalog_path)) + new_cat = _parse_go_catalog(new_catalog) + old_pol = _parse_go_policies(_read_file(policy_path)) + new_pol = _parse_go_policies(new_policy) + + old_codes = set(old_cat) + new_codes = set(new_cat) + added = sorted(new_codes - old_codes) + removed = sorted(old_codes - new_codes) + common = sorted(old_codes & new_codes) + + changed: List[str] = [] + for code in common: + diffs: List[str] = [] + if old_cat[code] != new_cat[code]: + oname, odesc = old_cat[code] + nname, ndesc = new_cat[code] + if oname != nname: + diffs.append(f"name: {oname} -> {nname}") + if odesc != ndesc: + diffs.append(f"desc: {odesc} -> {ndesc}") + if old_pol.get(code) != new_pol.get(code): + op = old_pol.get(code, ("?", False, "?")) + np = new_pol.get(code, ("?", False, "?")) + if op[0] != np[0]: + diffs.append(f"severity: {op[0].split('.')[-1]} -> {np[0].split('.')[-1]}") + if op[1] != np[1]: + diffs.append(f"reset: {op[1]} -> {np[1]}") + if op[2] != np[2]: + diffs.append(f"action: {op[2]} -> {np[2]}") + if diffs: + changed.append(f" Xid {code}: {'; '.join(diffs)}") + + if not added and not removed and not changed: + print(" No changes to catalog.", file=sys.stderr) + return + + if added: + names = [f"{c} ({new_cat[c][0]})" for c in added] + print(f" + {len(added)} added: {', '.join(names)}", file=sys.stderr) + if removed: + names = [f"{c} ({old_cat[c][0]})" for c in removed] + print(f" - {len(removed)} removed: {', '.join(names)}", file=sys.stderr) + if changed: + print(f" ~ {len(changed)} modified:", file=sys.stderr) + for line in changed: + print(line, file=sys.stderr) + + +# --------------------------------------------------------------------------- +# File writing helpers +# --------------------------------------------------------------------------- + +def _write_atomic(path: str, content: str) -> None: + """Write to temp file then move — atomic on same filesystem.""" + abs_path = os.path.abspath(path) + dir_name = os.path.dirname(abs_path) + os.makedirs(dir_name, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=dir_name, suffix=".tmp") + try: + os.write(fd, content.encode("utf-8")) + os.close(fd) + fd = -1 + shutil.move(tmp, abs_path) + except Exception: + if fd >= 0: + os.close(fd) + try: + os.unlink(tmp) + except OSError: + pass + raise + + +def _gofmt(path: str) -> None: + """Run gofmt on a Go file. Non-fatal if gofmt is not available.""" + try: + subprocess.run(["gofmt", "-w", path], check=True, capture_output=True) + except FileNotFoundError: + print(f" Warning: gofmt not found, skipping format of {path}", file=sys.stderr) + except subprocess.CalledProcessError as e: + print(f" Warning: gofmt failed on {path}: {e.stderr.decode()}", file=sys.stderr) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description="Update Xid catalog from NVIDIA XLSX") + parser.add_argument("--xid-md", required=True, help="Output path for xid.md") + parser.add_argument("--catalog-out", required=True, help="Output path for catalog_generated.go") + parser.add_argument("--policy-out", required=True, help="Output path for xid_generated_policies.go") + parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") + parser.add_argument("--local-xlsx", help="Use a local XLSX file instead of downloading") + args = parser.parse_args() + + # Step 1: Get XLSX + if args.local_xlsx: + tmp_xlsx = os.path.abspath(args.local_xlsx) + xlsx_url = f"file://{tmp_xlsx}" + owns_tmp = False + if not os.path.exists(tmp_xlsx): + print(f"ERROR: Local XLSX not found: {tmp_xlsx}", file=sys.stderr) + sys.exit(1) + else: + xlsx_url = discover_xlsx_url() + print("Downloading XLSX...", file=sys.stderr) + fd, tmp_xlsx = tempfile.mkstemp(suffix=".xlsx") + os.close(fd) + owns_tmp = True + data = _fetch_url(xlsx_url) + with open(tmp_xlsx, "wb") as f: + f.write(data) + print(f" Downloaded {len(data):,} bytes", file=sys.stderr) + + try: + # Step 2: Parse XLSX + print("Parsing XLSX...", file=sys.stderr) + sheets = parse_xlsx(tmp_xlsx) + print(f" {len(sheets)} sheet(s): {', '.join(sheets.keys())}", file=sys.stderr) + + # Step 3: Extract Xid entries + entries = extract_xid_entries(sheets) + print(f" {len(entries)} active Xid entries", file=sys.stderr) + + if len(entries) < 10: + print("ERROR: Suspiciously few entries — possible schema change", file=sys.stderr) + sys.exit(1) + + # Step 4: Generate outputs + md_content = generate_xid_md(sheets, xlsx_url) + catalog_content = generate_catalog_go(entries, xlsx_url) + policy_content = generate_policies_go(entries, xlsx_url) + + # Step 5: Show what changed + _print_changes(args.catalog_out, args.policy_out, catalog_content, policy_content) + + if args.dry_run: + print(f"\nDry run: {args.xid_md} ({len(md_content):,}B), " + f"{args.catalog_out} ({len(entries)} entries), " + f"{args.policy_out}", file=sys.stderr) + return + + # Step 6: Write files atomically + for path, content in [(args.xid_md, md_content), + (args.catalog_out, catalog_content), + (args.policy_out, policy_content)]: + _write_atomic(path, content) + + _gofmt(args.catalog_out) + _gofmt(args.policy_out) + + print(f"Done: {len(entries)} Xid entries -> 3 files.", file=sys.stderr) + + finally: + if owns_tmp: + os.unlink(tmp_xlsx) + + +if __name__ == "__main__": + main() From 4edca230201e08d01b60e09c43eba873ba9176a9 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Thu, 9 Apr 2026 18:48:23 +0200 Subject: [PATCH 07/23] gather-info v0.2.1: final cleanup Add __pycache__/ to gitignore. Replace stringly-typed path check for ANSI stripping with a struct field in nvidia collector. --- customers/vm-troubleshooting/.gitignore | 1 + .../internal/collector/nvidia.go | 19 ++++++++++--------- .../internal/config/version.go | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/customers/vm-troubleshooting/.gitignore b/customers/vm-troubleshooting/.gitignore index 3a12ca2..c15005a 100644 --- a/customers/vm-troubleshooting/.gitignore +++ b/customers/vm-troubleshooting/.gitignore @@ -1,2 +1,3 @@ bin/ gather-info +__pycache__/ diff --git a/customers/vm-troubleshooting/internal/collector/nvidia.go b/customers/vm-troubleshooting/internal/collector/nvidia.go index f9d7981..5778dc8 100644 --- a/customers/vm-troubleshooting/internal/collector/nvidia.go +++ b/customers/vm-troubleshooting/internal/collector/nvidia.go @@ -61,18 +61,19 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) return r, nil } for _, spec := range []struct { - path string - args []string - env []string + path string + args []string + env []string + stripANSI bool }{ - {"nvidia/nvidia-smi.txt", []string{}, nil}, - {"nvidia/nvidia-smi-q.txt", []string{"-q"}, nil}, - {"nvidia/nvidia-smi-topo.txt", []string{"topo", "-m"}, []string{"TERM=dumb"}}, - {"nvidia/nvidia-smi-nvlink.txt", []string{"nvlink", "--status"}, nil}, - {"nvidia/nvidia-smi-pmon.txt", []string{"pmon", "-s", "um", "-c", "1"}, nil}, + {"nvidia/nvidia-smi.txt", []string{}, nil, false}, + {"nvidia/nvidia-smi-q.txt", []string{"-q"}, nil, false}, + {"nvidia/nvidia-smi-topo.txt", []string{"topo", "-m"}, []string{"TERM=dumb"}, true}, + {"nvidia/nvidia-smi-nvlink.txt", []string{"nvlink", "--status"}, nil, false}, + {"nvidia/nvidia-smi-pmon.txt", []string{"pmon", "-s", "um", "-c", "1"}, nil, false}, } { c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: "nvidia-smi", Args: spec.args, Env: spec.env, Timeout: config.TimeoutMedium}, "nvidia-smi", "gpu") - if spec.path == "nvidia/nvidia-smi-topo.txt" { + if spec.stripANSI { if err := stripANSIArtifact(filepath.Join(c.Writer.Root(), spec.path)); err != nil { r.RecordErrorForArtifact(ErrProbeFailed, fmt.Sprintf("strip ANSI for %s: %v", spec.path, err), spec.path) } diff --git a/customers/vm-troubleshooting/internal/config/version.go b/customers/vm-troubleshooting/internal/config/version.go index 00579b5..788a997 100644 --- a/customers/vm-troubleshooting/internal/config/version.go +++ b/customers/vm-troubleshooting/internal/config/version.go @@ -1,7 +1,7 @@ package config var ( - Version = "0.2.0" + Version = "0.2.1" Commit = "unknown" BuildDate = "unknown" ) From 445e13157642fb235b53c05703c40b21329ed1eb Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Thu, 9 Apr 2026 18:54:49 +0200 Subject: [PATCH 08/23] gather-info: make --xid-md optional in update-xid-catalog The Makefile was writing a dangling xid.md to the repo root on every build. Make the flag optional and drop it from the Makefile so only the two Go source files needed for compilation are generated. --- customers/vm-troubleshooting/Makefile | 1 - .../tools/update-xid-catalog.py | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/customers/vm-troubleshooting/Makefile b/customers/vm-troubleshooting/Makefile index 27d8840..9a390f3 100644 --- a/customers/vm-troubleshooting/Makefile +++ b/customers/vm-troubleshooting/Makefile @@ -36,7 +36,6 @@ clean: update-xid-catalog: @python3 tools/update-xid-catalog.py \ - --xid-md ../../xid.md \ --catalog-out internal/triage/xidcatalog/catalog_generated.go \ --policy-out internal/triage/xid_generated_policies.go \ && gofmt -w internal/triage/xidcatalog/catalog_generated.go internal/triage/xid_generated_policies.go \ diff --git a/customers/vm-troubleshooting/tools/update-xid-catalog.py b/customers/vm-troubleshooting/tools/update-xid-catalog.py index 83607df..f123ffd 100755 --- a/customers/vm-troubleshooting/tools/update-xid-catalog.py +++ b/customers/vm-troubleshooting/tools/update-xid-catalog.py @@ -11,9 +11,9 @@ Usage: python3 tools/update-xid-catalog.py \\ - --xid-md xid.md \\ --catalog-out internal/triage/xidcatalog/catalog_generated.go \\ - --policy-out internal/triage/xid_generated_policies.go + --policy-out internal/triage/xid_generated_policies.go \\ + [--xid-md xid.md] """ import argparse @@ -551,7 +551,7 @@ def _gofmt(path: str) -> None: def main() -> None: parser = argparse.ArgumentParser(description="Update Xid catalog from NVIDIA XLSX") - parser.add_argument("--xid-md", required=True, help="Output path for xid.md") + parser.add_argument("--xid-md", help="Output path for xid.md (optional)") parser.add_argument("--catalog-out", required=True, help="Output path for catalog_generated.go") parser.add_argument("--policy-out", required=True, help="Output path for xid_generated_policies.go") parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") @@ -592,7 +592,7 @@ def main() -> None: sys.exit(1) # Step 4: Generate outputs - md_content = generate_xid_md(sheets, xlsx_url) + md_content = generate_xid_md(sheets, xlsx_url) if args.xid_md else None catalog_content = generate_catalog_go(entries, xlsx_url) policy_content = generate_policies_go(entries, xlsx_url) @@ -600,21 +600,27 @@ def main() -> None: _print_changes(args.catalog_out, args.policy_out, catalog_content, policy_content) if args.dry_run: - print(f"\nDry run: {args.xid_md} ({len(md_content):,}B), " - f"{args.catalog_out} ({len(entries)} entries), " - f"{args.policy_out}", file=sys.stderr) + parts = [f"{args.catalog_out} ({len(entries)} entries)", args.policy_out] + if args.xid_md and md_content: + parts.insert(0, f"{args.xid_md} ({len(md_content):,}B)") + print(f"\nDry run: {', '.join(parts)}", file=sys.stderr) return # Step 6: Write files atomically - for path, content in [(args.xid_md, md_content), - (args.catalog_out, catalog_content), - (args.policy_out, policy_content)]: + outputs: list[tuple[str, str]] = [ + (args.catalog_out, catalog_content), + (args.policy_out, policy_content), + ] + if args.xid_md and md_content: + outputs.append((args.xid_md, md_content)) + for path, content in outputs: _write_atomic(path, content) _gofmt(args.catalog_out) _gofmt(args.policy_out) - print(f"Done: {len(entries)} Xid entries -> 3 files.", file=sys.stderr) + file_count = len(outputs) + print(f"Done: {len(entries)} Xid entries -> {file_count} files.", file=sys.stderr) finally: if owns_tmp: From e1f8c1cbc89876cd7872c0c672a1becafc603079 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 13 Apr 2026 13:46:25 +0200 Subject: [PATCH 09/23] feat: add vm-troubleshooting-dashboard; harden gather-info; docs & gitignore - Dashboard: Go API, SQLite ingest/store, pathutil, React+Vite UI, mirrored schemas - Collector: journal NDJSON streaming, Docker/sanitize, triage enrichment+evidence, runner/output/schema updates - Repo: monorepo AGENTS/CLAUDE/README, ARCHITECTURE+SCHEMA-COMPATIBILITY, docs index, .gitignore refresh --- .gitignore | 23 +- AGENTS.md | 50 +- CLAUDE.md | 6 +- README.md | 34 +- .../vm-troubleshooting-dashboard/.gitignore | 37 + .../vm-troubleshooting-dashboard/AGENTS.md | 102 + .../vm-troubleshooting-dashboard/CLAUDE.md | 1 + .../vm-troubleshooting-dashboard/CODEMAP.md | 96 + .../vm-troubleshooting-dashboard/Makefile | 42 + .../frontend/.gitignore | 28 + .../frontend/components.json | 25 + .../frontend/eslint.config.js | 30 + .../frontend/index.html | 16 + .../frontend/package.json | 44 + .../frontend/pnpm-lock.yaml | 4485 +++++++++++++++++ .../frontend/public/favicon.svg | 1 + .../frontend/public/icons.svg | 24 + .../frontend/src/api/archives.ts | 74 + .../frontend/src/api/artifacts.ts | 30 + .../frontend/src/api/client.ts | 31 + .../frontend/src/api/issues.ts | 43 + .../artifacts/ArtifactBrowserPage.tsx | 311 ++ .../issue-detail/IssueDetailPage.tsx | 283 ++ .../src/components/issues/IssuesPage.tsx | 273 + .../src/components/layout/AppShell.tsx | 57 + .../src/components/layout/ArchiveHeader.tsx | 103 + .../frontend/src/components/ui/badge.tsx | 52 + .../frontend/src/components/ui/button.tsx | 58 + .../frontend/src/components/ui/card.tsx | 103 + .../frontend/src/components/ui/input.tsx | 20 + .../frontend/src/components/ui/kv.tsx | 8 + .../frontend/src/components/ui/pill.tsx | 7 + .../frontend/src/components/ui/select.tsx | 48 + .../src/components/ui/severity-badge.tsx | 31 + .../frontend/src/components/ui/table.tsx | 114 + .../frontend/src/components/ui/tabs.tsx | 82 + .../src/components/upload/HomePage.tsx | 193 + .../frontend/src/index.css | 103 + .../frontend/src/lib/utils.ts | 175 + .../frontend/src/main.tsx | 26 + .../frontend/src/router.tsx | 23 + .../frontend/src/types.ts | 125 + .../frontend/tsconfig.app.json | 32 + .../frontend/tsconfig.json | 13 + .../frontend/tsconfig.node.json | 24 + .../frontend/vite.config.ts | 25 + customers/vm-troubleshooting-dashboard/go.mod | 20 + customers/vm-troubleshooting-dashboard/go.sum | 53 + .../internal/api/server.go | 625 +++ .../internal/api/server_test.go | 229 + .../internal/ingest/ingest.go | 342 ++ .../internal/ingest/ingest_test.go | 225 + .../internal/model/types.go | 241 + .../internal/pathutil/safejoin.go | 56 + .../internal/pathutil/safejoin_test.go | 50 + .../internal/store/evidence.go | 251 + .../internal/store/schema.sql | 55 + .../internal/store/store.go | 613 +++ .../internal/store/store_test.go | 136 + .../schemas/manifest.schema.json | 143 + .../schemas/report-record.schema.json | 84 + .../schemas/triage-result.schema.json | 48 + customers/vm-troubleshooting/.gitignore | 14 + customers/vm-troubleshooting/AGENTS.md | 19 +- customers/vm-troubleshooting/ARCHITECTURE.md | 235 + customers/vm-troubleshooting/CLAUDE.md | 1 + customers/vm-troubleshooting/CODEMAP.md | 1 + .../SCHEMA-COMPATIBILITY.md | 104 + .../internal/collector/collector.go | 17 +- .../internal/collector/collector_test.go | 296 +- .../internal/collector/docker.go | 44 +- .../internal/collector/docker_test.go | 123 + .../internal/collector/journal.go | 87 +- .../collector/journal_ndjson_build_test.go | 105 + .../internal/collector/nvidia.go | 66 +- .../internal/collector/services.go | 6 +- .../internal/collector/system.go | 7 +- .../output/archive_consistency_test.go | 2 +- .../internal/output/contract_test.go | 20 +- .../internal/output/manifest.go | 1 + .../internal/output/manifest_test.go | 15 +- .../internal/output/report.go | 2 +- .../internal/output/summary.go | 15 +- .../internal/output/summary_test.go | 68 + .../internal/runner/runner.go | 125 +- .../internal/runner/runner_enrichment_test.go | 50 + .../internal/sanitize/sanitize.go | 90 +- .../internal/sanitize/sanitize_test.go | 133 +- .../internal/triage/enrichment.go | 431 ++ .../internal/triage/enrichment_test.go | 477 ++ .../internal/triage/evidence.go | 107 + .../internal/triage/evidence_test.go | 155 + .../internal/triage/triage.go | 35 +- .../schemas/manifest.schema.json | 1 + .../schemas/report-record.schema.json | 1 + .../schemas/triage-result.schema.json | 2 +- docs/README.md | 19 + docs/architecture.md | 228 +- 98 files changed, 13466 insertions(+), 318 deletions(-) create mode 100644 customers/vm-troubleshooting-dashboard/.gitignore create mode 100644 customers/vm-troubleshooting-dashboard/AGENTS.md create mode 100644 customers/vm-troubleshooting-dashboard/CLAUDE.md create mode 100644 customers/vm-troubleshooting-dashboard/CODEMAP.md create mode 100644 customers/vm-troubleshooting-dashboard/Makefile create mode 100644 customers/vm-troubleshooting-dashboard/frontend/.gitignore create mode 100644 customers/vm-troubleshooting-dashboard/frontend/components.json create mode 100644 customers/vm-troubleshooting-dashboard/frontend/eslint.config.js create mode 100644 customers/vm-troubleshooting-dashboard/frontend/index.html create mode 100644 customers/vm-troubleshooting-dashboard/frontend/package.json create mode 100644 customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml create mode 100644 customers/vm-troubleshooting-dashboard/frontend/public/favicon.svg create mode 100644 customers/vm-troubleshooting-dashboard/frontend/public/icons.svg create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/api/artifacts.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/api/client.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/layout/ArchiveHeader.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/badge.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/button.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/card.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/input.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/pill.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/select.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/table.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/tabs.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/upload/HomePage.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/index.css create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/main.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/router.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/types.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/tsconfig.app.json create mode 100644 customers/vm-troubleshooting-dashboard/frontend/tsconfig.json create mode 100644 customers/vm-troubleshooting-dashboard/frontend/tsconfig.node.json create mode 100644 customers/vm-troubleshooting-dashboard/frontend/vite.config.ts create mode 100644 customers/vm-troubleshooting-dashboard/go.mod create mode 100644 customers/vm-troubleshooting-dashboard/go.sum create mode 100644 customers/vm-troubleshooting-dashboard/internal/api/server.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/api/server_test.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/model/types.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/evidence.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/schema.sql create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/store.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/store_test.go create mode 100644 customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json create mode 100644 customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json create mode 100644 customers/vm-troubleshooting-dashboard/schemas/triage-result.schema.json create mode 100644 customers/vm-troubleshooting/ARCHITECTURE.md create mode 100644 customers/vm-troubleshooting/CLAUDE.md create mode 100644 customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md create mode 100644 customers/vm-troubleshooting/internal/collector/journal_ndjson_build_test.go create mode 100644 customers/vm-troubleshooting/internal/runner/runner_enrichment_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/enrichment.go create mode 100644 customers/vm-troubleshooting/internal/triage/enrichment_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/evidence.go create mode 100644 customers/vm-troubleshooting/internal/triage/evidence_test.go create mode 100644 docs/README.md diff --git a/.gitignore b/.gitignore index 3c5a9d3..bdf6ce5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,11 +3,19 @@ *.exe bin/ +# Go (repo-wide) +coverage.out +coverage.html +*.coverprofile +*.prof +cpu.prof +mem.prof + # OS metadata .DS_Store Thumbs.db -# Editor/IDE +# Editor/IDE (local only; subprojects may un-ignore e.g. !.vscode/extensions.json) *.swp *.swo *~ @@ -16,13 +24,20 @@ Thumbs.db *.sublime-project *.sublime-workspace -# Environment and secrets +# Environment and secrets (allow committed templates) .env .env.* +!.env.example +!.env*.example *.pem *.key +*.p12 +*.pfx -docs/plans/ - +# Local tool caches (not shared project config) .mcp_data/ .serena/ +.cursor/ +.codex/ +.claude/ +docs/plans/ \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 6edf9fb..a2dd2cd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,17 @@ This repository contains customer-facing support scripts and binaries for collecting diagnostics or applying narrow workarounds on customer VMs. Assume the operator is a customer or support engineer running the tool locally on a machine we do not control and cannot access directly. +## Monorepo model +- First-class projects live under `customers//`. Each project has its own **`AGENTS.md`** with verification commands, stack-specific rules, and links to `CODEMAP.md` / `ARCHITECTURE.md` as needed. +- When changing code under a project directory, follow **that project's `AGENTS.md`** for how to build, test, and review. +- New projects may use any stack (Go, Node, Python, shell, etc.). The **root file stays policy and discovery**—not a catalog of every toolchain's commands. + +## Cross-project contracts +- **`customers/vm-troubleshooting/`** (`gather-info`) **produces** diagnostic archives (manifest, report stream, triage data, schemas). +- **`customers/vm-troubleshooting-dashboard/`** **consumes** those archives for ingest and UI. +- Authoritative compatibility and versioning rules: **`customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md`**. Producer or consumer changes that affect archive shape, schema majors, or mirrored JSON files usually belong in a **coordinated** change (both sides + that doc when applicable). +- Dashboard `schemas/` mirrors collector `schemas/`; keep them aligned per each project's `AGENTS.md` checklists. + ## Operating constraints - Prefer self-contained tooling with minimal external dependencies. - Assume common Linux distributions first: Ubuntu 20.04/22.04/24.04. Treat other distros as best-effort unless explicitly supported. @@ -37,34 +48,31 @@ Assume the operator is a customer or support engineer running the tool locally o - Prefer machine-readable sources when available. - Do not silently ignore errors that affect support value; record them in output. -## Verification -Before considering work complete, run the narrowest relevant checks that exist. +## Verification (repository root only) +Before considering work complete, run the narrowest relevant checks for what you changed. -Current commands: +**Root-level assets only** (e.g. scripts in the repo root): - Bash lint: `shellcheck nvidia-drm-disable-modeset.sh` - Bash syntax: `bash -n nvidia-drm-disable-modeset.sh` -If a Go implementation exists: -- Format: `cd customers/vm-troubleshooting && gofmt -w .` -- Vet: `cd customers/vm-troubleshooting && go vet ./...` -- Test: `cd customers/vm-troubleshooting && go test ./...` -- Build: `cd customers/vm-troubleshooting && CGO_ENABLED=0 go build ./cmd/gather-info` +**Anything under `customers/`:** use that project's **`AGENTS.md`** verification section (Go, frontend, etc.). ## Repo structure -- `customers/`: customer-run support tooling and related assets. -- Root scripts: focused one-off support or remediation utilities. -- `customers/vm-troubleshooting/`: current Go-based diagnostics collector. -- `customers/vm-troubleshooting/CODEMAP.md`: architecture and collector map for the diagnostics collector. +- `customers/`: shipped or support-facing tools and assets; each subfolder is a project with its own `AGENTS.md`. +- Root scripts: focused one-off support or remediation utilities (verify with root-only commands above). +- `customers/vm-troubleshooting/`: diagnostics collector (`gather-info`). Maps: `CODEMAP.md`, `ARCHITECTURE.md`. +- `customers/vm-troubleshooting-dashboard/`: dashboard (Go API + UI). Map: `CODEMAP.md`. +- `docs/`: planning notes and indexes (`docs/README.md`, `docs/architecture.md` point to project-local maps). + +## Doc maintenance (cross-cutting) +When architecture or boundaries change materially, update the relevant **project** orientation docs in the same change: +- Collector: `customers/vm-troubleshooting/CODEMAP.md`; `ARCHITECTURE.md` when pipeline/types/modes narratives change. +- Dashboard: `customers/vm-troubleshooting-dashboard/CODEMAP.md` when package layout or ingest/API flow changes. + +For **Go-based diagnostics** behavior (modular collectors, timeouts, static builds, graceful skips), follow the collector project's `AGENTS.md`; do not duplicate those rules here. -## Diagnostics collector guidance -For the Go-based diagnostics collector: -- Preserve user-visible behavior and output structure unless there is a clear improvement. -- Keep collectors modular. -- Keep command execution timeout-bound. -- Check command availability before execution. -- Make unsupported probes report "not available" rather than fail the whole run. -- Prefer static builds for portability unless there is a strong reason not to. -- Keep `customers/vm-troubleshooting/CODEMAP.md` current when changing architecture or collector ownership. +## Optional: CI at scale +If you add continuous integration, **path-scoped** jobs (build/test only projects whose paths changed) are a practical way to keep feedback fast as `customers/` grows. This is optional operational practice, not a requirement of this repository. ## Done means A change is not done until: diff --git a/CLAUDE.md b/CLAUDE.md index 2e18543..0886dd3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,9 +1,7 @@ # CLAUDE.md -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +See @AGENTS.md for repository-wide policy, safety, monorepo discovery, and cross-project contracts. -See @AGENTS.md for all project instructions, constraints, and verification commands. - -For the diagnostics collector, also see `customers/vm-troubleshooting/AGENTS.md` and `customers/vm-troubleshooting/CODEMAP.md`. +This repository is a **monorepo**. Prefer the **nearest** `CLAUDE.md` or `AGENTS.md` under the tree you are editing (for example `customers/vm-troubleshooting/` or `customers/vm-troubleshooting-dashboard/`). Keep changes scoped to the active project unless the task explicitly spans multiple projects or shared contracts. ShellCheck is pre-allowed in `.claude/settings.local.json`. diff --git a/README.md b/README.md index c1e3638..18d3c0a 100644 --- a/README.md +++ b/README.md @@ -1 +1,33 @@ -This repository consists of support scripts that Hyperstack Customer Experience team would send to Hyperstack users, with instructions, to fix or workaround issues, or to gather information related to a ticket. +# support-scripts + +This repository holds **customer-run support tooling**: scripts and small programs that operators use on Linux VMs to gather diagnostics or apply narrow, documented workarounds. Tools are meant to work on partially broken hosts, with minimal assumptions about network, GPU, containers, or privilege. + +## What lives here + +| Area | Purpose | +|------|---------| +| [`customers/`](customers/) | Packaged tools distributed with support (Go binaries, dashboards, assets). | +| [`docs/`](docs/) | Planning notes and indexes; authoritative behavior is always the code plus per-project `AGENTS.md`. | +| Root `.sh` scripts | Focused one-off utilities (when present). | + +## Main projects + +- **[`customers/vm-troubleshooting/`](customers/vm-troubleshooting/)** — `gather-info`: static Go binary that collects VM diagnostics into a single `.tar.gz` with manifest, report stream, and summaries. + - Quick map: [`customers/vm-troubleshooting/CODEMAP.md`](customers/vm-troubleshooting/CODEMAP.md) + - Extended reference: [`customers/vm-troubleshooting/ARCHITECTURE.md`](customers/vm-troubleshooting/ARCHITECTURE.md) + +- **[`customers/vm-troubleshooting-dashboard/`](customers/vm-troubleshooting-dashboard/)** — Local web app (Go API + SQLite + React) to ingest and browse `gather-info` archives. + - Map: [`customers/vm-troubleshooting-dashboard/CODEMAP.md`](customers/vm-troubleshooting-dashboard/CODEMAP.md) + +## Contributing / agents + +- Repo-wide policy, monorepo discovery, and **root-only** verification (e.g. root shell scripts): [`AGENTS.md`](AGENTS.md) +- **Per-project** build/test commands and stack rules: that project's [`AGENTS.md`](customers/vm-troubleshooting/AGENTS.md) (collector) or [`AGENTS.md`](customers/vm-troubleshooting-dashboard/AGENTS.md) (dashboard). + +## Plans and hardening + +Workstreams and follow-up hardening are described under [`docs/plans/`](docs/plans/). Those documents are **specifications and history**: always confirm behavior in the current tree and tests (for example [`docs/plans/post-audit-hardening.md`](docs/plans/post-audit-hardening.md) outlines post-audit dashboard and collector hardening goals). + +## Quick verification + +Use the **`AGENTS.md` inside the project** you are changing (collector or dashboard) for exact commands. The repo root [`AGENTS.md`](AGENTS.md) only lists checks for **root-level** assets (e.g. specific shell scripts). diff --git a/customers/vm-troubleshooting-dashboard/.gitignore b/customers/vm-troubleshooting-dashboard/.gitignore new file mode 100644 index 0000000..c2a37ec --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/.gitignore @@ -0,0 +1,37 @@ +# Build output +bin/ +dashboard + +# Runtime / local data +dashboard-data/ +*.db +*.db-journal +*.db-wal +*.db-shm + +# Frontend +frontend/dist/ +frontend/node_modules/ + +# Go test & coverage +*.test +*.out +coverage.out +coverage.html +*.coverprofile + +# OS +.DS_Store +Thumbs.db + +# Editor +*.swp +*.swo +*~ +.idea/ +.vscode/ + +# Environment (templates may be committed from repo root rules) +.env +.env.* +!.env.example diff --git a/customers/vm-troubleshooting-dashboard/AGENTS.md b/customers/vm-troubleshooting-dashboard/AGENTS.md new file mode 100644 index 0000000..4841c0b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/AGENTS.md @@ -0,0 +1,102 @@ +# AGENTS.md + +## Scope +This file applies to everything under `customers/vm-troubleshooting-dashboard/`. +The dashboard is a Go + frontend web app that ingests `gather-info` archives +and renders them for support engineers. + +See `customers/vm-troubleshooting-dashboard/CODEMAP.md` for package layout, request flow, and verification commands. + +## Monorepo boundary +- This project **owns** ingest, persistence, HTTP API, and UI for archives produced by **`customers/vm-troubleshooting/`** (`gather-info`). +- Stay **forward-compatible** with supported schema majors and newer minors; authoritative rules live in `customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md`. + +## Simplicity (KISS) +- Keep ingest, model, store, and API layers straightforward; avoid extra indirection unless it removes real duplication. +- Favor tolerant parsing and generic UI fallbacks over hardcoded enums for codes/tags/hints. + +## Project goals +- Render any archive produced by a supported collector major version. +- Treat archives as the source of truth — never modify them on ingest. +- Be tolerant of unknown fields and unknown enum values; never block ingest on + cosmetic schema drift. + +## Architecture rules +- `cmd/` is the entrypoint (HTTP server). Keep it thin. +- `internal/ingest/` parses archives into the model. +- `internal/model/` defines the in-memory shape and the schema-version gate. +- `internal/store/` persists ingested archives to SQLite. +- `frontend/` is the UI (TypeScript). Issue codes / tags / parser hints are + opaque strings — do not hardcode enums. + +## Schema compatibility (READ THIS BEFORE TOUCHING SCHEMA HANDLING) +Authoritative compatibility rules live in +`customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md`. The dashboard's job is +to honor the consumer side of that contract: + +- **Accept any archive whose `schema_version` major matches + `SupportedSchemaMajor`** (`internal/model/types.go`). Do not pin to a specific + minor. Do not add minor checks that would reject newer-minor archives. +- **Treat unknown fields as ignorable.** The default Go `encoding/json` + behavior already does this. Do not add `DisallowUnknownFields()` to ingest + decoders. +- **Treat issue codes, tags, parser hints, and finding codes as opaque + strings.** Never assume a fixed enum in Go or TS. Renderers should fall back + to a generic display for codes they do not recognize, not error or hide them. +- **Schema files in `schemas/` are mirrors of the collector schemas.** They + must match the collector copies byte-for-byte (modulo `$id` if intentional). + When the collector adds a new enum value or field, mirror the change here in + the same PR. +- **Never validate archives against the JSON Schema at runtime.** The schemas + are documentation/contract. Runtime parsing uses Go structs and is + deliberately permissive. +- **When extending support to a new major version**, prefer extending the + version gate to accept multiple majors (range check) over flipping it. This + preserves the ability to view historical archives. + +## Forward compatibility checklist for any dashboard change +- [ ] Does this change reject archives that today's dashboard accepts? If yes, + stop and reconsider. +- [ ] Does this change hardcode an enum that the collector treats as + extensible? If yes, replace with a fallback-friendly lookup. +- [ ] If schema mirror files were edited, do they match the collector's + `customers/vm-troubleshooting/schemas/` byte-for-byte? +- [ ] Did you test ingest with both an old archive (e.g. `3.0.x` or `3.1.x`) + and a current-version archive? + +## UX and rendering +- Issue / finding codes are displayed by code string with optional friendly + label lookup. Unknown codes render as the raw code, not as an error or + blank. +- Severity / confidence are constrained enums in the schema, but renderers + should still tolerate unexpected values rather than crash. +- Facts are an open `map[string]any`. The UI should render any well-formed + fact, not just a known set. + +## Safety and privacy +- The dashboard ingests data the collector already sanitized. Do not + re-collect from customer systems. +- Storage paths are server-controlled. Never pass user input directly into + filesystem paths without validation. +- Do not log archive contents at info level; archives may contain hostnames + and IPs that count as customer-identifiable. + +## Tests and verification +Before considering a change done: +- `gofmt -w .` +- `go vet ./...` +- `go test ./...` +- `go build ./cmd/dashboard` +- For frontend changes: `cd frontend && pnpm build`, then load a known-good archive in a browser. Type checks and unit tests do not catch all UI regressions. + +When changing ingest or model code, always include a test that ingests an +archive with a slightly newer minor schema version (e.g. one with an +unknown enum value or fact key) and confirms it succeeds. + +## Change management +- Preserve URL routes and API response shapes used by the frontend unless + there is a clear improvement. +- Database migrations must be additive (new columns nullable; never drop + columns in the same release that adds the replacement). +- If schema compat rules change, update both this file and + `customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md` in the same change. diff --git a/customers/vm-troubleshooting-dashboard/CLAUDE.md b/customers/vm-troubleshooting-dashboard/CLAUDE.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/customers/vm-troubleshooting-dashboard/CODEMAP.md b/customers/vm-troubleshooting-dashboard/CODEMAP.md new file mode 100644 index 0000000..5797529 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/CODEMAP.md @@ -0,0 +1,96 @@ +# CODEMAP — vm-troubleshooting-dashboard + +## Purpose + +`customers/vm-troubleshooting-dashboard/` is a **local support tool**: a small Go HTTP server with a SQLite store and a React (Vite) frontend. It **ingests** `gather-info` `.tar.gz` archives (read-only on disk under a managed storage directory), indexes metadata and issues, and serves a JSON API + static SPA. + +This file is the skimmable map. Schema compatibility rules and consumer contracts are in **[AGENTS.md](./AGENTS.md)** and **`../vm-troubleshooting/SCHEMA-COMPATIBILITY.md`**. + +## Read this first + +- Server entrypoint: [`cmd/dashboard/main.go`](cmd/dashboard/main.go) +- HTTP routes and handlers: [`internal/api/server.go`](internal/api/server.go) +- Archive extraction + manifest parsing: [`internal/ingest/ingest.go`](internal/ingest/ingest.go) +- Persistence: [`internal/store/store.go`](internal/store/store.go) +- Path containment helper: [`internal/pathutil/safejoin.go`](internal/pathutil/safejoin.go) +- Supported schema major + types: [`internal/model/types.go`](internal/model/types.go) +- Frontend app: [`frontend/`](frontend/) + +## Request flow (high level) + +1. **Startup** — `main` opens/creates SQLite, applies migrations, constructs `store.Store`, optional static `webRoot`, builds `api.Server` with options (listen/auth/limits), runs `http.Server`. +2. **Upload** — `POST` multipart archive → temp file → `ingest.Ingest` (bounded tar/gzip extraction, manifest load) → transactional `store.Save` → JSON response. +3. **Browse** — `GET` APIs load archive rows, issues, artifacts; many handlers resolve `archive_id` then call `store.Get` to hydrate `ArchiveDetail` (issues, collectors, manifest artifacts, optional evidence suggestions). +4. **Artifacts** — preview/download use `pathutil.SafeJoin` against the archive storage directory before `Stat`/`Open`/`ServeFile` (see AGENTS for path-trust rules). +5. **Static UI** — `GET` non-API paths serve files from `webRoot` with SPA fallback to `index.html` where configured. + +## Package ownership + +### `cmd/dashboard/` + +- CLI flags, SQLite path, listen address, auth-related flags, graceful shutdown. +- Keep thin; no business logic. + +### `internal/api/` + +- Routing, middleware (security headers, optional API auth), JSON helpers. +- Handlers for archives, issues, artifacts, upload, delete. +- Pagination and query parsing for list endpoints. + +### `internal/ingest/` + +- `tar.gz` extraction with size/entry limits, path safety, unsupported tar types rejected. +- Builds in-memory/detail structs used by `store.Save`. +- Does **not** mutate customer archive bytes beyond writing extracted files into server-controlled dirs. + +### `internal/store/` + +- SQLite schema, migrations, CRUD for archives/issues/collectors. +- Denormalized fields where needed for list performance (see migrations in `store.go`). +- `evidence.go`: heuristic suggestions linking issues to artifacts (scoring); keep tolerant of unknown issue shapes. + +### `internal/model/` + +- Shared types for API/store boundary; schema major gate for ingest. + +### `internal/pathutil/` + +- `SafeJoin(root, rel)` for lexical containment; callers must still respect symlink/open semantics documented in AGENTS. + +### `schemas/` + +- JSON Schema mirrors of `customers/vm-troubleshooting/schemas/` — keep in sync per AGENTS checklist. + +### `frontend/` + +- Vite + React + TanStack Query; calls same-origin `/api/...`. +- Does not embed shared secrets; bearer auth in production typically requires a reverse proxy or custom build if used. + +## Frontend layout (conceptual) + +- `src/api/` — thin hooks and `fetch` helpers. +- `src/components/` — pages (archives, issues, artifacts, upload). +- `src/lib/` — shared utilities (e.g. finding sort/title helpers). + +## Verification + +From `customers/vm-troubleshooting-dashboard/`: + +```bash +gofmt -w . +go vet ./... +go test ./... +go build ./cmd/dashboard +``` + +Frontend: + +```bash +cd frontend && pnpm build +``` + +## Related docs + +- [`AGENTS.md`](./AGENTS.md) — dashboard rules, schema compat, tests. +- [`../vm-troubleshooting/CODEMAP.md`](../vm-troubleshooting/CODEMAP.md) — collector output the dashboard consumes. +- [`../../docs/plans/post-audit-hardening.md`](../../docs/plans/post-audit-hardening.md) — planned hardening goals (verify in code). diff --git a/customers/vm-troubleshooting-dashboard/Makefile b/customers/vm-troubleshooting-dashboard/Makefile new file mode 100644 index 0000000..986a6a3 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/Makefile @@ -0,0 +1,42 @@ +.PHONY: all build build-backend build-frontend dev dev-backend dev-frontend fmt vet test lint clean + +# Build the production binary with embedded frontend +all: build + +build: build-frontend build-backend + +build-backend: + go build -o bin/dashboard ./cmd/dashboard + +build-frontend: + cd frontend && pnpm install --frozen-lockfile && pnpm build + +# Development: run backend and frontend dev servers concurrently. +# Both streams are prefixed so you can tell which process logged what. +dev: + @trap 'kill 0' EXIT; \ + $(MAKE) dev-backend 2>&1 | sed -u 's/^/[go] /' & \ + $(MAKE) dev-frontend 2>&1 | sed -u 's/^/[vite] /' & \ + wait + +dev-backend: + go run ./cmd/dashboard -addr :8080 -data-dir ./dashboard-data -web-root "" + +dev-frontend: + cd frontend && pnpm dev + +fmt: + gofmt -w . + cd frontend && pnpm lint --fix || true + +vet: + go vet ./... + +test: + go test ./... + +lint: vet + cd frontend && pnpm lint + +clean: + rm -rf bin/ dashboard-data/ frontend/dist/ diff --git a/customers/vm-troubleshooting-dashboard/frontend/.gitignore b/customers/vm-troubleshooting-dashboard/frontend/.gitignore new file mode 100644 index 0000000..c2f3522 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/.gitignore @@ -0,0 +1,28 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +# Dependencies & build +node_modules +dist +dist-ssr +*.local + +# TypeScript incremental +*.tsbuildinfo + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/customers/vm-troubleshooting-dashboard/frontend/components.json b/customers/vm-troubleshooting-dashboard/frontend/components.json new file mode 100644 index 0000000..15addee --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/components.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "base-nova", + "rsc": false, + "tsx": true, + "tailwind": { + "config": "", + "css": "src/index.css", + "baseColor": "neutral", + "cssVariables": true, + "prefix": "" + }, + "iconLibrary": "lucide", + "rtl": false, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "menuColor": "default", + "menuAccent": "subtle", + "registries": {} +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/eslint.config.js b/customers/vm-troubleshooting-dashboard/frontend/eslint.config.js new file mode 100644 index 0000000..6cae6cb --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/eslint.config.js @@ -0,0 +1,30 @@ +import js from '@eslint/js' +import globals from 'globals' +import reactHooks from 'eslint-plugin-react-hooks' +import reactRefresh from 'eslint-plugin-react-refresh' +import tseslint from 'typescript-eslint' +import { defineConfig, globalIgnores } from 'eslint/config' + +export default defineConfig([ + globalIgnores(['dist']), + { + files: ['**/*.{ts,tsx}'], + extends: [ + js.configs.recommended, + tseslint.configs.recommended, + reactHooks.configs.flat.recommended, + reactRefresh.configs.vite, + ], + languageOptions: { + ecmaVersion: 2020, + globals: globals.browser, + }, + }, + // shadcn/ui generated files export variant helpers alongside components + { + files: ['src/components/ui/**/*.{ts,tsx}'], + rules: { + 'react-refresh/only-export-components': 'off', + }, + }, +]) diff --git a/customers/vm-troubleshooting-dashboard/frontend/index.html b/customers/vm-troubleshooting-dashboard/frontend/index.html new file mode 100644 index 0000000..51ff75e --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/index.html @@ -0,0 +1,16 @@ + + + + + + + Diagnostic Dashboard + + +
+ + + diff --git a/customers/vm-troubleshooting-dashboard/frontend/package.json b/customers/vm-troubleshooting-dashboard/frontend/package.json new file mode 100644 index 0000000..0b2bab4 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/package.json @@ -0,0 +1,44 @@ +{ + "name": "frontend", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc -b && vite build", + "lint": "eslint .", + "preview": "vite preview" + }, + "dependencies": { + "@base-ui/react": "^1.3.0", + "@fontsource-variable/geist": "^5.2.8", + "@fontsource-variable/outfit": "^5.2.8", + "@tailwindcss/vite": "^4.2.2", + "@tanstack/react-query": "^5.97.0", + "@tanstack/react-virtual": "^3.13.23", + "class-variance-authority": "^0.7.1", + "clsx": "^2.1.1", + "lucide-react": "^1.8.0", + "react": "^19.2.4", + "react-dom": "^19.2.4", + "react-router": "^7.14.0", + "shadcn": "^4.2.0", + "tailwind-merge": "^3.5.0", + "tailwindcss": "^4.2.2", + "tw-animate-css": "^1.4.0" + }, + "devDependencies": { + "@eslint/js": "^9.39.4", + "@types/node": "^24.12.2", + "@types/react": "^19.2.14", + "@types/react-dom": "^19.2.3", + "@vitejs/plugin-react": "^6.0.1", + "eslint": "^9.39.4", + "eslint-plugin-react-hooks": "^7.0.1", + "eslint-plugin-react-refresh": "^0.5.2", + "globals": "^17.4.0", + "typescript": "~6.0.2", + "typescript-eslint": "^8.58.0", + "vite": "^8.0.4" + } +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml b/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml new file mode 100644 index 0000000..776761b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml @@ -0,0 +1,4485 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@base-ui/react': + specifier: ^1.3.0 + version: 1.3.0(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + '@fontsource-variable/geist': + specifier: ^5.2.8 + version: 5.2.8 + '@fontsource-variable/outfit': + specifier: ^5.2.8 + version: 5.2.8 + '@tailwindcss/vite': + specifier: ^4.2.2 + version: 4.2.2(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1)) + '@tanstack/react-query': + specifier: ^5.97.0 + version: 5.97.0(react@19.2.5) + '@tanstack/react-virtual': + specifier: ^3.13.23 + version: 3.13.23(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + class-variance-authority: + specifier: ^0.7.1 + version: 0.7.1 + clsx: + specifier: ^2.1.1 + version: 2.1.1 + lucide-react: + specifier: ^1.8.0 + version: 1.8.0(react@19.2.5) + react: + specifier: ^19.2.4 + version: 19.2.5 + react-dom: + specifier: ^19.2.4 + version: 19.2.5(react@19.2.5) + react-router: + specifier: ^7.14.0 + version: 7.14.0(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + shadcn: + specifier: ^4.2.0 + version: 4.2.0(@types/node@24.12.2)(typescript@6.0.2) + tailwind-merge: + specifier: ^3.5.0 + version: 3.5.0 + tailwindcss: + specifier: ^4.2.2 + version: 4.2.2 + tw-animate-css: + specifier: ^1.4.0 + version: 1.4.0 + devDependencies: + '@eslint/js': + specifier: ^9.39.4 + version: 9.39.4 + '@types/node': + specifier: ^24.12.2 + version: 24.12.2 + '@types/react': + specifier: ^19.2.14 + version: 19.2.14 + '@types/react-dom': + specifier: ^19.2.3 + version: 19.2.3(@types/react@19.2.14) + '@vitejs/plugin-react': + specifier: ^6.0.1 + version: 6.0.1(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1)) + eslint: + specifier: ^9.39.4 + version: 9.39.4(jiti@2.6.1) + eslint-plugin-react-hooks: + specifier: ^7.0.1 + version: 7.0.1(eslint@9.39.4(jiti@2.6.1)) + eslint-plugin-react-refresh: + specifier: ^0.5.2 + version: 0.5.2(eslint@9.39.4(jiti@2.6.1)) + globals: + specifier: ^17.4.0 + version: 17.4.0 + typescript: + specifier: ~6.0.2 + version: 6.0.2 + typescript-eslint: + specifier: ^8.58.0 + version: 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + vite: + specifier: ^8.0.4 + version: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + +packages: + + '@babel/code-frame@7.29.0': + resolution: {integrity: sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==} + engines: {node: '>=6.9.0'} + + '@babel/compat-data@7.29.0': + resolution: {integrity: sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==} + engines: {node: '>=6.9.0'} + + '@babel/core@7.29.0': + resolution: {integrity: sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==} + engines: {node: '>=6.9.0'} + + '@babel/generator@7.29.1': + resolution: {integrity: sha512-qsaF+9Qcm2Qv8SRIMMscAvG4O3lJ0F1GuMo5HR/Bp02LopNgnZBC/EkbevHFeGs4ls/oPz9v+Bsmzbkbe+0dUw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-annotate-as-pure@7.27.3': + resolution: {integrity: sha512-fXSwMQqitTGeHLBC08Eq5yXz2m37E4pJX1qAU1+2cNedz/ifv/bVXft90VeSav5nFO61EcNgwr0aJxbyPaWBPg==} + engines: {node: '>=6.9.0'} + + '@babel/helper-compilation-targets@7.28.6': + resolution: {integrity: sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==} + engines: {node: '>=6.9.0'} + + '@babel/helper-create-class-features-plugin@7.28.6': + resolution: {integrity: sha512-dTOdvsjnG3xNT9Y0AUg1wAl38y+4Rl4sf9caSQZOXdNqVn+H+HbbJ4IyyHaIqNR6SW9oJpA/RuRjsjCw2IdIow==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0 + + '@babel/helper-globals@7.28.0': + resolution: {integrity: sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-member-expression-to-functions@7.28.5': + resolution: {integrity: sha512-cwM7SBRZcPCLgl8a7cY0soT1SptSzAlMH39vwiRpOQkJlh53r5hdHwLSCZpQdVLT39sZt+CRpNwYG4Y2v77atg==} + engines: {node: '>=6.9.0'} + + '@babel/helper-module-imports@7.28.6': + resolution: {integrity: sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-module-transforms@7.28.6': + resolution: {integrity: sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0 + + '@babel/helper-optimise-call-expression@7.27.1': + resolution: {integrity: sha512-URMGH08NzYFhubNSGJrpUEphGKQwMQYBySzat5cAByY1/YgIRkULnIy3tAMeszlL/so2HbeilYloUmSpd7GdVw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-plugin-utils@7.28.6': + resolution: {integrity: sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==} + engines: {node: '>=6.9.0'} + + '@babel/helper-replace-supers@7.28.6': + resolution: {integrity: sha512-mq8e+laIk94/yFec3DxSjCRD2Z0TAjhVbEJY3UQrlwVo15Lmt7C2wAUbK4bjnTs4APkwsYLTahXRraQXhb1WCg==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0 + + '@babel/helper-skip-transparent-expression-wrappers@7.27.1': + resolution: {integrity: sha512-Tub4ZKEXqbPjXgWLl2+3JpQAYBJ8+ikpQ2Ocj/q/r0LwE3UhENh7EUabyHjz2kCEsrRY83ew2DQdHluuiDQFzg==} + engines: {node: '>=6.9.0'} + + '@babel/helper-string-parser@7.27.1': + resolution: {integrity: sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==} + engines: {node: '>=6.9.0'} + + '@babel/helper-validator-identifier@7.28.5': + resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==} + engines: {node: '>=6.9.0'} + + '@babel/helper-validator-option@7.27.1': + resolution: {integrity: sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==} + engines: {node: '>=6.9.0'} + + '@babel/helpers@7.29.2': + resolution: {integrity: sha512-HoGuUs4sCZNezVEKdVcwqmZN8GoHirLUcLaYVNBK2J0DadGtdcqgr3BCbvH8+XUo4NGjNl3VOtSjEKNzqfFgKw==} + engines: {node: '>=6.9.0'} + + '@babel/parser@7.29.2': + resolution: {integrity: sha512-4GgRzy/+fsBa72/RZVJmGKPmZu9Byn8o4MoLpmNe1m8ZfYnz5emHLQz3U4gLud6Zwl0RZIcgiLD7Uq7ySFuDLA==} + engines: {node: '>=6.0.0'} + hasBin: true + + '@babel/plugin-syntax-jsx@7.28.6': + resolution: {integrity: sha512-wgEmr06G6sIpqr8YDwA2dSRTE3bJ+V0IfpzfSY3Lfgd7YWOaAdlykvJi13ZKBt8cZHfgH1IXN+CL656W3uUa4w==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-syntax-typescript@7.28.6': + resolution: {integrity: sha512-+nDNmQye7nlnuuHDboPbGm00Vqg3oO8niRRL27/4LYHUsHYh0zJ1xWOz0uRwNFmM1Avzk8wZbc6rdiYhomzv/A==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-transform-modules-commonjs@7.28.6': + resolution: {integrity: sha512-jppVbf8IV9iWWwWTQIxJMAJCWBuuKx71475wHwYytrRGQ2CWiDvYlADQno3tcYpS/T2UUWFQp3nVtYfK/YBQrA==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/plugin-transform-typescript@7.28.6': + resolution: {integrity: sha512-0YWL2RFxOqEm9Efk5PvreamxPME8OyY0wM5wh5lHjF+VtVhdneCWGzZeSqzOfiobVqQaNCd2z0tQvnI9DaPWPw==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/preset-typescript@7.28.5': + resolution: {integrity: sha512-+bQy5WOI2V6LJZpPVxY+yp66XdZ2yifu0Mc1aP5CQKgjn4QM5IN2i5fAZ4xKop47pr8rpVhiAeu+nDQa12C8+g==} + engines: {node: '>=6.9.0'} + peerDependencies: + '@babel/core': ^7.0.0-0 + + '@babel/runtime@7.29.2': + resolution: {integrity: sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==} + engines: {node: '>=6.9.0'} + + '@babel/template@7.28.6': + resolution: {integrity: sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==} + engines: {node: '>=6.9.0'} + + '@babel/traverse@7.29.0': + resolution: {integrity: sha512-4HPiQr0X7+waHfyXPZpWPfWL/J7dcN1mx9gL6WdQVMbPnF3+ZhSMs8tCxN7oHddJE9fhNE7+lxdnlyemKfJRuA==} + engines: {node: '>=6.9.0'} + + '@babel/types@7.29.0': + resolution: {integrity: sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==} + engines: {node: '>=6.9.0'} + + '@base-ui/react@1.3.0': + resolution: {integrity: sha512-FwpKqZbPz14AITp1CVgf4AjhKPe1OeeVKSBMdgD10zbFlj3QSWelmtCMLi2+/PFZZcIm3l87G7rwtCZJwHyXWA==} + engines: {node: '>=14.0.0'} + peerDependencies: + '@types/react': ^17 || ^18 || ^19 + react: ^17 || ^18 || ^19 + react-dom: ^17 || ^18 || ^19 + peerDependenciesMeta: + '@types/react': + optional: true + + '@base-ui/utils@0.2.6': + resolution: {integrity: sha512-yQ+qeuqohwhsNpoYDqqXaLllYAkPCP4vYdDrVo8FQXaAPfHWm1pG/Vm+jmGTA5JFS0BAIjookyapuJFY8F9PIw==} + peerDependencies: + '@types/react': ^17 || ^18 || ^19 + react: ^17 || ^18 || ^19 + react-dom: ^17 || ^18 || ^19 + peerDependenciesMeta: + '@types/react': + optional: true + + '@dotenvx/dotenvx@1.61.0': + resolution: {integrity: sha512-utL3cpZoFzflyqUkjYbxYujI6STBTmO5LFn4bbin/NZnRWN6wQ7eErhr3/Vpa5h/jicPFC6kTa42r940mQftJQ==} + hasBin: true + + '@ecies/ciphers@0.2.6': + resolution: {integrity: sha512-patgsRPKGkhhoBjETV4XxD0En4ui5fbX0hzayqI3M8tvNMGUoUvmyYAIWwlxBc1KX5cturfqByYdj5bYGRpN9g==} + engines: {bun: '>=1', deno: '>=2.7.10', node: '>=16'} + peerDependencies: + '@noble/ciphers': ^1.0.0 + + '@emnapi/core@1.9.2': + resolution: {integrity: sha512-UC+ZhH3XtczQYfOlu3lNEkdW/p4dsJ1r/bP7H8+rhao3TTTMO1ATq/4DdIi23XuGoFY+Cz0JmCbdVl0hz9jZcA==} + + '@emnapi/runtime@1.9.2': + resolution: {integrity: sha512-3U4+MIWHImeyu1wnmVygh5WlgfYDtyf0k8AbLhMFxOipihf6nrWC4syIm/SwEeec0mNSafiiNnMJwbza/Is6Lw==} + + '@emnapi/wasi-threads@1.2.1': + resolution: {integrity: sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w==} + + '@eslint-community/eslint-utils@4.9.1': + resolution: {integrity: sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ==} + engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} + peerDependencies: + eslint: ^6.0.0 || ^7.0.0 || >=8.0.0 + + '@eslint-community/regexpp@4.12.2': + resolution: {integrity: sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew==} + engines: {node: ^12.0.0 || ^14.0.0 || >=16.0.0} + + '@eslint/config-array@0.21.2': + resolution: {integrity: sha512-nJl2KGTlrf9GjLimgIru+V/mzgSK0ABCDQRvxw5BjURL7WfH5uoWmizbH7QB6MmnMBd8cIC9uceWnezL1VZWWw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/config-helpers@0.4.2': + resolution: {integrity: sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/core@0.17.0': + resolution: {integrity: sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/eslintrc@3.3.5': + resolution: {integrity: sha512-4IlJx0X0qftVsN5E+/vGujTRIFtwuLbNsVUe7TO6zYPDR1O6nFwvwhIKEKSrl6dZchmYBITazxKoUYOjdtjlRg==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/js@9.39.4': + resolution: {integrity: sha512-nE7DEIchvtiFTwBw4Lfbu59PG+kCofhjsKaCWzxTpt4lfRjRMqG6uMBzKXuEcyXhOHoUp9riAm7/aWYGhXZ9cw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/object-schema@2.1.7': + resolution: {integrity: sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@eslint/plugin-kit@0.4.1': + resolution: {integrity: sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@floating-ui/core@1.7.5': + resolution: {integrity: sha512-1Ih4WTWyw0+lKyFMcBHGbb5U5FtuHJuujoyyr5zTaWS5EYMeT6Jb2AuDeftsCsEuchO+mM2ij5+q9crhydzLhQ==} + + '@floating-ui/dom@1.7.6': + resolution: {integrity: sha512-9gZSAI5XM36880PPMm//9dfiEngYoC6Am2izES1FF406YFsjvyBMmeJ2g4SAju3xWwtuynNRFL2s9hgxpLI5SQ==} + + '@floating-ui/react-dom@2.1.8': + resolution: {integrity: sha512-cC52bHwM/n/CxS87FH0yWdngEZrjdtLW/qVruo68qg+prK7ZQ4YGdut2GyDVpoGeAYe/h899rVeOVm6Oi40k2A==} + peerDependencies: + react: '>=16.8.0' + react-dom: '>=16.8.0' + + '@floating-ui/utils@0.2.11': + resolution: {integrity: sha512-RiB/yIh78pcIxl6lLMG0CgBXAZ2Y0eVHqMPYugu+9U0AeT6YBeiJpf7lbdJNIugFP5SIjwNRgo4DhR1Qxi26Gg==} + + '@fontsource-variable/geist@5.2.8': + resolution: {integrity: sha512-cJ6m9e+8MQ5dCYJsLylfZrgBh6KkG4bOLckB35Tr9J/EqdkEM6QllH5PxqP1dhTvFup+HtMRPuz9xOjxXJggxw==} + + '@fontsource-variable/outfit@5.2.8': + resolution: {integrity: sha512-4oUDCZx/Tcz6HZP423w/niqEH31Gks5IsqHV2ZZz1qKHaVIZdj2f0/S1IK2n8jl6Xo0o3N+3RjNHlV9R73ozQA==} + + '@hono/node-server@1.19.13': + resolution: {integrity: sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ==} + engines: {node: '>=18.14.1'} + peerDependencies: + hono: ^4 + + '@humanfs/core@0.19.1': + resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==} + engines: {node: '>=18.18.0'} + + '@humanfs/node@0.16.7': + resolution: {integrity: sha512-/zUx+yOsIrG4Y43Eh2peDeKCxlRt/gET6aHfaKpuq267qXdYDFViVHfMaLyygZOnl0kGWxFIgsBy8QFuTLUXEQ==} + engines: {node: '>=18.18.0'} + + '@humanwhocodes/module-importer@1.0.1': + resolution: {integrity: sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==} + engines: {node: '>=12.22'} + + '@humanwhocodes/retry@0.4.3': + resolution: {integrity: sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==} + engines: {node: '>=18.18'} + + '@inquirer/ansi@1.0.2': + resolution: {integrity: sha512-S8qNSZiYzFd0wAcyG5AXCvUHC5Sr7xpZ9wZ2py9XR88jUz8wooStVx5M6dRzczbBWjic9NP7+rY0Xi7qqK/aMQ==} + engines: {node: '>=18'} + + '@inquirer/confirm@5.1.21': + resolution: {integrity: sha512-KR8edRkIsUayMXV+o3Gv+q4jlhENF9nMYUZs9PA2HzrXeHI8M5uDag70U7RJn9yyiMZSbtF5/UexBtAVtZGSbQ==} + engines: {node: '>=18'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + + '@inquirer/core@10.3.2': + resolution: {integrity: sha512-43RTuEbfP8MbKzedNqBrlhhNKVwoK//vUFNW3Q3vZ88BLcrs4kYpGg+B2mm5p2K/HfygoCxuKwJJiv8PbGmE0A==} + engines: {node: '>=18'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + + '@inquirer/figures@1.0.15': + resolution: {integrity: sha512-t2IEY+unGHOzAaVM5Xx6DEWKeXlDDcNPeDyUpsRc6CUhBfU3VQOEl+Vssh7VNp1dR8MdUJBWhuObjXCsVpjN5g==} + engines: {node: '>=18'} + + '@inquirer/type@3.0.10': + resolution: {integrity: sha512-BvziSRxfz5Ov8ch0z/n3oijRSEcEsHnhggm4xFZe93DHcUCTlutlq9Ox4SVENAfcRD22UQq7T/atg9Wr3k09eA==} + engines: {node: '>=18'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + + '@jridgewell/gen-mapping@0.3.13': + resolution: {integrity: sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==} + + '@jridgewell/remapping@2.3.5': + resolution: {integrity: sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==} + + '@jridgewell/resolve-uri@3.1.2': + resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} + engines: {node: '>=6.0.0'} + + '@jridgewell/sourcemap-codec@1.5.5': + resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==} + + '@jridgewell/trace-mapping@0.3.31': + resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + + '@modelcontextprotocol/sdk@1.29.0': + resolution: {integrity: sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ==} + engines: {node: '>=18'} + peerDependencies: + '@cfworker/json-schema': ^4.1.1 + zod: ^3.25 || ^4.0 + peerDependenciesMeta: + '@cfworker/json-schema': + optional: true + + '@mswjs/interceptors@0.41.3': + resolution: {integrity: sha512-cXu86tF4VQVfwz8W1SPbhoRyHJkti6mjH/XJIxp40jhO4j2k1m4KYrEykxqWPkFF3vrK4rgQppBh//AwyGSXPA==} + engines: {node: '>=18'} + + '@napi-rs/wasm-runtime@1.1.3': + resolution: {integrity: sha512-xK9sGVbJWYb08+mTJt3/YV24WxvxpXcXtP6B172paPZ+Ts69Re9dAr7lKwJoeIx8OoeuimEiRZ7umkiUVClmmQ==} + peerDependencies: + '@emnapi/core': ^1.7.1 + '@emnapi/runtime': ^1.7.1 + + '@noble/ciphers@1.3.0': + resolution: {integrity: sha512-2I0gnIVPtfnMw9ee9h1dJG7tp81+8Ob3OJb3Mv37rx5L40/b0i7djjCVvGOVqc9AEIQyvyu1i6ypKdFw8R8gQw==} + engines: {node: ^14.21.3 || >=16} + + '@noble/curves@1.9.7': + resolution: {integrity: sha512-gbKGcRUYIjA3/zCCNaWDciTMFI0dCkvou3TL8Zmy5Nc7sJ47a0jtOeZoTaMxkuqRo9cRhjOdZJXegxYE5FN/xw==} + engines: {node: ^14.21.3 || >=16} + + '@noble/hashes@1.8.0': + resolution: {integrity: sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==} + engines: {node: ^14.21.3 || >=16} + + '@nodelib/fs.scandir@2.1.5': + resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} + engines: {node: '>= 8'} + + '@nodelib/fs.stat@2.0.5': + resolution: {integrity: sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==} + engines: {node: '>= 8'} + + '@nodelib/fs.walk@1.2.8': + resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==} + engines: {node: '>= 8'} + + '@open-draft/deferred-promise@2.2.0': + resolution: {integrity: sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==} + + '@open-draft/logger@0.3.0': + resolution: {integrity: sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ==} + + '@open-draft/until@2.1.0': + resolution: {integrity: sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==} + + '@oxc-project/types@0.124.0': + resolution: {integrity: sha512-VBFWMTBvHxS11Z5Lvlr3IWgrwhMTXV+Md+EQF0Xf60+wAdsGFTBx7X7K/hP4pi8N7dcm1RvcHwDxZ16Qx8keUg==} + + '@rolldown/binding-android-arm64@1.0.0-rc.15': + resolution: {integrity: sha512-YYe6aWruPZDtHNpwu7+qAHEMbQ/yRl6atqb/AhznLTnD3UY99Q1jE7ihLSahNWkF4EqRPVC4SiR4O0UkLK02tA==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [android] + + '@rolldown/binding-darwin-arm64@1.0.0-rc.15': + resolution: {integrity: sha512-oArR/ig8wNTPYsXL+Mzhs0oxhxfuHRfG7Ikw7jXsw8mYOtk71W0OkF2VEVh699pdmzjPQsTjlD1JIOoHkLP1Fg==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [darwin] + + '@rolldown/binding-darwin-x64@1.0.0-rc.15': + resolution: {integrity: sha512-YzeVqOqjPYvUbJSWJ4EDL8ahbmsIXQpgL3JVipmN+MX0XnXMeWomLN3Fb+nwCmP/jfyqte5I3XRSm7OfQrbyxw==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [x64] + os: [darwin] + + '@rolldown/binding-freebsd-x64@1.0.0-rc.15': + resolution: {integrity: sha512-9Erhx956jeQ0nNTyif1+QWAXDRD38ZNjr//bSHrt6wDwB+QkAfl2q6Mn1k6OBPerznjRmbM10lgRb1Pli4xZPw==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [x64] + os: [freebsd] + + '@rolldown/binding-linux-arm-gnueabihf@1.0.0-rc.15': + resolution: {integrity: sha512-cVwk0w8QbZJGTnP/AHQBs5yNwmpgGYStL88t4UIaqcvYJWBfS0s3oqVLZPwsPU6M0zlW4GqjP0Zq5MnAGwFeGA==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm] + os: [linux] + + '@rolldown/binding-linux-arm64-gnu@1.0.0-rc.15': + resolution: {integrity: sha512-eBZ/u8iAK9SoHGanqe/jrPnY0JvBN6iXbVOsbO38mbz+ZJsaobExAm1Iu+rxa4S1l2FjG0qEZn4Rc6X8n+9M+w==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [linux] + + '@rolldown/binding-linux-arm64-musl@1.0.0-rc.15': + resolution: {integrity: sha512-ZvRYMGrAklV9PEkgt4LQM6MjQX2P58HPAuecwYObY2DhS2t35R0I810bKi0wmaYORt6m/2Sm+Z+nFgb0WhXNcQ==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [linux] + + '@rolldown/binding-linux-ppc64-gnu@1.0.0-rc.15': + resolution: {integrity: sha512-VDpgGBzgfg5hLg+uBpCLoFG5kVvEyafmfxGUV0UHLcL5irxAK7PKNeC2MwClgk6ZAiNhmo9FLhRYgvMmedLtnQ==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [ppc64] + os: [linux] + + '@rolldown/binding-linux-s390x-gnu@1.0.0-rc.15': + resolution: {integrity: sha512-y1uXY3qQWCzcPgRJATPSOUP4tCemh4uBdY7e3EZbVwCJTY3gLJWnQABgeUetvED+bt1FQ01OeZwvhLS2bpNrAQ==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [s390x] + os: [linux] + + '@rolldown/binding-linux-x64-gnu@1.0.0-rc.15': + resolution: {integrity: sha512-023bTPBod7J3Y/4fzAN6QtpkSABR0rigtrwaP+qSEabUh5zf6ELr9Nc7GujaROuPY3uwdSIXWrvhn1KxOvurWA==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [x64] + os: [linux] + + '@rolldown/binding-linux-x64-musl@1.0.0-rc.15': + resolution: {integrity: sha512-witB2O0/hU4CgfOOKUoeFgQ4GktPi1eEbAhaLAIpgD6+ZnhcPkUtPsoKKHRzmOoWPZue46IThdSgdo4XneOLYw==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [x64] + os: [linux] + + '@rolldown/binding-openharmony-arm64@1.0.0-rc.15': + resolution: {integrity: sha512-UCL68NJ0Ud5zRipXZE9dF5PmirzJE4E4BCIOOssEnM7wLDsxjc6Qb0sGDxTNRTP53I6MZpygyCpY8Aa8sPfKPg==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [openharmony] + + '@rolldown/binding-wasm32-wasi@1.0.0-rc.15': + resolution: {integrity: sha512-ApLruZq/ig+nhaE7OJm4lDjayUnOHVUa77zGeqnqZ9pn0ovdVbbNPerVibLXDmWeUZXjIYIT8V3xkT58Rm9u5Q==} + engines: {node: '>=14.0.0'} + cpu: [wasm32] + + '@rolldown/binding-win32-arm64-msvc@1.0.0-rc.15': + resolution: {integrity: sha512-KmoUoU7HnN+Si5YWJigfTws1jz1bKBYDQKdbLspz0UaqjjFkddHsqorgiW1mxcAj88lYUE6NC/zJNwT+SloqtA==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [arm64] + os: [win32] + + '@rolldown/binding-win32-x64-msvc@1.0.0-rc.15': + resolution: {integrity: sha512-3P2A8L+x75qavWLe/Dll3EYBJLQmtkJN8rfh+U/eR3MqMgL/h98PhYI+JFfXuDPgPeCB7iZAKiqii5vqOvnA0g==} + engines: {node: ^20.19.0 || >=22.12.0} + cpu: [x64] + os: [win32] + + '@rolldown/pluginutils@1.0.0-rc.15': + resolution: {integrity: sha512-UromN0peaE53IaBRe9W7CjrZgXl90fqGpK+mIZbA3qSTeYqg3pqpROBdIPvOG3F5ereDHNwoHBI2e50n1BDr1g==} + + '@rolldown/pluginutils@1.0.0-rc.7': + resolution: {integrity: sha512-qujRfC8sFVInYSPPMLQByRh7zhwkGFS4+tyMQ83srV1qrxL4g8E2tyxVVyxd0+8QeBM1mIk9KbWxkegRr76XzA==} + + '@sec-ant/readable-stream@0.4.1': + resolution: {integrity: sha512-831qok9r2t8AlxLko40y2ebgSDhenenCatLVeW/uBtnHPyhHOvG0C7TvfgecV+wHzIm5KUICgzmVpWS+IMEAeg==} + + '@sindresorhus/merge-streams@4.0.0': + resolution: {integrity: sha512-tlqY9xq5ukxTUZBmoOp+m61cqwQD5pHJtFY3Mn8CA8ps6yghLH/Hw8UPdqg4OLmFW3IFlcXnQNmo/dh8HzXYIQ==} + engines: {node: '>=18'} + + '@tailwindcss/node@4.2.2': + resolution: {integrity: sha512-pXS+wJ2gZpVXqFaUEjojq7jzMpTGf8rU6ipJz5ovJV6PUGmlJ+jvIwGrzdHdQ80Sg+wmQxUFuoW1UAAwHNEdFA==} + + '@tailwindcss/oxide-android-arm64@4.2.2': + resolution: {integrity: sha512-dXGR1n+P3B6748jZO/SvHZq7qBOqqzQ+yFrXpoOWWALWndF9MoSKAT3Q0fYgAzYzGhxNYOoysRvYlpixRBBoDg==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [android] + + '@tailwindcss/oxide-darwin-arm64@4.2.2': + resolution: {integrity: sha512-iq9Qjr6knfMpZHj55/37ouZeykwbDqF21gPFtfnhCCKGDcPI/21FKC9XdMO/XyBM7qKORx6UIhGgg6jLl7BZlg==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [darwin] + + '@tailwindcss/oxide-darwin-x64@4.2.2': + resolution: {integrity: sha512-BlR+2c3nzc8f2G639LpL89YY4bdcIdUmiOOkv2GQv4/4M0vJlpXEa0JXNHhCHU7VWOKWT/CjqHdTP8aUuDJkuw==} + engines: {node: '>= 20'} + cpu: [x64] + os: [darwin] + + '@tailwindcss/oxide-freebsd-x64@4.2.2': + resolution: {integrity: sha512-YUqUgrGMSu2CDO82hzlQ5qSb5xmx3RUrke/QgnoEx7KvmRJHQuZHZmZTLSuuHwFf0DJPybFMXMYf+WJdxHy/nQ==} + engines: {node: '>= 20'} + cpu: [x64] + os: [freebsd] + + '@tailwindcss/oxide-linux-arm-gnueabihf@4.2.2': + resolution: {integrity: sha512-FPdhvsW6g06T9BWT0qTwiVZYE2WIFo2dY5aCSpjG/S/u1tby+wXoslXS0kl3/KXnULlLr1E3NPRRw0g7t2kgaQ==} + engines: {node: '>= 20'} + cpu: [arm] + os: [linux] + + '@tailwindcss/oxide-linux-arm64-gnu@4.2.2': + resolution: {integrity: sha512-4og1V+ftEPXGttOO7eCmW7VICmzzJWgMx+QXAJRAhjrSjumCwWqMfkDrNu1LXEQzNAwz28NCUpucgQPrR4S2yw==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [linux] + + '@tailwindcss/oxide-linux-arm64-musl@4.2.2': + resolution: {integrity: sha512-oCfG/mS+/+XRlwNjnsNLVwnMWYH7tn/kYPsNPh+JSOMlnt93mYNCKHYzylRhI51X+TbR+ufNhhKKzm6QkqX8ag==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [linux] + + '@tailwindcss/oxide-linux-x64-gnu@4.2.2': + resolution: {integrity: sha512-rTAGAkDgqbXHNp/xW0iugLVmX62wOp2PoE39BTCGKjv3Iocf6AFbRP/wZT/kuCxC9QBh9Pu8XPkv/zCZB2mcMg==} + engines: {node: '>= 20'} + cpu: [x64] + os: [linux] + + '@tailwindcss/oxide-linux-x64-musl@4.2.2': + resolution: {integrity: sha512-XW3t3qwbIwiSyRCggeO2zxe3KWaEbM0/kW9e8+0XpBgyKU4ATYzcVSMKteZJ1iukJ3HgHBjbg9P5YPRCVUxlnQ==} + engines: {node: '>= 20'} + cpu: [x64] + os: [linux] + + '@tailwindcss/oxide-wasm32-wasi@4.2.2': + resolution: {integrity: sha512-eKSztKsmEsn1O5lJ4ZAfyn41NfG7vzCg496YiGtMDV86jz1q/irhms5O0VrY6ZwTUkFy/EKG3RfWgxSI3VbZ8Q==} + engines: {node: '>=14.0.0'} + cpu: [wasm32] + bundledDependencies: + - '@napi-rs/wasm-runtime' + - '@emnapi/core' + - '@emnapi/runtime' + - '@tybys/wasm-util' + - '@emnapi/wasi-threads' + - tslib + + '@tailwindcss/oxide-win32-arm64-msvc@4.2.2': + resolution: {integrity: sha512-qPmaQM4iKu5mxpsrWZMOZRgZv1tOZpUm+zdhhQP0VhJfyGGO3aUKdbh3gDZc/dPLQwW4eSqWGrrcWNBZWUWaXQ==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [win32] + + '@tailwindcss/oxide-win32-x64-msvc@4.2.2': + resolution: {integrity: sha512-1T/37VvI7WyH66b+vqHj/cLwnCxt7Qt3WFu5Q8hk65aOvlwAhs7rAp1VkulBJw/N4tMirXjVnylTR72uI0HGcA==} + engines: {node: '>= 20'} + cpu: [x64] + os: [win32] + + '@tailwindcss/oxide@4.2.2': + resolution: {integrity: sha512-qEUA07+E5kehxYp9BVMpq9E8vnJuBHfJEC0vPC5e7iL/hw7HR61aDKoVoKzrG+QKp56vhNZe4qwkRmMC0zDLvg==} + engines: {node: '>= 20'} + + '@tailwindcss/vite@4.2.2': + resolution: {integrity: sha512-mEiF5HO1QqCLXoNEfXVA1Tzo+cYsrqV7w9Juj2wdUFyW07JRenqMG225MvPwr3ZD9N1bFQj46X7r33iHxLUW0w==} + peerDependencies: + vite: ^5.2.0 || ^6 || ^7 || ^8 + + '@tanstack/query-core@5.97.0': + resolution: {integrity: sha512-QdpLP5VzVMgo4VtaPppRA2W04UFjIqX+bxke/ZJhE5cfd5UPkRzqIAJQt9uXkQJjqE8LBOMbKv7f8HCsZltXlg==} + + '@tanstack/react-query@5.97.0': + resolution: {integrity: sha512-y4So4eGcQoK2WVMAcDNZE9ofB/p5v1OlKvtc1F3uqHwrtifobT7q+ZnXk2mRkc8E84HKYSlAE9z6HXl2V0+ySQ==} + peerDependencies: + react: ^18 || ^19 + + '@tanstack/react-virtual@3.13.23': + resolution: {integrity: sha512-XnMRnHQ23piOVj2bzJqHrRrLg4r+F86fuBcwteKfbIjJrtGxb4z7tIvPVAe4B+4UVwo9G4Giuz5fmapcrnZ0OQ==} + peerDependencies: + react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-dom: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + + '@tanstack/virtual-core@3.13.23': + resolution: {integrity: sha512-zSz2Z2HNyLjCplANTDyl3BcdQJc2k1+yyFoKhNRmCr7V7dY8o8q5m8uFTI1/Pg1kL+Hgrz6u3Xo6eFUB7l66cg==} + + '@ts-morph/common@0.27.0': + resolution: {integrity: sha512-Wf29UqxWDpc+i61k3oIOzcUfQt79PIT9y/MWfAGlrkjg6lBC1hwDECLXPVJAhWjiGbfBCxZd65F/LIZF3+jeJQ==} + + '@tybys/wasm-util@0.10.1': + resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==} + + '@types/estree@1.0.8': + resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==} + + '@types/json-schema@7.0.15': + resolution: {integrity: sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==} + + '@types/node@24.12.2': + resolution: {integrity: sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==} + + '@types/react-dom@19.2.3': + resolution: {integrity: sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ==} + peerDependencies: + '@types/react': ^19.2.0 + + '@types/react@19.2.14': + resolution: {integrity: sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==} + + '@types/statuses@2.0.6': + resolution: {integrity: sha512-xMAgYwceFhRA2zY+XbEA7mxYbA093wdiW8Vu6gZPGWy9cmOyU9XesH1tNcEWsKFd5Vzrqx5T3D38PWx1FIIXkA==} + + '@types/validate-npm-package-name@4.0.2': + resolution: {integrity: sha512-lrpDziQipxCEeK5kWxvljWYhUvOiB2A9izZd9B2AFarYAkqZshb4lPbRs7zKEic6eGtH8V/2qJW+dPp9OtF6bw==} + + '@typescript-eslint/eslint-plugin@8.58.1': + resolution: {integrity: sha512-eSkwoemjo76bdXl2MYqtxg51HNwUSkWfODUOQ3PaTLZGh9uIWWFZIjyjaJnex7wXDu+TRx+ATsnSxdN9YWfRTQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + '@typescript-eslint/parser': ^8.58.1 + eslint: ^8.57.0 || ^9.0.0 || ^10.0.0 + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/parser@8.58.1': + resolution: {integrity: sha512-gGkiNMPqerb2cJSVcruigx9eHBlLG14fSdPdqMoOcBfh+vvn4iCq2C8MzUB89PrxOXk0y3GZ1yIWb9aOzL93bw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + eslint: ^8.57.0 || ^9.0.0 || ^10.0.0 + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/project-service@8.58.1': + resolution: {integrity: sha512-gfQ8fk6cxhtptek+/8ZIqw8YrRW5048Gug8Ts5IYcMLCw18iUgrZAEY/D7s4hkI0FxEfGakKuPK/XUMPzPxi5g==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/scope-manager@8.58.1': + resolution: {integrity: sha512-TPYUEqJK6avLcEjumWsIuTpuYODTTDAtoMdt8ZZa93uWMTX13Nb8L5leSje1NluammvU+oI3QRr5lLXPgihX3w==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@typescript-eslint/tsconfig-utils@8.58.1': + resolution: {integrity: sha512-JAr2hOIct2Q+qk3G+8YFfqkqi7sC86uNryT+2i5HzMa2MPjw4qNFvtjnw1IiA1rP7QhNKVe21mSSLaSjwA1Olw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/type-utils@8.58.1': + resolution: {integrity: sha512-HUFxvTJVroT+0rXVJC7eD5zol6ID+Sn5npVPWoFuHGg9Ncq5Q4EYstqR+UOqaNRFXi5TYkpXXkLhoCHe3G0+7w==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + eslint: ^8.57.0 || ^9.0.0 || ^10.0.0 + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/types@8.58.1': + resolution: {integrity: sha512-io/dV5Aw5ezwzfPBBWLoT+5QfVtP8O7q4Kftjn5azJ88bYyp/ZMCsyW1lpKK46EXJcaYMZ1JtYj+s/7TdzmQMw==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@typescript-eslint/typescript-estree@8.58.1': + resolution: {integrity: sha512-w4w7WR7GHOjqqPnvAYbazq+Y5oS68b9CzasGtnd6jIeOIeKUzYzupGTB2T4LTPSv4d+WPeccbxuneTFHYgAAWg==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/utils@8.58.1': + resolution: {integrity: sha512-Ln8R0tmWC7pTtLOzgJzYTXSCjJ9rDNHAqTaVONF4FEi2qwce8mD9iSOxOpLFFvWp/wBFlew0mjM1L1ihYWfBdQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + eslint: ^8.57.0 || ^9.0.0 || ^10.0.0 + typescript: '>=4.8.4 <6.1.0' + + '@typescript-eslint/visitor-keys@8.58.1': + resolution: {integrity: sha512-y+vH7QE8ycjoa0bWciFg7OpFcipUuem1ujhrdLtq1gByKwfbC7bPeKsiny9e0urg93DqwGcHey+bGRKCnF1nZQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@vitejs/plugin-react@6.0.1': + resolution: {integrity: sha512-l9X/E3cDb+xY3SWzlG1MOGt2usfEHGMNIaegaUGFsLkb3RCn/k8/TOXBcab+OndDI4TBtktT8/9BwwW8Vi9KUQ==} + engines: {node: ^20.19.0 || >=22.12.0} + peerDependencies: + '@rolldown/plugin-babel': ^0.1.7 || ^0.2.0 + babel-plugin-react-compiler: ^1.0.0 + vite: ^8.0.0 + peerDependenciesMeta: + '@rolldown/plugin-babel': + optional: true + babel-plugin-react-compiler: + optional: true + + accepts@2.0.0: + resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==} + engines: {node: '>= 0.6'} + + acorn-jsx@5.3.2: + resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==} + peerDependencies: + acorn: ^6.0.0 || ^7.0.0 || ^8.0.0 + + acorn@8.16.0: + resolution: {integrity: sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==} + engines: {node: '>=0.4.0'} + hasBin: true + + agent-base@7.1.4: + resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} + engines: {node: '>= 14'} + + ajv-formats@3.0.1: + resolution: {integrity: sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==} + peerDependencies: + ajv: ^8.0.0 + peerDependenciesMeta: + ajv: + optional: true + + ajv@6.14.0: + resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==} + + ajv@8.18.0: + resolution: {integrity: sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==} + + ansi-regex@5.0.1: + resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} + engines: {node: '>=8'} + + ansi-regex@6.2.2: + resolution: {integrity: sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==} + engines: {node: '>=12'} + + ansi-styles@4.3.0: + resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==} + engines: {node: '>=8'} + + argparse@2.0.1: + resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} + + ast-types@0.16.1: + resolution: {integrity: sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg==} + engines: {node: '>=4'} + + balanced-match@1.0.2: + resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} + + balanced-match@4.0.4: + resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} + engines: {node: 18 || 20 || >=22} + + baseline-browser-mapping@2.10.17: + resolution: {integrity: sha512-HdrkN8eVG2CXxeifv/VdJ4A4RSra1DTW8dc/hdxzhGHN8QePs6gKaWM9pHPcpCoxYZJuOZ8drHmbdpLHjCYjLA==} + engines: {node: '>=6.0.0'} + hasBin: true + + body-parser@2.2.2: + resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} + engines: {node: '>=18'} + + brace-expansion@1.1.13: + resolution: {integrity: sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==} + + brace-expansion@5.0.5: + resolution: {integrity: sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ==} + engines: {node: 18 || 20 || >=22} + + braces@3.0.3: + resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} + engines: {node: '>=8'} + + browserslist@4.28.2: + resolution: {integrity: sha512-48xSriZYYg+8qXna9kwqjIVzuQxi+KYWp2+5nCYnYKPTr0LvD89Jqk2Or5ogxz0NUMfIjhh2lIUX/LyX9B4oIg==} + engines: {node: ^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7} + hasBin: true + + bundle-name@4.1.0: + resolution: {integrity: sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==} + engines: {node: '>=18'} + + bytes@3.1.2: + resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} + engines: {node: '>= 0.8'} + + call-bind-apply-helpers@1.0.2: + resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==} + engines: {node: '>= 0.4'} + + call-bound@1.0.4: + resolution: {integrity: sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==} + engines: {node: '>= 0.4'} + + callsites@3.1.0: + resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==} + engines: {node: '>=6'} + + caniuse-lite@1.0.30001787: + resolution: {integrity: sha512-mNcrMN9KeI68u7muanUpEejSLghOKlVhRqS/Za2IeyGllJ9I9otGpR9g3nsw7n4W378TE/LyIteA0+/FOZm4Kg==} + + chalk@4.1.2: + resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} + engines: {node: '>=10'} + + chalk@5.6.2: + resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==} + engines: {node: ^12.17.0 || ^14.13 || >=16.0.0} + + class-variance-authority@0.7.1: + resolution: {integrity: sha512-Ka+9Trutv7G8M6WT6SeiRWz792K5qEqIGEGzXKhAE6xOWAY6pPH8U+9IY3oCMv6kqTmLsv7Xh/2w2RigkePMsg==} + + cli-cursor@5.0.0: + resolution: {integrity: sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==} + engines: {node: '>=18'} + + cli-spinners@2.9.2: + resolution: {integrity: sha512-ywqV+5MmyL4E7ybXgKys4DugZbX0FC6LnwrhjuykIjnK9k8OQacQ7axGKnjDXWNhns0xot3bZI5h55H8yo9cJg==} + engines: {node: '>=6'} + + cli-width@4.1.0: + resolution: {integrity: sha512-ouuZd4/dm2Sw5Gmqy6bGyNNNe1qt9RpmxveLSO7KcgsTnU7RXfsw+/bukWGo1abgBiMAic068rclZsO4IWmmxQ==} + engines: {node: '>= 12'} + + cliui@8.0.1: + resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==} + engines: {node: '>=12'} + + clsx@2.1.1: + resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==} + engines: {node: '>=6'} + + code-block-writer@13.0.3: + resolution: {integrity: sha512-Oofo0pq3IKnsFtuHqSF7TqBfr71aeyZDVJ0HpmqB7FBM2qEigL0iPONSCZSO9pE9dZTAxANe5XHG9Uy0YMv8cg==} + + color-convert@2.0.1: + resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} + engines: {node: '>=7.0.0'} + + color-name@1.1.4: + resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + + commander@11.1.0: + resolution: {integrity: sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==} + engines: {node: '>=16'} + + commander@14.0.3: + resolution: {integrity: sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw==} + engines: {node: '>=20'} + + concat-map@0.0.1: + resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} + + content-disposition@1.1.0: + resolution: {integrity: sha512-5jRCH9Z/+DRP7rkvY83B+yGIGX96OYdJmzngqnw2SBSxqCFPd0w2km3s5iawpGX8krnwSGmF0FW5Nhr0Hfai3g==} + engines: {node: '>=18'} + + content-type@1.0.5: + resolution: {integrity: sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==} + engines: {node: '>= 0.6'} + + convert-source-map@2.0.0: + resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==} + + cookie-signature@1.2.2: + resolution: {integrity: sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==} + engines: {node: '>=6.6.0'} + + cookie@0.7.2: + resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==} + engines: {node: '>= 0.6'} + + cookie@1.1.1: + resolution: {integrity: sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==} + engines: {node: '>=18'} + + cors@2.8.6: + resolution: {integrity: sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==} + engines: {node: '>= 0.10'} + + cosmiconfig@9.0.1: + resolution: {integrity: sha512-hr4ihw+DBqcvrsEDioRO31Z17x71pUYoNe/4h6Z0wB72p7MU7/9gH8Q3s12NFhHPfYBBOV3qyfUxmr/Yn3shnQ==} + engines: {node: '>=14'} + peerDependencies: + typescript: '>=4.9.5' + peerDependenciesMeta: + typescript: + optional: true + + cross-spawn@7.0.6: + resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} + engines: {node: '>= 8'} + + cssesc@3.0.0: + resolution: {integrity: sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==} + engines: {node: '>=4'} + hasBin: true + + csstype@3.2.3: + resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} + + data-uri-to-buffer@4.0.1: + resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==} + engines: {node: '>= 12'} + + debug@4.4.3: + resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + + dedent@1.7.2: + resolution: {integrity: sha512-WzMx3mW98SN+zn3hgemf4OzdmyNhhhKz5Ay0pUfQiMQ3e1g+xmTJWp/pKdwKVXhdSkAEGIIzqeuWrL3mV/AXbA==} + peerDependencies: + babel-plugin-macros: ^3.1.0 + peerDependenciesMeta: + babel-plugin-macros: + optional: true + + deep-is@0.1.4: + resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==} + + deepmerge@4.3.1: + resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} + engines: {node: '>=0.10.0'} + + default-browser-id@5.0.1: + resolution: {integrity: sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==} + engines: {node: '>=18'} + + default-browser@5.5.0: + resolution: {integrity: sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==} + engines: {node: '>=18'} + + define-lazy-prop@3.0.0: + resolution: {integrity: sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==} + engines: {node: '>=12'} + + depd@2.0.0: + resolution: {integrity: sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==} + engines: {node: '>= 0.8'} + + detect-libc@2.1.2: + resolution: {integrity: sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==} + engines: {node: '>=8'} + + diff@8.0.4: + resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==} + engines: {node: '>=0.3.1'} + + dotenv@17.4.1: + resolution: {integrity: sha512-k8DaKGP6r1G30Lx8V4+pCsLzKr8vLmV2paqEj1Y55GdAgJuIqpRp5FfajGF8KtwMxCz9qJc6wUIJnm053d/WCw==} + engines: {node: '>=12'} + + dunder-proto@1.0.1: + resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} + engines: {node: '>= 0.4'} + + eciesjs@0.4.18: + resolution: {integrity: sha512-wG99Zcfcys9fZux7Cft8BAX/YrOJLJSZ3jyYPfhZHqN2E+Ffx+QXBDsv3gubEgPtV6dTzJMSQUwk1H98/t/0wQ==} + engines: {bun: '>=1', deno: '>=2', node: '>=16'} + + ee-first@1.1.1: + resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==} + + electron-to-chromium@1.5.334: + resolution: {integrity: sha512-mgjZAz7Jyx1SRCwEpy9wefDS7GvNPazLthHg8eQMJ76wBdGQQDW33TCrUTvQ4wzpmOrv2zrFoD3oNufMdyMpog==} + + emoji-regex@10.6.0: + resolution: {integrity: sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A==} + + emoji-regex@8.0.0: + resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==} + + encodeurl@2.0.0: + resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==} + engines: {node: '>= 0.8'} + + enhanced-resolve@5.20.1: + resolution: {integrity: sha512-Qohcme7V1inbAfvjItgw0EaxVX5q2rdVEZHRBrEQdRZTssLDGsL8Lwrznl8oQ/6kuTJONLaDcGjkNP247XEhcA==} + engines: {node: '>=10.13.0'} + + env-paths@2.2.1: + resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} + engines: {node: '>=6'} + + error-ex@1.3.4: + resolution: {integrity: sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ==} + + es-define-property@1.0.1: + resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==} + engines: {node: '>= 0.4'} + + es-errors@1.3.0: + resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==} + engines: {node: '>= 0.4'} + + es-object-atoms@1.1.1: + resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==} + engines: {node: '>= 0.4'} + + escalade@3.2.0: + resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==} + engines: {node: '>=6'} + + escape-html@1.0.3: + resolution: {integrity: sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==} + + escape-string-regexp@4.0.0: + resolution: {integrity: sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==} + engines: {node: '>=10'} + + eslint-plugin-react-hooks@7.0.1: + resolution: {integrity: sha512-O0d0m04evaNzEPoSW+59Mezf8Qt0InfgGIBJnpC0h3NH/WjUAR7BIKUfysC6todmtiZ/A0oUVS8Gce0WhBrHsA==} + engines: {node: '>=18'} + peerDependencies: + eslint: ^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0-0 || ^9.0.0 + + eslint-plugin-react-refresh@0.5.2: + resolution: {integrity: sha512-hmgTH57GfzoTFjVN0yBwTggnsVUF2tcqi7RJZHqi9lIezSs4eFyAMktA68YD4r5kNw1mxyY4dmkyoFDb3FIqrA==} + peerDependencies: + eslint: ^9 || ^10 + + eslint-scope@8.4.0: + resolution: {integrity: sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + eslint-visitor-keys@3.4.3: + resolution: {integrity: sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==} + engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} + + eslint-visitor-keys@4.2.1: + resolution: {integrity: sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + eslint-visitor-keys@5.0.1: + resolution: {integrity: sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA==} + engines: {node: ^20.19.0 || ^22.13.0 || >=24} + + eslint@9.39.4: + resolution: {integrity: sha512-XoMjdBOwe/esVgEvLmNsD3IRHkm7fbKIUGvrleloJXUZgDHig2IPWNniv+GwjyJXzuNqVjlr5+4yVUZjycJwfQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + hasBin: true + peerDependencies: + jiti: '*' + peerDependenciesMeta: + jiti: + optional: true + + espree@10.4.0: + resolution: {integrity: sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + esprima@4.0.1: + resolution: {integrity: sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==} + engines: {node: '>=4'} + hasBin: true + + esquery@1.7.0: + resolution: {integrity: sha512-Ap6G0WQwcU/LHsvLwON1fAQX9Zp0A2Y6Y/cJBl9r/JbW90Zyg4/zbG6zzKa2OTALELarYHmKu0GhpM5EO+7T0g==} + engines: {node: '>=0.10'} + + esrecurse@4.3.0: + resolution: {integrity: sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==} + engines: {node: '>=4.0'} + + estraverse@5.3.0: + resolution: {integrity: sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==} + engines: {node: '>=4.0'} + + esutils@2.0.3: + resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==} + engines: {node: '>=0.10.0'} + + etag@1.8.1: + resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==} + engines: {node: '>= 0.6'} + + eventsource-parser@3.0.6: + resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==} + engines: {node: '>=18.0.0'} + + eventsource@3.0.7: + resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==} + engines: {node: '>=18.0.0'} + + execa@5.1.1: + resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} + engines: {node: '>=10'} + + execa@9.6.1: + resolution: {integrity: sha512-9Be3ZoN4LmYR90tUoVu2te2BsbzHfhJyfEiAVfz7N5/zv+jduIfLrV2xdQXOHbaD6KgpGdO9PRPM1Y4Q9QkPkA==} + engines: {node: ^18.19.0 || >=20.5.0} + + express-rate-limit@8.3.2: + resolution: {integrity: sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg==} + engines: {node: '>= 16'} + peerDependencies: + express: '>= 4.11' + + express@5.2.1: + resolution: {integrity: sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==} + engines: {node: '>= 18'} + + fast-deep-equal@3.1.3: + resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + + fast-glob@3.3.3: + resolution: {integrity: sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==} + engines: {node: '>=8.6.0'} + + fast-json-stable-stringify@2.1.0: + resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==} + + fast-levenshtein@2.0.6: + resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} + + fast-uri@3.1.0: + resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==} + + fastq@1.20.1: + resolution: {integrity: sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==} + + fdir@6.5.0: + resolution: {integrity: sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==} + engines: {node: '>=12.0.0'} + peerDependencies: + picomatch: ^3 || ^4 + peerDependenciesMeta: + picomatch: + optional: true + + fetch-blob@3.2.0: + resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==} + engines: {node: ^12.20 || >= 14.13} + + figures@6.1.0: + resolution: {integrity: sha512-d+l3qxjSesT4V7v2fh+QnmFnUWv9lSpjarhShNTgBOfA0ttejbQUAlHLitbjkoRiDulW0OPoQPYIGhIC8ohejg==} + engines: {node: '>=18'} + + file-entry-cache@8.0.0: + resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==} + engines: {node: '>=16.0.0'} + + fill-range@7.1.1: + resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} + engines: {node: '>=8'} + + finalhandler@2.1.1: + resolution: {integrity: sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==} + engines: {node: '>= 18.0.0'} + + find-up@5.0.0: + resolution: {integrity: sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==} + engines: {node: '>=10'} + + flat-cache@4.0.1: + resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==} + engines: {node: '>=16'} + + flatted@3.4.2: + resolution: {integrity: sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==} + + formdata-polyfill@4.0.10: + resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==} + engines: {node: '>=12.20.0'} + + forwarded@0.2.0: + resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==} + engines: {node: '>= 0.6'} + + fresh@2.0.0: + resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} + engines: {node: '>= 0.8'} + + fs-extra@11.3.4: + resolution: {integrity: sha512-CTXd6rk/M3/ULNQj8FBqBWHYBVYybQ3VPBw0xGKFe3tuH7ytT6ACnvzpIQ3UZtB8yvUKC2cXn1a+x+5EVQLovA==} + engines: {node: '>=14.14'} + + fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + function-bind@1.1.2: + resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + + fuzzysort@3.1.0: + resolution: {integrity: sha512-sR9BNCjBg6LNgwvxlBd0sBABvQitkLzoVY9MYYROQVX/FvfJ4Mai9LsGhDgd8qYdds0bY77VzYd5iuB+v5rwQQ==} + + gensync@1.0.0-beta.2: + resolution: {integrity: sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==} + engines: {node: '>=6.9.0'} + + get-caller-file@2.0.5: + resolution: {integrity: sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==} + engines: {node: 6.* || 8.* || >= 10.*} + + get-east-asian-width@1.5.0: + resolution: {integrity: sha512-CQ+bEO+Tva/qlmw24dCejulK5pMzVnUOFOijVogd3KQs07HnRIgp8TGipvCCRT06xeYEbpbgwaCxglFyiuIcmA==} + engines: {node: '>=18'} + + get-intrinsic@1.3.0: + resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==} + engines: {node: '>= 0.4'} + + get-own-enumerable-keys@1.0.0: + resolution: {integrity: sha512-PKsK2FSrQCyxcGHsGrLDcK0lx+0Ke+6e8KFFozA9/fIQLhQzPaRvJFdcz7+Axg3jUH/Mq+NI4xa5u/UT2tQskA==} + engines: {node: '>=14.16'} + + get-proto@1.0.1: + resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==} + engines: {node: '>= 0.4'} + + get-stream@6.0.1: + resolution: {integrity: sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==} + engines: {node: '>=10'} + + get-stream@9.0.1: + resolution: {integrity: sha512-kVCxPF3vQM/N0B1PmoqVUqgHP+EeVjmZSQn+1oCRPxd2P21P2F19lIgbR3HBosbB1PUhOAoctJnfEn2GbN2eZA==} + engines: {node: '>=18'} + + glob-parent@5.1.2: + resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} + engines: {node: '>= 6'} + + glob-parent@6.0.2: + resolution: {integrity: sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==} + engines: {node: '>=10.13.0'} + + globals@14.0.0: + resolution: {integrity: sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==} + engines: {node: '>=18'} + + globals@17.4.0: + resolution: {integrity: sha512-hjrNztw/VajQwOLsMNT1cbJiH2muO3OROCHnbehc8eY5JyD2gqz4AcMHPqgaOR59DjgUjYAYLeH699g/eWi2jw==} + engines: {node: '>=18'} + + gopd@1.2.0: + resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==} + engines: {node: '>= 0.4'} + + graceful-fs@4.2.11: + resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + + graphql@16.13.2: + resolution: {integrity: sha512-5bJ+nf/UCpAjHM8i06fl7eLyVC9iuNAjm9qzkiu2ZGhM0VscSvS6WDPfAwkdkBuoXGM9FJSbKl6wylMwP9Ktig==} + engines: {node: ^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0} + + has-flag@4.0.0: + resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==} + engines: {node: '>=8'} + + has-symbols@1.1.0: + resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==} + engines: {node: '>= 0.4'} + + hasown@2.0.2: + resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} + engines: {node: '>= 0.4'} + + headers-polyfill@4.0.3: + resolution: {integrity: sha512-IScLbePpkvO846sIwOtOTDjutRMWdXdJmXdMvk6gCBHxFO8d+QKOQedyZSxFTTFYRSmlgSTDtXqqq4pcenBXLQ==} + + hermes-estree@0.25.1: + resolution: {integrity: sha512-0wUoCcLp+5Ev5pDW2OriHC2MJCbwLwuRx+gAqMTOkGKJJiBCLjtrvy4PWUGn6MIVefecRpzoOZ/UV6iGdOr+Cw==} + + hermes-parser@0.25.1: + resolution: {integrity: sha512-6pEjquH3rqaI6cYAXYPcz9MS4rY6R4ngRgrgfDshRptUZIc3lw0MCIJIGDj9++mfySOuPTHB4nrSW99BCvOPIA==} + + hono@4.12.12: + resolution: {integrity: sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==} + engines: {node: '>=16.9.0'} + + http-errors@2.0.1: + resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} + engines: {node: '>= 0.8'} + + https-proxy-agent@7.0.6: + resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} + engines: {node: '>= 14'} + + human-signals@2.1.0: + resolution: {integrity: sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==} + engines: {node: '>=10.17.0'} + + human-signals@8.0.1: + resolution: {integrity: sha512-eKCa6bwnJhvxj14kZk5NCPc6Hb6BdsU9DZcOnmQKSnO1VKrfV0zCvtttPZUsBvjmNDn8rpcJfpwSYnHBjc95MQ==} + engines: {node: '>=18.18.0'} + + iconv-lite@0.7.2: + resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==} + engines: {node: '>=0.10.0'} + + ignore@5.3.2: + resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} + engines: {node: '>= 4'} + + ignore@7.0.5: + resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==} + engines: {node: '>= 4'} + + import-fresh@3.3.1: + resolution: {integrity: sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==} + engines: {node: '>=6'} + + imurmurhash@0.1.4: + resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==} + engines: {node: '>=0.8.19'} + + inherits@2.0.4: + resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} + + ip-address@10.1.0: + resolution: {integrity: sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==} + engines: {node: '>= 12'} + + ipaddr.js@1.9.1: + resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==} + engines: {node: '>= 0.10'} + + is-arrayish@0.2.1: + resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} + + is-docker@3.0.0: + resolution: {integrity: sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + hasBin: true + + is-extglob@2.1.1: + resolution: {integrity: sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==} + engines: {node: '>=0.10.0'} + + is-fullwidth-code-point@3.0.0: + resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==} + engines: {node: '>=8'} + + is-glob@4.0.3: + resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==} + engines: {node: '>=0.10.0'} + + is-in-ssh@1.0.0: + resolution: {integrity: sha512-jYa6Q9rH90kR1vKB6NM7qqd1mge3Fx4Dhw5TVlK1MUBqhEOuCagrEHMevNuCcbECmXZ0ThXkRm+Ymr51HwEPAw==} + engines: {node: '>=20'} + + is-inside-container@1.0.0: + resolution: {integrity: sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==} + engines: {node: '>=14.16'} + hasBin: true + + is-interactive@2.0.0: + resolution: {integrity: sha512-qP1vozQRI+BMOPcjFzrjXuQvdak2pHNUMZoeG2eRbiSqyvbEf/wQtEOTOX1guk6E3t36RkaqiSt8A/6YElNxLQ==} + engines: {node: '>=12'} + + is-node-process@1.2.0: + resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==} + + is-number@7.0.0: + resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==} + engines: {node: '>=0.12.0'} + + is-obj@3.0.0: + resolution: {integrity: sha512-IlsXEHOjtKhpN8r/tRFj2nDyTmHvcfNeu/nrRIcXE17ROeatXchkojffa1SpdqW4cr/Fj6QkEf/Gn4zf6KKvEQ==} + engines: {node: '>=12'} + + is-plain-obj@4.1.0: + resolution: {integrity: sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==} + engines: {node: '>=12'} + + is-promise@4.0.0: + resolution: {integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==} + + is-regexp@3.1.0: + resolution: {integrity: sha512-rbku49cWloU5bSMI+zaRaXdQHXnthP6DZ/vLnfdSKyL4zUzuWnomtOEiZZOd+ioQ+avFo/qau3KPTc7Fjy1uPA==} + engines: {node: '>=12'} + + is-stream@2.0.1: + resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==} + engines: {node: '>=8'} + + is-stream@4.0.1: + resolution: {integrity: sha512-Dnz92NInDqYckGEUJv689RbRiTSEHCQ7wOVeALbkOz999YpqT46yMRIGtSNl2iCL1waAZSx40+h59NV/EwzV/A==} + engines: {node: '>=18'} + + is-unicode-supported@1.3.0: + resolution: {integrity: sha512-43r2mRvz+8JRIKnWJ+3j8JtjRKZ6GmjzfaE/qiBJnikNnYv/6bagRJ1kUhNk8R5EX/GkobD+r+sfxCPJsiKBLQ==} + engines: {node: '>=12'} + + is-unicode-supported@2.1.0: + resolution: {integrity: sha512-mE00Gnza5EEB3Ds0HfMyllZzbBrmLOX3vfWoj9A9PEnTfratQ/BcaJOuMhnkhjXvb2+FkY3VuHqtAGpTPmglFQ==} + engines: {node: '>=18'} + + is-wsl@3.1.1: + resolution: {integrity: sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==} + engines: {node: '>=16'} + + isexe@2.0.0: + resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + + isexe@3.1.5: + resolution: {integrity: sha512-6B3tLtFqtQS4ekarvLVMZ+X+VlvQekbe4taUkf/rhVO3d/h0M2rfARm/pXLcPEsjjMsFgrFgSrhQIxcSVrBz8w==} + engines: {node: '>=18'} + + jiti@2.6.1: + resolution: {integrity: sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==} + hasBin: true + + jose@6.2.2: + resolution: {integrity: sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ==} + + js-tokens@4.0.0: + resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} + + js-yaml@4.1.1: + resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} + hasBin: true + + jsesc@3.1.0: + resolution: {integrity: sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==} + engines: {node: '>=6'} + hasBin: true + + json-buffer@3.0.1: + resolution: {integrity: sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==} + + json-parse-even-better-errors@2.3.1: + resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} + + json-schema-traverse@0.4.1: + resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==} + + json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + + json-schema-typed@8.0.2: + resolution: {integrity: sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==} + + json-stable-stringify-without-jsonify@1.0.1: + resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==} + + json5@2.2.3: + resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} + engines: {node: '>=6'} + hasBin: true + + jsonfile@6.2.0: + resolution: {integrity: sha512-FGuPw30AdOIUTRMC2OMRtQV+jkVj2cfPqSeWXv1NEAJ1qZ5zb1X6z1mFhbfOB/iy3ssJCD+3KuZ8r8C3uVFlAg==} + + keyv@4.5.4: + resolution: {integrity: sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==} + + kleur@3.0.3: + resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} + engines: {node: '>=6'} + + kleur@4.1.5: + resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==} + engines: {node: '>=6'} + + levn@0.4.1: + resolution: {integrity: sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==} + engines: {node: '>= 0.8.0'} + + lightningcss-android-arm64@1.32.0: + resolution: {integrity: sha512-YK7/ClTt4kAK0vo6w3X+Pnm0D2cf2vPHbhOXdoNti1Ga0al1P4TBZhwjATvjNwLEBCnKvjJc2jQgHXH0NEwlAg==} + engines: {node: '>= 12.0.0'} + cpu: [arm64] + os: [android] + + lightningcss-darwin-arm64@1.32.0: + resolution: {integrity: sha512-RzeG9Ju5bag2Bv1/lwlVJvBE3q6TtXskdZLLCyfg5pt+HLz9BqlICO7LZM7VHNTTn/5PRhHFBSjk5lc4cmscPQ==} + engines: {node: '>= 12.0.0'} + cpu: [arm64] + os: [darwin] + + lightningcss-darwin-x64@1.32.0: + resolution: {integrity: sha512-U+QsBp2m/s2wqpUYT/6wnlagdZbtZdndSmut/NJqlCcMLTWp5muCrID+K5UJ6jqD2BFshejCYXniPDbNh73V8w==} + engines: {node: '>= 12.0.0'} + cpu: [x64] + os: [darwin] + + lightningcss-freebsd-x64@1.32.0: + resolution: {integrity: sha512-JCTigedEksZk3tHTTthnMdVfGf61Fky8Ji2E4YjUTEQX14xiy/lTzXnu1vwiZe3bYe0q+SpsSH/CTeDXK6WHig==} + engines: {node: '>= 12.0.0'} + cpu: [x64] + os: [freebsd] + + lightningcss-linux-arm-gnueabihf@1.32.0: + resolution: {integrity: sha512-x6rnnpRa2GL0zQOkt6rts3YDPzduLpWvwAF6EMhXFVZXD4tPrBkEFqzGowzCsIWsPjqSK+tyNEODUBXeeVHSkw==} + engines: {node: '>= 12.0.0'} + cpu: [arm] + os: [linux] + + lightningcss-linux-arm64-gnu@1.32.0: + resolution: {integrity: sha512-0nnMyoyOLRJXfbMOilaSRcLH3Jw5z9HDNGfT/gwCPgaDjnx0i8w7vBzFLFR1f6CMLKF8gVbebmkUN3fa/kQJpQ==} + engines: {node: '>= 12.0.0'} + cpu: [arm64] + os: [linux] + + lightningcss-linux-arm64-musl@1.32.0: + resolution: {integrity: sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg==} + engines: {node: '>= 12.0.0'} + cpu: [arm64] + os: [linux] + + lightningcss-linux-x64-gnu@1.32.0: + resolution: {integrity: sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA==} + engines: {node: '>= 12.0.0'} + cpu: [x64] + os: [linux] + + lightningcss-linux-x64-musl@1.32.0: + resolution: {integrity: sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg==} + engines: {node: '>= 12.0.0'} + cpu: [x64] + os: [linux] + + lightningcss-win32-arm64-msvc@1.32.0: + resolution: {integrity: sha512-8SbC8BR40pS6baCM8sbtYDSwEVQd4JlFTOlaD3gWGHfThTcABnNDBda6eTZeqbofalIJhFx0qKzgHJmcPTnGdw==} + engines: {node: '>= 12.0.0'} + cpu: [arm64] + os: [win32] + + lightningcss-win32-x64-msvc@1.32.0: + resolution: {integrity: sha512-Amq9B/SoZYdDi1kFrojnoqPLxYhQ4Wo5XiL8EVJrVsB8ARoC1PWW6VGtT0WKCemjy8aC+louJnjS7U18x3b06Q==} + engines: {node: '>= 12.0.0'} + cpu: [x64] + os: [win32] + + lightningcss@1.32.0: + resolution: {integrity: sha512-NXYBzinNrblfraPGyrbPoD19C1h9lfI/1mzgWYvXUTe414Gz/X1FD2XBZSZM7rRTrMA8JL3OtAaGifrIKhQ5yQ==} + engines: {node: '>= 12.0.0'} + + lines-and-columns@1.2.4: + resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} + + locate-path@6.0.0: + resolution: {integrity: sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==} + engines: {node: '>=10'} + + lodash.merge@4.6.2: + resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} + + log-symbols@6.0.0: + resolution: {integrity: sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==} + engines: {node: '>=18'} + + lru-cache@5.1.1: + resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} + + lucide-react@1.8.0: + resolution: {integrity: sha512-WuvlsjngSk7TnTBJ1hsCy3ql9V9VOdcPkd3PKcSmM34vJD8KG6molxz7m7zbYFgICwsanQWmJ13JlYs4Zp7Arw==} + peerDependencies: + react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0 + + magic-string@0.30.21: + resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==} + + math-intrinsics@1.1.0: + resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} + engines: {node: '>= 0.4'} + + media-typer@1.1.0: + resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==} + engines: {node: '>= 0.8'} + + merge-descriptors@2.0.0: + resolution: {integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==} + engines: {node: '>=18'} + + merge-stream@2.0.0: + resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==} + + merge2@1.4.1: + resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==} + engines: {node: '>= 8'} + + micromatch@4.0.8: + resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==} + engines: {node: '>=8.6'} + + mime-db@1.54.0: + resolution: {integrity: sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==} + engines: {node: '>= 0.6'} + + mime-types@3.0.2: + resolution: {integrity: sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==} + engines: {node: '>=18'} + + mimic-fn@2.1.0: + resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} + engines: {node: '>=6'} + + mimic-function@5.0.1: + resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==} + engines: {node: '>=18'} + + minimatch@10.2.5: + resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==} + engines: {node: 18 || 20 || >=22} + + minimatch@3.1.5: + resolution: {integrity: sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==} + + minimist@1.2.8: + resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + + ms@2.1.3: + resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + + msw@2.13.2: + resolution: {integrity: sha512-go2H1TIERKkC48pXiwec5l6sbNqYuvqOk3/vHGo1Zd+pq/H63oFawDQerH+WQdUw/flJFHDG7F+QdWMwhntA/A==} + engines: {node: '>=18'} + hasBin: true + peerDependencies: + typescript: '>= 4.8.x' + peerDependenciesMeta: + typescript: + optional: true + + mute-stream@2.0.0: + resolution: {integrity: sha512-WWdIxpyjEn+FhQJQQv9aQAYlHoNVdzIzUySNV1gHUPDSdZJ3yZn7pAAbQcV7B56Mvu881q9FZV+0Vx2xC44VWA==} + engines: {node: ^18.17.0 || >=20.5.0} + + nanoid@3.3.11: + resolution: {integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==} + engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} + hasBin: true + + natural-compare@1.4.0: + resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} + + negotiator@1.0.0: + resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==} + engines: {node: '>= 0.6'} + + node-domexception@1.0.0: + resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} + engines: {node: '>=10.5.0'} + deprecated: Use your platform's native DOMException instead + + node-fetch@3.3.2: + resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + + node-releases@2.0.37: + resolution: {integrity: sha512-1h5gKZCF+pO/o3Iqt5Jp7wc9rH3eJJ0+nh/CIoiRwjRxde/hAHyLPXYN4V3CqKAbiZPSeJFSWHmJsbkicta0Eg==} + + npm-run-path@4.0.1: + resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} + engines: {node: '>=8'} + + npm-run-path@6.0.0: + resolution: {integrity: sha512-9qny7Z9DsQU8Ou39ERsPU4OZQlSTP47ShQzuKZ6PRXpYLtIFgl/DEBYEXKlvcEa+9tHVcK8CF81Y2V72qaZhWA==} + engines: {node: '>=18'} + + object-assign@4.1.1: + resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} + engines: {node: '>=0.10.0'} + + object-inspect@1.13.4: + resolution: {integrity: sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==} + engines: {node: '>= 0.4'} + + object-treeify@1.1.33: + resolution: {integrity: sha512-EFVjAYfzWqWsBMRHPMAXLCDIJnpMhdWAqR7xG6M6a2cs6PMFpl/+Z20w9zDW4vkxOFfddegBKq9Rehd0bxWE7A==} + engines: {node: '>= 10'} + + on-finished@2.4.1: + resolution: {integrity: sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==} + engines: {node: '>= 0.8'} + + once@1.4.0: + resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + + onetime@5.1.2: + resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} + engines: {node: '>=6'} + + onetime@7.0.0: + resolution: {integrity: sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==} + engines: {node: '>=18'} + + open@11.0.0: + resolution: {integrity: sha512-smsWv2LzFjP03xmvFoJ331ss6h+jixfA4UUV/Bsiyuu4YJPfN+FIQGOIiv4w9/+MoHkfkJ22UIaQWRVFRfH6Vw==} + engines: {node: '>=20'} + + optionator@0.9.4: + resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} + engines: {node: '>= 0.8.0'} + + ora@8.2.0: + resolution: {integrity: sha512-weP+BZ8MVNnlCm8c0Qdc1WSWq4Qn7I+9CJGm7Qali6g44e/PUzbjNqJX5NJ9ljlNMosfJvg1fKEGILklK9cwnw==} + engines: {node: '>=18'} + + outvariant@1.4.3: + resolution: {integrity: sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==} + + p-limit@3.1.0: + resolution: {integrity: sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==} + engines: {node: '>=10'} + + p-locate@5.0.0: + resolution: {integrity: sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==} + engines: {node: '>=10'} + + parent-module@1.0.1: + resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} + engines: {node: '>=6'} + + parse-json@5.2.0: + resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==} + engines: {node: '>=8'} + + parse-ms@4.0.0: + resolution: {integrity: sha512-TXfryirbmq34y8QBwgqCVLi+8oA3oWx2eAnSn62ITyEhEYaWRlVZ2DvMM9eZbMs/RfxPu/PK/aBLyGj4IrqMHw==} + engines: {node: '>=18'} + + parseurl@1.3.3: + resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} + engines: {node: '>= 0.8'} + + path-browserify@1.0.1: + resolution: {integrity: sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g==} + + path-exists@4.0.0: + resolution: {integrity: sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==} + engines: {node: '>=8'} + + path-key@3.1.1: + resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} + engines: {node: '>=8'} + + path-key@4.0.0: + resolution: {integrity: sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==} + engines: {node: '>=12'} + + path-to-regexp@6.3.0: + resolution: {integrity: sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ==} + + path-to-regexp@8.4.2: + resolution: {integrity: sha512-qRcuIdP69NPm4qbACK+aDogI5CBDMi1jKe0ry5rSQJz8JVLsC7jV8XpiJjGRLLol3N+R5ihGYcrPLTno6pAdBA==} + + picocolors@1.1.1: + resolution: {integrity: sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==} + + picomatch@2.3.2: + resolution: {integrity: sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==} + engines: {node: '>=8.6'} + + picomatch@4.0.4: + resolution: {integrity: sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==} + engines: {node: '>=12'} + + pkce-challenge@5.0.1: + resolution: {integrity: sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==} + engines: {node: '>=16.20.0'} + + postcss-selector-parser@7.1.1: + resolution: {integrity: sha512-orRsuYpJVw8LdAwqqLykBj9ecS5/cRHlI5+nvTo8LcCKmzDmqVORXtOIYEEQuL9D4BxtA1lm5isAqzQZCoQ6Eg==} + engines: {node: '>=4'} + + postcss@8.5.9: + resolution: {integrity: sha512-7a70Nsot+EMX9fFU3064K/kdHWZqGVY+BADLyXc8Dfv+mTLLVl6JzJpPaCZ2kQL9gIJvKXSLMHhqdRRjwQeFtw==} + engines: {node: ^10 || ^12 || >=14} + + powershell-utils@0.1.0: + resolution: {integrity: sha512-dM0jVuXJPsDN6DvRpea484tCUaMiXWjuCn++HGTqUWzGDjv5tZkEZldAJ/UMlqRYGFrD/etByo4/xOuC/snX2A==} + engines: {node: '>=20'} + + prelude-ls@1.2.1: + resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} + engines: {node: '>= 0.8.0'} + + pretty-ms@9.3.0: + resolution: {integrity: sha512-gjVS5hOP+M3wMm5nmNOucbIrqudzs9v/57bWRHQWLYklXqoXKrVfYW2W9+glfGsqtPgpiz5WwyEEB+ksXIx3gQ==} + engines: {node: '>=18'} + + prompts@2.4.2: + resolution: {integrity: sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==} + engines: {node: '>= 6'} + + proxy-addr@2.0.7: + resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} + engines: {node: '>= 0.10'} + + punycode@2.3.1: + resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} + engines: {node: '>=6'} + + qs@6.15.1: + resolution: {integrity: sha512-6YHEFRL9mfgcAvql/XhwTvf5jKcOiiupt2FiJxHkiX1z4j7WL8J/jRHYLluORvc1XxB5rV20KoeK00gVJamspg==} + engines: {node: '>=0.6'} + + queue-microtask@1.2.3: + resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==} + + range-parser@1.2.1: + resolution: {integrity: sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==} + engines: {node: '>= 0.6'} + + raw-body@3.0.2: + resolution: {integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==} + engines: {node: '>= 0.10'} + + react-dom@19.2.5: + resolution: {integrity: sha512-J5bAZz+DXMMwW/wV3xzKke59Af6CHY7G4uYLN1OvBcKEsWOs4pQExj86BBKamxl/Ik5bx9whOrvBlSDfWzgSag==} + peerDependencies: + react: ^19.2.5 + + react-router@7.14.0: + resolution: {integrity: sha512-m/xR9N4LQLmAS0ZhkY2nkPA1N7gQ5TUVa5n8TgANuDTARbn1gt+zLPXEm7W0XDTbrQ2AJSJKhoa6yx1D8BcpxQ==} + engines: {node: '>=20.0.0'} + peerDependencies: + react: '>=18' + react-dom: '>=18' + peerDependenciesMeta: + react-dom: + optional: true + + react@19.2.5: + resolution: {integrity: sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA==} + engines: {node: '>=0.10.0'} + + recast@0.23.11: + resolution: {integrity: sha512-YTUo+Flmw4ZXiWfQKGcwwc11KnoRAYgzAE2E7mXKCjSviTKShtxBsN6YUUBB2gtaBzKzeKunxhUwNHQuRryhWA==} + engines: {node: '>= 4'} + + require-directory@2.1.1: + resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} + engines: {node: '>=0.10.0'} + + require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + + reselect@5.1.1: + resolution: {integrity: sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w==} + + resolve-from@4.0.0: + resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==} + engines: {node: '>=4'} + + restore-cursor@5.1.0: + resolution: {integrity: sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==} + engines: {node: '>=18'} + + rettime@0.10.1: + resolution: {integrity: sha512-uyDrIlUEH37cinabq0AX4QbgV4HbFZ/gqoiunWQ1UqBtRvTTytwhNYjE++pO/MjPTZL5KQCf2bEoJ/BJNVQ5Kw==} + + reusify@1.1.0: + resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} + engines: {iojs: '>=1.0.0', node: '>=0.10.0'} + + rolldown@1.0.0-rc.15: + resolution: {integrity: sha512-Ff31guA5zT6WjnGp0SXw76X6hzGRk/OQq2hE+1lcDe+lJdHSgnSX6nK3erbONHyCbpSj9a9E+uX/OvytZoWp2g==} + engines: {node: ^20.19.0 || >=22.12.0} + hasBin: true + + router@2.2.0: + resolution: {integrity: sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==} + engines: {node: '>= 18'} + + run-applescript@7.1.0: + resolution: {integrity: sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==} + engines: {node: '>=18'} + + run-parallel@1.2.0: + resolution: {integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==} + + safer-buffer@2.1.2: + resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==} + + scheduler@0.27.0: + resolution: {integrity: sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==} + + semver@6.3.1: + resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} + hasBin: true + + semver@7.7.4: + resolution: {integrity: sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==} + engines: {node: '>=10'} + hasBin: true + + send@1.2.1: + resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==} + engines: {node: '>= 18'} + + serve-static@2.2.1: + resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==} + engines: {node: '>= 18'} + + set-cookie-parser@2.7.2: + resolution: {integrity: sha512-oeM1lpU/UvhTxw+g3cIfxXHyJRc/uidd3yK1P242gzHds0udQBYzs3y8j4gCCW+ZJ7ad0yctld8RYO+bdurlvw==} + + setprototypeof@1.2.0: + resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==} + + shadcn@4.2.0: + resolution: {integrity: sha512-ZDuV340itidaUd4Gi1BxQX+Y7Ush6BHp6URZBM2RyxUUBZ6yFtOWIr4nVY+Ro+YRSpo82v7JrsmtcU5xoBCMJQ==} + hasBin: true + + shebang-command@2.0.0: + resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} + engines: {node: '>=8'} + + shebang-regex@3.0.0: + resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} + engines: {node: '>=8'} + + side-channel-list@1.0.1: + resolution: {integrity: sha512-mjn/0bi/oUURjc5Xl7IaWi/OJJJumuoJFQJfDDyO46+hBWsfaVM65TBHq2eoZBhzl9EchxOijpkbRC8SVBQU0w==} + engines: {node: '>= 0.4'} + + side-channel-map@1.0.1: + resolution: {integrity: sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==} + engines: {node: '>= 0.4'} + + side-channel-weakmap@1.0.2: + resolution: {integrity: sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==} + engines: {node: '>= 0.4'} + + side-channel@1.1.0: + resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==} + engines: {node: '>= 0.4'} + + signal-exit@3.0.7: + resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + + signal-exit@4.1.0: + resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} + engines: {node: '>=14'} + + sisteransi@1.0.5: + resolution: {integrity: sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==} + + source-map-js@1.2.1: + resolution: {integrity: sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==} + engines: {node: '>=0.10.0'} + + source-map@0.6.1: + resolution: {integrity: sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==} + engines: {node: '>=0.10.0'} + + statuses@2.0.2: + resolution: {integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==} + engines: {node: '>= 0.8'} + + stdin-discarder@0.2.2: + resolution: {integrity: sha512-UhDfHmA92YAlNnCfhmq0VeNL5bDbiZGg7sZ2IvPsXubGkiNa9EC+tUTsjBRsYUAz87btI6/1wf4XoVvQ3uRnmQ==} + engines: {node: '>=18'} + + strict-event-emitter@0.5.1: + resolution: {integrity: sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==} + + string-width@4.2.3: + resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} + engines: {node: '>=8'} + + string-width@7.2.0: + resolution: {integrity: sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==} + engines: {node: '>=18'} + + stringify-object@5.0.0: + resolution: {integrity: sha512-zaJYxz2FtcMb4f+g60KsRNFOpVMUyuJgA51Zi5Z1DOTC3S59+OQiVOzE9GZt0x72uBGWKsQIuBKeF9iusmKFsg==} + engines: {node: '>=14.16'} + + strip-ansi@6.0.1: + resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} + engines: {node: '>=8'} + + strip-ansi@7.2.0: + resolution: {integrity: sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w==} + engines: {node: '>=12'} + + strip-bom@3.0.0: + resolution: {integrity: sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==} + engines: {node: '>=4'} + + strip-final-newline@2.0.0: + resolution: {integrity: sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==} + engines: {node: '>=6'} + + strip-final-newline@4.0.0: + resolution: {integrity: sha512-aulFJcD6YK8V1G7iRB5tigAP4TsHBZZrOV8pjV++zdUwmeV8uzbY7yn6h9MswN62adStNZFuCIx4haBnRuMDaw==} + engines: {node: '>=18'} + + strip-json-comments@3.1.1: + resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} + engines: {node: '>=8'} + + supports-color@7.2.0: + resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==} + engines: {node: '>=8'} + + tabbable@6.4.0: + resolution: {integrity: sha512-05PUHKSNE8ou2dwIxTngl4EzcnsCDZGJ/iCLtDflR/SHB/ny14rXc+qU5P4mG9JkusiV7EivzY9Mhm55AzAvCg==} + + tagged-tag@1.0.0: + resolution: {integrity: sha512-yEFYrVhod+hdNyx7g5Bnkkb0G6si8HJurOoOEgC8B/O0uXLHlaey/65KRv6cuWBNhBgHKAROVpc7QyYqE5gFng==} + engines: {node: '>=20'} + + tailwind-merge@3.5.0: + resolution: {integrity: sha512-I8K9wewnVDkL1NTGoqWmVEIlUcB9gFriAEkXkfCjX5ib8ezGxtR3xD7iZIxrfArjEsH7F1CHD4RFUtxefdqV/A==} + + tailwindcss@4.2.2: + resolution: {integrity: sha512-KWBIxs1Xb6NoLdMVqhbhgwZf2PGBpPEiwOqgI4pFIYbNTfBXiKYyWoTsXgBQ9WFg/OlhnvHaY+AEpW7wSmFo2Q==} + + tapable@2.3.2: + resolution: {integrity: sha512-1MOpMXuhGzGL5TTCZFItxCc0AARf1EZFQkGqMm7ERKj8+Hgr5oLvJOVFcC+lRmR8hCe2S3jC4T5D7Vg/d7/fhA==} + engines: {node: '>=6'} + + tiny-invariant@1.3.3: + resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} + + tinyglobby@0.2.16: + resolution: {integrity: sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==} + engines: {node: '>=12.0.0'} + + tldts-core@7.0.28: + resolution: {integrity: sha512-7W5Efjhsc3chVdFhqtaU0KtK32J37Zcr9RKtID54nG+tIpcY79CQK/veYPODxtD/LJ4Lue66jvrQzIX2Z2/pUQ==} + + tldts@7.0.28: + resolution: {integrity: sha512-+Zg3vWhRUv8B1maGSTFdev9mjoo8Etn2Ayfs4cnjlD3CsGkxXX4QyW3j2WJ0wdjYcYmy7Lx2RDsZMhgCWafKIw==} + hasBin: true + + to-regex-range@5.0.1: + resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} + engines: {node: '>=8.0'} + + toidentifier@1.0.1: + resolution: {integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==} + engines: {node: '>=0.6'} + + tough-cookie@6.0.1: + resolution: {integrity: sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==} + engines: {node: '>=16'} + + ts-api-utils@2.5.0: + resolution: {integrity: sha512-OJ/ibxhPlqrMM0UiNHJ/0CKQkoKF243/AEmplt3qpRgkW8VG7IfOS41h7V8TjITqdByHzrjcS/2si+y4lIh8NA==} + engines: {node: '>=18.12'} + peerDependencies: + typescript: '>=4.8.4' + + ts-morph@26.0.0: + resolution: {integrity: sha512-ztMO++owQnz8c/gIENcM9XfCEzgoGphTv+nKpYNM1bgsdOVC/jRZuEBf6N+mLLDNg68Kl+GgUZfOySaRiG1/Ug==} + + tsconfig-paths@4.2.0: + resolution: {integrity: sha512-NoZ4roiN7LnbKn9QqE1amc9DJfzvZXxF4xDavcOWt1BPkdx+m+0gJuPM+S0vCe7zTJMYUP0R8pO2XMr+Y8oLIg==} + engines: {node: '>=6'} + + tslib@2.8.1: + resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + + tw-animate-css@1.4.0: + resolution: {integrity: sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==} + + type-check@0.4.0: + resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==} + engines: {node: '>= 0.8.0'} + + type-fest@5.5.0: + resolution: {integrity: sha512-PlBfpQwiUvGViBNX84Yxwjsdhd1TUlXr6zjX7eoirtCPIr08NAmxwa+fcYBTeRQxHo9YC9wwF3m9i700sHma8g==} + engines: {node: '>=20'} + + type-is@2.0.1: + resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} + engines: {node: '>= 0.6'} + + typescript-eslint@8.58.1: + resolution: {integrity: sha512-gf6/oHChByg9HJvhMO1iBexJh12AqqTfnuxscMDOVqfJW3htsdRJI/GfPpHTTcyeB8cSTUY2JcZmVgoyPqcrDg==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + eslint: ^8.57.0 || ^9.0.0 || ^10.0.0 + typescript: '>=4.8.4 <6.1.0' + + typescript@6.0.2: + resolution: {integrity: sha512-bGdAIrZ0wiGDo5l8c++HWtbaNCWTS4UTv7RaTH/ThVIgjkveJt83m74bBHMJkuCbslY8ixgLBVZJIOiQlQTjfQ==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@7.16.0: + resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} + + unicorn-magic@0.3.0: + resolution: {integrity: sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==} + engines: {node: '>=18'} + + universalify@2.0.1: + resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} + engines: {node: '>= 10.0.0'} + + unpipe@1.0.0: + resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} + engines: {node: '>= 0.8'} + + until-async@3.0.2: + resolution: {integrity: sha512-IiSk4HlzAMqTUseHHe3VhIGyuFmN90zMTpD3Z3y8jeQbzLIq500MVM7Jq2vUAnTKAFPJrqwkzr6PoTcPhGcOiw==} + + update-browserslist-db@1.2.3: + resolution: {integrity: sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==} + hasBin: true + peerDependencies: + browserslist: '>= 4.21.0' + + uri-js@4.4.1: + resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} + + use-sync-external-store@1.6.0: + resolution: {integrity: sha512-Pp6GSwGP/NrPIrxVFAIkOQeyw8lFenOHijQWkUTrDvrF4ALqylP2C/KCkeS9dpUM3KvYRQhna5vt7IL95+ZQ9w==} + peerDependencies: + react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + + util-deprecate@1.0.2: + resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} + + validate-npm-package-name@7.0.2: + resolution: {integrity: sha512-hVDIBwsRruT73PbK7uP5ebUt+ezEtCmzZz3F59BSr2F6OVFnJ/6h8liuvdLrQ88Xmnk6/+xGGuq+pG9WwTuy3A==} + engines: {node: ^20.17.0 || >=22.9.0} + + vary@1.1.2: + resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==} + engines: {node: '>= 0.8'} + + vite@8.0.8: + resolution: {integrity: sha512-dbU7/iLVa8KZALJyLOBOQ88nOXtNG8vxKuOT4I2mD+Ya70KPceF4IAmDsmU0h1Qsn5bPrvsY9HJstCRh3hG6Uw==} + engines: {node: ^20.19.0 || >=22.12.0} + hasBin: true + peerDependencies: + '@types/node': ^20.19.0 || >=22.12.0 + '@vitejs/devtools': ^0.1.0 + esbuild: ^0.27.0 || ^0.28.0 + jiti: '>=1.21.0' + less: ^4.0.0 + sass: ^1.70.0 + sass-embedded: ^1.70.0 + stylus: '>=0.54.8' + sugarss: ^5.0.0 + terser: ^5.16.0 + tsx: ^4.8.1 + yaml: ^2.4.2 + peerDependenciesMeta: + '@types/node': + optional: true + '@vitejs/devtools': + optional: true + esbuild: + optional: true + jiti: + optional: true + less: + optional: true + sass: + optional: true + sass-embedded: + optional: true + stylus: + optional: true + sugarss: + optional: true + terser: + optional: true + tsx: + optional: true + yaml: + optional: true + + web-streams-polyfill@3.3.3: + resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} + engines: {node: '>= 8'} + + which@2.0.2: + resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} + engines: {node: '>= 8'} + hasBin: true + + which@4.0.0: + resolution: {integrity: sha512-GlaYyEb07DPxYCKhKzplCWBJtvxZcZMrL+4UkrTSJHHPyZU4mYYTv3qaOe77H7EODLSSopAUFAc6W8U4yqvscg==} + engines: {node: ^16.13.0 || >=18.0.0} + hasBin: true + + word-wrap@1.2.5: + resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} + engines: {node: '>=0.10.0'} + + wrap-ansi@6.2.0: + resolution: {integrity: sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==} + engines: {node: '>=8'} + + wrap-ansi@7.0.0: + resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} + engines: {node: '>=10'} + + wrappy@1.0.2: + resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} + + wsl-utils@0.3.1: + resolution: {integrity: sha512-g/eziiSUNBSsdDJtCLB8bdYEUMj4jR7AGeUo96p/3dTafgjHhpF4RiCFPiRILwjQoDXx5MqkBr4fwWtR3Ky4Wg==} + engines: {node: '>=20'} + + y18n@5.0.8: + resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} + engines: {node: '>=10'} + + yallist@3.1.1: + resolution: {integrity: sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==} + + yargs-parser@21.1.1: + resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==} + engines: {node: '>=12'} + + yargs@17.7.2: + resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==} + engines: {node: '>=12'} + + yocto-queue@0.1.0: + resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} + engines: {node: '>=10'} + + yocto-spinner@1.1.0: + resolution: {integrity: sha512-/BY0AUXnS7IKO354uLLA2eRcWiqDifEbd6unXCsOxkFDAkhgUL3PH9X2bFoaU0YchnDXsF+iKleeTLJGckbXfA==} + engines: {node: '>=18.19'} + + yoctocolors-cjs@2.1.3: + resolution: {integrity: sha512-U/PBtDf35ff0D8X8D0jfdzHYEPFxAI7jJlxZXwCSez5M3190m+QobIfh+sWDWSHMCWWJN2AWamkegn6vr6YBTw==} + engines: {node: '>=18'} + + yoctocolors@2.1.2: + resolution: {integrity: sha512-CzhO+pFNo8ajLM2d2IW/R93ipy99LWjtwblvC1RsoSUMZgyLbYFr221TnSNT7GjGdYui6P459mw9JH/g/zW2ug==} + engines: {node: '>=18'} + + zod-to-json-schema@3.25.2: + resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==} + peerDependencies: + zod: ^3.25.28 || ^4 + + zod-validation-error@4.0.2: + resolution: {integrity: sha512-Q6/nZLe6jxuU80qb/4uJ4t5v2VEZ44lzQjPDhYJNztRQ4wyWc6VF3D3Kb/fAuPetZQnhS3hnajCf9CsWesghLQ==} + engines: {node: '>=18.0.0'} + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + + zod@3.25.76: + resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==} + + zod@4.3.6: + resolution: {integrity: sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==} + +snapshots: + + '@babel/code-frame@7.29.0': + dependencies: + '@babel/helper-validator-identifier': 7.28.5 + js-tokens: 4.0.0 + picocolors: 1.1.1 + + '@babel/compat-data@7.29.0': {} + + '@babel/core@7.29.0': + dependencies: + '@babel/code-frame': 7.29.0 + '@babel/generator': 7.29.1 + '@babel/helper-compilation-targets': 7.28.6 + '@babel/helper-module-transforms': 7.28.6(@babel/core@7.29.0) + '@babel/helpers': 7.29.2 + '@babel/parser': 7.29.2 + '@babel/template': 7.28.6 + '@babel/traverse': 7.29.0 + '@babel/types': 7.29.0 + '@jridgewell/remapping': 2.3.5 + convert-source-map: 2.0.0 + debug: 4.4.3 + gensync: 1.0.0-beta.2 + json5: 2.2.3 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + '@babel/generator@7.29.1': + dependencies: + '@babel/parser': 7.29.2 + '@babel/types': 7.29.0 + '@jridgewell/gen-mapping': 0.3.13 + '@jridgewell/trace-mapping': 0.3.31 + jsesc: 3.1.0 + + '@babel/helper-annotate-as-pure@7.27.3': + dependencies: + '@babel/types': 7.29.0 + + '@babel/helper-compilation-targets@7.28.6': + dependencies: + '@babel/compat-data': 7.29.0 + '@babel/helper-validator-option': 7.27.1 + browserslist: 4.28.2 + lru-cache: 5.1.1 + semver: 6.3.1 + + '@babel/helper-create-class-features-plugin@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-annotate-as-pure': 7.27.3 + '@babel/helper-member-expression-to-functions': 7.28.5 + '@babel/helper-optimise-call-expression': 7.27.1 + '@babel/helper-replace-supers': 7.28.6(@babel/core@7.29.0) + '@babel/helper-skip-transparent-expression-wrappers': 7.27.1 + '@babel/traverse': 7.29.0 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + '@babel/helper-globals@7.28.0': {} + + '@babel/helper-member-expression-to-functions@7.28.5': + dependencies: + '@babel/traverse': 7.29.0 + '@babel/types': 7.29.0 + transitivePeerDependencies: + - supports-color + + '@babel/helper-module-imports@7.28.6': + dependencies: + '@babel/traverse': 7.29.0 + '@babel/types': 7.29.0 + transitivePeerDependencies: + - supports-color + + '@babel/helper-module-transforms@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-module-imports': 7.28.6 + '@babel/helper-validator-identifier': 7.28.5 + '@babel/traverse': 7.29.0 + transitivePeerDependencies: + - supports-color + + '@babel/helper-optimise-call-expression@7.27.1': + dependencies: + '@babel/types': 7.29.0 + + '@babel/helper-plugin-utils@7.28.6': {} + + '@babel/helper-replace-supers@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-member-expression-to-functions': 7.28.5 + '@babel/helper-optimise-call-expression': 7.27.1 + '@babel/traverse': 7.29.0 + transitivePeerDependencies: + - supports-color + + '@babel/helper-skip-transparent-expression-wrappers@7.27.1': + dependencies: + '@babel/traverse': 7.29.0 + '@babel/types': 7.29.0 + transitivePeerDependencies: + - supports-color + + '@babel/helper-string-parser@7.27.1': {} + + '@babel/helper-validator-identifier@7.28.5': {} + + '@babel/helper-validator-option@7.27.1': {} + + '@babel/helpers@7.29.2': + dependencies: + '@babel/template': 7.28.6 + '@babel/types': 7.29.0 + + '@babel/parser@7.29.2': + dependencies: + '@babel/types': 7.29.0 + + '@babel/plugin-syntax-jsx@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-plugin-utils': 7.28.6 + + '@babel/plugin-syntax-typescript@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-plugin-utils': 7.28.6 + + '@babel/plugin-transform-modules-commonjs@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-module-transforms': 7.28.6(@babel/core@7.29.0) + '@babel/helper-plugin-utils': 7.28.6 + transitivePeerDependencies: + - supports-color + + '@babel/plugin-transform-typescript@7.28.6(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-annotate-as-pure': 7.27.3 + '@babel/helper-create-class-features-plugin': 7.28.6(@babel/core@7.29.0) + '@babel/helper-plugin-utils': 7.28.6 + '@babel/helper-skip-transparent-expression-wrappers': 7.27.1 + '@babel/plugin-syntax-typescript': 7.28.6(@babel/core@7.29.0) + transitivePeerDependencies: + - supports-color + + '@babel/preset-typescript@7.28.5(@babel/core@7.29.0)': + dependencies: + '@babel/core': 7.29.0 + '@babel/helper-plugin-utils': 7.28.6 + '@babel/helper-validator-option': 7.27.1 + '@babel/plugin-syntax-jsx': 7.28.6(@babel/core@7.29.0) + '@babel/plugin-transform-modules-commonjs': 7.28.6(@babel/core@7.29.0) + '@babel/plugin-transform-typescript': 7.28.6(@babel/core@7.29.0) + transitivePeerDependencies: + - supports-color + + '@babel/runtime@7.29.2': {} + + '@babel/template@7.28.6': + dependencies: + '@babel/code-frame': 7.29.0 + '@babel/parser': 7.29.2 + '@babel/types': 7.29.0 + + '@babel/traverse@7.29.0': + dependencies: + '@babel/code-frame': 7.29.0 + '@babel/generator': 7.29.1 + '@babel/helper-globals': 7.28.0 + '@babel/parser': 7.29.2 + '@babel/template': 7.28.6 + '@babel/types': 7.29.0 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + '@babel/types@7.29.0': + dependencies: + '@babel/helper-string-parser': 7.27.1 + '@babel/helper-validator-identifier': 7.28.5 + + '@base-ui/react@1.3.0(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5)': + dependencies: + '@babel/runtime': 7.29.2 + '@base-ui/utils': 0.2.6(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + '@floating-ui/react-dom': 2.1.8(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + '@floating-ui/utils': 0.2.11 + react: 19.2.5 + react-dom: 19.2.5(react@19.2.5) + tabbable: 6.4.0 + use-sync-external-store: 1.6.0(react@19.2.5) + optionalDependencies: + '@types/react': 19.2.14 + + '@base-ui/utils@0.2.6(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5)': + dependencies: + '@babel/runtime': 7.29.2 + '@floating-ui/utils': 0.2.11 + react: 19.2.5 + react-dom: 19.2.5(react@19.2.5) + reselect: 5.1.1 + use-sync-external-store: 1.6.0(react@19.2.5) + optionalDependencies: + '@types/react': 19.2.14 + + '@dotenvx/dotenvx@1.61.0': + dependencies: + commander: 11.1.0 + dotenv: 17.4.1 + eciesjs: 0.4.18 + execa: 5.1.1 + fdir: 6.5.0(picomatch@4.0.4) + ignore: 5.3.2 + object-treeify: 1.1.33 + picomatch: 4.0.4 + which: 4.0.0 + yocto-spinner: 1.1.0 + + '@ecies/ciphers@0.2.6(@noble/ciphers@1.3.0)': + dependencies: + '@noble/ciphers': 1.3.0 + + '@emnapi/core@1.9.2': + dependencies: + '@emnapi/wasi-threads': 1.2.1 + tslib: 2.8.1 + optional: true + + '@emnapi/runtime@1.9.2': + dependencies: + tslib: 2.8.1 + optional: true + + '@emnapi/wasi-threads@1.2.1': + dependencies: + tslib: 2.8.1 + optional: true + + '@eslint-community/eslint-utils@4.9.1(eslint@9.39.4(jiti@2.6.1))': + dependencies: + eslint: 9.39.4(jiti@2.6.1) + eslint-visitor-keys: 3.4.3 + + '@eslint-community/regexpp@4.12.2': {} + + '@eslint/config-array@0.21.2': + dependencies: + '@eslint/object-schema': 2.1.7 + debug: 4.4.3 + minimatch: 3.1.5 + transitivePeerDependencies: + - supports-color + + '@eslint/config-helpers@0.4.2': + dependencies: + '@eslint/core': 0.17.0 + + '@eslint/core@0.17.0': + dependencies: + '@types/json-schema': 7.0.15 + + '@eslint/eslintrc@3.3.5': + dependencies: + ajv: 6.14.0 + debug: 4.4.3 + espree: 10.4.0 + globals: 14.0.0 + ignore: 5.3.2 + import-fresh: 3.3.1 + js-yaml: 4.1.1 + minimatch: 3.1.5 + strip-json-comments: 3.1.1 + transitivePeerDependencies: + - supports-color + + '@eslint/js@9.39.4': {} + + '@eslint/object-schema@2.1.7': {} + + '@eslint/plugin-kit@0.4.1': + dependencies: + '@eslint/core': 0.17.0 + levn: 0.4.1 + + '@floating-ui/core@1.7.5': + dependencies: + '@floating-ui/utils': 0.2.11 + + '@floating-ui/dom@1.7.6': + dependencies: + '@floating-ui/core': 1.7.5 + '@floating-ui/utils': 0.2.11 + + '@floating-ui/react-dom@2.1.8(react-dom@19.2.5(react@19.2.5))(react@19.2.5)': + dependencies: + '@floating-ui/dom': 1.7.6 + react: 19.2.5 + react-dom: 19.2.5(react@19.2.5) + + '@floating-ui/utils@0.2.11': {} + + '@fontsource-variable/geist@5.2.8': {} + + '@fontsource-variable/outfit@5.2.8': {} + + '@hono/node-server@1.19.13(hono@4.12.12)': + dependencies: + hono: 4.12.12 + + '@humanfs/core@0.19.1': {} + + '@humanfs/node@0.16.7': + dependencies: + '@humanfs/core': 0.19.1 + '@humanwhocodes/retry': 0.4.3 + + '@humanwhocodes/module-importer@1.0.1': {} + + '@humanwhocodes/retry@0.4.3': {} + + '@inquirer/ansi@1.0.2': {} + + '@inquirer/confirm@5.1.21(@types/node@24.12.2)': + dependencies: + '@inquirer/core': 10.3.2(@types/node@24.12.2) + '@inquirer/type': 3.0.10(@types/node@24.12.2) + optionalDependencies: + '@types/node': 24.12.2 + + '@inquirer/core@10.3.2(@types/node@24.12.2)': + dependencies: + '@inquirer/ansi': 1.0.2 + '@inquirer/figures': 1.0.15 + '@inquirer/type': 3.0.10(@types/node@24.12.2) + cli-width: 4.1.0 + mute-stream: 2.0.0 + signal-exit: 4.1.0 + wrap-ansi: 6.2.0 + yoctocolors-cjs: 2.1.3 + optionalDependencies: + '@types/node': 24.12.2 + + '@inquirer/figures@1.0.15': {} + + '@inquirer/type@3.0.10(@types/node@24.12.2)': + optionalDependencies: + '@types/node': 24.12.2 + + '@jridgewell/gen-mapping@0.3.13': + dependencies: + '@jridgewell/sourcemap-codec': 1.5.5 + '@jridgewell/trace-mapping': 0.3.31 + + '@jridgewell/remapping@2.3.5': + dependencies: + '@jridgewell/gen-mapping': 0.3.13 + '@jridgewell/trace-mapping': 0.3.31 + + '@jridgewell/resolve-uri@3.1.2': {} + + '@jridgewell/sourcemap-codec@1.5.5': {} + + '@jridgewell/trace-mapping@0.3.31': + dependencies: + '@jridgewell/resolve-uri': 3.1.2 + '@jridgewell/sourcemap-codec': 1.5.5 + + '@modelcontextprotocol/sdk@1.29.0(zod@3.25.76)': + dependencies: + '@hono/node-server': 1.19.13(hono@4.12.12) + ajv: 8.18.0 + ajv-formats: 3.0.1(ajv@8.18.0) + content-type: 1.0.5 + cors: 2.8.6 + cross-spawn: 7.0.6 + eventsource: 3.0.7 + eventsource-parser: 3.0.6 + express: 5.2.1 + express-rate-limit: 8.3.2(express@5.2.1) + hono: 4.12.12 + jose: 6.2.2 + json-schema-typed: 8.0.2 + pkce-challenge: 5.0.1 + raw-body: 3.0.2 + zod: 3.25.76 + zod-to-json-schema: 3.25.2(zod@3.25.76) + transitivePeerDependencies: + - supports-color + + '@mswjs/interceptors@0.41.3': + dependencies: + '@open-draft/deferred-promise': 2.2.0 + '@open-draft/logger': 0.3.0 + '@open-draft/until': 2.1.0 + is-node-process: 1.2.0 + outvariant: 1.4.3 + strict-event-emitter: 0.5.1 + + '@napi-rs/wasm-runtime@1.1.3(@emnapi/core@1.9.2)(@emnapi/runtime@1.9.2)': + dependencies: + '@emnapi/core': 1.9.2 + '@emnapi/runtime': 1.9.2 + '@tybys/wasm-util': 0.10.1 + optional: true + + '@noble/ciphers@1.3.0': {} + + '@noble/curves@1.9.7': + dependencies: + '@noble/hashes': 1.8.0 + + '@noble/hashes@1.8.0': {} + + '@nodelib/fs.scandir@2.1.5': + dependencies: + '@nodelib/fs.stat': 2.0.5 + run-parallel: 1.2.0 + + '@nodelib/fs.stat@2.0.5': {} + + '@nodelib/fs.walk@1.2.8': + dependencies: + '@nodelib/fs.scandir': 2.1.5 + fastq: 1.20.1 + + '@open-draft/deferred-promise@2.2.0': {} + + '@open-draft/logger@0.3.0': + dependencies: + is-node-process: 1.2.0 + outvariant: 1.4.3 + + '@open-draft/until@2.1.0': {} + + '@oxc-project/types@0.124.0': {} + + '@rolldown/binding-android-arm64@1.0.0-rc.15': + optional: true + + '@rolldown/binding-darwin-arm64@1.0.0-rc.15': + optional: true + + '@rolldown/binding-darwin-x64@1.0.0-rc.15': + optional: true + + '@rolldown/binding-freebsd-x64@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-arm-gnueabihf@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-arm64-gnu@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-arm64-musl@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-ppc64-gnu@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-s390x-gnu@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-x64-gnu@1.0.0-rc.15': + optional: true + + '@rolldown/binding-linux-x64-musl@1.0.0-rc.15': + optional: true + + '@rolldown/binding-openharmony-arm64@1.0.0-rc.15': + optional: true + + '@rolldown/binding-wasm32-wasi@1.0.0-rc.15': + dependencies: + '@emnapi/core': 1.9.2 + '@emnapi/runtime': 1.9.2 + '@napi-rs/wasm-runtime': 1.1.3(@emnapi/core@1.9.2)(@emnapi/runtime@1.9.2) + optional: true + + '@rolldown/binding-win32-arm64-msvc@1.0.0-rc.15': + optional: true + + '@rolldown/binding-win32-x64-msvc@1.0.0-rc.15': + optional: true + + '@rolldown/pluginutils@1.0.0-rc.15': {} + + '@rolldown/pluginutils@1.0.0-rc.7': {} + + '@sec-ant/readable-stream@0.4.1': {} + + '@sindresorhus/merge-streams@4.0.0': {} + + '@tailwindcss/node@4.2.2': + dependencies: + '@jridgewell/remapping': 2.3.5 + enhanced-resolve: 5.20.1 + jiti: 2.6.1 + lightningcss: 1.32.0 + magic-string: 0.30.21 + source-map-js: 1.2.1 + tailwindcss: 4.2.2 + + '@tailwindcss/oxide-android-arm64@4.2.2': + optional: true + + '@tailwindcss/oxide-darwin-arm64@4.2.2': + optional: true + + '@tailwindcss/oxide-darwin-x64@4.2.2': + optional: true + + '@tailwindcss/oxide-freebsd-x64@4.2.2': + optional: true + + '@tailwindcss/oxide-linux-arm-gnueabihf@4.2.2': + optional: true + + '@tailwindcss/oxide-linux-arm64-gnu@4.2.2': + optional: true + + '@tailwindcss/oxide-linux-arm64-musl@4.2.2': + optional: true + + '@tailwindcss/oxide-linux-x64-gnu@4.2.2': + optional: true + + '@tailwindcss/oxide-linux-x64-musl@4.2.2': + optional: true + + '@tailwindcss/oxide-wasm32-wasi@4.2.2': + optional: true + + '@tailwindcss/oxide-win32-arm64-msvc@4.2.2': + optional: true + + '@tailwindcss/oxide-win32-x64-msvc@4.2.2': + optional: true + + '@tailwindcss/oxide@4.2.2': + optionalDependencies: + '@tailwindcss/oxide-android-arm64': 4.2.2 + '@tailwindcss/oxide-darwin-arm64': 4.2.2 + '@tailwindcss/oxide-darwin-x64': 4.2.2 + '@tailwindcss/oxide-freebsd-x64': 4.2.2 + '@tailwindcss/oxide-linux-arm-gnueabihf': 4.2.2 + '@tailwindcss/oxide-linux-arm64-gnu': 4.2.2 + '@tailwindcss/oxide-linux-arm64-musl': 4.2.2 + '@tailwindcss/oxide-linux-x64-gnu': 4.2.2 + '@tailwindcss/oxide-linux-x64-musl': 4.2.2 + '@tailwindcss/oxide-wasm32-wasi': 4.2.2 + '@tailwindcss/oxide-win32-arm64-msvc': 4.2.2 + '@tailwindcss/oxide-win32-x64-msvc': 4.2.2 + + '@tailwindcss/vite@4.2.2(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1))': + dependencies: + '@tailwindcss/node': 4.2.2 + '@tailwindcss/oxide': 4.2.2 + tailwindcss: 4.2.2 + vite: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + + '@tanstack/query-core@5.97.0': {} + + '@tanstack/react-query@5.97.0(react@19.2.5)': + dependencies: + '@tanstack/query-core': 5.97.0 + react: 19.2.5 + + '@tanstack/react-virtual@3.13.23(react-dom@19.2.5(react@19.2.5))(react@19.2.5)': + dependencies: + '@tanstack/virtual-core': 3.13.23 + react: 19.2.5 + react-dom: 19.2.5(react@19.2.5) + + '@tanstack/virtual-core@3.13.23': {} + + '@ts-morph/common@0.27.0': + dependencies: + fast-glob: 3.3.3 + minimatch: 10.2.5 + path-browserify: 1.0.1 + + '@tybys/wasm-util@0.10.1': + dependencies: + tslib: 2.8.1 + optional: true + + '@types/estree@1.0.8': {} + + '@types/json-schema@7.0.15': {} + + '@types/node@24.12.2': + dependencies: + undici-types: 7.16.0 + + '@types/react-dom@19.2.3(@types/react@19.2.14)': + dependencies: + '@types/react': 19.2.14 + + '@types/react@19.2.14': + dependencies: + csstype: 3.2.3 + + '@types/statuses@2.0.6': {} + + '@types/validate-npm-package-name@4.0.2': {} + + '@typescript-eslint/eslint-plugin@8.58.1(@typescript-eslint/parser@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2))(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2)': + dependencies: + '@eslint-community/regexpp': 4.12.2 + '@typescript-eslint/parser': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + '@typescript-eslint/scope-manager': 8.58.1 + '@typescript-eslint/type-utils': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + '@typescript-eslint/utils': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + '@typescript-eslint/visitor-keys': 8.58.1 + eslint: 9.39.4(jiti@2.6.1) + ignore: 7.0.5 + natural-compare: 1.4.0 + ts-api-utils: 2.5.0(typescript@6.0.2) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/parser@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2)': + dependencies: + '@typescript-eslint/scope-manager': 8.58.1 + '@typescript-eslint/types': 8.58.1 + '@typescript-eslint/typescript-estree': 8.58.1(typescript@6.0.2) + '@typescript-eslint/visitor-keys': 8.58.1 + debug: 4.4.3 + eslint: 9.39.4(jiti@2.6.1) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/project-service@8.58.1(typescript@6.0.2)': + dependencies: + '@typescript-eslint/tsconfig-utils': 8.58.1(typescript@6.0.2) + '@typescript-eslint/types': 8.58.1 + debug: 4.4.3 + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/scope-manager@8.58.1': + dependencies: + '@typescript-eslint/types': 8.58.1 + '@typescript-eslint/visitor-keys': 8.58.1 + + '@typescript-eslint/tsconfig-utils@8.58.1(typescript@6.0.2)': + dependencies: + typescript: 6.0.2 + + '@typescript-eslint/type-utils@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2)': + dependencies: + '@typescript-eslint/types': 8.58.1 + '@typescript-eslint/typescript-estree': 8.58.1(typescript@6.0.2) + '@typescript-eslint/utils': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + debug: 4.4.3 + eslint: 9.39.4(jiti@2.6.1) + ts-api-utils: 2.5.0(typescript@6.0.2) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/types@8.58.1': {} + + '@typescript-eslint/typescript-estree@8.58.1(typescript@6.0.2)': + dependencies: + '@typescript-eslint/project-service': 8.58.1(typescript@6.0.2) + '@typescript-eslint/tsconfig-utils': 8.58.1(typescript@6.0.2) + '@typescript-eslint/types': 8.58.1 + '@typescript-eslint/visitor-keys': 8.58.1 + debug: 4.4.3 + minimatch: 10.2.5 + semver: 7.7.4 + tinyglobby: 0.2.16 + ts-api-utils: 2.5.0(typescript@6.0.2) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/utils@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2)': + dependencies: + '@eslint-community/eslint-utils': 4.9.1(eslint@9.39.4(jiti@2.6.1)) + '@typescript-eslint/scope-manager': 8.58.1 + '@typescript-eslint/types': 8.58.1 + '@typescript-eslint/typescript-estree': 8.58.1(typescript@6.0.2) + eslint: 9.39.4(jiti@2.6.1) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + '@typescript-eslint/visitor-keys@8.58.1': + dependencies: + '@typescript-eslint/types': 8.58.1 + eslint-visitor-keys: 5.0.1 + + '@vitejs/plugin-react@6.0.1(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1))': + dependencies: + '@rolldown/pluginutils': 1.0.0-rc.7 + vite: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + + accepts@2.0.0: + dependencies: + mime-types: 3.0.2 + negotiator: 1.0.0 + + acorn-jsx@5.3.2(acorn@8.16.0): + dependencies: + acorn: 8.16.0 + + acorn@8.16.0: {} + + agent-base@7.1.4: {} + + ajv-formats@3.0.1(ajv@8.18.0): + optionalDependencies: + ajv: 8.18.0 + + ajv@6.14.0: + dependencies: + fast-deep-equal: 3.1.3 + fast-json-stable-stringify: 2.1.0 + json-schema-traverse: 0.4.1 + uri-js: 4.4.1 + + ajv@8.18.0: + dependencies: + fast-deep-equal: 3.1.3 + fast-uri: 3.1.0 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + + ansi-regex@5.0.1: {} + + ansi-regex@6.2.2: {} + + ansi-styles@4.3.0: + dependencies: + color-convert: 2.0.1 + + argparse@2.0.1: {} + + ast-types@0.16.1: + dependencies: + tslib: 2.8.1 + + balanced-match@1.0.2: {} + + balanced-match@4.0.4: {} + + baseline-browser-mapping@2.10.17: {} + + body-parser@2.2.2: + dependencies: + bytes: 3.1.2 + content-type: 1.0.5 + debug: 4.4.3 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + on-finished: 2.4.1 + qs: 6.15.1 + raw-body: 3.0.2 + type-is: 2.0.1 + transitivePeerDependencies: + - supports-color + + brace-expansion@1.1.13: + dependencies: + balanced-match: 1.0.2 + concat-map: 0.0.1 + + brace-expansion@5.0.5: + dependencies: + balanced-match: 4.0.4 + + braces@3.0.3: + dependencies: + fill-range: 7.1.1 + + browserslist@4.28.2: + dependencies: + baseline-browser-mapping: 2.10.17 + caniuse-lite: 1.0.30001787 + electron-to-chromium: 1.5.334 + node-releases: 2.0.37 + update-browserslist-db: 1.2.3(browserslist@4.28.2) + + bundle-name@4.1.0: + dependencies: + run-applescript: 7.1.0 + + bytes@3.1.2: {} + + call-bind-apply-helpers@1.0.2: + dependencies: + es-errors: 1.3.0 + function-bind: 1.1.2 + + call-bound@1.0.4: + dependencies: + call-bind-apply-helpers: 1.0.2 + get-intrinsic: 1.3.0 + + callsites@3.1.0: {} + + caniuse-lite@1.0.30001787: {} + + chalk@4.1.2: + dependencies: + ansi-styles: 4.3.0 + supports-color: 7.2.0 + + chalk@5.6.2: {} + + class-variance-authority@0.7.1: + dependencies: + clsx: 2.1.1 + + cli-cursor@5.0.0: + dependencies: + restore-cursor: 5.1.0 + + cli-spinners@2.9.2: {} + + cli-width@4.1.0: {} + + cliui@8.0.1: + dependencies: + string-width: 4.2.3 + strip-ansi: 6.0.1 + wrap-ansi: 7.0.0 + + clsx@2.1.1: {} + + code-block-writer@13.0.3: {} + + color-convert@2.0.1: + dependencies: + color-name: 1.1.4 + + color-name@1.1.4: {} + + commander@11.1.0: {} + + commander@14.0.3: {} + + concat-map@0.0.1: {} + + content-disposition@1.1.0: {} + + content-type@1.0.5: {} + + convert-source-map@2.0.0: {} + + cookie-signature@1.2.2: {} + + cookie@0.7.2: {} + + cookie@1.1.1: {} + + cors@2.8.6: + dependencies: + object-assign: 4.1.1 + vary: 1.1.2 + + cosmiconfig@9.0.1(typescript@6.0.2): + dependencies: + env-paths: 2.2.1 + import-fresh: 3.3.1 + js-yaml: 4.1.1 + parse-json: 5.2.0 + optionalDependencies: + typescript: 6.0.2 + + cross-spawn@7.0.6: + dependencies: + path-key: 3.1.1 + shebang-command: 2.0.0 + which: 2.0.2 + + cssesc@3.0.0: {} + + csstype@3.2.3: {} + + data-uri-to-buffer@4.0.1: {} + + debug@4.4.3: + dependencies: + ms: 2.1.3 + + dedent@1.7.2: {} + + deep-is@0.1.4: {} + + deepmerge@4.3.1: {} + + default-browser-id@5.0.1: {} + + default-browser@5.5.0: + dependencies: + bundle-name: 4.1.0 + default-browser-id: 5.0.1 + + define-lazy-prop@3.0.0: {} + + depd@2.0.0: {} + + detect-libc@2.1.2: {} + + diff@8.0.4: {} + + dotenv@17.4.1: {} + + dunder-proto@1.0.1: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-errors: 1.3.0 + gopd: 1.2.0 + + eciesjs@0.4.18: + dependencies: + '@ecies/ciphers': 0.2.6(@noble/ciphers@1.3.0) + '@noble/ciphers': 1.3.0 + '@noble/curves': 1.9.7 + '@noble/hashes': 1.8.0 + + ee-first@1.1.1: {} + + electron-to-chromium@1.5.334: {} + + emoji-regex@10.6.0: {} + + emoji-regex@8.0.0: {} + + encodeurl@2.0.0: {} + + enhanced-resolve@5.20.1: + dependencies: + graceful-fs: 4.2.11 + tapable: 2.3.2 + + env-paths@2.2.1: {} + + error-ex@1.3.4: + dependencies: + is-arrayish: 0.2.1 + + es-define-property@1.0.1: {} + + es-errors@1.3.0: {} + + es-object-atoms@1.1.1: + dependencies: + es-errors: 1.3.0 + + escalade@3.2.0: {} + + escape-html@1.0.3: {} + + escape-string-regexp@4.0.0: {} + + eslint-plugin-react-hooks@7.0.1(eslint@9.39.4(jiti@2.6.1)): + dependencies: + '@babel/core': 7.29.0 + '@babel/parser': 7.29.2 + eslint: 9.39.4(jiti@2.6.1) + hermes-parser: 0.25.1 + zod: 4.3.6 + zod-validation-error: 4.0.2(zod@4.3.6) + transitivePeerDependencies: + - supports-color + + eslint-plugin-react-refresh@0.5.2(eslint@9.39.4(jiti@2.6.1)): + dependencies: + eslint: 9.39.4(jiti@2.6.1) + + eslint-scope@8.4.0: + dependencies: + esrecurse: 4.3.0 + estraverse: 5.3.0 + + eslint-visitor-keys@3.4.3: {} + + eslint-visitor-keys@4.2.1: {} + + eslint-visitor-keys@5.0.1: {} + + eslint@9.39.4(jiti@2.6.1): + dependencies: + '@eslint-community/eslint-utils': 4.9.1(eslint@9.39.4(jiti@2.6.1)) + '@eslint-community/regexpp': 4.12.2 + '@eslint/config-array': 0.21.2 + '@eslint/config-helpers': 0.4.2 + '@eslint/core': 0.17.0 + '@eslint/eslintrc': 3.3.5 + '@eslint/js': 9.39.4 + '@eslint/plugin-kit': 0.4.1 + '@humanfs/node': 0.16.7 + '@humanwhocodes/module-importer': 1.0.1 + '@humanwhocodes/retry': 0.4.3 + '@types/estree': 1.0.8 + ajv: 6.14.0 + chalk: 4.1.2 + cross-spawn: 7.0.6 + debug: 4.4.3 + escape-string-regexp: 4.0.0 + eslint-scope: 8.4.0 + eslint-visitor-keys: 4.2.1 + espree: 10.4.0 + esquery: 1.7.0 + esutils: 2.0.3 + fast-deep-equal: 3.1.3 + file-entry-cache: 8.0.0 + find-up: 5.0.0 + glob-parent: 6.0.2 + ignore: 5.3.2 + imurmurhash: 0.1.4 + is-glob: 4.0.3 + json-stable-stringify-without-jsonify: 1.0.1 + lodash.merge: 4.6.2 + minimatch: 3.1.5 + natural-compare: 1.4.0 + optionator: 0.9.4 + optionalDependencies: + jiti: 2.6.1 + transitivePeerDependencies: + - supports-color + + espree@10.4.0: + dependencies: + acorn: 8.16.0 + acorn-jsx: 5.3.2(acorn@8.16.0) + eslint-visitor-keys: 4.2.1 + + esprima@4.0.1: {} + + esquery@1.7.0: + dependencies: + estraverse: 5.3.0 + + esrecurse@4.3.0: + dependencies: + estraverse: 5.3.0 + + estraverse@5.3.0: {} + + esutils@2.0.3: {} + + etag@1.8.1: {} + + eventsource-parser@3.0.6: {} + + eventsource@3.0.7: + dependencies: + eventsource-parser: 3.0.6 + + execa@5.1.1: + dependencies: + cross-spawn: 7.0.6 + get-stream: 6.0.1 + human-signals: 2.1.0 + is-stream: 2.0.1 + merge-stream: 2.0.0 + npm-run-path: 4.0.1 + onetime: 5.1.2 + signal-exit: 3.0.7 + strip-final-newline: 2.0.0 + + execa@9.6.1: + dependencies: + '@sindresorhus/merge-streams': 4.0.0 + cross-spawn: 7.0.6 + figures: 6.1.0 + get-stream: 9.0.1 + human-signals: 8.0.1 + is-plain-obj: 4.1.0 + is-stream: 4.0.1 + npm-run-path: 6.0.0 + pretty-ms: 9.3.0 + signal-exit: 4.1.0 + strip-final-newline: 4.0.0 + yoctocolors: 2.1.2 + + express-rate-limit@8.3.2(express@5.2.1): + dependencies: + express: 5.2.1 + ip-address: 10.1.0 + + express@5.2.1: + dependencies: + accepts: 2.0.0 + body-parser: 2.2.2 + content-disposition: 1.1.0 + content-type: 1.0.5 + cookie: 0.7.2 + cookie-signature: 1.2.2 + debug: 4.4.3 + depd: 2.0.0 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + finalhandler: 2.1.1 + fresh: 2.0.0 + http-errors: 2.0.1 + merge-descriptors: 2.0.0 + mime-types: 3.0.2 + on-finished: 2.4.1 + once: 1.4.0 + parseurl: 1.3.3 + proxy-addr: 2.0.7 + qs: 6.15.1 + range-parser: 1.2.1 + router: 2.2.0 + send: 1.2.1 + serve-static: 2.2.1 + statuses: 2.0.2 + type-is: 2.0.1 + vary: 1.1.2 + transitivePeerDependencies: + - supports-color + + fast-deep-equal@3.1.3: {} + + fast-glob@3.3.3: + dependencies: + '@nodelib/fs.stat': 2.0.5 + '@nodelib/fs.walk': 1.2.8 + glob-parent: 5.1.2 + merge2: 1.4.1 + micromatch: 4.0.8 + + fast-json-stable-stringify@2.1.0: {} + + fast-levenshtein@2.0.6: {} + + fast-uri@3.1.0: {} + + fastq@1.20.1: + dependencies: + reusify: 1.1.0 + + fdir@6.5.0(picomatch@4.0.4): + optionalDependencies: + picomatch: 4.0.4 + + fetch-blob@3.2.0: + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 3.3.3 + + figures@6.1.0: + dependencies: + is-unicode-supported: 2.1.0 + + file-entry-cache@8.0.0: + dependencies: + flat-cache: 4.0.1 + + fill-range@7.1.1: + dependencies: + to-regex-range: 5.0.1 + + finalhandler@2.1.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + on-finished: 2.4.1 + parseurl: 1.3.3 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + + find-up@5.0.0: + dependencies: + locate-path: 6.0.0 + path-exists: 4.0.0 + + flat-cache@4.0.1: + dependencies: + flatted: 3.4.2 + keyv: 4.5.4 + + flatted@3.4.2: {} + + formdata-polyfill@4.0.10: + dependencies: + fetch-blob: 3.2.0 + + forwarded@0.2.0: {} + + fresh@2.0.0: {} + + fs-extra@11.3.4: + dependencies: + graceful-fs: 4.2.11 + jsonfile: 6.2.0 + universalify: 2.0.1 + + fsevents@2.3.3: + optional: true + + function-bind@1.1.2: {} + + fuzzysort@3.1.0: {} + + gensync@1.0.0-beta.2: {} + + get-caller-file@2.0.5: {} + + get-east-asian-width@1.5.0: {} + + get-intrinsic@1.3.0: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-define-property: 1.0.1 + es-errors: 1.3.0 + es-object-atoms: 1.1.1 + function-bind: 1.1.2 + get-proto: 1.0.1 + gopd: 1.2.0 + has-symbols: 1.1.0 + hasown: 2.0.2 + math-intrinsics: 1.1.0 + + get-own-enumerable-keys@1.0.0: {} + + get-proto@1.0.1: + dependencies: + dunder-proto: 1.0.1 + es-object-atoms: 1.1.1 + + get-stream@6.0.1: {} + + get-stream@9.0.1: + dependencies: + '@sec-ant/readable-stream': 0.4.1 + is-stream: 4.0.1 + + glob-parent@5.1.2: + dependencies: + is-glob: 4.0.3 + + glob-parent@6.0.2: + dependencies: + is-glob: 4.0.3 + + globals@14.0.0: {} + + globals@17.4.0: {} + + gopd@1.2.0: {} + + graceful-fs@4.2.11: {} + + graphql@16.13.2: {} + + has-flag@4.0.0: {} + + has-symbols@1.1.0: {} + + hasown@2.0.2: + dependencies: + function-bind: 1.1.2 + + headers-polyfill@4.0.3: {} + + hermes-estree@0.25.1: {} + + hermes-parser@0.25.1: + dependencies: + hermes-estree: 0.25.1 + + hono@4.12.12: {} + + http-errors@2.0.1: + dependencies: + depd: 2.0.0 + inherits: 2.0.4 + setprototypeof: 1.2.0 + statuses: 2.0.2 + toidentifier: 1.0.1 + + https-proxy-agent@7.0.6: + dependencies: + agent-base: 7.1.4 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + human-signals@2.1.0: {} + + human-signals@8.0.1: {} + + iconv-lite@0.7.2: + dependencies: + safer-buffer: 2.1.2 + + ignore@5.3.2: {} + + ignore@7.0.5: {} + + import-fresh@3.3.1: + dependencies: + parent-module: 1.0.1 + resolve-from: 4.0.0 + + imurmurhash@0.1.4: {} + + inherits@2.0.4: {} + + ip-address@10.1.0: {} + + ipaddr.js@1.9.1: {} + + is-arrayish@0.2.1: {} + + is-docker@3.0.0: {} + + is-extglob@2.1.1: {} + + is-fullwidth-code-point@3.0.0: {} + + is-glob@4.0.3: + dependencies: + is-extglob: 2.1.1 + + is-in-ssh@1.0.0: {} + + is-inside-container@1.0.0: + dependencies: + is-docker: 3.0.0 + + is-interactive@2.0.0: {} + + is-node-process@1.2.0: {} + + is-number@7.0.0: {} + + is-obj@3.0.0: {} + + is-plain-obj@4.1.0: {} + + is-promise@4.0.0: {} + + is-regexp@3.1.0: {} + + is-stream@2.0.1: {} + + is-stream@4.0.1: {} + + is-unicode-supported@1.3.0: {} + + is-unicode-supported@2.1.0: {} + + is-wsl@3.1.1: + dependencies: + is-inside-container: 1.0.0 + + isexe@2.0.0: {} + + isexe@3.1.5: {} + + jiti@2.6.1: {} + + jose@6.2.2: {} + + js-tokens@4.0.0: {} + + js-yaml@4.1.1: + dependencies: + argparse: 2.0.1 + + jsesc@3.1.0: {} + + json-buffer@3.0.1: {} + + json-parse-even-better-errors@2.3.1: {} + + json-schema-traverse@0.4.1: {} + + json-schema-traverse@1.0.0: {} + + json-schema-typed@8.0.2: {} + + json-stable-stringify-without-jsonify@1.0.1: {} + + json5@2.2.3: {} + + jsonfile@6.2.0: + dependencies: + universalify: 2.0.1 + optionalDependencies: + graceful-fs: 4.2.11 + + keyv@4.5.4: + dependencies: + json-buffer: 3.0.1 + + kleur@3.0.3: {} + + kleur@4.1.5: {} + + levn@0.4.1: + dependencies: + prelude-ls: 1.2.1 + type-check: 0.4.0 + + lightningcss-android-arm64@1.32.0: + optional: true + + lightningcss-darwin-arm64@1.32.0: + optional: true + + lightningcss-darwin-x64@1.32.0: + optional: true + + lightningcss-freebsd-x64@1.32.0: + optional: true + + lightningcss-linux-arm-gnueabihf@1.32.0: + optional: true + + lightningcss-linux-arm64-gnu@1.32.0: + optional: true + + lightningcss-linux-arm64-musl@1.32.0: + optional: true + + lightningcss-linux-x64-gnu@1.32.0: + optional: true + + lightningcss-linux-x64-musl@1.32.0: + optional: true + + lightningcss-win32-arm64-msvc@1.32.0: + optional: true + + lightningcss-win32-x64-msvc@1.32.0: + optional: true + + lightningcss@1.32.0: + dependencies: + detect-libc: 2.1.2 + optionalDependencies: + lightningcss-android-arm64: 1.32.0 + lightningcss-darwin-arm64: 1.32.0 + lightningcss-darwin-x64: 1.32.0 + lightningcss-freebsd-x64: 1.32.0 + lightningcss-linux-arm-gnueabihf: 1.32.0 + lightningcss-linux-arm64-gnu: 1.32.0 + lightningcss-linux-arm64-musl: 1.32.0 + lightningcss-linux-x64-gnu: 1.32.0 + lightningcss-linux-x64-musl: 1.32.0 + lightningcss-win32-arm64-msvc: 1.32.0 + lightningcss-win32-x64-msvc: 1.32.0 + + lines-and-columns@1.2.4: {} + + locate-path@6.0.0: + dependencies: + p-locate: 5.0.0 + + lodash.merge@4.6.2: {} + + log-symbols@6.0.0: + dependencies: + chalk: 5.6.2 + is-unicode-supported: 1.3.0 + + lru-cache@5.1.1: + dependencies: + yallist: 3.1.1 + + lucide-react@1.8.0(react@19.2.5): + dependencies: + react: 19.2.5 + + magic-string@0.30.21: + dependencies: + '@jridgewell/sourcemap-codec': 1.5.5 + + math-intrinsics@1.1.0: {} + + media-typer@1.1.0: {} + + merge-descriptors@2.0.0: {} + + merge-stream@2.0.0: {} + + merge2@1.4.1: {} + + micromatch@4.0.8: + dependencies: + braces: 3.0.3 + picomatch: 2.3.2 + + mime-db@1.54.0: {} + + mime-types@3.0.2: + dependencies: + mime-db: 1.54.0 + + mimic-fn@2.1.0: {} + + mimic-function@5.0.1: {} + + minimatch@10.2.5: + dependencies: + brace-expansion: 5.0.5 + + minimatch@3.1.5: + dependencies: + brace-expansion: 1.1.13 + + minimist@1.2.8: {} + + ms@2.1.3: {} + + msw@2.13.2(@types/node@24.12.2)(typescript@6.0.2): + dependencies: + '@inquirer/confirm': 5.1.21(@types/node@24.12.2) + '@mswjs/interceptors': 0.41.3 + '@open-draft/deferred-promise': 2.2.0 + '@types/statuses': 2.0.6 + cookie: 1.1.1 + graphql: 16.13.2 + headers-polyfill: 4.0.3 + is-node-process: 1.2.0 + outvariant: 1.4.3 + path-to-regexp: 6.3.0 + picocolors: 1.1.1 + rettime: 0.10.1 + statuses: 2.0.2 + strict-event-emitter: 0.5.1 + tough-cookie: 6.0.1 + type-fest: 5.5.0 + until-async: 3.0.2 + yargs: 17.7.2 + optionalDependencies: + typescript: 6.0.2 + transitivePeerDependencies: + - '@types/node' + + mute-stream@2.0.0: {} + + nanoid@3.3.11: {} + + natural-compare@1.4.0: {} + + negotiator@1.0.0: {} + + node-domexception@1.0.0: {} + + node-fetch@3.3.2: + dependencies: + data-uri-to-buffer: 4.0.1 + fetch-blob: 3.2.0 + formdata-polyfill: 4.0.10 + + node-releases@2.0.37: {} + + npm-run-path@4.0.1: + dependencies: + path-key: 3.1.1 + + npm-run-path@6.0.0: + dependencies: + path-key: 4.0.0 + unicorn-magic: 0.3.0 + + object-assign@4.1.1: {} + + object-inspect@1.13.4: {} + + object-treeify@1.1.33: {} + + on-finished@2.4.1: + dependencies: + ee-first: 1.1.1 + + once@1.4.0: + dependencies: + wrappy: 1.0.2 + + onetime@5.1.2: + dependencies: + mimic-fn: 2.1.0 + + onetime@7.0.0: + dependencies: + mimic-function: 5.0.1 + + open@11.0.0: + dependencies: + default-browser: 5.5.0 + define-lazy-prop: 3.0.0 + is-in-ssh: 1.0.0 + is-inside-container: 1.0.0 + powershell-utils: 0.1.0 + wsl-utils: 0.3.1 + + optionator@0.9.4: + dependencies: + deep-is: 0.1.4 + fast-levenshtein: 2.0.6 + levn: 0.4.1 + prelude-ls: 1.2.1 + type-check: 0.4.0 + word-wrap: 1.2.5 + + ora@8.2.0: + dependencies: + chalk: 5.6.2 + cli-cursor: 5.0.0 + cli-spinners: 2.9.2 + is-interactive: 2.0.0 + is-unicode-supported: 2.1.0 + log-symbols: 6.0.0 + stdin-discarder: 0.2.2 + string-width: 7.2.0 + strip-ansi: 7.2.0 + + outvariant@1.4.3: {} + + p-limit@3.1.0: + dependencies: + yocto-queue: 0.1.0 + + p-locate@5.0.0: + dependencies: + p-limit: 3.1.0 + + parent-module@1.0.1: + dependencies: + callsites: 3.1.0 + + parse-json@5.2.0: + dependencies: + '@babel/code-frame': 7.29.0 + error-ex: 1.3.4 + json-parse-even-better-errors: 2.3.1 + lines-and-columns: 1.2.4 + + parse-ms@4.0.0: {} + + parseurl@1.3.3: {} + + path-browserify@1.0.1: {} + + path-exists@4.0.0: {} + + path-key@3.1.1: {} + + path-key@4.0.0: {} + + path-to-regexp@6.3.0: {} + + path-to-regexp@8.4.2: {} + + picocolors@1.1.1: {} + + picomatch@2.3.2: {} + + picomatch@4.0.4: {} + + pkce-challenge@5.0.1: {} + + postcss-selector-parser@7.1.1: + dependencies: + cssesc: 3.0.0 + util-deprecate: 1.0.2 + + postcss@8.5.9: + dependencies: + nanoid: 3.3.11 + picocolors: 1.1.1 + source-map-js: 1.2.1 + + powershell-utils@0.1.0: {} + + prelude-ls@1.2.1: {} + + pretty-ms@9.3.0: + dependencies: + parse-ms: 4.0.0 + + prompts@2.4.2: + dependencies: + kleur: 3.0.3 + sisteransi: 1.0.5 + + proxy-addr@2.0.7: + dependencies: + forwarded: 0.2.0 + ipaddr.js: 1.9.1 + + punycode@2.3.1: {} + + qs@6.15.1: + dependencies: + side-channel: 1.1.0 + + queue-microtask@1.2.3: {} + + range-parser@1.2.1: {} + + raw-body@3.0.2: + dependencies: + bytes: 3.1.2 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + unpipe: 1.0.0 + + react-dom@19.2.5(react@19.2.5): + dependencies: + react: 19.2.5 + scheduler: 0.27.0 + + react-router@7.14.0(react-dom@19.2.5(react@19.2.5))(react@19.2.5): + dependencies: + cookie: 1.1.1 + react: 19.2.5 + set-cookie-parser: 2.7.2 + optionalDependencies: + react-dom: 19.2.5(react@19.2.5) + + react@19.2.5: {} + + recast@0.23.11: + dependencies: + ast-types: 0.16.1 + esprima: 4.0.1 + source-map: 0.6.1 + tiny-invariant: 1.3.3 + tslib: 2.8.1 + + require-directory@2.1.1: {} + + require-from-string@2.0.2: {} + + reselect@5.1.1: {} + + resolve-from@4.0.0: {} + + restore-cursor@5.1.0: + dependencies: + onetime: 7.0.0 + signal-exit: 4.1.0 + + rettime@0.10.1: {} + + reusify@1.1.0: {} + + rolldown@1.0.0-rc.15: + dependencies: + '@oxc-project/types': 0.124.0 + '@rolldown/pluginutils': 1.0.0-rc.15 + optionalDependencies: + '@rolldown/binding-android-arm64': 1.0.0-rc.15 + '@rolldown/binding-darwin-arm64': 1.0.0-rc.15 + '@rolldown/binding-darwin-x64': 1.0.0-rc.15 + '@rolldown/binding-freebsd-x64': 1.0.0-rc.15 + '@rolldown/binding-linux-arm-gnueabihf': 1.0.0-rc.15 + '@rolldown/binding-linux-arm64-gnu': 1.0.0-rc.15 + '@rolldown/binding-linux-arm64-musl': 1.0.0-rc.15 + '@rolldown/binding-linux-ppc64-gnu': 1.0.0-rc.15 + '@rolldown/binding-linux-s390x-gnu': 1.0.0-rc.15 + '@rolldown/binding-linux-x64-gnu': 1.0.0-rc.15 + '@rolldown/binding-linux-x64-musl': 1.0.0-rc.15 + '@rolldown/binding-openharmony-arm64': 1.0.0-rc.15 + '@rolldown/binding-wasm32-wasi': 1.0.0-rc.15 + '@rolldown/binding-win32-arm64-msvc': 1.0.0-rc.15 + '@rolldown/binding-win32-x64-msvc': 1.0.0-rc.15 + + router@2.2.0: + dependencies: + debug: 4.4.3 + depd: 2.0.0 + is-promise: 4.0.0 + parseurl: 1.3.3 + path-to-regexp: 8.4.2 + transitivePeerDependencies: + - supports-color + + run-applescript@7.1.0: {} + + run-parallel@1.2.0: + dependencies: + queue-microtask: 1.2.3 + + safer-buffer@2.1.2: {} + + scheduler@0.27.0: {} + + semver@6.3.1: {} + + semver@7.7.4: {} + + send@1.2.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + fresh: 2.0.0 + http-errors: 2.0.1 + mime-types: 3.0.2 + ms: 2.1.3 + on-finished: 2.4.1 + range-parser: 1.2.1 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + + serve-static@2.2.1: + dependencies: + encodeurl: 2.0.0 + escape-html: 1.0.3 + parseurl: 1.3.3 + send: 1.2.1 + transitivePeerDependencies: + - supports-color + + set-cookie-parser@2.7.2: {} + + setprototypeof@1.2.0: {} + + shadcn@4.2.0(@types/node@24.12.2)(typescript@6.0.2): + dependencies: + '@babel/core': 7.29.0 + '@babel/parser': 7.29.2 + '@babel/plugin-transform-typescript': 7.28.6(@babel/core@7.29.0) + '@babel/preset-typescript': 7.28.5(@babel/core@7.29.0) + '@dotenvx/dotenvx': 1.61.0 + '@modelcontextprotocol/sdk': 1.29.0(zod@3.25.76) + '@types/validate-npm-package-name': 4.0.2 + browserslist: 4.28.2 + commander: 14.0.3 + cosmiconfig: 9.0.1(typescript@6.0.2) + dedent: 1.7.2 + deepmerge: 4.3.1 + diff: 8.0.4 + execa: 9.6.1 + fast-glob: 3.3.3 + fs-extra: 11.3.4 + fuzzysort: 3.1.0 + https-proxy-agent: 7.0.6 + kleur: 4.1.5 + msw: 2.13.2(@types/node@24.12.2)(typescript@6.0.2) + node-fetch: 3.3.2 + open: 11.0.0 + ora: 8.2.0 + postcss: 8.5.9 + postcss-selector-parser: 7.1.1 + prompts: 2.4.2 + recast: 0.23.11 + stringify-object: 5.0.0 + tailwind-merge: 3.5.0 + ts-morph: 26.0.0 + tsconfig-paths: 4.2.0 + validate-npm-package-name: 7.0.2 + zod: 3.25.76 + zod-to-json-schema: 3.25.2(zod@3.25.76) + transitivePeerDependencies: + - '@cfworker/json-schema' + - '@types/node' + - babel-plugin-macros + - supports-color + - typescript + + shebang-command@2.0.0: + dependencies: + shebang-regex: 3.0.0 + + shebang-regex@3.0.0: {} + + side-channel-list@1.0.1: + dependencies: + es-errors: 1.3.0 + object-inspect: 1.13.4 + + side-channel-map@1.0.1: + dependencies: + call-bound: 1.0.4 + es-errors: 1.3.0 + get-intrinsic: 1.3.0 + object-inspect: 1.13.4 + + side-channel-weakmap@1.0.2: + dependencies: + call-bound: 1.0.4 + es-errors: 1.3.0 + get-intrinsic: 1.3.0 + object-inspect: 1.13.4 + side-channel-map: 1.0.1 + + side-channel@1.1.0: + dependencies: + es-errors: 1.3.0 + object-inspect: 1.13.4 + side-channel-list: 1.0.1 + side-channel-map: 1.0.1 + side-channel-weakmap: 1.0.2 + + signal-exit@3.0.7: {} + + signal-exit@4.1.0: {} + + sisteransi@1.0.5: {} + + source-map-js@1.2.1: {} + + source-map@0.6.1: {} + + statuses@2.0.2: {} + + stdin-discarder@0.2.2: {} + + strict-event-emitter@0.5.1: {} + + string-width@4.2.3: + dependencies: + emoji-regex: 8.0.0 + is-fullwidth-code-point: 3.0.0 + strip-ansi: 6.0.1 + + string-width@7.2.0: + dependencies: + emoji-regex: 10.6.0 + get-east-asian-width: 1.5.0 + strip-ansi: 7.2.0 + + stringify-object@5.0.0: + dependencies: + get-own-enumerable-keys: 1.0.0 + is-obj: 3.0.0 + is-regexp: 3.1.0 + + strip-ansi@6.0.1: + dependencies: + ansi-regex: 5.0.1 + + strip-ansi@7.2.0: + dependencies: + ansi-regex: 6.2.2 + + strip-bom@3.0.0: {} + + strip-final-newline@2.0.0: {} + + strip-final-newline@4.0.0: {} + + strip-json-comments@3.1.1: {} + + supports-color@7.2.0: + dependencies: + has-flag: 4.0.0 + + tabbable@6.4.0: {} + + tagged-tag@1.0.0: {} + + tailwind-merge@3.5.0: {} + + tailwindcss@4.2.2: {} + + tapable@2.3.2: {} + + tiny-invariant@1.3.3: {} + + tinyglobby@0.2.16: + dependencies: + fdir: 6.5.0(picomatch@4.0.4) + picomatch: 4.0.4 + + tldts-core@7.0.28: {} + + tldts@7.0.28: + dependencies: + tldts-core: 7.0.28 + + to-regex-range@5.0.1: + dependencies: + is-number: 7.0.0 + + toidentifier@1.0.1: {} + + tough-cookie@6.0.1: + dependencies: + tldts: 7.0.28 + + ts-api-utils@2.5.0(typescript@6.0.2): + dependencies: + typescript: 6.0.2 + + ts-morph@26.0.0: + dependencies: + '@ts-morph/common': 0.27.0 + code-block-writer: 13.0.3 + + tsconfig-paths@4.2.0: + dependencies: + json5: 2.2.3 + minimist: 1.2.8 + strip-bom: 3.0.0 + + tslib@2.8.1: {} + + tw-animate-css@1.4.0: {} + + type-check@0.4.0: + dependencies: + prelude-ls: 1.2.1 + + type-fest@5.5.0: + dependencies: + tagged-tag: 1.0.0 + + type-is@2.0.1: + dependencies: + content-type: 1.0.5 + media-typer: 1.1.0 + mime-types: 3.0.2 + + typescript-eslint@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2): + dependencies: + '@typescript-eslint/eslint-plugin': 8.58.1(@typescript-eslint/parser@8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2))(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + '@typescript-eslint/parser': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + '@typescript-eslint/typescript-estree': 8.58.1(typescript@6.0.2) + '@typescript-eslint/utils': 8.58.1(eslint@9.39.4(jiti@2.6.1))(typescript@6.0.2) + eslint: 9.39.4(jiti@2.6.1) + typescript: 6.0.2 + transitivePeerDependencies: + - supports-color + + typescript@6.0.2: {} + + undici-types@7.16.0: {} + + unicorn-magic@0.3.0: {} + + universalify@2.0.1: {} + + unpipe@1.0.0: {} + + until-async@3.0.2: {} + + update-browserslist-db@1.2.3(browserslist@4.28.2): + dependencies: + browserslist: 4.28.2 + escalade: 3.2.0 + picocolors: 1.1.1 + + uri-js@4.4.1: + dependencies: + punycode: 2.3.1 + + use-sync-external-store@1.6.0(react@19.2.5): + dependencies: + react: 19.2.5 + + util-deprecate@1.0.2: {} + + validate-npm-package-name@7.0.2: {} + + vary@1.1.2: {} + + vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1): + dependencies: + lightningcss: 1.32.0 + picomatch: 4.0.4 + postcss: 8.5.9 + rolldown: 1.0.0-rc.15 + tinyglobby: 0.2.16 + optionalDependencies: + '@types/node': 24.12.2 + fsevents: 2.3.3 + jiti: 2.6.1 + + web-streams-polyfill@3.3.3: {} + + which@2.0.2: + dependencies: + isexe: 2.0.0 + + which@4.0.0: + dependencies: + isexe: 3.1.5 + + word-wrap@1.2.5: {} + + wrap-ansi@6.2.0: + dependencies: + ansi-styles: 4.3.0 + string-width: 4.2.3 + strip-ansi: 6.0.1 + + wrap-ansi@7.0.0: + dependencies: + ansi-styles: 4.3.0 + string-width: 4.2.3 + strip-ansi: 6.0.1 + + wrappy@1.0.2: {} + + wsl-utils@0.3.1: + dependencies: + is-wsl: 3.1.1 + powershell-utils: 0.1.0 + + y18n@5.0.8: {} + + yallist@3.1.1: {} + + yargs-parser@21.1.1: {} + + yargs@17.7.2: + dependencies: + cliui: 8.0.1 + escalade: 3.2.0 + get-caller-file: 2.0.5 + require-directory: 2.1.1 + string-width: 4.2.3 + y18n: 5.0.8 + yargs-parser: 21.1.1 + + yocto-queue@0.1.0: {} + + yocto-spinner@1.1.0: + dependencies: + yoctocolors: 2.1.2 + + yoctocolors-cjs@2.1.3: {} + + yoctocolors@2.1.2: {} + + zod-to-json-schema@3.25.2(zod@3.25.76): + dependencies: + zod: 3.25.76 + + zod-validation-error@4.0.2(zod@4.3.6): + dependencies: + zod: 4.3.6 + + zod@3.25.76: {} + + zod@4.3.6: {} diff --git a/customers/vm-troubleshooting-dashboard/frontend/public/favicon.svg b/customers/vm-troubleshooting-dashboard/frontend/public/favicon.svg new file mode 100644 index 0000000..6893eb1 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/public/favicon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/customers/vm-troubleshooting-dashboard/frontend/public/icons.svg b/customers/vm-troubleshooting-dashboard/frontend/public/icons.svg new file mode 100644 index 0000000..e952219 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/public/icons.svg @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts new file mode 100644 index 0000000..e90b8b9 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts @@ -0,0 +1,74 @@ +import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query" +import type { + ArchiveDetail, + ArchiveListResponse, + ArchiveSummary, +} from "@/types" +import { fetchJson, apiUrl } from "./client" + +export function useArchives(page = 1, pageSize = 25) { + return useQuery({ + queryKey: ["archives", page, pageSize], + queryFn: () => { + const params = new URLSearchParams({ + page: String(page), + page_size: String(pageSize), + }) + return fetchJson( + apiUrl(`/archives?${params.toString()}`), + ) + }, + }) +} + +export function useArchive(archiveId: string) { + return useQuery({ + queryKey: ["archive", archiveId], + queryFn: () => + fetchJson( + apiUrl(`/archives/${encodeURIComponent(archiveId)}`), + ), + }) +} + +export function useUploadArchive() { + const queryClient = useQueryClient() + return useMutation({ + mutationFn: async (file: File) => { + const formData = new FormData() + formData.append("archive", file) + return fetchJson<{ + archive: ArchiveSummary + redirect_url: string + }>(apiUrl("/archives"), { method: "POST", body: formData }) + }, + onSuccess: () => { + void queryClient.invalidateQueries({ queryKey: ["archives"] }) + }, + }) +} + +export function useDeleteArchive() { + const queryClient = useQueryClient() + return useMutation({ + mutationFn: async (archiveId: string) => { + const response = await fetch( + apiUrl(`/archives/${encodeURIComponent(archiveId)}`), + { method: "DELETE" }, + ) + if (!response.ok && response.status !== 204) { + let message = response.statusText + try { + const payload = (await response.json()) as { error?: string } + message = payload.error ?? message + } catch { + // keep status text + } + throw new Error(message) + } + }, + onSuccess: () => { + void queryClient.invalidateQueries({ queryKey: ["archives"] }) + }, + }) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/artifacts.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/artifacts.ts new file mode 100644 index 0000000..4830751 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/artifacts.ts @@ -0,0 +1,30 @@ +import { useQuery } from "@tanstack/react-query" +import type { ArtifactPreviewResponse, ArtifactsResponse } from "@/types" +import { fetchJson, apiUrl } from "./client" +import { encodePath } from "@/lib/utils" + +export function useArtifacts(archiveId: string) { + return useQuery({ + queryKey: ["artifacts", archiveId], + queryFn: () => + fetchJson( + apiUrl(`/archives/${encodeURIComponent(archiveId)}/artifacts`), + ), + }) +} + +export function useArtifactContent( + archiveId: string, + path: string | undefined, +) { + return useQuery({ + queryKey: ["artifact-content", archiveId, path], + queryFn: () => + fetchJson( + apiUrl( + `/archives/${encodeURIComponent(archiveId)}/artifacts/view/${encodePath(path!)}`, + ), + ), + enabled: !!path, + }) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/client.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/client.ts new file mode 100644 index 0000000..3eea1ca --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/client.ts @@ -0,0 +1,31 @@ +const baseUrl = "/api/v1" + +export class ApiError extends Error { + status: number + + constructor(status: number, message: string) { + super(message) + this.name = "ApiError" + this.status = status + } +} + +export async function fetchJson( + input: RequestInfo | URL, + init?: RequestInit, +): Promise { + const response = await fetch(input, init) + if (!response.ok) { + let message = response.statusText + try { + const payload = (await response.json()) as { error?: string } + message = payload.error ?? message + } catch { /* use status text */ } + throw new ApiError(response.status, message) + } + return (await response.json()) as T +} + +export function apiUrl(path: string) { + return `${baseUrl}${path}` +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts new file mode 100644 index 0000000..29f648b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts @@ -0,0 +1,43 @@ +import { keepPreviousData, useQuery } from "@tanstack/react-query" +import type { IssueRecord, IssuesResponse } from "@/types" +import type { ArchiveSummary } from "@/types" +import { fetchJson, apiUrl } from "./client" + +export type IssueFilters = { + severity?: string + confidence?: string + category?: string + collector?: string + code?: string + q?: string +} + +export function useIssues(archiveId: string, filters: IssueFilters) { + return useQuery({ + queryKey: ["issues", archiveId, filters], + queryFn: () => { + const search = new URLSearchParams() + for (const [key, value] of Object.entries(filters)) { + if (value) search.set(key, value) + } + return fetchJson( + apiUrl( + `/archives/${encodeURIComponent(archiveId)}/issues?${search.toString()}`, + ), + ) + }, + placeholderData: keepPreviousData, + }) +} + +export function useIssueDetail(archiveId: string, issueId: string) { + return useQuery({ + queryKey: ["issue", archiveId, issueId], + queryFn: () => + fetchJson<{ archive: ArchiveSummary; issue: IssueRecord }>( + apiUrl( + `/archives/${encodeURIComponent(archiveId)}/issues/${encodeURIComponent(issueId)}`, + ), + ), + }) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx new file mode 100644 index 0000000..6f309d8 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx @@ -0,0 +1,311 @@ +import { useRef, useState, useEffect, useMemo } from "react" +import { useParams, useNavigate } from "react-router" +import { Card, CardContent } from "@/components/ui/card" +import { buttonVariants } from "@/components/ui/button" +import { ArchiveHeader } from "@/components/layout/ArchiveHeader" +import { useArchive } from "@/api/archives" +import { useArtifacts, useArtifactContent } from "@/api/artifacts" +import { formatBytes, encodeSegment, encodePath, cn } from "@/lib/utils" +import { apiUrl } from "@/api/client" +import { useVirtualizer } from "@tanstack/react-virtual" +import { Folder, File, Download, ChevronRight, FileWarning } from "lucide-react" +import type { ArtifactRecord } from "@/types" + +export function ArtifactBrowserPage() { + const { archiveId, "*": splat } = useParams<{ + archiveId: string + "*": string + }>() + const navigate = useNavigate() + const artifactPath = splat ?? "" + + const { data: archive } = useArchive(archiveId!) + const { data: artifactsData, isLoading } = useArtifacts(archiveId!) + const artifacts = useMemo( + () => artifactsData?.items ?? [], + [artifactsData?.items], + ) + + const [selectedPath, setSelectedPath] = useState(artifactPath) + + useEffect(() => { + setSelectedPath(artifactPath) + }, [artifactPath]) + + useEffect(() => { + if (!selectedPath && artifacts.length > 0) { + const first = + artifacts.find((a) => a.exists_on_disk)?.path ?? artifacts[0].path + setSelectedPath(first) + } + }, [artifacts.length, selectedPath]) // eslint-disable-line react-hooks/exhaustive-deps + + const { data: preview, isLoading: previewLoading } = useArtifactContent( + archiveId!, + selectedPath || undefined, + ) + + const selectArtifact = (path: string) => { + setSelectedPath(path) + navigate( + `/archives/${encodeSegment(archiveId!)}/artifacts/${encodePath(path)}`, + { replace: true }, + ) + } + + const tree = useMemo(() => buildTree(artifacts), [artifacts]) + + const previewLines = useMemo( + () => preview?.content?.split(/\r?\n/) ?? [], + [preview?.content], + ) + + if (isLoading) { + return ( +
+ Loading artifacts... +
+ ) + } + + return ( +
+ {archive ? ( + + ) : null} + +
+ {/* File tree */} + + +

+ Files +

+
+ {tree.map((node) => ( + + ))} +
+
+
+ + {/* Preview */} + + +
+ + {previewLoading ? ( +
+ Loading preview... +
+ ) : preview?.binary ? ( +
+ +

+ Binary file ({formatBytes(preview.size_bytes)}) +

+ + Download file + +
+ ) : preview?.content != null ? ( +
+ {preview.truncated ? ( +
+ File truncated — showing{" "} + {previewLines.length.toLocaleString()} lines of{" "} + ~{formatBytes(preview.size_bytes)}. Download for the full + file. +
+ ) : null} + +
+ ) : !selectedPath ? ( +
+ +

+ Choose a file from the tree to preview it. +

+
+ ) : null} + + +
+
+ ) +} + +// --- Tree logic --- + +type TreeNode = { + name: string + path: string + kind: "file" | "folder" + exists_on_disk: boolean + children: TreeNode[] +} + +function TreeNodeView({ + node, + selectedPath, + onSelect, +}: { + node: TreeNode + selectedPath: string + onSelect: (path: string) => void +}) { + if (node.kind === "file") { + return ( + + ) + } + + return ( +
+ + + + {node.name} + +
+ {node.children.map((child) => ( + + ))} +
+
+ ) +} + +function VirtualLogView({ lines }: { lines: string[] }) { + const parentRef = useRef(null) + const virtualizer = useVirtualizer({ + count: lines.length, + getScrollElement: () => parentRef.current, + estimateSize: () => 20, + overscan: 20, + }) + + return ( +
+
+ {virtualizer.getVirtualItems().map((virtualRow) => ( +
+ + {virtualRow.index + 1} + + + {lines[virtualRow.index].length > 0 + ? lines[virtualRow.index] + : " "} + +
+ ))} +
+
+ ) +} + +function buildTree(artifacts: ArtifactRecord[]): TreeNode[] { + const root: TreeNode[] = [] + for (const artifact of artifacts) { + const segments = artifact.path.split("/").filter(Boolean) + if (segments.length === 0) continue + let current = root + let currentPath = "" + for (let i = 0; i < segments.length; i++) { + const name = segments[i] + currentPath = currentPath ? `${currentPath}/${name}` : name + const isLeaf = i === segments.length - 1 + const existing = current.find((n) => n.name === name) + if (existing) { + if (isLeaf) existing.exists_on_disk = artifact.exists_on_disk + current = existing.children + continue + } + const node: TreeNode = { + name, + path: currentPath, + kind: isLeaf ? "file" : "folder", + exists_on_disk: isLeaf ? artifact.exists_on_disk : true, + children: [], + } + current.push(node) + current = node.children + } + } + sortTree(root) + return root +} + +function sortTree(nodes: TreeNode[]) { + nodes.sort((a, b) => { + if (a.kind !== b.kind) return a.kind === "folder" ? -1 : 1 + return a.name.localeCompare(b.name) + }) + for (const n of nodes) sortTree(n.children) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx new file mode 100644 index 0000000..1197655 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -0,0 +1,283 @@ +import { useParams, Link } from "react-router" +import { Card, CardContent } from "@/components/ui/card" +import { SeverityBadge } from "@/components/ui/severity-badge" +import { ArchiveHeader } from "@/components/layout/ArchiveHeader" +import { useArchive } from "@/api/archives" +import { useIssueDetail } from "@/api/issues" +import { KV } from "@/components/ui/kv" +import { shortFingerprint, encodeSegment, encodePath, sortFindings } from "@/lib/utils" +import { FileText, ArrowRight } from "lucide-react" + +export function IssueDetailPage() { + const { archiveId, issueId } = useParams<{ + archiveId: string + issueId: string + }>() + const { data: archive } = useArchive(archiveId!) + const { data, isLoading, error } = useIssueDetail(archiveId!, issueId!) + + if (isLoading) { + return ( +
+ Loading issue... +
+ ) + } + if (error || !data) { + return ( +
+ {error ? String(error) : "Issue not found"} +
+ ) + } + + const issue = data.issue + const findings = sortFindings(issue.triage_findings ?? []) + const primaryFinding = findings[0] + const relatedArtifacts = issue.related_artifact_paths ?? [] + const triageArtifacts = uniquePaths( + findings.flatMap((finding) => finding.source_artifacts ?? []), + ) + const suggestedArtifacts = issue.suggested_artifact_paths ?? [] + const evidencePaths = + relatedArtifacts.length > 0 + ? relatedArtifacts + : triageArtifacts.length > 0 + ? triageArtifacts + : suggestedArtifacts + const evidenceLabel = + relatedArtifacts.length > 0 + ? "Related artifacts" + : triageArtifacts.length > 0 + ? "Triage evidence" + : "Suggested evidence" + + return ( +
+ {archive ? ( + + ) : null} + + {/* Issue header */} +
+
+

+ {primaryFinding?.title || issue.message} +

+ {primaryFinding ? ( +

+ {issue.message} +

+ ) : null} +

+ {issue.collector} · {issue.code} · {issue.category} +

+
+ +
+ +
+ {/* Main content */} +
+ {/* What happened */} + + +

+ What happened +

+ {primaryFinding ? ( + <> +

+ {primaryFinding.title} +

+

+ {primaryFinding.description} +

+ + ) : ( +

+ {issue.message} +

+ )} + {findings.length === 0 ? ( +
+ No triage match for this fingerprint. Showing the best + evidence we could find from the archive. +
+ ) : null} +
+
+ + {/* How serious */} + + +

+ Classification +

+
+ + + {issue.confidence} confidence + + + {issue.category} + +
+
+
+ + {/* What to do next */} + {primaryFinding?.action ? ( + + +

+ Recommended action +

+
+ +

+ {primaryFinding.action} +

+
+
+
+ ) : null} + + {/* Supporting findings */} + {findings.length > 1 ? ( +
+

+ Supporting findings +

+ {findings.slice(1).map((finding, i) => ( + + +
+
+

+ {finding.title} +

+

+ {finding.code} · {finding.category} +

+
+ +
+

+ {finding.description} +

+ {finding.action ? ( +

+ + Next step: + {" "} + + {finding.action} + +

+ ) : null} + {finding.evidence?.length ? ( +
+
    + {finding.evidence.map((ev, j) => ( +
  • {ev}
  • + ))} +
+
+ ) : null} + {finding.source_artifacts?.length ? ( +
+

+ Source artifacts +

+
+ {finding.source_artifacts.map((path) => ( + + {path} + + ))} +
+
+ ) : null} +
+
+ ))} +
+ ) : null} +
+ + {/* Sidebar */} +
+ + +

+ Metadata +

+
+ + + + + +
+
+
+ + {/* Related artifacts */} + + +

+ {evidenceLabel} +

+ {relatedArtifacts.length === 0 && triageArtifacts.length === 0 && suggestedArtifacts.length > 0 ? ( +

+ Derived from the archive because no explicit triage link was + available. +

+ ) : null} + {evidencePaths.length === 0 ? ( +

+ No artifact links on this issue. +

+ ) : ( +
+ {evidencePaths.map((path) => ( + + + {path} + + ))} +
+ )} + {triageArtifacts.length > 0 && relatedArtifacts.length === 0 ? ( +

+ Showing artifacts mentioned by the triage findings. +

+ ) : null} +
+
+
+
+
+ ) +} + +function uniquePaths(paths: string[]) { + return Array.from(new Set(paths.filter(Boolean))) +} + diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx new file mode 100644 index 0000000..b33215b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx @@ -0,0 +1,273 @@ +import { useEffect, useMemo, useState } from "react" +import { useParams, useSearchParams, useNavigate } from "react-router" +import { Card } from "@/components/ui/card" +import { Input } from "@/components/ui/input" +import { Select, type SelectOption } from "@/components/ui/select" +import { Button } from "@/components/ui/button" +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@/components/ui/table" +import { SeverityBadge } from "@/components/ui/severity-badge" +import { ArchiveHeader } from "@/components/layout/ArchiveHeader" +import { useArchive } from "@/api/archives" +import { useIssues, type IssueFilters } from "@/api/issues" +import { encodeSegment, categoryLabel, primaryFindingTitle } from "@/lib/utils" +import { Search, X } from "lucide-react" + +export function IssuesPage() { + const { archiveId } = useParams<{ archiveId: string }>() + const [searchParams] = useSearchParams() + const navigate = useNavigate() + + const committedFilters: IssueFilters = { + severity: searchParams.get("severity") ?? "", + confidence: searchParams.get("confidence") ?? "", + category: searchParams.get("category") ?? "", + collector: searchParams.get("collector") ?? "", + code: searchParams.get("code") ?? "", + q: searchParams.get("q") ?? "", + } + + const [filters, setFilters] = useState(committedFilters) + + const { data: archive, isLoading: archiveLoading } = useArchive(archiveId!) + const { + data: issuesData, + isLoading: issuesLoading, + error, + } = useIssues(archiveId!, committedFilters) + const issues = issuesData?.items ?? [] + const isLoading = archiveLoading || issuesLoading + + // Build dropdown options from archive issues, narrowed by the OTHER active filters. + const allIssues = archive?.issues ?? [] + + const severityOptions = useMemo(() => { + const filtered = allIssues.filter((i) => { + if (filters.category && i.category !== filters.category) return false + if (filters.q) { + const q = filters.q.toLowerCase() + const title = primaryFindingTitle(i.triage_findings) ?? "" + if (!`${i.message} ${i.code} ${i.collector} ${title}`.toLowerCase().includes(q)) return false + } + return true + }) + const seen = new Set(filtered.map((i) => i.severity.toLowerCase())) + const order = ["critical", "warning", "info"] + const opts: SelectOption[] = [{ value: "", label: "All severities" }] + for (const s of order) { + if (seen.has(s)) { + opts.push({ value: s, label: s.charAt(0).toUpperCase() + s.slice(1) }) + } + } + return opts + }, [allIssues, filters.category, filters.q]) + + const sourceOptions = useMemo(() => { + const filtered = allIssues.filter((i) => { + if (filters.severity && i.severity.toLowerCase() !== filters.severity.toLowerCase()) return false + if (filters.q) { + const q = filters.q.toLowerCase() + const title = primaryFindingTitle(i.triage_findings) ?? "" + if (!`${i.message} ${i.code} ${i.collector} ${title}`.toLowerCase().includes(q)) return false + } + return true + }) + const seen = new Map() + for (const issue of filtered) { + if (!seen.has(issue.category)) { + seen.set(issue.category, categoryLabel(issue.category)) + } + } + const sorted = [...seen.entries()].sort((a, b) => + a[1].localeCompare(b[1]), + ) + return [ + { value: "", label: "All sources" }, + ...sorted.map(([value, label]) => ({ value, label })), + ] + }, [allIssues, filters.severity, filters.q]) + + // Auto-reset a dropdown if its selected value was eliminated by the other filters. + useEffect(() => { + const sevValues = new Set(severityOptions.map((o) => o.value)) + const srcValues = new Set(sourceOptions.map((o) => o.value)) + const sevStale = !!filters.severity && !sevValues.has(filters.severity) + const srcStale = !!filters.category && !srcValues.has(filters.category) + if (sevStale || srcStale) { + const next = { + ...filters, + severity: sevStale ? "" : filters.severity, + category: srcStale ? "" : filters.category, + } + setFilters(next) + commitFilters(next) + } + }, [severityOptions, sourceOptions]) + + const commitFilters = (next: IssueFilters) => { + const params = new URLSearchParams() + for (const [key, value] of Object.entries(next)) { + if (value) params.set(key, value) + } + const suffix = params.toString() ? `?${params.toString()}` : "" + navigate( + `/archives/${encodeSegment(archiveId!)}/issues${suffix}`, + { replace: true }, + ) + } + + const applyFilters = (e: React.FormEvent) => { + e.preventDefault() + commitFilters(filters) + } + + const clearFilters = () => { + const cleared: IssueFilters = { + severity: "", + confidence: "", + category: "", + collector: "", + code: "", + q: "", + } + setFilters(cleared) + commitFilters(cleared) + } + + const hasActiveFilters = Object.values(filters).some(Boolean) + + if (isLoading) { + return ( +
+ Loading issues... +
+ ) + } + + return ( +
+ {archive ? ( + + ) : null} + + {/* Filter bar */} +
+
+
+ + setFilters({ ...filters, q: e.target.value })} + className="border-border bg-card pl-8 focus-visible:border-ring" + /> +
+ { + const next = { ...filters, category } + setFilters(next) + commitFilters(next) + }} + className="w-[140px] border-border bg-card focus-visible:border-ring" + /> + +
+ {hasActiveFilters ? ( +

+ Showing {issues.length} of {allIssues.length} issues +

+ ) : null} +
+ + {/* Results */} + {error ? ( +
+ {String(error)} +
+ ) : issues.length === 0 ? ( +
+

No matching issues

+

+ Try a broader search or clear filters. +

+
+ ) : ( + + + + + Severity + Source + Issue + + + + {issues.map((issue) => { + const title = primaryFindingTitle(issue.triage_findings) + return ( + + navigate( + `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(issue.id)}`, + ) + } + > + + + + + {categoryLabel(issue.category)} + + + + {title || issue.message} + + {title ? ( + + {issue.message} + + ) : null} + + + ) + })} + +
+
+ )} +
+ ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx new file mode 100644 index 0000000..0b7b238 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx @@ -0,0 +1,57 @@ +import { Outlet, Link, useLocation } from "react-router" +import { cn } from "@/lib/utils" + +export function AppShell() { + const location = useLocation() + const isHome = location.pathname === "/" + + return ( +
+
+
+ + + Dx + + + Diagnostics + + + +
+
+ +
+ +
+
+ ) +} + +function NavLink({ + to, + active, + children, +}: { + to: string + active: boolean + children: React.ReactNode +}) { + return ( + + {children} + + ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/ArchiveHeader.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/ArchiveHeader.tsx new file mode 100644 index 0000000..b1810cc --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/ArchiveHeader.tsx @@ -0,0 +1,103 @@ +import { Link, useNavigate } from "react-router" +import { Button } from "@/components/ui/button" +import { SeverityBadge } from "@/components/ui/severity-badge" +import { useDeleteArchive } from "@/api/archives" +import { overallSeverity, encodeSegment } from "@/lib/utils" +import { cn } from "@/lib/utils" +import type { ArchiveSummary } from "@/types" +import { ChevronRight, Trash2 } from "lucide-react" + +type Tab = "overview" | "issues" | "artifacts" + +export function ArchiveHeader({ + summary, + activeTab, +}: { + summary: ArchiveSummary + activeTab: Tab +}) { + const navigate = useNavigate() + const deleteArchive = useDeleteArchive() + const id = encodeSegment(summary.archive_id) + + const handleDelete = () => { + if (!window.confirm(`Delete archive ${summary.archive_id}? This cannot be undone.`)) return + deleteArchive.mutate(summary.archive_id, { + onSuccess: () => navigate("/"), + }) + } + + return ( +
+ + +
+
+

+ {summary.hostname} +

+

+ {summary.archive_id} +

+
+
+ + +
+
+ +
+ + Overview + + + Issues + + + Artifacts + +
+
+ ) +} + +function TabLink({ + to, + active, + children, +}: { + to: string + active: boolean + children: React.ReactNode +}) { + return ( + + {children} + + ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/badge.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/badge.tsx new file mode 100644 index 0000000..b20959d --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/badge.tsx @@ -0,0 +1,52 @@ +import { mergeProps } from "@base-ui/react/merge-props" +import { useRender } from "@base-ui/react/use-render" +import { cva, type VariantProps } from "class-variance-authority" + +import { cn } from "@/lib/utils" + +const badgeVariants = cva( + "group/badge inline-flex h-5 w-fit shrink-0 items-center justify-center gap-1 overflow-hidden rounded-4xl border border-transparent px-2 py-0.5 text-xs font-medium whitespace-nowrap transition-all focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 has-data-[icon=inline-end]:pr-1.5 has-data-[icon=inline-start]:pl-1.5 aria-invalid:border-destructive aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 [&>svg]:pointer-events-none [&>svg]:size-3!", + { + variants: { + variant: { + default: "bg-primary text-primary-foreground [a]:hover:bg-primary/80", + secondary: + "bg-secondary text-secondary-foreground [a]:hover:bg-secondary/80", + destructive: + "bg-destructive/10 text-destructive focus-visible:ring-destructive/20 dark:bg-destructive/20 dark:focus-visible:ring-destructive/40 [a]:hover:bg-destructive/20", + outline: + "border-border text-foreground [a]:hover:bg-muted [a]:hover:text-muted-foreground", + ghost: + "hover:bg-muted hover:text-muted-foreground dark:hover:bg-muted/50", + link: "text-primary underline-offset-4 hover:underline", + }, + }, + defaultVariants: { + variant: "default", + }, + } +) + +function Badge({ + className, + variant = "default", + render, + ...props +}: useRender.ComponentProps<"span"> & VariantProps) { + return useRender({ + defaultTagName: "span", + props: mergeProps<"span">( + { + className: cn(badgeVariants({ variant }), className), + }, + props + ), + render, + state: { + slot: "badge", + variant, + }, + }) +} + +export { Badge, badgeVariants } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/button.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/button.tsx new file mode 100644 index 0000000..09df753 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/button.tsx @@ -0,0 +1,58 @@ +import { Button as ButtonPrimitive } from "@base-ui/react/button" +import { cva, type VariantProps } from "class-variance-authority" + +import { cn } from "@/lib/utils" + +const buttonVariants = cva( + "group/button inline-flex shrink-0 items-center justify-center rounded-lg border border-transparent bg-clip-padding text-sm font-medium whitespace-nowrap transition-all outline-none select-none focus-visible:border-ring focus-visible:ring-3 focus-visible:ring-ring/50 active:not-aria-[haspopup]:translate-y-px disabled:pointer-events-none disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-3 aria-invalid:ring-destructive/20 dark:aria-invalid:border-destructive/50 dark:aria-invalid:ring-destructive/40 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4", + { + variants: { + variant: { + default: "bg-primary text-primary-foreground [a]:hover:bg-primary/80", + outline: + "border-border bg-background hover:bg-muted hover:text-foreground aria-expanded:bg-muted aria-expanded:text-foreground dark:border-input dark:bg-input/30 dark:hover:bg-input/50", + secondary: + "bg-secondary text-secondary-foreground hover:bg-secondary/80 aria-expanded:bg-secondary aria-expanded:text-secondary-foreground", + ghost: + "hover:bg-muted hover:text-foreground aria-expanded:bg-muted aria-expanded:text-foreground dark:hover:bg-muted/50", + destructive: + "bg-destructive/10 text-destructive hover:bg-destructive/20 focus-visible:border-destructive/40 focus-visible:ring-destructive/20 dark:bg-destructive/20 dark:hover:bg-destructive/30 dark:focus-visible:ring-destructive/40", + link: "text-primary underline-offset-4 hover:underline", + }, + size: { + default: + "h-8 gap-1.5 px-2.5 has-data-[icon=inline-end]:pr-2 has-data-[icon=inline-start]:pl-2", + xs: "h-6 gap-1 rounded-[min(var(--radius-md),10px)] px-2 text-xs in-data-[slot=button-group]:rounded-lg has-data-[icon=inline-end]:pr-1.5 has-data-[icon=inline-start]:pl-1.5 [&_svg:not([class*='size-'])]:size-3", + sm: "h-7 gap-1 rounded-[min(var(--radius-md),12px)] px-2.5 text-[0.8rem] in-data-[slot=button-group]:rounded-lg has-data-[icon=inline-end]:pr-1.5 has-data-[icon=inline-start]:pl-1.5 [&_svg:not([class*='size-'])]:size-3.5", + lg: "h-9 gap-1.5 px-2.5 has-data-[icon=inline-end]:pr-2 has-data-[icon=inline-start]:pl-2", + icon: "size-8", + "icon-xs": + "size-6 rounded-[min(var(--radius-md),10px)] in-data-[slot=button-group]:rounded-lg [&_svg:not([class*='size-'])]:size-3", + "icon-sm": + "size-7 rounded-[min(var(--radius-md),12px)] in-data-[slot=button-group]:rounded-lg", + "icon-lg": "size-9", + }, + }, + defaultVariants: { + variant: "default", + size: "default", + }, + } +) + +function Button({ + className, + variant = "default", + size = "default", + ...props +}: ButtonPrimitive.Props & VariantProps) { + return ( + + ) +} + +export { Button, buttonVariants } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/card.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/card.tsx new file mode 100644 index 0000000..3b77793 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/card.tsx @@ -0,0 +1,103 @@ +import * as React from "react" + +import { cn } from "@/lib/utils" + +function Card({ + className, + size = "default", + ...props +}: React.ComponentProps<"div"> & { size?: "default" | "sm" }) { + return ( +
img:first-child]:pt-0 data-[size=sm]:gap-3 data-[size=sm]:py-3 data-[size=sm]:has-data-[slot=card-footer]:pb-0 *:[img:first-child]:rounded-t-xl *:[img:last-child]:rounded-b-xl", + className + )} + {...props} + /> + ) +} + +function CardHeader({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +function CardTitle({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +function CardDescription({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +function CardAction({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +function CardContent({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +function CardFooter({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ) +} + +export { + Card, + CardHeader, + CardFooter, + CardTitle, + CardAction, + CardDescription, + CardContent, +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/input.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/input.tsx new file mode 100644 index 0000000..7d21bab --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/input.tsx @@ -0,0 +1,20 @@ +import * as React from "react" +import { Input as InputPrimitive } from "@base-ui/react/input" + +import { cn } from "@/lib/utils" + +function Input({ className, type, ...props }: React.ComponentProps<"input">) { + return ( + + ) +} + +export { Input } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx new file mode 100644 index 0000000..7ae1ec8 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx @@ -0,0 +1,8 @@ +export function KV({ label, value }: { label: string; value: string }) { + return ( +
+

{label}

+

{value}

+
+ ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/pill.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/pill.tsx new file mode 100644 index 0000000..33d9916 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/pill.tsx @@ -0,0 +1,7 @@ +export function Pill({ children }: { children: React.ReactNode }) { + return ( + + {children} + + ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/select.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/select.tsx new file mode 100644 index 0000000..1aff289 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/select.tsx @@ -0,0 +1,48 @@ +import * as React from "react" +import { ChevronDown } from "lucide-react" + +import { cn } from "@/lib/utils" + +interface SelectOption { + value: string + label: string +} + +function Select({ + options, + value, + onChange, + className, + ...props +}: { + options: SelectOption[] + value: string + onChange: (value: string) => void +} & Omit, "value" | "onChange">) { + return ( +
+ + +
+ ) +} + +export { Select, type SelectOption } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx new file mode 100644 index 0000000..8ee0b5b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx @@ -0,0 +1,31 @@ +import { cn } from "@/lib/utils" + +const styles: Record = { + critical: "bg-severity-critical-muted text-severity-critical", + warning: "bg-severity-warning-muted text-severity-warning", + info: "bg-severity-info-muted text-severity-info", + ok: "bg-success-muted text-success", +} + +export function SeverityBadge({ + severity, + label, + className, +}: { + severity: string + label?: string + className?: string +}) { + const key = severity.toLowerCase() + return ( + + {label ?? severity} + + ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/table.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/table.tsx new file mode 100644 index 0000000..ac9585e --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/table.tsx @@ -0,0 +1,114 @@ +import * as React from "react" + +import { cn } from "@/lib/utils" + +function Table({ className, ...props }: React.ComponentProps<"table">) { + return ( +
+ + + ) +} + +function TableHeader({ className, ...props }: React.ComponentProps<"thead">) { + return ( + + ) +} + +function TableBody({ className, ...props }: React.ComponentProps<"tbody">) { + return ( + + ) +} + +function TableFooter({ className, ...props }: React.ComponentProps<"tfoot">) { + return ( + tr]:last:border-b-0", + className + )} + {...props} + /> + ) +} + +function TableRow({ className, ...props }: React.ComponentProps<"tr">) { + return ( + + ) +} + +function TableHead({ className, ...props }: React.ComponentProps<"th">) { + return ( +
+ ) +} + +function TableCell({ className, ...props }: React.ComponentProps<"td">) { + return ( + + ) +} + +function TableCaption({ + className, + ...props +}: React.ComponentProps<"caption">) { + return ( +
+ ) +} + +export { + Table, + TableHeader, + TableBody, + TableFooter, + TableHead, + TableRow, + TableCell, + TableCaption, +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/tabs.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/tabs.tsx new file mode 100644 index 0000000..8ee8054 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/tabs.tsx @@ -0,0 +1,82 @@ +"use client" + +import { Tabs as TabsPrimitive } from "@base-ui/react/tabs" +import { cva, type VariantProps } from "class-variance-authority" + +import { cn } from "@/lib/utils" + +function Tabs({ + className, + orientation = "horizontal", + ...props +}: TabsPrimitive.Root.Props) { + return ( + + ) +} + +const tabsListVariants = cva( + "group/tabs-list inline-flex w-fit items-center justify-center rounded-lg p-[3px] text-muted-foreground group-data-horizontal/tabs:h-8 group-data-vertical/tabs:h-fit group-data-vertical/tabs:flex-col data-[variant=line]:rounded-none", + { + variants: { + variant: { + default: "bg-muted", + line: "gap-1 bg-transparent", + }, + }, + defaultVariants: { + variant: "default", + }, + } +) + +function TabsList({ + className, + variant = "default", + ...props +}: TabsPrimitive.List.Props & VariantProps) { + return ( + + ) +} + +function TabsTrigger({ className, ...props }: TabsPrimitive.Tab.Props) { + return ( + + ) +} + +function TabsContent({ className, ...props }: TabsPrimitive.Panel.Props) { + return ( + + ) +} + +export { Tabs, TabsList, TabsTrigger, TabsContent, tabsListVariants } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/upload/HomePage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/upload/HomePage.tsx new file mode 100644 index 0000000..faf991f --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/upload/HomePage.tsx @@ -0,0 +1,193 @@ +import { useState } from "react" +import { useNavigate } from "react-router" +import { Card, CardContent } from "@/components/ui/card" +import { buttonVariants } from "@/components/ui/button" +import { SeverityBadge } from "@/components/ui/severity-badge" +import { useArchives, useUploadArchive } from "@/api/archives" +import { + formatBytes, + formatDate, + plainLanguageSummary, + overallSeverity, + encodeSegment, +} from "@/lib/utils" +import { Upload, Server, HardDrive, ChevronRight } from "lucide-react" + +export function HomePage() { + const navigate = useNavigate() + const { data, isLoading, error } = useArchives() + + const archives = data?.items ?? [] + const totalStorageBytes = data?.total_storage_bytes ?? 0 + + return ( +
+ {/* Page header */} +
+
+

Archives

+

+ Upload and review diagnostic archives +

+
+
+ +
+ +
+
+ + {/* Upload zone */} + + + {/* Archive list */} +
+

+ Recent archives +

+ + {error ? ( + + ) : null} + {isLoading ? ( + + ) : null} + {!isLoading && archives.length === 0 ? ( + + + +

+ No archives yet +

+

+ Upload a .tar.gz archive to get started. +

+
+
+ ) : null} + +
+ {archives.map((archive) => ( + + ))} +
+
+
+ ) +} + +function UploadPanel() { + const navigate = useNavigate() + const upload = useUploadArchive() + const [dragActive, setDragActive] = useState(false) + + const handleFile = (file?: File) => { + if (!file) return + upload.mutate(file, { + onSuccess: (data) => navigate(data.redirect_url), + }) + } + + return ( +
{ + e.preventDefault() + setDragActive(true) + }} + onDragLeave={() => setDragActive(false)} + onDrop={(e) => { + e.preventDefault() + setDragActive(false) + handleFile(e.dataTransfer.files[0]) + }} + > +
+ +

+ {upload.isPending + ? "Uploading..." + : "Drop a .tar.gz archive here, or"} +

+ + {upload.error ? ( + + ) : null} +
+
+ ) +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+

{label}

+

+ {value} +

+
+ ) +} + +function Notice({ tone, text }: { tone: "neutral" | "danger"; text: string }) { + return ( +
+ {text} +
+ ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/index.css b/customers/vm-troubleshooting-dashboard/frontend/src/index.css new file mode 100644 index 0000000..6bc8c23 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/index.css @@ -0,0 +1,103 @@ +@import "tailwindcss"; +@import "tw-animate-css"; +@import "shadcn/tailwind.css"; + +@custom-variant dark (&:is(.dark *)); + +@theme { + --color-severity-critical: #dc2626; + --color-severity-warning: #d97706; + --color-severity-info: #2563eb; + --color-severity-critical-muted: #fef2f2; + --color-severity-warning-muted: #fffbeb; + --color-severity-info-muted: #eff6ff; + --color-success: #059669; + --color-success-muted: #ecfdf5; +} + +@theme inline { + --font-sans: 'Outfit Variable', 'Outfit', system-ui, -apple-system, sans-serif; + --font-mono: 'JetBrains Mono', 'Fira Code', ui-monospace, monospace; + --color-sidebar-ring: var(--sidebar-ring); + --color-sidebar-border: var(--sidebar-border); + --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); + --color-sidebar-accent: var(--sidebar-accent); + --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); + --color-sidebar-primary: var(--sidebar-primary); + --color-sidebar-foreground: var(--sidebar-foreground); + --color-sidebar: var(--sidebar); + --color-chart-5: var(--chart-5); + --color-chart-4: var(--chart-4); + --color-chart-3: var(--chart-3); + --color-chart-2: var(--chart-2); + --color-chart-1: var(--chart-1); + --color-ring: var(--ring); + --color-input: var(--input); + --color-border: var(--border); + --color-destructive: var(--destructive); + --color-accent-foreground: var(--accent-foreground); + --color-accent: var(--accent); + --color-muted-foreground: var(--muted-foreground); + --color-muted: var(--muted); + --color-secondary-foreground: var(--secondary-foreground); + --color-secondary: var(--secondary); + --color-primary-foreground: var(--primary-foreground); + --color-primary: var(--primary); + --color-popover-foreground: var(--popover-foreground); + --color-popover: var(--popover); + --color-card-foreground: var(--card-foreground); + --color-card: var(--card); + --color-foreground: var(--foreground); + --color-background: var(--background); + --radius-sm: calc(var(--radius) * 0.6); + --radius-md: calc(var(--radius) * 0.8); + --radius-lg: var(--radius); + --radius-xl: calc(var(--radius) * 1.4); + --radius-2xl: calc(var(--radius) * 1.8); + --radius-3xl: calc(var(--radius) * 2.2); + --radius-4xl: calc(var(--radius) * 2.6); +} + +:root { + --background: #f0f2f6; + --foreground: #0f172a; + --card: #ffffff; + --card-foreground: #0f172a; + --popover: #ffffff; + --popover-foreground: #0f172a; + --primary: #0f172a; + --primary-foreground: #ffffff; + --secondary: #f1f5f9; + --secondary-foreground: #475569; + --muted: #f1f5f9; + --muted-foreground: #64748b; + --accent: #f1f5f9; + --accent-foreground: #0f172a; + --destructive: #dc2626; + --border: #e2e8f0; + --input: #e2e8f0; + --ring: #0f172a; + --chart-1: #dc2626; + --chart-2: #d97706; + --chart-3: #2563eb; + --chart-4: #059669; + --chart-5: #7c3aed; + --radius: 0.625rem; + --sidebar: #f1f5f9; + --sidebar-foreground: #0f172a; + --sidebar-primary: #0f172a; + --sidebar-primary-foreground: #ffffff; + --sidebar-accent: #e2e8f0; + --sidebar-accent-foreground: #0f172a; + --sidebar-border: #e2e8f0; + --sidebar-ring: #0f172a; +} + +@layer base { + body { + @apply antialiased bg-background text-foreground font-sans; + } + * { + @apply border-border outline-ring/50; + } +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts new file mode 100644 index 0000000..9f7e13f --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts @@ -0,0 +1,175 @@ +import { clsx, type ClassValue } from "clsx" +import { twMerge } from "tailwind-merge" +import type { ArchiveSummary, TriageFinding } from "@/types" + +export function cn(...inputs: ClassValue[]) { + return twMerge(clsx(inputs)) +} + +export function formatBytes(bytes: number) { + if (!Number.isFinite(bytes) || bytes < 0) return "0 B" + const units = ["B", "KB", "MB", "GB"] + let value = bytes + let unit = 0 + while (value >= 1024 && unit < units.length - 1) { + value /= 1024 + unit += 1 + } + return `${value.toFixed(value >= 10 || unit === 0 ? 0 : 1)} ${units[unit]}` +} + +export function formatDuration(ms: number) { + if (ms < 1000) return `${ms} ms` + return `${(ms / 1000).toFixed(1)} s` +} + +const dateFormatter = new Intl.DateTimeFormat(undefined, { + dateStyle: "medium", + timeStyle: "short", +}) + +export function formatDate(value: string) { + const date = new Date(value) + if (Number.isNaN(date.getTime())) return value + return dateFormatter.format(date) +} + +export function shortFingerprint(value: string) { + return `${value.slice(0, 8)}\u2026` +} + +export function severityColor(severity: string) { + switch (severity.toLowerCase()) { + case "critical": + return "text-severity-critical" + case "warning": + return "text-severity-warning" + case "info": + return "text-severity-info" + default: + return "text-muted-foreground" + } +} + +export function severityBorder(severity: string) { + switch (severity.toLowerCase()) { + case "critical": + return "border-severity-critical/40" + case "warning": + return "border-severity-warning/40" + case "info": + return "border-severity-info/40" + default: + return "border-border" + } +} + +export function plainLanguageSummary(summary: ArchiveSummary) { + if (summary.issue_counts.critical > 0) { + return `${summary.issue_counts.critical} critical issue${summary.issue_counts.critical === 1 ? "" : "s"} need attention.` + } + if (summary.issue_counts.warning > 0) { + return `${summary.issue_counts.warning} warning${summary.issue_counts.warning === 1 ? "" : "s"} worth a follow-up.` + } + if (summary.issue_counts.total > 0) { + return "No critical issues. Review the informational findings when you have time." + } + return "No issues were recorded for this archive." +} + +export function nextStep(summary: ArchiveSummary) { + if (summary.issue_counts.critical > 0) { + return "Start with the Issues tab and work down from the critical findings." + } + if (summary.issue_counts.warning > 0) { + return "Review the warnings, then open artifacts if you need more proof." + } + return "The archive looks quiet, so the artifact browser is the best place to confirm details." +} + +export function overallSeverity(summary: ArchiveSummary) { + if (summary.issue_counts.critical > 0) return "critical" + if (summary.issue_counts.warning > 0) return "warning" + return "info" +} + +export function encodeSegment(value: string) { + return encodeURIComponent(value) +} + +export function encodePath(value: string) { + return value + .split("/") + .map((s) => encodeURIComponent(s)) + .join("/") +} + +const categoryLabels: Record = { + DISK: "Disk", + SVC: "Services", + MEM: "Memory", + FW: "Firewall", + GPU: "GPU", + KERN: "Kernel", + HW: "Hardware", + ERR: "System Logs", + PROC: "Process", + TIMEOUT: "Timeout", + DATA: "Data Quality", +} + +/** Map a short category code to a human-readable label. */ +export function categoryLabel(category: string): string { + return categoryLabels[category] ?? category +} + +const severityRank: Record = { + critical: 3, + warning: 2, + info: 1, +} + +/** + * Sort findings for deterministic primary selection: + * 1. Analyzer origin: purpose-built analyzers beat "issue_enrichment" + * 2. Confidence: "high" before "low" + * 3. Severity: critical > warning > info + * 4. Lexical tiebreaker on code then title + */ +/** Negative if a should sort before b (same order as sortFindings). */ +function compareFindingPrimaryOrder(a: TriageFinding, b: TriageFinding): number { + const aEnrich = a.analyzer === "issue_enrichment" ? 1 : 0 + const bEnrich = b.analyzer === "issue_enrichment" ? 1 : 0 + if (aEnrich !== bEnrich) return aEnrich - bEnrich + + const aConf = a.confidence === "high" ? 0 : 1 + const bConf = b.confidence === "high" ? 0 : 1 + if (aConf !== bConf) return aConf - bConf + + const aSev = severityRank[a.severity] ?? 0 + const bSev = severityRank[b.severity] ?? 0 + if (aSev !== bSev) return bSev - aSev + + const codeCmp = a.code.localeCompare(b.code) + if (codeCmp !== 0) return codeCmp + return a.title.localeCompare(b.title) +} + +export function sortFindings(findings: TriageFinding[]) { + return [...findings].sort(compareFindingPrimaryOrder) +} + +/** Returns the title of the highest-priority finding, or undefined. */ +export function primaryFindingTitle( + findings?: TriageFinding[], +): string | undefined { + if (!findings?.length) return undefined + let best = findings[0] + for (let i = 1; i < findings.length; i++) { + const cur = findings[i] + if (compareFindingPrimaryOrder(cur, best) < 0) { + best = cur + } + } + return best.title +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/main.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/main.tsx new file mode 100644 index 0000000..833c1d9 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/main.tsx @@ -0,0 +1,26 @@ +import { StrictMode } from "react" +import { createRoot } from "react-dom/client" +import { RouterProvider } from "react-router" +import { QueryClient, QueryClientProvider } from "@tanstack/react-query" +import { router } from "./router" +import "@fontsource-variable/outfit/index.css" +import "./index.css" +import { ApiError } from "./api/client" + +const queryClient = new QueryClient({ + defaultOptions: { + queries: { + staleTime: 30_000, + retry: (count, error) => + count < 1 && !(error instanceof ApiError && error.status < 500), + }, + }, +}) + +createRoot(document.getElementById("root")!).render( + + + + + , +) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/router.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/router.tsx new file mode 100644 index 0000000..f6dc6ec --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/router.tsx @@ -0,0 +1,23 @@ +import { createBrowserRouter } from "react-router" +import { AppShell } from "@/components/layout/AppShell" +import { HomePage } from "@/components/upload/HomePage" +import { DashboardPage } from "@/components/dashboard/DashboardPage" +import { IssuesPage } from "@/components/issues/IssuesPage" +import { IssueDetailPage } from "@/components/issue-detail/IssueDetailPage" +import { ArtifactBrowserPage } from "@/components/artifacts/ArtifactBrowserPage" + +export const router = createBrowserRouter([ + { + element: , + children: [ + { index: true, element: }, + { path: "archives/:archiveId", element: }, + { path: "archives/:archiveId/issues", element: }, + { + path: "archives/:archiveId/issues/:issueId", + element: , + }, + { path: "archives/:archiveId/artifacts/*", element: }, + ], + }, +]) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts new file mode 100644 index 0000000..e1fde0b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts @@ -0,0 +1,125 @@ +export type IssueCounts = { + critical: number + warning: number + info: number + total: number +} + +export type ManifestPlatform = { + os?: string + kernel?: string +} + +export type ArchiveSummary = { + archive_id: string + schema_version: string + version?: string + commit?: string + generated_at: string + hostname: string + platform: ManifestPlatform + uploaded_at: string + uploaded_by: string + issue_counts: IssueCounts + collector_count: number + artifact_count: number + triage_finding_count: number + status: string + error_reason?: string + compressed_size: number +} + +export type CollectorRecord = { + collector_id: string + status: string + duration_ms: number + artifact_count: number + skipped_count: number + error_count: number + facts?: Record +} + +export type TriageFinding = { + code: string + severity: string + confidence: string + category: string + title: string + description: string + action?: string + evidence?: string[] + source_artifacts?: string[] + issue_fingerprint?: string + analyzer?: string +} + +export type IssueRecord = { + id: string + collector: string + code: string + severity: string + confidence: string + category: string + message: string + issue_fingerprint?: string + related_artifact_paths?: string[] + suggested_artifact_paths?: string[] + unresolved_artifact_paths?: string[] + triage_findings?: TriageFinding[] +} + +export type ArtifactRecord = { + path: string + collector: string + type: string + command?: string + source?: string + exit_code: number + status: string + ignored_exit: boolean + timed_out: boolean + sanitized?: boolean + truncated: boolean + duration_ms: number + size_bytes: number + sha256: string + content_type: string + parser_hint?: string + tags: string[] + exists_on_disk: boolean +} + +export type ArchiveDetail = { + summary: ArchiveSummary + collectors: CollectorRecord[] + issues: IssueRecord[] + artifacts: ArtifactRecord[] + triage_findings?: TriageFinding[] +} + +export type ArchiveListResponse = { + items: ArchiveSummary[] + total: number + page: number + page_size: number + total_storage_bytes: number +} + +export type IssuesResponse = { + items: IssueRecord[] + total: number +} + +export type ArtifactsResponse = { + items: ArtifactRecord[] + total: number +} + +export type ArtifactPreviewResponse = { + path: string + size_bytes: number + binary: boolean + truncated: boolean + content?: string + download_url: string +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/tsconfig.app.json b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.app.json new file mode 100644 index 0000000..56d43db --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.app.json @@ -0,0 +1,32 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", + "target": "es2023", + "lib": ["ES2023", "DOM", "DOM.Iterable"], + "module": "esnext", + "types": ["vite/client"], + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + "jsx": "react-jsx", + + /* Paths */ + "ignoreDeprecations": "6.0", + "baseUrl": ".", + "paths": { + "@/*": ["./src/*"] + }, + + /* Linting */ + "noUnusedLocals": true, + "noUnusedParameters": true, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true + }, + "include": ["src"] +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/tsconfig.json b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.json new file mode 100644 index 0000000..aa3c04f --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "baseUrl": ".", + "paths": { + "@/*": ["./src/*"] + } + }, + "files": [], + "references": [ + { "path": "./tsconfig.app.json" }, + { "path": "./tsconfig.node.json" } + ] +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/tsconfig.node.json b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.node.json new file mode 100644 index 0000000..d3c52ea --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/tsconfig.node.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", + "target": "es2023", + "lib": ["ES2023"], + "module": "esnext", + "types": ["node"], + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + + /* Linting */ + "noUnusedLocals": true, + "noUnusedParameters": true, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true + }, + "include": ["vite.config.ts"] +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/vite.config.ts b/customers/vm-troubleshooting-dashboard/frontend/vite.config.ts new file mode 100644 index 0000000..746a68a --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/vite.config.ts @@ -0,0 +1,25 @@ +import path from 'path' +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' +import tailwindcss from '@tailwindcss/vite' + +export default defineConfig({ + plugins: [react(), tailwindcss()], + resolve: { + alias: { + '@': path.resolve(__dirname, './src'), + }, + }, + server: { + port: 5173, + proxy: { + '/api': { + target: 'http://127.0.0.1:8080', + changeOrigin: true, + }, + }, + }, + build: { + outDir: 'dist', + }, +}) diff --git a/customers/vm-troubleshooting-dashboard/go.mod b/customers/vm-troubleshooting-dashboard/go.mod new file mode 100644 index 0000000..38526a4 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/go.mod @@ -0,0 +1,20 @@ +module github.com/NexGenCloud/diagnostic-dashboard + +go 1.25.0 + +require ( + golang.org/x/time v0.14.0 + modernc.org/sqlite v1.48.2 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + golang.org/x/sys v0.42.0 // indirect + modernc.org/libc v1.70.0 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect +) diff --git a/customers/vm-troubleshooting-dashboard/go.sum b/customers/vm-troubleshooting-dashboard/go.sum new file mode 100644 index 0000000..e920f4d --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/go.sum @@ -0,0 +1,53 @@ +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= +modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw= +modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0= +modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= +modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw= +modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.48.2 h1:5CnW4uP8joZtA0LedVqLbZV5GD7F/0x91AXeSyjoh5c= +modernc.org/sqlite v1.48.2/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/customers/vm-troubleshooting-dashboard/internal/api/server.go b/customers/vm-troubleshooting-dashboard/internal/api/server.go new file mode 100644 index 0000000..4f258b2 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/api/server.go @@ -0,0 +1,625 @@ +package api + +import ( + "crypto/subtle" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "mime/multipart" + "net" + "net/http" + "net/url" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "time" + "unicode/utf8" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/ingest" + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" + "github.com/NexGenCloud/diagnostic-dashboard/internal/pathutil" + "github.com/NexGenCloud/diagnostic-dashboard/internal/store" + "golang.org/x/time/rate" +) + +const ( + ingestLimit = 100 * 1024 * 1024 + ingestPreviewLimit = 15 * 1024 * 1024 +) + +// Options configures authentication, upload limits, and listen exposure. +type Options struct { + AuthSharedToken string + TrustForwardedUser bool + MaxArchives int + // RequireAPIAuth is true when listening on a non-loopback address. + RequireAPIAuth bool +} + +type Server struct { + store *store.Store + webRoot string + opts Options + mux *http.ServeMux + uploadLimiterMu sync.Mutex + uploadLimiters map[string]*rate.Limiter +} + +func New(st *store.Store, webRoot string, opts Options) *Server { + if opts.MaxArchives <= 0 { + opts.MaxArchives = 10000 + } + s := &Server{ + store: st, + webRoot: webRoot, + opts: opts, + mux: http.NewServeMux(), + uploadLimiters: make(map[string]*rate.Limiter), + } + s.routes() + return s +} + +func (s *Server) Handler() http.Handler { + return s.withHeaders(s.withSecurity(s.mux)) +} + +func (s *Server) withSecurity(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !strings.HasPrefix(r.URL.Path, "/api/") { + next.ServeHTTP(w, r) + return + } + if !s.opts.RequireAPIAuth { + next.ServeHTTP(w, r) + return + } + if s.opts.AuthSharedToken != "" { + if !s.checkBearer(w, r) { + return + } + } else if !s.opts.TrustForwardedUser { + writeError(w, http.StatusForbidden, "server misconfiguration") + return + } + next.ServeHTTP(w, r) + }) +} + +func (s *Server) checkBearer(w http.ResponseWriter, r *http.Request) bool { + parts := strings.SplitN(strings.TrimSpace(r.Header.Get("Authorization")), " ", 2) + if len(parts) != 2 || !strings.EqualFold(parts[0], "Bearer") { + writeError(w, http.StatusUnauthorized, "unauthorized") + return false + } + got := strings.TrimSpace(parts[1]) + tok := s.opts.AuthSharedToken + if len(got) != len(tok) { + writeError(w, http.StatusUnauthorized, "unauthorized") + return false + } + if subtle.ConstantTimeCompare([]byte(got), []byte(tok)) != 1 { + writeError(w, http.StatusUnauthorized, "unauthorized") + return false + } + return true +} + +func clientIP(r *http.Request, trustForwarded bool) string { + if trustForwarded { + if xff := r.Header.Get("X-Forwarded-For"); xff != "" { + parts := strings.Split(xff, ",") + if host := strings.TrimSpace(parts[0]); host != "" { + return host + } + } + } + host, _, err := net.SplitHostPort(r.RemoteAddr) + if err != nil { + return r.RemoteAddr + } + return host +} + +func (s *Server) uploadLimiterAllow(r *http.Request) bool { + if !s.opts.RequireAPIAuth { + return true + } + key := clientIP(r, s.opts.TrustForwardedUser) + s.uploadLimiterMu.Lock() + defer s.uploadLimiterMu.Unlock() + lim, ok := s.uploadLimiters[key] + if !ok { + lim = rate.NewLimiter(rate.Limit(5.0/60.0), 2) + s.uploadLimiters[key] = lim + } + return lim.Allow() +} + +func (s *Server) routes() { + s.mux.HandleFunc("POST /api/v1/archives", s.handleUpload) + s.mux.HandleFunc("GET /api/v1/archives", s.handleListArchives) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}", s.handleGetArchive) + s.mux.HandleFunc("DELETE /api/v1/archives/{archiveID}", s.handleDeleteArchive) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/issues", s.handleListIssues) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/issues/{issueID}", s.handleGetIssue) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts", s.handleListArtifacts) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts/view/{path...}", s.handleViewArtifact) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts/download/{path...}", s.handleDownloadArtifact) + s.mux.HandleFunc("/", s.handleWeb) +} + +func (s *Server) withHeaders(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-Content-Type-Options", "nosniff") + w.Header().Set("Referrer-Policy", "no-referrer") + + // Log API requests for dev visibility. + if strings.HasPrefix(r.URL.Path, "/api/") { + start := time.Now() + rw := &statusWriter{ResponseWriter: w} + next.ServeHTTP(rw, r) + log.Printf("%s %s %d %s", r.Method, r.URL.Path, rw.status, time.Since(start).Round(time.Millisecond)) + return + } + next.ServeHTTP(w, r) + }) +} + +type statusWriter struct { + http.ResponseWriter + status int +} + +func (w *statusWriter) WriteHeader(code int) { + w.status = code + w.ResponseWriter.WriteHeader(code) +} + +func (w *statusWriter) Write(b []byte) (int, error) { + if w.status == 0 { + w.status = http.StatusOK + } + return w.ResponseWriter.Write(b) +} + +func (s *Server) handleUpload(w http.ResponseWriter, r *http.Request) { + if s.opts.RequireAPIAuth && !s.uploadLimiterAllow(r) { + writeError(w, http.StatusTooManyRequests, "upload rate limit exceeded") + return + } + nArchives, err := s.store.ArchiveCount() + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + if int(nArchives) >= s.opts.MaxArchives { + writeError(w, http.StatusInsufficientStorage, "archive limit reached; prune old archives or raise --max-archives") + return + } + if err := r.ParseMultipartForm(ingestLimit); err != nil { + writeError(w, http.StatusBadRequest, "invalid multipart upload") + return + } + file, header, err := openUploadFile(r) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + defer file.Close() + + tmpDir := filepath.Join(s.store.RootDir(), "tmp") + if err := os.MkdirAll(tmpDir, 0o755); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + tmpFile, err := os.CreateTemp(tmpDir, "upload-*.tar.gz") + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + tmpPath := tmpFile.Name() + defer os.Remove(tmpPath) + defer tmpFile.Close() + + written, err := io.Copy(tmpFile, io.LimitReader(file, ingestLimit+1)) + if err != nil { + writeError(w, http.StatusBadRequest, fmt.Sprintf("read upload: %v", err)) + return + } + if written > ingestLimit { + writeError(w, http.StatusRequestEntityTooLarge, "archive exceeds 100 MB compressed limit") + return + } + if _, err := tmpFile.Seek(0, 0); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tmpFile.Sync(); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + archive, err := ingest.Ingest(r.Context(), s.store, tmpPath, s.uploadedBy(r)) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + nAfter, _ := s.store.ArchiveCount() + writeJSON(w, http.StatusCreated, map[string]any{ + "archive": archive.Summary, + "redirect_url": "/archives/" + pathEscapeArchiveID(archive.Summary.ArchiveID), + "uploaded_by": archive.Summary.UploadedBy, + "archive_count": nAfter, + "filename": header.Filename, + "compressed_size": archive.Summary.StorageBytes, + }) +} + +func (s *Server) handleListArchives(w http.ResponseWriter, r *http.Request) { + page, pageSize := pagination(r) + items := s.store.List() + total := len(items) + start := (page - 1) * pageSize + if start > total { + start = total + } + end := start + pageSize + if end > total { + end = total + } + writeJSON(w, http.StatusOK, map[string]any{ + "items": items[start:end], + "total": total, + "page": page, + "page_size": pageSize, + "total_storage_bytes": totalStorageBytes(items), + }) +} + +func (s *Server) handleGetArchive(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + writeJSON(w, http.StatusOK, archive) +} + +func (s *Server) handleDeleteArchive(w http.ResponseWriter, r *http.Request) { + archiveID := r.PathValue("archiveID") + if archiveID == "" { + writeError(w, http.StatusBadRequest, "missing archive id") + return + } + if err := s.store.Delete(archiveID); err != nil { + if errors.Is(err, os.ErrNotExist) { + writeError(w, http.StatusNotFound, "archive not found") + return + } + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + w.WriteHeader(http.StatusNoContent) +} + +func (s *Server) handleListIssues(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + issues := filterIssues(archive.Issues, r) + writeJSON(w, http.StatusOK, map[string]any{"items": issues, "total": len(issues)}) +} + +func (s *Server) handleGetIssue(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + issueID := r.PathValue("issueID") + for _, issue := range archive.Issues { + if issue.ID == issueID { + writeJSON(w, http.StatusOK, map[string]any{"archive": archive.Summary, "issue": issue}) + return + } + } + writeError(w, http.StatusNotFound, "issue not found") +} + +func (s *Server) handleListArtifacts(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + writeJSON(w, http.StatusOK, map[string]any{"items": archive.Artifacts, "total": len(archive.Artifacts)}) +} + +func (s *Server) handleViewArtifact(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + resp, ok, err := readArtifactPreview(archive.StorageDir, r.PathValue("path"), archive.Summary.ArchiveID) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + if !ok { + writeError(w, http.StatusNotFound, "artifact not found") + return + } + writeJSON(w, http.StatusOK, resp) +} + +func (s *Server) handleDownloadArtifact(w http.ResponseWriter, r *http.Request) { + archive, ok := s.getArchive(w, r) + if !ok { + return + } + rel := r.PathValue("path") + fullPath, err := pathutil.SafeJoin(archive.StorageDir, rel) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + if _, err := os.Stat(fullPath); err != nil { + if errors.Is(err, os.ErrNotExist) { + writeError(w, http.StatusNotFound, "artifact not found") + return + } + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + w.Header().Set("Content-Disposition", "attachment; filename="+strconv.Quote(filepath.Base(fullPath))) + http.ServeFile(w, r, fullPath) +} + +func (s *Server) handleWeb(w http.ResponseWriter, r *http.Request) { + if strings.HasPrefix(r.URL.Path, "/api/") { + writeError(w, http.StatusNotFound, "not found") + return + } + if s.webRoot == "" { + http.Error(w, "dashboard frontend is not built", http.StatusNotFound) + return + } + rel := strings.TrimPrefix(r.URL.Path, "/") + if rel == "" { + http.ServeFile(w, r, filepath.Join(s.webRoot, "index.html")) + return + } + full, err := pathutil.SafeJoin(s.webRoot, rel) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + if info, err := os.Stat(full); err == nil && !info.IsDir() { + http.ServeFile(w, r, full) + return + } + http.ServeFile(w, r, filepath.Join(s.webRoot, "index.html")) +} + +func (s *Server) getArchive(w http.ResponseWriter, r *http.Request) (*model.ArchiveDetail, bool) { + archiveID := r.PathValue("archiveID") + if archiveID == "" { + writeError(w, http.StatusBadRequest, "missing archive id") + return nil, false + } + archive, ok := s.store.Get(archiveID) + if !ok { + writeError(w, http.StatusNotFound, "archive not found") + return nil, false + } + return archive, true +} + +func openUploadFile(r *http.Request) (multipart.File, *multipart.FileHeader, error) { + for _, key := range []string{"archive", "file"} { + file, header, err := r.FormFile(key) + if err == nil { + return file, header, nil + } + } + return nil, nil, errors.New("upload field archive is required") +} + +func writeJSON(w http.ResponseWriter, status int, value any) { + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(status) + enc := json.NewEncoder(w) + enc.SetEscapeHTML(true) + _ = enc.Encode(value) +} + +func writeError(w http.ResponseWriter, status int, msg string) { + writeJSON(w, status, map[string]string{"error": msg}) +} + +func (s *Server) uploadedBy(r *http.Request) string { + if s.opts.TrustForwardedUser { + for _, key := range []string{"X-Forwarded-User", "X-Forwarded-Email", "X-Remote-User"} { + if value := strings.TrimSpace(r.Header.Get(key)); value != "" { + return value + } + } + } + return "anonymous" +} + +func pagination(r *http.Request) (int, int) { + page := atoiDefault(r.URL.Query().Get("page"), 1) + pageSize := atoiDefault(r.URL.Query().Get("page_size"), 25) + if page < 1 { + page = 1 + } + if pageSize < 1 { + pageSize = 25 + } + if pageSize > 100 { + pageSize = 100 + } + return page, pageSize +} + +func atoiDefault(value string, fallback int) int { + if value == "" { + return fallback + } + n, err := strconv.Atoi(value) + if err != nil { + return fallback + } + return n +} + +func totalStorageBytes(items []model.ArchiveSummary) int64 { + var total int64 + for _, item := range items { + total += item.StorageBytes + } + return total +} + +func filterIssues(issues []model.IssueRecord, r *http.Request) []model.IssueRecord { + query := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("q"))) + severity := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("severity"))) + confidence := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("confidence"))) + category := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("category"))) + collector := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("collector"))) + code := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("code"))) + out := make([]model.IssueRecord, 0, len(issues)) + for _, issue := range issues { + if severity != "" && strings.ToLower(issue.Severity) != severity { + continue + } + if confidence != "" && strings.ToLower(issue.Confidence) != confidence { + continue + } + if category != "" && !strings.Contains(strings.ToLower(issue.Category), category) { + continue + } + if collector != "" && strings.ToLower(issue.Collector) != collector { + continue + } + if code != "" && strings.ToLower(issue.Code) != code { + continue + } + if query != "" && !issueMatches(issue, query) { + continue + } + out = append(out, issue) + } + sort.SliceStable(out, func(i, j int) bool { + si := model.SeverityScore(out[i].Severity) + sj := model.SeverityScore(out[j].Severity) + if si != sj { + return si > sj + } + if out[i].Collector != out[j].Collector { + return out[i].Collector < out[j].Collector + } + return out[i].ID < out[j].ID + }) + return out +} + +func issueMatches(issue model.IssueRecord, query string) bool { + fields := []string{issue.ID, issue.Collector, issue.Code, issue.Severity, issue.Confidence, issue.Category, issue.Message, issue.Fingerprint} + for _, finding := range issue.TriageFindings { + fields = append(fields, finding.Title, finding.Description, finding.Action, finding.Category, finding.Code, finding.Fingerprint) + } + for _, field := range fields { + if strings.Contains(strings.ToLower(field), query) { + return true + } + } + return false +} + +type artifactPreviewResponse struct { + Path string `json:"path"` + SizeBytes int64 `json:"size_bytes"` + Binary bool `json:"binary"` + Truncated bool `json:"truncated"` + Content string `json:"content,omitempty"` + DownloadURL string `json:"download_url"` +} + +func readArtifactPreview(root, relPath, archiveID string) (artifactPreviewResponse, bool, error) { + fullPath, err := pathutil.SafeJoin(root, relPath) + if err != nil { + return artifactPreviewResponse{}, false, err + } + info, err := os.Stat(fullPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return artifactPreviewResponse{}, false, nil + } + return artifactPreviewResponse{}, false, err + } + size := info.Size() + truncated := size > ingestPreviewLimit + + f, err := os.Open(fullPath) + if err != nil { + return artifactPreviewResponse{}, false, err + } + defer f.Close() + + readSize := size + if readSize > ingestPreviewLimit { + readSize = ingestPreviewLimit + } + data := make([]byte, readSize) + n, err := io.ReadFull(f, data) + if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) { + return artifactPreviewResponse{}, false, err + } + data = data[:n] + + binary := looksBinary(data) + resp := artifactPreviewResponse{ + Path: relPath, + SizeBytes: size, + Binary: binary, + Truncated: truncated, + DownloadURL: "/api/v1/archives/" + pathEscapeArchiveID(archiveID) + "/artifacts/download/" + pathEscapePath(relPath), + } + if !binary { + resp.Content = string(data) + } + return resp, true, nil +} + +func looksBinary(data []byte) bool { + if len(data) == 0 { + return false + } + if !utf8.Valid(data) { + return true + } + for _, b := range data { + if b == 0 { + return true + } + } + return false +} + +func pathEscapeArchiveID(value string) string { + return url.PathEscape(value) +} + +func pathEscapePath(value string) string { + segments := strings.Split(value, "/") + for i, segment := range segments { + segments[i] = url.PathEscape(segment) + } + return strings.Join(segments, "/") +} diff --git a/customers/vm-troubleshooting-dashboard/internal/api/server_test.go b/customers/vm-troubleshooting-dashboard/internal/api/server_test.go new file mode 100644 index 0000000..0331dad --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/api/server_test.go @@ -0,0 +1,229 @@ +package api + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" + "github.com/NexGenCloud/diagnostic-dashboard/internal/store" +) + +func TestAPI_NoAuthOnLoopback(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + srv := New(st, "", Options{RequireAPIAuth: false}) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + resp, err := http.Get(ts.URL + "/api/v1/archives") + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +func TestAPI_BearerAuth(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + srv := New(st, "", Options{ + RequireAPIAuth: true, + AuthSharedToken: "secret-token", + }) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + resp, err := http.Get(ts.URL + "/api/v1/archives") + if err != nil { + t.Fatal(err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("missing bearer: want 401, got %d", resp.StatusCode) + } + + req, _ := http.NewRequest(http.MethodGet, ts.URL+"/api/v1/archives", nil) + req.Header.Set("Authorization", "Bearer wrong") + resp2, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + resp2.Body.Close() + if resp2.StatusCode != http.StatusUnauthorized { + t.Fatalf("bad bearer: want 401, got %d", resp2.StatusCode) + } + + req3, _ := http.NewRequest(http.MethodGet, ts.URL+"/api/v1/archives", nil) + req3.Header.Set("Authorization", "Bearer secret-token") + resp3, err := http.DefaultClient.Do(req3) + if err != nil { + t.Fatal(err) + } + resp3.Body.Close() + if resp3.StatusCode != http.StatusOK { + t.Fatalf("good bearer: want 200, got %d", resp3.StatusCode) + } +} + +func TestAPI_ListIgnoresSpoofedForwardedUserWithoutTrust(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + storage := st.ArchiveDir("a1") + if err := os.MkdirAll(storage, 0o755); err != nil { + t.Fatal(err) + } + detail := &model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: "a1", + SchemaVersion: "3.0.0", + GeneratedAt: "2020-01-01T00:00:00Z", + Hostname: "h", + UploadedAt: time.Now().UTC(), + UploadedBy: "anonymous", + Status: "ready", + }, + StorageDir: storage, + } + if err := st.Save(detail); err != nil { + t.Fatal(err) + } + + srv := New(st, "", Options{ + RequireAPIAuth: true, + AuthSharedToken: "tok", + TrustForwardedUser: false, + }) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + req, _ := http.NewRequest(http.MethodGet, ts.URL+"/api/v1/archives", nil) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("X-Forwarded-User", "attacker") + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + var out struct { + Items []struct { + UploadedBy string `json:"uploaded_by"` + } `json:"items"` + } + if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { + t.Fatal(err) + } + if len(out.Items) != 1 || out.Items[0].UploadedBy != "anonymous" { + t.Fatalf("uploaded_by should stay anonymous, got %+v", out.Items) + } +} + +func TestAPI_UploadRateLimit(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + srv := New(st, "", Options{ + RequireAPIAuth: true, + AuthSharedToken: "tok", + }) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + var got429 bool + for i := 0; i < 20; i++ { + req, _ := http.NewRequest(http.MethodPost, ts.URL+"/api/v1/archives", strings.NewReader("x")) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "text/plain") + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode == http.StatusTooManyRequests { + got429 = true + break + } + } + if !got429 { + t.Fatal("expected at least one 429 Too Many Requests from upload rate limiter") + } +} + +func TestAPI_MaxArchives(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + storage := st.ArchiveDir("id1") + if err := os.MkdirAll(storage, 0o755); err != nil { + t.Fatal(err) + } + detail := &model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: "id1", + SchemaVersion: "3.0.0", + GeneratedAt: "2020-01-01T00:00:00Z", + Hostname: "h", + UploadedAt: time.Now().UTC(), + UploadedBy: "u", + Status: "ready", + }, + StorageDir: storage, + } + if err := st.Save(detail); err != nil { + t.Fatal(err) + } + + srv := New(st, "", Options{RequireAPIAuth: false, MaxArchives: 1}) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + req, _ := http.NewRequest(http.MethodPost, ts.URL+"/api/v1/archives", strings.NewReader("x")) + req.Header.Set("Content-Type", "text/plain") + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusInsufficientStorage { + t.Fatalf("expected 507, got %d", resp.StatusCode) + } + if ct := resp.Header.Get("Content-Type"); !strings.Contains(ct, "application/json") { + t.Fatalf("expected json error body, Content-Type=%q", ct) + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go new file mode 100644 index 0000000..4cdad6b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go @@ -0,0 +1,342 @@ +package ingest + +import ( + "archive/tar" + "compress/gzip" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + "strings" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" + "github.com/NexGenCloud/diagnostic-dashboard/internal/pathutil" + "github.com/NexGenCloud/diagnostic-dashboard/internal/store" +) + +const ( + maxCompressedBytes = 100 * 1024 * 1024 + maxExtractedBytes = 500 * 1024 * 1024 + maxExtractedEntries = 10000 +) + +func validateRegularTarFileHeader(hdr *tar.Header) error { + if hdr.Size < 0 { + return fmt.Errorf("invalid tar entry %q: negative size", hdr.Name) + } + return nil +} + +func Ingest(ctx context.Context, st *store.Store, archivePath, uploadedBy string) (*model.ArchiveDetail, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } + compressedSize, err := fileSize(archivePath) + if err != nil { + return nil, err + } + if compressedSize <= 0 { + return nil, errors.New("archive is empty") + } + if compressedSize > maxCompressedBytes { + return nil, fmt.Errorf("archive exceeds %d MB compressed limit", maxCompressedBytes/(1024*1024)) + } + + tmpRoot := filepath.Join(st.RootDir(), "tmp") + if err := os.MkdirAll(tmpRoot, 0o755); err != nil { + return nil, err + } + extractDir, err := os.MkdirTemp(filepath.Join(st.RootDir(), "tmp"), "extract-*") + if err != nil { + return nil, err + } + defer os.RemoveAll(extractDir) + + if err := extractArchive(ctx, archivePath, extractDir); err != nil { + return nil, err + } + + // Archives produced by gather-info wrap all files under a single top-level + // directory (e.g. vm-diagnostics_host_date/manifest.json). Detect this and + // treat the inner directory as the working root. + workDir := unwrapSingleDir(extractDir) + + manifestPath := filepath.Join(workDir, "manifest.json") + manifestBytes, err := os.ReadFile(manifestPath) + if err != nil { + return nil, errors.New("manifest.json missing from archive") + } + var manifest model.Manifest + if err := json.Unmarshal(manifestBytes, &manifest); err != nil { + return nil, fmt.Errorf("parse manifest.json: %w", err) + } + if !model.IsSupportedSchemaVersion(manifest.SchemaVersion) { + return nil, fmt.Errorf("unsupported schema version %q", manifest.SchemaVersion) + } + if manifest.ArchiveID == "" { + return nil, errors.New("manifest archive_id is empty") + } + if manifest.Hostname == "" || manifest.GeneratedAt == "" { + return nil, errors.New("manifest is missing required fields") + } + + detail, err := buildDetail(workDir, manifest, uploadedBy, compressedSize) + if err != nil { + return nil, err + } + + finalDir := st.ArchiveDir(manifest.ArchiveID) + if _, err := os.Stat(finalDir); err == nil { + return nil, fmt.Errorf("archive %q already exists", manifest.ArchiveID) + } else if !errors.Is(err, os.ErrNotExist) { + return nil, err + } + if err := os.Rename(workDir, finalDir); err != nil { + return nil, err + } + detail.StorageDir = finalDir + detail.Summary.UploadedBy = uploadedBy + if err := st.Save(detail); err != nil { + _ = os.RemoveAll(finalDir) + return nil, err + } + return detail, nil +} + +func buildDetail(workDir string, manifest model.Manifest, uploadedBy string, compressedSize int64) (*model.ArchiveDetail, error) { + collectorIDs := model.SortedCollectorIDs(manifest.Collectors) + collectors := make([]model.CollectorRecord, 0, len(collectorIDs)) + issues := make([]model.IssueRecord, 0) + artifacts := make([]model.ArtifactRecord, 0, len(manifest.ArtifactIndex)) + + counts := model.IssueCounts{} + for _, collectorID := range collectorIDs { + collector := manifest.Collectors[collectorID] + collectors = append(collectors, model.CollectorRecord{ + ID: collectorID, + Status: collector.Status, + DurationMS: collector.DurationMS, + ArtifactCount: collector.ArtifactCount, + SkippedCount: collector.SkippedCount, + ErrorCount: collector.ErrorCount, + Facts: collector.Facts, + }) + for _, issue := range collector.Issues { + record := model.IssueRecord{ + Collector: collectorID, + Code: issue.Code, + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Message: issue.Message, + Fingerprint: issue.IssueFingerprint, + RelatedArtifactPaths: append([]string(nil), issue.RelatedArtifactPaths...), + UnresolvedArtifactPaths: append([]string(nil), issue.UnresolvedArtifactPaths...), + Source: "manifest", + } + counts.Total++ + switch strings.ToLower(issue.Severity) { + case "critical": + counts.Critical++ + case "warning": + counts.Warning++ + case "info": + counts.Info++ + } + issues = append(issues, record) + } + } + + for _, artifact := range manifest.ArtifactIndex { + artifacts = append(artifacts, model.ArtifactRecord{ + Path: artifact.Path, + Collector: artifact.Collector, + Type: artifact.Type, + Command: artifact.Command, + Source: artifact.Source, + ExitCode: artifact.ExitCode, + Status: artifact.Status, + IgnoredExit: artifact.IgnoredExit, + TimedOut: artifact.TimedOut, + Sanitized: artifact.Sanitized, + Truncated: artifact.Truncated, + DurationMS: artifact.DurationMS, + SizeBytes: artifact.SizeBytes, + SHA256: artifact.SHA256, + ContentType: artifact.ContentType, + ParserHint: artifact.ParserHint, + Tags: append([]string(nil), artifact.Tags...), + ExistsOnDisk: artifactExists(workDir, artifact.Path), + }) + } + + summary := model.ArchiveSummary{ + ArchiveID: manifest.ArchiveID, + SchemaVersion: manifest.SchemaVersion, + Version: manifest.Version, + Commit: manifest.Commit, + GeneratedAt: manifest.GeneratedAt, + Hostname: manifest.Hostname, + Platform: manifest.Platform, + UploadedAt: time.Now().UTC(), + UploadedBy: uploadedBy, + IssueCounts: counts, + CollectorCount: len(collectors), + ArtifactCount: len(artifacts), + Status: "ready", + StorageBytes: compressedSize, + } + + return &model.ArchiveDetail{ + Summary: summary, + Collectors: collectors, + Issues: issues, + Artifacts: artifacts, + }, nil +} + +func extractArchive(ctx context.Context, archivePath, dst string) error { + file, err := os.Open(archivePath) + if err != nil { + return err + } + defer file.Close() + + gzr, err := gzip.NewReader(file) + if err != nil { + return fmt.Errorf("open gzip: %w", err) + } + defer gzr.Close() + + tr := tar.NewReader(gzr) + var totalBytes int64 + entries := 0 + seen := make(map[string]struct{}) + + for { + if ctx.Err() != nil { + return ctx.Err() + } + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return fmt.Errorf("read tar: %w", err) + } + if hdr == nil { + continue + } + name, err := safeTarPath(hdr.Name) + if err != nil { + return err + } + if name == "" { + continue + } + if _, ok := seen[name]; ok { + return fmt.Errorf("duplicate archive path %q", name) + } + seen[name] = struct{}{} + entries++ + if entries > maxExtractedEntries { + return fmt.Errorf("archive contains more than %d files", maxExtractedEntries) + } + + target := filepath.Join(dst, filepath.FromSlash(name)) + if !strings.HasPrefix(filepath.Clean(target), filepath.Clean(dst)+string(os.PathSeparator)) && filepath.Clean(target) != filepath.Clean(dst) { + return fmt.Errorf("archive path escapes extraction root: %q", hdr.Name) + } + + switch hdr.Typeflag { + case tar.TypeDir: + if err := os.MkdirAll(target, 0o755); err != nil { + return err + } + case tar.TypeReg, tar.TypeRegA: + if err := validateRegularTarFileHeader(hdr); err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil { + return err + } + if totalBytes+hdr.Size > maxExtractedBytes { + return fmt.Errorf("archive exceeds %d MB extracted limit", maxExtractedBytes/(1024*1024)) + } + out, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) + if err != nil { + return err + } + written, copyErr := io.CopyN(out, tr, hdr.Size) + closeErr := out.Close() + if copyErr != nil && !errors.Is(copyErr, io.EOF) { + return copyErr + } + if closeErr != nil { + return closeErr + } + totalBytes += written + default: + return fmt.Errorf("unsupported tar entry type %q for %q", hdr.Typeflag, hdr.Name) + } + } + return nil +} + +func safeTarPath(name string) (string, error) { + cleaned := path.Clean("/" + strings.TrimSpace(name)) + cleaned = strings.TrimPrefix(cleaned, "/") + if cleaned == "." || cleaned == "" { + return "", nil + } + if strings.HasPrefix(cleaned, "../") || cleaned == ".." || strings.Contains(cleaned, "/../") { + return "", fmt.Errorf("archive path escapes extraction root: %q", name) + } + return cleaned, nil +} + +// unwrapSingleDir checks if dir contains exactly one entry and that entry is a +// directory. If so, it returns the path to that inner directory. This handles +// tar.gz archives that wrap everything under a single top-level directory +// (e.g. vm-diagnostics_host_20260409_120000/). +func unwrapSingleDir(dir string) string { + entries, err := os.ReadDir(dir) + if err != nil || len(entries) != 1 { + return dir + } + if !entries[0].IsDir() { + return dir + } + inner := filepath.Join(dir, entries[0].Name()) + // Sanity check: the inner directory should contain manifest.json. + if _, err := os.Stat(filepath.Join(inner, "manifest.json")); err != nil { + return dir + } + return inner +} + +func artifactExists(workDir, relPath string) bool { + rel, err := safeTarPath(relPath) + if err != nil || rel == "" { + return false + } + fullPath, err := pathutil.SafeJoin(workDir, rel) + if err != nil { + return false + } + _, err = os.Stat(fullPath) + return err == nil +} + +func fileSize(path string) (int64, error) { + info, err := os.Stat(path) + if err != nil { + return 0, err + } + return info.Size(), nil +} diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go new file mode 100644 index 0000000..e849ac2 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go @@ -0,0 +1,225 @@ +package ingest + +import ( + "archive/tar" + "compress/gzip" + "context" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" + "github.com/NexGenCloud/diagnostic-dashboard/internal/store" +) + +func TestIngestArchive(t *testing.T) { + root := t.TempDir() + st, err := store.New(root) + if err != nil { + t.Fatalf("store.New: %v", err) + } + archivePath := createArchive(t, root, func(tw *tar.Writer) { + writeTarFile(t, tw, "manifest.json", mustJSON(t, model.Manifest{ + SchemaVersion: "3.1.0", + ArchiveID: "archive-123", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "test-host", + Platform: model.ManifestPlatform{OS: "Ubuntu 24.04", Kernel: "6.8.0"}, + ArtifactIndex: []model.ManifestArtifact{{ + Path: "logs/journal.txt", + Collector: "critical-events", + Type: "file", + Status: "ok", + DurationMS: 12, + SizeBytes: 18, + SHA256: "abc", + ContentType: "text/plain", + Tags: []string{"journal"}, + }}, + Collectors: map[string]model.ManifestCollector{ + "critical-events": { + Status: "ok", + DurationMS: 12, + ArtifactCount: 1, + Issues: []model.ManifestIssue{{ + Code: "critical_log", + Severity: "critical", + Confidence: "high", + Category: "journal", + Message: "GPU driver reset detected", + IssueFingerprint: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + RelatedArtifactPaths: []string{"logs/journal.txt"}, + }}, + }, + }, + })) + writeTarFile(t, tw, "logs/journal.txt", []byte("kernel: reset detected\n")) + writeTarFile(t, tw, "triage/_data/critical_events.json", mustJSON(t, model.TriageEnvelope{ + Kind: "triage_result", + SchemaVersion: "3.1.0", + ArchiveID: "archive-123", + Analyzer: "critical-events", + Findings: []model.TriageFinding{{ + Code: "critical_log", + Severity: "critical", + Confidence: "high", + Category: "journal", + Title: "Kernel reset", + Description: "The journal shows a driver reset.", + Action: "Investigate the GPU path.", + Evidence: []string{"kernel: reset detected"}, + SourceArtifacts: []string{"logs/journal.txt"}, + Fingerprint: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + }}, + })) + }) + + detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com") + if err != nil { + t.Fatalf("Ingest: %v", err) + } + if got, want := detail.Summary.ArchiveID, "archive-123"; got != want { + t.Fatalf("archive id = %q, want %q", got, want) + } + if got, want := detail.Summary.IssueCounts.Critical, 1; got != want { + t.Fatalf("critical count = %d, want %d", got, want) + } + if len(detail.Issues) != 1 { + t.Fatalf("issues = %d, want 1", len(detail.Issues)) + } + if !detail.Artifacts[0].ExistsOnDisk { + t.Fatalf("artifact should exist on disk") + } + + // Triage findings are joined on-demand via store.Get(), not during ingest. + loaded, ok := st.Get("archive-123") + if !ok { + t.Fatalf("expected archive in store") + } + if loaded.Summary.UploadedBy != "tester@example.com" { + t.Fatalf("uploaded_by = %q", loaded.Summary.UploadedBy) + } + if len(loaded.Issues) != 1 { + t.Fatalf("loaded issues = %d, want 1", len(loaded.Issues)) + } + if len(loaded.Issues[0].TriageFindings) != 1 { + t.Fatalf("triage findings = %d, want 1", len(loaded.Issues[0].TriageFindings)) + } +} + +func TestIngestWrappedArchive(t *testing.T) { + // Real archives produced by gather-info wrap files under a top-level directory. + root := t.TempDir() + st, err := store.New(root) + if err != nil { + t.Fatalf("store.New: %v", err) + } + prefix := "vm-diagnostics_test-host_20260409_120000" + archivePath := createArchive(t, root, func(tw *tar.Writer) { + writeTarDir(t, tw, prefix+"/") + writeTarFile(t, tw, prefix+"/manifest.json", mustJSON(t, model.Manifest{ + SchemaVersion: "3.1.0", + ArchiveID: "wrapped-archive", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "test-host", + Platform: model.ManifestPlatform{OS: "Ubuntu 24.04", Kernel: "6.8.0"}, + ArtifactIndex: []model.ManifestArtifact{{ + Path: "logs/dmesg.txt", Collector: "dmesg", Type: "file", + Status: "ok", SizeBytes: 5, SHA256: "def", ContentType: "text/plain", + }}, + Collectors: map[string]model.ManifestCollector{ + "dmesg": {Status: "ok", DurationMS: 10, ArtifactCount: 1}, + }, + })) + writeTarFile(t, tw, prefix+"/logs/dmesg.txt", []byte("hello")) + }) + + detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com") + if err != nil { + t.Fatalf("Ingest wrapped archive: %v", err) + } + if got := detail.Summary.ArchiveID; got != "wrapped-archive" { + t.Fatalf("archive id = %q, want %q", got, "wrapped-archive") + } + if !detail.Artifacts[0].ExistsOnDisk { + t.Fatal("artifact should exist on disk") + } +} + +func TestValidateRegularTarFileHeaderRejectsNegativeSize(t *testing.T) { + t.Parallel() + err := validateRegularTarFileHeader(&tar.Header{Name: "evil.bin", Size: -1}) + if err == nil || !strings.Contains(err.Error(), "negative size") { + t.Fatalf("expected negative size error, got %v", err) + } + if validateRegularTarFileHeader(&tar.Header{Name: "ok.bin", Size: 0}) != nil { + t.Fatal("expected nil for non-negative size") + } +} + +func TestIngestRejectsTraversal(t *testing.T) { + root := t.TempDir() + st, err := store.New(root) + if err != nil { + t.Fatalf("store.New: %v", err) + } + archivePath := createArchive(t, root, func(tw *tar.Writer) { + writeTarFile(t, tw, "../evil.txt", []byte("bad")) + }) + if _, err := Ingest(context.Background(), st, archivePath, "tester@example.com"); err == nil { + t.Fatal("expected traversal error") + } +} + +func createArchive(t *testing.T, root string, fn func(*tar.Writer)) string { + t.Helper() + path := filepath.Join(root, "archive.tar.gz") + file, err := os.Create(path) + if err != nil { + t.Fatalf("create archive: %v", err) + } + gw := gzip.NewWriter(file) + tw := tar.NewWriter(gw) + fn(tw) + if err := tw.Close(); err != nil { + t.Fatalf("close tar: %v", err) + } + if err := gw.Close(); err != nil { + t.Fatalf("close gzip: %v", err) + } + if err := file.Close(); err != nil { + t.Fatalf("close file: %v", err) + } + return path +} + +func writeTarDir(t *testing.T, tw *tar.Writer, name string) { + t.Helper() + hdr := &tar.Header{Name: name, Typeflag: tar.TypeDir, Mode: 0o755} + if err := tw.WriteHeader(hdr); err != nil { + t.Fatalf("write dir header %q: %v", name, err) + } +} + +func writeTarFile(t *testing.T, tw *tar.Writer, name string, data []byte) { + t.Helper() + hdr := &tar.Header{Name: name, Mode: 0o644, Size: int64(len(data))} + if err := tw.WriteHeader(hdr); err != nil { + t.Fatalf("write header %q: %v", name, err) + } + if _, err := tw.Write(data); err != nil { + t.Fatalf("write data %q: %v", name, err) + } +} + +func mustJSON(t *testing.T, value any) []byte { + t.Helper() + data, err := json.Marshal(value) + if err != nil { + t.Fatalf("json marshal: %v", err) + } + return data +} diff --git a/customers/vm-troubleshooting-dashboard/internal/model/types.go b/customers/vm-troubleshooting-dashboard/internal/model/types.go new file mode 100644 index 0000000..b568012 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/model/types.go @@ -0,0 +1,241 @@ +package model + +import ( + "fmt" + "sort" + "strconv" + "strings" + "time" +) + +const SupportedSchemaMajor = 3 + +type Manifest struct { + SchemaVersion string `json:"schema_version"` + SchemaRef string `json:"schema_ref,omitempty"` + ArchiveID string `json:"archive_id"` + Version string `json:"version,omitempty"` + Commit string `json:"commit,omitempty"` + GeneratedAt string `json:"generated_at"` + Hostname string `json:"hostname"` + Platform ManifestPlatform `json:"platform"` + ArtifactIndex []ManifestArtifact `json:"artifact_index"` + Collectors map[string]ManifestCollector `json:"collectors"` +} + +type ManifestPlatform struct { + OS string `json:"os,omitempty"` + Kernel string `json:"kernel,omitempty"` +} + +type ManifestArtifact struct { + Path string `json:"path"` + Collector string `json:"collector"` + Type string `json:"type"` + Command string `json:"command,omitempty"` + Source string `json:"source,omitempty"` + ExitCode int `json:"exit_code"` + Status string `json:"status"` + IgnoredExit bool `json:"ignored_exit"` + TimedOut bool `json:"timed_out"` + Sanitized bool `json:"sanitized,omitempty"` + Truncated bool `json:"truncated"` + DurationMS int64 `json:"duration_ms"` + SizeBytes int64 `json:"size_bytes"` + SHA256 string `json:"sha256"` + ContentType string `json:"content_type"` + ParserHint string `json:"parser_hint,omitempty"` + Tags []string `json:"tags"` +} + +type ManifestIssue struct { + Code string `json:"code"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Category string `json:"category"` + Message string `json:"message"` + IssueFingerprint string `json:"issue_fingerprint,omitempty"` + RelatedArtifactPaths []string `json:"related_artifact_paths,omitempty"` + UnresolvedArtifactPaths []string `json:"unresolved_artifact_paths,omitempty"` +} + +type ManifestCollector struct { + Status string `json:"status"` + DurationMS int64 `json:"duration_ms"` + ArtifactCount int `json:"artifact_count"` + SkippedCount int `json:"skipped_count"` + ErrorCount int `json:"error_count"` + Facts map[string]any `json:"facts,omitempty"` + Issues []ManifestIssue `json:"issues,omitempty"` + SkipReasons []ManifestSkipReason `json:"skip_reasons,omitempty"` + Errors []ManifestStructuredError `json:"structured_errors,omitempty"` +} + +type ManifestSkipReason struct { + Reason string `json:"reason"` + Detail string `json:"detail"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + +type ManifestStructuredError struct { + Code string `json:"code"` + Message string `json:"message"` + ArtifactPath string `json:"artifact_path,omitempty"` +} + +type TriageEnvelope struct { + Kind string `json:"kind"` + SchemaVersion string `json:"schema_version"` + ArchiveID string `json:"archive_id"` + Analyzer string `json:"analyzer"` + Findings []TriageFinding `json:"findings"` + Facts map[string]any `json:"facts,omitempty"` +} + +type TriageFinding struct { + Code string `json:"code"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Category string `json:"category"` + Title string `json:"title"` + Description string `json:"description"` + Action string `json:"action,omitempty"` + Evidence []string `json:"evidence,omitempty"` + SourceArtifacts []string `json:"source_artifacts,omitempty"` + Fingerprint string `json:"issue_fingerprint,omitempty"` + Analyzer string `json:"analyzer,omitempty"` // populated from envelope during loading +} + +type IssueRecord struct { + ID string `json:"id"` + Collector string `json:"collector"` + Code string `json:"code"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Category string `json:"category"` + Message string `json:"message"` + Fingerprint string `json:"issue_fingerprint,omitempty"` + RelatedArtifactPaths []string `json:"related_artifact_paths,omitempty"` + SuggestedArtifactPaths []string `json:"suggested_artifact_paths,omitempty"` + UnresolvedArtifactPaths []string `json:"unresolved_artifact_paths,omitempty"` + TriageFindings []TriageFinding `json:"triage_findings,omitempty"` + Source string `json:"source,omitempty"` +} + +type ArtifactRecord struct { + Path string `json:"path"` + Collector string `json:"collector"` + Type string `json:"type"` + Command string `json:"command,omitempty"` + Source string `json:"source,omitempty"` + ExitCode int `json:"exit_code"` + Status string `json:"status"` + IgnoredExit bool `json:"ignored_exit"` + TimedOut bool `json:"timed_out"` + Sanitized bool `json:"sanitized,omitempty"` + Truncated bool `json:"truncated"` + DurationMS int64 `json:"duration_ms"` + SizeBytes int64 `json:"size_bytes"` + SHA256 string `json:"sha256"` + ContentType string `json:"content_type"` + ParserHint string `json:"parser_hint,omitempty"` + Tags []string `json:"tags"` + ExistsOnDisk bool `json:"exists_on_disk"` +} + +type CollectorRecord struct { + ID string `json:"collector_id"` + Status string `json:"status"` + DurationMS int64 `json:"duration_ms"` + ArtifactCount int `json:"artifact_count"` + SkippedCount int `json:"skipped_count"` + ErrorCount int `json:"error_count"` + Facts map[string]any `json:"facts,omitempty"` +} + +type IssueCounts struct { + Critical int `json:"critical"` + Warning int `json:"warning"` + Info int `json:"info"` + Total int `json:"total"` +} + +type ArchiveSummary struct { + ArchiveID string `json:"archive_id"` + SchemaVersion string `json:"schema_version"` + Version string `json:"version,omitempty"` + Commit string `json:"commit,omitempty"` + GeneratedAt string `json:"generated_at"` + Hostname string `json:"hostname"` + Platform ManifestPlatform `json:"platform"` + UploadedAt time.Time `json:"uploaded_at"` + UploadedBy string `json:"uploaded_by"` + IssueCounts IssueCounts `json:"issue_counts"` + CollectorCount int `json:"collector_count"` + ArtifactCount int `json:"artifact_count"` + TriageFindingCount int `json:"triage_finding_count"` + Status string `json:"status"` + ErrorReason string `json:"error_reason,omitempty"` + StorageBytes int64 `json:"compressed_size"` +} + +type ArchiveDetail struct { + Summary ArchiveSummary `json:"summary"` + Collectors []CollectorRecord `json:"collectors"` + Issues []IssueRecord `json:"issues"` + Artifacts []ArtifactRecord `json:"artifacts"` + StorageDir string `json:"-"` +} + +func IsSupportedSchemaVersion(version string) bool { + major, _, ok := ParseSchemaVersion(version) + return ok && major == SupportedSchemaMajor +} + +func ParseSchemaVersion(version string) (int, int, bool) { + parts := strings.Split(version, ".") + if len(parts) != 3 { + return 0, 0, false + } + major, err := strconv.Atoi(parts[0]) + if err != nil { + return 0, 0, false + } + minor, err := strconv.Atoi(parts[1]) + if err != nil { + return 0, 0, false + } + if _, err := strconv.Atoi(parts[2]); err != nil { + return 0, 0, false + } + return major, minor, true +} + +func SortedCollectorIDs(m map[string]ManifestCollector) []string { + ids := make([]string, 0, len(m)) + for id := range m { + ids = append(ids, id) + } + sort.Strings(ids) + return ids +} + +func SeverityScore(severity string) int { + switch strings.ToLower(severity) { + case "critical": + return 3 + case "warning": + return 2 + case "info": + return 1 + default: + return 0 + } +} + +func IssueLabel(issue IssueRecord) string { + if issue.Code != "" { + return fmt.Sprintf("%s · %s", issue.Code, strings.ToLower(issue.Severity)) + } + return strings.ToLower(issue.Severity) +} diff --git a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go new file mode 100644 index 0000000..06cb587 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go @@ -0,0 +1,56 @@ +package pathutil + +import ( + "fmt" + "os" + "path/filepath" + "strings" +) + +// SafeJoin resolves rel under root and rejects path traversal. +func SafeJoin(root, relPath string) (string, error) { + rel, err := safeRelativePath(relPath) + if err != nil { + return "", err + } + full := filepath.Join(root, filepath.FromSlash(rel)) + cleanRoot := filepath.Clean(root) + cleanFull := filepath.Clean(full) + if cleanFull != cleanRoot && !strings.HasPrefix(cleanFull, cleanRoot+string(os.PathSeparator)) { + return "", fmt.Errorf("path escapes root: %q", relPath) + } + return cleanFull, nil +} + +func safeRelativePath(value string) (string, error) { + value = strings.TrimSpace(value) + if value == "" { + return "", fmt.Errorf("path is empty") + } + if filepath.IsAbs(value) { + return "", fmt.Errorf("path must be relative: %q", value) + } + s := filepath.ToSlash(value) + var depth int + for _, seg := range strings.Split(s, "/") { + if seg == "" || seg == "." { + continue + } + if seg == ".." { + depth-- + if depth < 0 { + return "", fmt.Errorf("path escapes root: %q", value) + } + continue + } + depth++ + } + cleaned := filepath.ToSlash(filepath.Clean(value)) + if cleaned == "." || cleaned == "" { + return "", fmt.Errorf("path is empty") + } + if strings.HasPrefix(cleaned, "../") || cleaned == ".." || strings.Contains(cleaned, "/../") { + return "", fmt.Errorf("path escapes root: %q", value) + } + return cleaned, nil +} diff --git a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go new file mode 100644 index 0000000..b457083 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go @@ -0,0 +1,50 @@ +package pathutil + +import ( + "os" + "path/filepath" + "testing" +) + +func TestSafeJoin_AllowsNormalRelative(t *testing.T) { + t.Parallel() + root := t.TempDir() + sub := filepath.Join(root, "a", "b") + if err := os.MkdirAll(sub, 0o755); err != nil { + t.Fatal(err) + } + full, err := SafeJoin(root, "a/b/file.txt") + if err != nil { + t.Fatalf("SafeJoin: %v", err) + } + if filepath.Dir(full) != sub { + t.Fatalf("got %q want dir %q", full, sub) + } +} + +func TestSafeJoin_RejectsDotDot(t *testing.T) { + t.Parallel() + root := t.TempDir() + _, err := SafeJoin(root, "../etc/passwd") + if err == nil { + t.Fatal("expected error for .. escape") + } +} + +func TestSafeJoin_RejectsAbsoluteInput(t *testing.T) { + t.Parallel() + root := t.TempDir() + _, err := SafeJoin(root, "/etc/passwd") + if err == nil { + t.Fatal("expected error for absolute-looking path") + } +} + +func TestSafeJoin_RejectsEmpty(t *testing.T) { + t.Parallel() + root := t.TempDir() + _, err := SafeJoin(root, "") + if err == nil { + t.Fatal("expected error for empty path") + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/evidence.go b/customers/vm-troubleshooting-dashboard/internal/store/evidence.go new file mode 100644 index 0000000..2effde1 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/evidence.go @@ -0,0 +1,251 @@ +package store + +import ( + "sort" + "strings" + "unicode" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +const defaultSuggestedArtifactLimit = 6 + +type evidenceResolver struct { + maxSuggestions int +} + +type rankedArtifact struct { + path string + score int + collector bool + locality int + tokenHits int + pathLength int +} + +type featureSet struct { + tokens map[string]int +} + +func suggestIssueArtifacts(issue model.IssueRecord, artifacts []model.ArtifactRecord) []string { + resolver := evidenceResolver{maxSuggestions: defaultSuggestedArtifactLimit} + return resolver.suggest(issue, artifacts) +} + +func (r evidenceResolver) suggest(issue model.IssueRecord, artifacts []model.ArtifactRecord) []string { + if r.maxSuggestions <= 0 { + r.maxSuggestions = defaultSuggestedArtifactLimit + } + if len(artifacts) == 0 { + return nil + } + + issueFeatures := buildIssueFeatures(issue) + candidates := make([]rankedArtifact, 0, len(artifacts)) + for _, artifact := range artifacts { + score, tokenHits, locality := scoreArtifact(issueFeatures, issue.Collector, artifact) + if score <= 0 { + continue + } + candidates = append(candidates, rankedArtifact{ + path: artifact.Path, + score: score, + collector: normalize(artifact.Collector) != "" && normalize(artifact.Collector) == normalize(issue.Collector), + locality: locality, + tokenHits: tokenHits, + pathLength: len(artifact.Path), + }) + } + + if len(candidates) == 0 { + appendFallbackCollectorArtifacts(&candidates, issue.Collector, artifacts) + } else if len(candidates) < r.maxSuggestions { + appendFallbackCollectorArtifacts(&candidates, issue.Collector, artifacts) + } + + sort.SliceStable(candidates, func(i, j int) bool { + if candidates[i].score != candidates[j].score { + return candidates[i].score > candidates[j].score + } + if candidates[i].collector != candidates[j].collector { + return candidates[i].collector + } + if candidates[i].locality != candidates[j].locality { + return candidates[i].locality > candidates[j].locality + } + if candidates[i].tokenHits != candidates[j].tokenHits { + return candidates[i].tokenHits > candidates[j].tokenHits + } + if candidates[i].pathLength != candidates[j].pathLength { + return candidates[i].pathLength < candidates[j].pathLength + } + return candidates[i].path < candidates[j].path + }) + + out := make([]string, 0, r.maxSuggestions) + seen := make(map[string]struct{}, r.maxSuggestions) + for _, candidate := range candidates { + if _, ok := seen[candidate.path]; ok { + continue + } + seen[candidate.path] = struct{}{} + out = append(out, candidate.path) + if len(out) >= r.maxSuggestions { + break + } + } + return out +} + +func appendFallbackCollectorArtifacts(candidates *[]rankedArtifact, issueCollector string, artifacts []model.ArtifactRecord) { + seen := make(map[string]struct{}, len(*candidates)) + for _, candidate := range *candidates { + seen[candidate.path] = struct{}{} + } + for _, artifact := range artifacts { + if _, ok := seen[artifact.Path]; ok { + continue + } + if normalize(artifact.Collector) != normalize(issueCollector) && !pathPrefixMatchesCollector(artifact.Path, issueCollector) { + continue + } + *candidates = append(*candidates, rankedArtifact{ + path: artifact.Path, + score: 1, + collector: true, + locality: 1, + pathLength: len(artifact.Path), + }) + } +} + +func buildIssueFeatures(issue model.IssueRecord) featureSet { + weights := make(map[string]int) + add := func(value string, weight int) { + for _, token := range tokenize(value) { + if token == "" { + continue + } + if current := weights[token]; weight > current { + weights[token] = weight + } + } + } + + add(issue.Collector, 8) + add(issue.Code, 7) + add(issue.Category, 6) + add(issue.Message, 3) + + return featureSet{tokens: weights} +} + +func scoreArtifact(issue featureSet, issueCollector string, artifact model.ArtifactRecord) (score int, tokenHits int, locality int) { + artifactFeatures := make(map[string]int) + add := func(value string, weight int) { + for _, token := range tokenize(value) { + if token == "" { + continue + } + if current := artifactFeatures[token]; weight > current { + artifactFeatures[token] = weight + } + } + } + + add(artifact.Collector, 10) + add(artifact.Path, 6) + add(strings.Join(artifact.Tags, " "), 8) + add(artifact.ParserHint, 4) + add(artifact.ContentType, 2) + + issueCollector = normalize(issueCollector) + artifactCollector := normalize(artifact.Collector) + if issueCollector != "" && artifactCollector != "" && issueCollector == artifactCollector { + score += 40 + locality += 3 + } + if issueCollector != "" && pathPrefixMatchesCollector(artifact.Path, issueCollector) { + score += 18 + locality += 2 + } + + for token, issueWeight := range issue.tokens { + if artifactWeight, ok := artifactFeatures[token]; ok { + score += issueWeight * artifactWeight + tokenHits++ + } + } + + // Keep weak same-collector suggestions available even when tokens are sparse. + if score == 0 && issueCollector != "" && artifactCollector == issueCollector { + score = 1 + locality = 1 + } + return score, tokenHits, locality +} + +func pathPrefixMatchesCollector(path, collector string) bool { + collector = normalize(collector) + if collector == "" || path == "" { + return false + } + segments := strings.Split(path, "/") + if len(segments) == 0 { + return false + } + return normalize(segments[0]) == collector || strings.Contains(normalize(path), collector) +} + +func normalize(value string) string { + var b strings.Builder + b.Grow(len(value)) + lastSep := false + for _, r := range strings.ToLower(strings.TrimSpace(value)) { + switch { + case unicode.IsLetter(r), unicode.IsDigit(r): + b.WriteRune(r) + lastSep = false + case r == '/' || r == '_' || r == '-' || unicode.IsSpace(r) || r == '.': + if !lastSep { + b.WriteByte(' ') + lastSep = true + } + default: + if !lastSep { + b.WriteByte(' ') + lastSep = true + } + } + } + return strings.TrimSpace(b.String()) +} + +func tokenize(value string) []string { + if value == "" { + return nil + } + parts := strings.FieldsFunc(strings.ToLower(value), func(r rune) bool { + return !unicode.IsLetter(r) && !unicode.IsDigit(r) + }) + if len(parts) == 0 { + return nil + } + out := make([]string, 0, len(parts)) + for _, part := range parts { + if part == "" || isStopToken(part) { + continue + } + out = append(out, part) + } + return out +} + +func isStopToken(token string) bool { + switch token { + case "a", "an", "and", "error", "errors", "fail", "failed", "failure", "info", "issue", "issues", "log", "logs", "message", "of", "on", "or", "status", "the", "to", "warning", "critical", "txt", "text": + return true + default: + return false + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/schema.sql b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql new file mode 100644 index 0000000..be59599 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql @@ -0,0 +1,55 @@ +CREATE TABLE IF NOT EXISTS archives ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + archive_id TEXT NOT NULL UNIQUE, + storage_path TEXT NOT NULL, + uploaded_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), + uploaded_by TEXT NOT NULL DEFAULT '', + status TEXT NOT NULL DEFAULT 'processing', + error_reason TEXT NOT NULL DEFAULT '', + compressed_size INTEGER NOT NULL DEFAULT 0, + hostname TEXT, + generated_at TEXT, + version TEXT, + commit_hash TEXT, + platform_os TEXT, + platform_kernel TEXT, + schema_version TEXT, + manifest_json BLOB, + triage_finding_count INTEGER +); + +CREATE TABLE IF NOT EXISTS collectors ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + archive_id TEXT NOT NULL REFERENCES archives(archive_id) ON DELETE CASCADE, + collector_id TEXT NOT NULL, + status TEXT NOT NULL, + duration_ms INTEGER NOT NULL, + artifact_count INTEGER NOT NULL DEFAULT 0, + skipped_count INTEGER NOT NULL DEFAULT 0, + error_count INTEGER NOT NULL DEFAULT 0, + facts_json TEXT, + UNIQUE(archive_id, collector_id) +); + +CREATE TABLE IF NOT EXISTS issues ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + archive_id TEXT NOT NULL REFERENCES archives(archive_id) ON DELETE CASCADE, + collector_id TEXT NOT NULL, + code TEXT NOT NULL, + severity TEXT NOT NULL, + confidence TEXT NOT NULL, + category TEXT NOT NULL, + message TEXT NOT NULL, + issue_fingerprint TEXT, + related_artifacts_json TEXT, + unresolved_artifacts_json TEXT +); + +CREATE INDEX IF NOT EXISTS idx_issues_archive ON issues(archive_id); +CREATE INDEX IF NOT EXISTS idx_issues_severity ON issues(archive_id, severity); +CREATE INDEX IF NOT EXISTS idx_issues_confidence ON issues(archive_id, confidence); +CREATE INDEX IF NOT EXISTS idx_issues_category ON issues(archive_id, category); +CREATE INDEX IF NOT EXISTS idx_issues_collector ON issues(archive_id, collector_id); +CREATE INDEX IF NOT EXISTS idx_issues_code ON issues(archive_id, code); +CREATE INDEX IF NOT EXISTS idx_issues_fingerprint ON issues(issue_fingerprint) WHERE issue_fingerprint IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_collectors_archive ON collectors(archive_id); diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store.go b/customers/vm-troubleshooting-dashboard/internal/store/store.go new file mode 100644 index 0000000..b2022af --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/store.go @@ -0,0 +1,613 @@ +package store + +import ( + "database/sql" + _ "embed" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" + + _ "modernc.org/sqlite" +) + +//go:embed schema.sql +var ddl string + +// Store provides SQLite-backed persistence for diagnostic archives. +type Store struct { + db *sql.DB + rootDir string +} + +// New opens (or creates) a SQLite database at rootDir/dashboard.db, applies the +// schema, and returns a ready Store. The rootDir is also used as the parent for +// extracted archive files on disk. +func New(rootDir string) (*Store, error) { + if err := os.MkdirAll(filepath.Join(rootDir, "archives"), 0o755); err != nil { + return nil, err + } + + dbPath := filepath.Join(rootDir, "dashboard.db") + dsn := dbPath + "?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)&_pragma=busy_timeout(5000)" + + db, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("open database: %w", err) + } + db.SetMaxOpenConns(4) + + if _, err := db.Exec(ddl); err != nil { + db.Close() + return nil, fmt.Errorf("apply schema: %w", err) + } + + if err := migrateArchivesTriageCount(db); err != nil { + db.Close() + return nil, err + } + + st := &Store{db: db, rootDir: rootDir} + if err := st.backfillTriageFindingCounts(); err != nil { + db.Close() + return nil, fmt.Errorf("backfill triage_finding_count: %w", err) + } + + return st, nil +} + +func migrateArchivesTriageCount(db *sql.DB) error { + var n int + err := db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('archives') WHERE name='triage_finding_count'`).Scan(&n) + if err != nil { + return fmt.Errorf("pragma archives: %w", err) + } + if n > 0 { + return nil + } + if _, err := db.Exec(`ALTER TABLE archives ADD COLUMN triage_finding_count INTEGER`); err != nil { + return fmt.Errorf("add triage_finding_count: %w", err) + } + return nil +} + +func (s *Store) backfillTriageFindingCounts() error { + rows, err := s.db.Query(`SELECT archive_id, storage_path FROM archives WHERE triage_finding_count IS NULL`) + if err != nil { + return err + } + defer rows.Close() + for rows.Next() { + var archiveID, storagePath string + if err := rows.Scan(&archiveID, &storagePath); err != nil { + return err + } + c := countTriageFindingsOnDisk(storagePath) + if _, err := s.db.Exec(`UPDATE archives SET triage_finding_count = ? WHERE archive_id = ?`, c, archiveID); err != nil { + return err + } + } + return rows.Err() +} + +func countTriageFindingsOnDisk(storagePath string) int { + m := loadTriageMap(storagePath) + n := 0 + for _, findings := range m { + n += len(findings) + } + return n +} + +// Close closes the underlying database connection. +func (s *Store) Close() error { + return s.db.Close() +} + +// RootDir returns the data root directory. +func (s *Store) RootDir() string { + return s.rootDir +} + +// ArchiveDir returns the filesystem path for a given archive's extracted files. +func (s *Store) ArchiveDir(archiveID string) string { + return filepath.Join(s.rootDir, "archives", sanitizeName(archiveID)) +} + +// HasArchive returns true if an archive with the given ID exists in the database. +func (s *Store) HasArchive(archiveID string) bool { + var exists bool + _ = s.db.QueryRow("SELECT EXISTS(SELECT 1 FROM archives WHERE archive_id = ?)", archiveID).Scan(&exists) + return exists +} + +// Save persists a fully built ArchiveDetail into the database. It inserts the +// archive, all collectors, and all issues in a single transaction. +func (s *Store) Save(detail *model.ArchiveDetail) error { + if detail == nil { + return errors.New("archive detail is nil") + } + archiveID := detail.Summary.ArchiveID + if archiveID == "" { + return errors.New("archive id is empty") + } + + // Serialize manifest_json for the artifact list endpoint. + manifestJSON, err := json.Marshal(detail.Artifacts) + if err != nil { + return fmt.Errorf("marshal artifacts: %w", err) + } + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + + storagePath := detail.StorageDir + if storagePath == "" { + storagePath = s.ArchiveDir(archiveID) + } + + triageCount := countTriageFindingsOnDisk(storagePath) + + _, err = tx.Exec(`INSERT INTO archives ( + archive_id, storage_path, uploaded_at, uploaded_by, status, error_reason, + compressed_size, hostname, generated_at, version, commit_hash, + platform_os, platform_kernel, schema_version, manifest_json, triage_finding_count + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + archiveID, + storagePath, + detail.Summary.UploadedAt.UTC().Format("2006-01-02T15:04:05Z"), + detail.Summary.UploadedBy, + detail.Summary.Status, + detail.Summary.ErrorReason, + detail.Summary.StorageBytes, + detail.Summary.Hostname, + detail.Summary.GeneratedAt, + detail.Summary.Version, + detail.Summary.Commit, + detail.Summary.Platform.OS, + detail.Summary.Platform.Kernel, + detail.Summary.SchemaVersion, + manifestJSON, + triageCount, + ) + if err != nil { + return fmt.Errorf("insert archive: %w", err) + } + + // Insert collectors. + for _, c := range detail.Collectors { + var factsJSON *string + if c.Facts != nil { + data, err := json.Marshal(c.Facts) + if err != nil { + return fmt.Errorf("marshal facts: %w", err) + } + s := string(data) + factsJSON = &s + } + _, err := tx.Exec(`INSERT INTO collectors ( + archive_id, collector_id, status, duration_ms, + artifact_count, skipped_count, error_count, facts_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + archiveID, c.ID, c.Status, c.DurationMS, + c.ArtifactCount, c.SkippedCount, c.ErrorCount, factsJSON, + ) + if err != nil { + return fmt.Errorf("insert collector %q: %w", c.ID, err) + } + } + + // Insert issues. + for _, issue := range detail.Issues { + var relJSON, unresJSON *string + if len(issue.RelatedArtifactPaths) > 0 { + data, _ := json.Marshal(issue.RelatedArtifactPaths) + s := string(data) + relJSON = &s + } + if len(issue.UnresolvedArtifactPaths) > 0 { + data, _ := json.Marshal(issue.UnresolvedArtifactPaths) + s := string(data) + unresJSON = &s + } + var fp *string + if issue.Fingerprint != "" { + fp = &issue.Fingerprint + } + _, err := tx.Exec(`INSERT INTO issues ( + archive_id, collector_id, code, severity, confidence, + category, message, issue_fingerprint, + related_artifacts_json, unresolved_artifacts_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + archiveID, issue.Collector, issue.Code, issue.Severity, + issue.Confidence, issue.Category, issue.Message, fp, + relJSON, unresJSON, + ) + if err != nil { + return fmt.Errorf("insert issue: %w", err) + } + } + + return tx.Commit() +} + +// Delete removes an archive's database rows (cascading to collectors and issues) +// and its extracted files from disk. +func (s *Store) Delete(archiveID string) error { + var storagePath string + err := s.db.QueryRow("SELECT storage_path FROM archives WHERE archive_id = ?", archiveID).Scan(&storagePath) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return os.ErrNotExist + } + return err + } + + trashPath := storagePath + ".deleting" + if err := os.Rename(storagePath, trashPath); err != nil { + if errors.Is(err, os.ErrNotExist) { + trashPath = storagePath + } else { + return err + } + } + + tx, err := s.db.Begin() + if err != nil { + if trashPath != storagePath { + _ = os.Rename(trashPath, storagePath) + } + return err + } + defer tx.Rollback() + + if _, err := tx.Exec("DELETE FROM archives WHERE archive_id = ?", archiveID); err != nil { + if trashPath != storagePath { + _ = os.Rename(trashPath, storagePath) + } + return err + } + if err := tx.Commit(); err != nil { + if trashPath != storagePath { + _ = os.Rename(trashPath, storagePath) + } + return err + } + if trashPath != storagePath { + return os.RemoveAll(trashPath) + } + return os.RemoveAll(storagePath) +} + +// List returns all archive summaries, sorted by uploaded_at descending. +func (s *Store) List() []model.ArchiveSummary { + rows, err := s.db.Query(` +SELECT + a.archive_id, + COALESCE(a.hostname, ''), + COALESCE(a.generated_at, ''), + a.uploaded_at, + a.uploaded_by, + a.status, + a.error_reason, + a.compressed_size, + COALESCE(a.version, ''), + COALESCE(a.commit_hash, ''), + COALESCE(a.platform_os, ''), + COALESCE(a.platform_kernel, ''), + COALESCE(a.schema_version, ''), + COALESCE(iss.critical, 0), + COALESCE(iss.warning, 0), + COALESCE(iss.info, 0), + COALESCE(iss.total, 0), + COALESCE(cc.collector_count, 0), + a.triage_finding_count +FROM archives a +LEFT JOIN ( + SELECT archive_id, + SUM(CASE WHEN LOWER(severity) = 'critical' THEN 1 ELSE 0 END) AS critical, + SUM(CASE WHEN LOWER(severity) = 'warning' THEN 1 ELSE 0 END) AS warning, + SUM(CASE WHEN LOWER(severity) = 'info' THEN 1 ELSE 0 END) AS info, + COUNT(*) AS total + FROM issues GROUP BY archive_id +) iss ON iss.archive_id = a.archive_id +LEFT JOIN ( + SELECT archive_id, COUNT(*) AS collector_count FROM collectors GROUP BY archive_id +) cc ON cc.archive_id = a.archive_id +ORDER BY a.uploaded_at DESC`) + if err != nil { + return nil + } + defer rows.Close() + + var items []model.ArchiveSummary + for rows.Next() { + var a model.ArchiveSummary + var uploadedAt string + var triage sql.NullInt64 + if err := rows.Scan( + &a.ArchiveID, &a.Hostname, &a.GeneratedAt, + &uploadedAt, &a.UploadedBy, &a.Status, &a.ErrorReason, + &a.StorageBytes, &a.Version, &a.Commit, + &a.Platform.OS, &a.Platform.Kernel, &a.SchemaVersion, + &a.IssueCounts.Critical, &a.IssueCounts.Warning, &a.IssueCounts.Info, &a.IssueCounts.Total, + &a.CollectorCount, + &triage, + ); err != nil { + continue + } + a.UploadedAt, _ = parseTime(uploadedAt) + if triage.Valid { + a.TriageFindingCount = int(triage.Int64) + } + items = append(items, a) + } + return items +} + +// ArchiveCount returns the number of rows in archives. +func (s *Store) ArchiveCount() (int64, error) { + var n int64 + err := s.db.QueryRow(`SELECT COUNT(*) FROM archives`).Scan(&n) + return n, err +} + +// Get reconstructs a full ArchiveDetail from the database and filesystem. +func (s *Store) Get(archiveID string) (*model.ArchiveDetail, bool) { + var a model.ArchiveSummary + var storagePath, uploadedAt string + var manifestJSON []byte + + err := s.db.QueryRow(`SELECT + archive_id, storage_path, COALESCE(hostname, ''), COALESCE(generated_at, ''), + uploaded_at, uploaded_by, status, error_reason, compressed_size, + COALESCE(version, ''), COALESCE(commit_hash, ''), + COALESCE(platform_os, ''), COALESCE(platform_kernel, ''), + COALESCE(schema_version, ''), manifest_json + FROM archives WHERE archive_id = ?`, archiveID).Scan( + &a.ArchiveID, &storagePath, &a.Hostname, &a.GeneratedAt, + &uploadedAt, &a.UploadedBy, &a.Status, &a.ErrorReason, + &a.StorageBytes, &a.Version, &a.Commit, + &a.Platform.OS, &a.Platform.Kernel, &a.SchemaVersion, + &manifestJSON, + ) + if err != nil { + return nil, false + } + a.UploadedAt, _ = parseTime(uploadedAt) + a.IssueCounts = s.issueCounts(archiveID) + + // Load collectors. + collectors := s.loadCollectors(archiveID) + a.CollectorCount = len(collectors) + + // Load issues with triage findings from disk. + issues := s.loadIssues(archiveID, storagePath) + a.TriageFindingCount = 0 + for _, issue := range issues { + a.TriageFindingCount += len(issue.TriageFindings) + } + + // Deserialize artifact list from manifest_json. + var artifacts []model.ArtifactRecord + if manifestJSON != nil { + _ = json.Unmarshal(manifestJSON, &artifacts) + } + a.ArtifactCount = len(artifacts) + issues = s.enrichIssueEvidence(issues, artifacts) + + return &model.ArchiveDetail{ + Summary: a, + Collectors: collectors, + Issues: issues, + Artifacts: artifacts, + StorageDir: storagePath, + }, true +} + +// issueCounts computes severity counts for an archive from the issues table. +func (s *Store) issueCounts(archiveID string) model.IssueCounts { + var counts model.IssueCounts + rows, err := s.db.Query("SELECT severity, COUNT(*) FROM issues WHERE archive_id = ? GROUP BY severity", archiveID) + if err != nil { + return counts + } + defer rows.Close() + for rows.Next() { + var sev string + var n int + if err := rows.Scan(&sev, &n); err != nil { + continue + } + counts.Total += n + switch strings.ToLower(sev) { + case "critical": + counts.Critical = n + case "warning": + counts.Warning = n + case "info": + counts.Info = n + } + } + return counts +} + +// loadCollectors reads all collectors for an archive from the database. +func (s *Store) loadCollectors(archiveID string) []model.CollectorRecord { + rows, err := s.db.Query(`SELECT + collector_id, status, duration_ms, + artifact_count, skipped_count, error_count, facts_json + FROM collectors WHERE archive_id = ? ORDER BY collector_id`, archiveID) + if err != nil { + return nil + } + defer rows.Close() + + var collectors []model.CollectorRecord + for rows.Next() { + var c model.CollectorRecord + var factsJSON *string + if err := rows.Scan( + &c.ID, &c.Status, &c.DurationMS, + &c.ArtifactCount, &c.SkippedCount, &c.ErrorCount, &factsJSON, + ); err != nil { + continue + } + if factsJSON != nil { + _ = json.Unmarshal([]byte(*factsJSON), &c.Facts) + } + collectors = append(collectors, c) + } + return collectors +} + +// loadIssues reads all issues for an archive and joins triage findings from disk. +func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { + rows, err := s.db.Query(`SELECT + id, collector_id, code, severity, confidence, + category, message, issue_fingerprint, + related_artifacts_json, unresolved_artifacts_json + FROM issues WHERE archive_id = ? + ORDER BY CASE severity + WHEN 'critical' THEN 0 + WHEN 'warning' THEN 1 + WHEN 'info' THEN 2 + ELSE 3 + END, collector_id, id`, archiveID) + if err != nil { + return nil + } + defer rows.Close() + + // Load triage findings lazily. + var triageLoaded bool + var triageByFP map[string][]model.TriageFinding + + var issues []model.IssueRecord + for rows.Next() { + var issue model.IssueRecord + var dbID int64 + var fp, relJSON, unresJSON *string + if err := rows.Scan( + &dbID, &issue.Collector, &issue.Code, &issue.Severity, + &issue.Confidence, &issue.Category, &issue.Message, &fp, + &relJSON, &unresJSON, + ); err != nil { + continue + } + issue.ID = strconv.FormatInt(dbID, 10) + if fp != nil { + issue.Fingerprint = *fp + } + if relJSON != nil { + _ = json.Unmarshal([]byte(*relJSON), &issue.RelatedArtifactPaths) + } + if unresJSON != nil { + _ = json.Unmarshal([]byte(*unresJSON), &issue.UnresolvedArtifactPaths) + } + issue.Source = "manifest" + + // Join triage findings by fingerprint. + if issue.Fingerprint != "" { + if !triageLoaded { + triageByFP = loadTriageMap(storagePath) + triageLoaded = true + } + if matched := triageByFP[issue.Fingerprint]; len(matched) > 0 { + issue.TriageFindings = matched + } + } + + issues = append(issues, issue) + } + return issues +} + +// enrichIssueEvidence adds fallback artifact suggestions for issues that do not +// already have explicit related artifact paths and have no triage match. +func (s *Store) enrichIssueEvidence(issues []model.IssueRecord, artifacts []model.ArtifactRecord) []model.IssueRecord { + if len(issues) == 0 || len(artifacts) == 0 { + return issues + } + + for i := range issues { + issue := &issues[i] + if len(issue.RelatedArtifactPaths) > 0 || len(issue.TriageFindings) > 0 { + continue + } + issue.SuggestedArtifactPaths = suggestIssueArtifacts(*issue, artifacts) + } + return issues +} + +// loadTriageMap reads triage findings from triage/_data/*.json on disk and +// indexes them by fingerprint. +func loadTriageMap(storagePath string) map[string][]model.TriageFinding { + result := make(map[string][]model.TriageFinding) + root := filepath.Join(storagePath, "triage", "_data") + entries, err := os.ReadDir(root) + if err != nil { + return result + } + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") { + continue + } + data, err := os.ReadFile(filepath.Join(root, entry.Name())) + if err != nil { + continue + } + var envelope model.TriageEnvelope + if err := json.Unmarshal(data, &envelope); err != nil { + continue + } + for _, f := range envelope.Findings { + if f.Fingerprint != "" { + f.Analyzer = envelope.Analyzer + result[f.Fingerprint] = append(result[f.Fingerprint], f) + } + } + } + return result +} + +// parseTime parses an ISO 8601 timestamp. +func parseTime(value string) (time.Time, error) { + return time.Parse("2006-01-02T15:04:05Z", value) +} + +func sanitizeName(value string) string { + if value == "" { + return "archive" + } + var b strings.Builder + b.Grow(len(value)) + for _, r := range value { + switch { + case r >= 'a' && r <= 'z': + b.WriteRune(r) + case r >= 'A' && r <= 'Z': + b.WriteRune(r) + case r >= '0' && r <= '9': + b.WriteRune(r) + case r == '-' || r == '_' || r == '.': + b.WriteRune(r) + default: + b.WriteByte('_') + } + } + name := strings.Trim(b.String(), "._") + if name == "" { + return "archive" + } + return name +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store_test.go b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go new file mode 100644 index 0000000..0209b66 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go @@ -0,0 +1,136 @@ +package store + +import ( + "testing" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +func TestGetAddsFallbackEvidence(t *testing.T) { + t.Parallel() + + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + archiveID := "archive-1" + archiveDir := st.ArchiveDir(archiveID) + detail := &model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: archiveID, + SchemaVersion: "3.1.0", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "host", + Platform: model.ManifestPlatform{OS: "Ubuntu", Kernel: "6.6"}, + UploadedAt: time.Now().UTC(), + UploadedBy: "tester", + Status: "ready", + StorageBytes: 123, + }, + Collectors: []model.CollectorRecord{{ID: "services", Status: "ok"}, {ID: "journal", Status: "ok"}}, + Issues: []model.IssueRecord{ + { + Collector: "services", + Code: "svc_failed", + Severity: "critical", + Confidence: "high", + Category: "SVC", + Message: "1 failed systemd service(s)", + }, + { + Collector: "journal", + Code: "critical_log", + Severity: "info", + Confidence: "low", + Category: "ERR", + Message: "Error/Fail: test", + }, + { + Collector: "triage", + Code: "firewall_posture", + Severity: "info", + Confidence: "high", + Category: "FW", + Message: "Firewall inactive (ufw)", + }, + { + Collector: "journal", + Code: "custom_journal", + Severity: "warning", + Confidence: "low", + Category: "ERR", + Message: "Journal fallback needed", + }, + }, + Artifacts: []model.ArtifactRecord{ + {Path: "services/failed_services.txt", Collector: "services", Tags: []string{"services"}}, + {Path: "services/key_services.txt", Collector: "services", Tags: []string{"services"}}, + {Path: "services/status_nvidia-dcgm.txt", Collector: "services", Tags: []string{"services"}}, + {Path: "logs/journal_errors.ndjson", Collector: "journal", Tags: []string{"journal"}}, + {Path: "logs/dmesg.txt", Collector: "journal", Tags: []string{"journal"}}, + {Path: "network/ufw_status.txt", Collector: "network", Tags: []string{"firewall", "network"}}, + {Path: "network/iptables.txt", Collector: "network", Tags: []string{"firewall", "network"}}, + }, + StorageDir: archiveDir, + } + + if err := st.Save(detail); err != nil { + t.Fatalf("Save: %v", err) + } + + loaded, ok := st.Get(archiveID) + if !ok { + t.Fatal("Get returned not found") + } + if got := issueSuggestionsByCode(loaded.Issues, "svc_failed"); !containsAll(got, []string{ + "services/failed_services.txt", + "services/key_services.txt", + "services/status_nvidia-dcgm.txt", + }) { + t.Fatalf("svc_failed suggestions = %v", got) + } + if got := issueSuggestionsByCode(loaded.Issues, "critical_log"); !containsAll(got, []string{ + "logs/journal_errors.ndjson", + "logs/dmesg.txt", + }) { + t.Fatalf("critical_log suggestions = %v", got) + } + if got := issueSuggestionsByCode(loaded.Issues, "firewall_posture"); !containsAll(got, []string{ + "network/ufw_status.txt", + "network/iptables.txt", + }) { + t.Fatalf("firewall_posture suggestions = %v", got) + } + if got := issueSuggestionsByCode(loaded.Issues, "custom_journal"); !containsAll(got, []string{ + "logs/journal_errors.ndjson", + "logs/dmesg.txt", + }) { + t.Fatalf("custom_journal suggestions = %v", got) + } +} + +func issueSuggestionsByCode(issues []model.IssueRecord, code string) []string { + for _, issue := range issues { + if issue.Code == code { + return issue.SuggestedArtifactPaths + } + } + return nil +} + +func containsAll(have []string, want []string) bool { + set := make(map[string]struct{}, len(have)) + for _, item := range have { + set[item] = struct{}{} + } + for _, item := range want { + if _, ok := set[item]; !ok { + return false + } + } + return true +} diff --git a/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json new file mode 100644 index 0000000..b5b060d --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json @@ -0,0 +1,143 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/manifest/v3", + "title": "VM Diagnostics Manifest", + "description": "Machine-readable index of a vm-diagnostics archive. artifact_index covers collector-produced payload and derived diagnostic files; framework control files (manifest.json, report.ndjson, SUMMARY.txt, metadata.json, transfer_commands.txt, schemas/*) are excluded. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", + "type": "object", + "required": ["schema_version", "archive_id", "generated_at", "hostname", "artifact_index", "collectors"], + "properties": { + "schema_version": { "type": "string", "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" }, + "schema_ref": { "type": "string" }, + "archive_id": { "type": "string" }, + "version": { "type": "string" }, + "commit": { "type": "string" }, + "generated_at": { "type": "string", "format": "date-time" }, + "hostname": { "type": "string" }, + "platform": { + "type": "object", + "properties": { + "os": { "type": "string" }, + "kernel": { "type": "string" } + } + }, + "artifact_index": { + "type": "array", + "items": { "$ref": "#/$defs/artifact" } + }, + "collectors": { + "type": "object", + "additionalProperties": { "$ref": "#/$defs/collector_summary" } + } + }, + "$defs": { + "artifact": { + "type": "object", + "required": ["path", "collector", "type", "status", "tags"], + "properties": { + "path": { "type": "string" }, + "collector": { "type": "string" }, + "type": { "type": "string", "enum": ["command", "file", "probe"] }, + "command": { "type": "string" }, + "source": { "type": "string" }, + "exit_code": { "type": "integer" }, + "status": { "type": "string", "enum": ["ok", "skipped", "error"] }, + "ignored_exit": { "type": "boolean" }, + "timed_out": { "type": "boolean" }, + "sanitized": { "type": "boolean" }, + "truncated": { "type": "boolean" }, + "duration_ms": { "type": "integer" }, + "size_bytes": { "type": "integer" }, + "sha256": { "type": "string" }, + "content_type": { "type": "string" }, + "parser_hint": { + "type": "string", + "enum": [ + "lscpu", "free", "df", "lsblk", "lspci", "nvidia-smi", "nvidia-smi-csv", "dcgmi", + "journalctl", "dmesg", "systemctl", "dpkg", "rpm", "smartctl", "nvme", "json", + "procfs", "netlink", "sysctl", "ps", "top", "text", "binary", "ss", "mount", + "lsmod", "pip", "docker", "nmcli", "networkctl", "resolvectl", "bridge", "netplan", + "iptables", "nft", "ufw", "firewall-cmd", "ibstat", "ibstatus", "ibv_devinfo", "rdma", "apt-mark", "sh", + "hostname", "date", "uptime", "uname", "csv" + ] + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", + "network", "firewall", "docker", "docker-security", "services", "journal", "oom", + "packages", "storage", "infiniband", "processes", "config", "triage" + ] + } + } + } + }, + "collector_summary": { + "type": "object", + "required": ["status", "duration_ms"], + "properties": { + "status": { "type": "string", "enum": ["ok", "partial", "failed", "skipped"] }, + "duration_ms": { "type": "integer" }, + "artifact_count": { "type": "integer" }, + "skipped_count": { "type": "integer" }, + "error_count": { "type": "integer" }, + "facts": { "type": "object" }, + "issues": { "type": "array", "items": { "$ref": "#/$defs/issue" } }, + "skip_reasons": { "type": "array", "items": { "$ref": "#/$defs/skip_reason" } }, + "structured_errors": { "type": "array", "items": { "$ref": "#/$defs/structured_error" } } + } + }, + "issue": { + "type": "object", + "required": ["code", "severity", "confidence", "category", "message"], + "properties": { + "code": { + "type": "string", + "enum": [ + "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "gpu_unreachable", + "xid", "sxid", "firewall_posture", "critical_log", "data_quality" + ] + }, + "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "confidence": { "type": "string", "enum": ["high", "low"] }, + "category": { "type": "string" }, + "message": { "type": "string" }, + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" }, + "related_artifact_paths": { + "type": "array", + "items": { "type": "string" } + }, + "unresolved_artifact_paths": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "skip_reason": { + "type": "object", + "required": ["reason", "detail"], + "properties": { + "reason": { + "type": "string", + "enum": ["disabled_by_flag", "command_unavailable", "source_unavailable", "not_applicable", "permission_or_access", "daemon_unavailable"] + }, + "detail": { "type": "string" }, + "artifact_path": { "type": "string" } + } + }, + "structured_error": { + "type": "object", + "required": ["code", "message"], + "properties": { + "code": { + "type": "string", + "enum": ["command_failed", "command_timed_out", "probe_failed", "artifact_validation_failed", "artifact_reserve_failed", "artifact_write_failed", "enumeration_failed"] + }, + "message": { "type": "string" }, + "artifact_path": { "type": "string" } + } + } + } +} diff --git a/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json new file mode 100644 index 0000000..4786d84 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json @@ -0,0 +1,84 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/report-record/v3", + "title": "VM Diagnostics Report Record", + "description": "Schema for each NDJSON line in report.ndjson. Discriminated by 'type' field. Wire rules: UTF-8 encoding, each line is one complete JSON object followed by \\n (0x0A), optionally preceded by \\r (0x0D). JSON texts must not contain literal newlines or carriage returns. Parsers may silently ignore empty lines. Parser contract: unknown fields must be ignored; field types never change within a major version; new fields are additive only.", + "type": "object", + "required": ["schema_version", "type", "ts", "collector"], + "properties": { + "schema_version": { "type": "string" }, + "type": { "type": "string", "enum": ["artifact", "issue", "fact", "collector_summary"] }, + "ts": { "type": "string", "format": "date-time" }, + "collector": { "type": "string" } + }, + "oneOf": [ + { + "properties": { + "type": { "const": "artifact" }, + "path": { "type": "string" }, + "command": { "type": "string" }, + "exit_code": { "type": "integer" }, + "status": { "type": "string", "enum": ["ok", "skipped", "error"] }, + "tags": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", + "network", "firewall", "docker", "docker-security", "services", "journal", "oom", + "packages", "storage", "infiniband", "processes", "config", "triage" + ] + } + }, + "duration_ms": { "type": "integer" } + }, + "required": ["type", "path", "exit_code", "status", "duration_ms"] + }, + { + "properties": { + "type": { "const": "issue" }, + "code": { + "type": "string", + "enum": [ + "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "gpu_unreachable", + "xid", "sxid", "firewall_posture", "critical_log", "data_quality" + ] + }, + "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "confidence": { "type": "string", "enum": ["high", "low"] }, + "category": { "type": "string" }, + "message": { "type": "string" }, + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" }, + "related_artifact_paths": { + "type": "array", + "items": { "type": "string" } + }, + "unresolved_artifact_paths": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["type", "code", "severity", "confidence", "category", "message"] + }, + { + "properties": { + "type": { "const": "fact" }, + "key": { "type": "string" }, + "value": {} + }, + "required": ["type", "key"] + }, + { + "properties": { + "type": { "const": "collector_summary" }, + "status": { "type": "string", "enum": ["ok", "partial", "failed", "skipped"] }, + "artifact_count": { "type": "integer" }, + "skip_count": { "type": "integer" }, + "error_count": { "type": "integer" }, + "duration_ms": { "type": "integer" } + }, + "required": ["type", "status"] + } + ] +} diff --git a/customers/vm-troubleshooting-dashboard/schemas/triage-result.schema.json b/customers/vm-troubleshooting-dashboard/schemas/triage-result.schema.json new file mode 100644 index 0000000..e1e657d --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/schemas/triage-result.schema.json @@ -0,0 +1,48 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://hyperstack.cloud/schemas/vm-diagnostics/triage-result/v3", + "title": "VM Diagnostics Triage Result", + "description": "Schema for triage/_data/*.json files. Each file is a per-analyzer result envelope containing classified findings and typed facts. The finding object shape is closed; facts remain open for additive analyzer growth.", + "type": "object", + "required": ["kind", "schema_version", "archive_id", "analyzer", "findings"], + "additionalProperties": false, + "properties": { + "kind": { "type": "string", "const": "triage_result" }, + "schema_version": { "type": "string", "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" }, + "archive_id": { "type": "string" }, + "analyzer": { "type": "string" }, + "findings": { + "type": "array", + "items": { "$ref": "#/$defs/finding" } + }, + "facts": { + "type": "object", + "description": "Typed analyzer facts. Integer-keyed facts are JSON numbers; 'unavailable' maps to null. Open for additive growth." + } + }, + "$defs": { + "finding": { + "type": "object", + "required": ["code", "severity", "confidence", "category", "title", "description"], + "additionalProperties": false, + "properties": { + "code": { + "type": "string", + "enum": ["xid", "sxid", "firewall_posture", "critical_log", "data_quality", "service_state", "disk_capacity", "memory_pressure"] + }, + "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, + "confidence": { "type": "string", "enum": ["high", "low"] }, + "category": { "type": "string" }, + "title": { "type": "string" }, + "description": { "type": "string" }, + "action": { "type": "string" }, + "evidence": { "type": "array", "items": { "type": "string" } }, + "source_artifacts": { + "type": "array", + "items": { "type": "string" } + }, + "issue_fingerprint": { "type": "string", "pattern": "^[0-9a-f]{32}$" } + } + } + } +} diff --git a/customers/vm-troubleshooting/.gitignore b/customers/vm-troubleshooting/.gitignore index c15005a..bd33b85 100644 --- a/customers/vm-troubleshooting/.gitignore +++ b/customers/vm-troubleshooting/.gitignore @@ -1,3 +1,17 @@ +# Local binary / output dir bin/ gather-info + +# Go test & coverage +*.test +*.out +coverage.out +coverage.html +*.coverprofile + +# Python (if any helper scripts) __pycache__/ +*.py[cod] +.Python +venv/ +.venv/ diff --git a/customers/vm-troubleshooting/AGENTS.md b/customers/vm-troubleshooting/AGENTS.md index 89df119..449c0b0 100644 --- a/customers/vm-troubleshooting/AGENTS.md +++ b/customers/vm-troubleshooting/AGENTS.md @@ -5,6 +5,16 @@ This file applies to everything under `customers/vm-troubleshooting/`. See `customers/vm-troubleshooting/CODEMAP.md` for the current code layout, runtime flow, collector map, and artifact structure. +See `customers/vm-troubleshooting/ARCHITECTURE.md` for a longer reference (execution pipeline, core types, machine-readable outputs, collection modes). + +## Monorepo boundary +- This project **owns** the `gather-info` **producer** contract: archive layout, schemas, manifest/report/triage outputs, and customer-safe collection behavior. +- **`customers/vm-troubleshooting-dashboard/`** consumes those archives. Breaking or major contract changes need coordinated updates (including `SCHEMA-COMPATIBILITY.md` and mirrored `schemas/` when applicable). + +## Simplicity (KISS) +- Prefer small, explicit changes over new frameworks or cross-collector abstractions unless they remove real duplication. +- Preserve support value on partially broken systems and keep forward compatibility rules in `SCHEMA-COMPATIBILITY.md` authoritative. + ## Project goals - Produce a single customer-distributable `gather-info` binary. - Favor reliability, readability, and maintainability over cleverness. @@ -49,6 +59,9 @@ See `customers/vm-troubleshooting/CODEMAP.md` for the current code layout, runti - Shell-out artifacts should retain command metadata headers. ## Structured data layer +For backward-compatibility rules, schema versioning, and the full list of files +to update on a schema change, see `SCHEMA-COMPATIBILITY.md`. + The archive contains three complementary output files for different consumers: - `metadata.json` — stable backward-compatible summary. **Do not change field types or remove fields.** - `manifest.json` — rich machine-readable index with per-artifact records, SHA-256 checksums, typed facts, and tags. This is the primary file for automated parsing tools. Schema: `schemas/manifest.schema.json`. @@ -58,9 +71,9 @@ The archive contains three complementary output files for different consumers: Rules for the structured layer: - Schema files live in `schemas/` in-repo and are included in every archive. -- `schema_version` follows semver: minor adds fields, major changes types. Current: `3.0.0`. +- `schema_version` follows semver: minor adds fields, major changes types. Current: `3.2.0`. See `SCHEMA-COMPATIBILITY.md` for bump rules. - Issues and findings are identity-bearing records: keep `code`, `severity`, `confidence`, and deterministic fingerprints populated. -- Facts with integer keys (`cpu_cores`, `gpu_count`, `memory_total`, `oom_event_count`, `xid_classified_count`, `critical_event_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report/triage. `"unavailable"` maps to `null`. +- Facts with integer keys (`cpu_cores`, `gpu_count`, `gpu_unreachable_count`, `gpu_total_count`, `memory_total`, `oom_event_count`, `xid_classified_count`, `critical_event_count`, `container_count`, `vllm_container_count`, `failed_service_count`) are typed as integers in manifest/report/triage. `"unavailable"` maps to `null`. - All other facts remain strings. ## Triage ownership and policy boundaries @@ -116,4 +129,4 @@ When changing artifact tags, parser hints, or the structured output format, veri - Do not silently reduce support value for partially broken systems. - Do not change `metadata.json` field types — it is the stable backward-compatible contract. - New tags or parser hints must be added to `ValidTags`/`ValidParserHints` in `collector.go` and to the JSON Schema files in `schemas/`. -- Bump `schema_version` in `manifest.go` and `report.go` when adding fields (minor) or changing types (major). +- Bump `schema_version` per `SCHEMA-COMPATIBILITY.md` when adding fields (minor) or changing types (major). Sites: `internal/runner/runner.go`, `internal/output/report.go`, and `internal/triage/triage.go` (only if triage schema changes). diff --git a/customers/vm-troubleshooting/ARCHITECTURE.md b/customers/vm-troubleshooting/ARCHITECTURE.md new file mode 100644 index 0000000..ebbbaca --- /dev/null +++ b/customers/vm-troubleshooting/ARCHITECTURE.md @@ -0,0 +1,235 @@ +# gather-info — architecture (extended reference) + +This document is a **narrative deep-dive** into the diagnostics collector under `customers/vm-troubleshooting/`. + +For a short, skimmable map (entrypoints, package ownership, collector table, safe-change guide), use **[CODEMAP.md](./CODEMAP.md)** in this directory. + +--- + +## Overview + +`gather-info` is a static Linux binary that collects VM diagnostics into a self-contained `.tar.gz` archive. It is designed for customer and support engineer use on machines we do not control and cannot access directly. + +The binary runs 11 domain collectors, performs automated triage analysis, and produces structured machine-readable output alongside human-readable summaries. + +## Execution Pipeline + +``` +main.go + | + v +cli/root.go Cobra flags, signal handling, context creation + | + v +runner/runner.go Orchestration hub + | + |-- privilege/ Detect root/sudo availability + |-- platform/ Detect distro, GPU, DCGM, WSL + |-- install/ Optional interactive DCGM installation + | + |-- collector/ 11 domain collectors (sequential, skip-aware) + | |-- system CPU, memory, disk, processes, hardware + | |-- network Links, routes, neighbors, firewall rules, netplan + | |-- nvidia nvidia-smi, dmesg Xid extraction, driver params + | |-- dcgm dcgmi discovery, health, stats, optional level-2 diag + | |-- docker docker info/ps, sanitized container inspect + | |-- services Batch D-Bus service status, failed services, fabricmanager + | |-- journal dmesg, journalctl (kernel, errors, OOM), optional full journal + | |-- packages dpkg/rpm nvidia packages, pip, held packages + | |-- additional Limits, sysctl, LVM, sensors, mounts + | |-- storage nvme, smartctl + | '-- infiniband ibstat, rdma tools + | + |-- triage/ Post-collection analysis (after collectors, before output) + | |-- xid NVIDIA Xid/SXid classification (catalog + local policy overrides) + | |-- xidcatalog Local Xid catalog/parsing boundary (neutral data) + | |-- firewall Firewall posture detection (iptables/ufw/nft/firewalld) + | '-- critical Critical log extraction (panic, HW error, fallen off bus, timeout) + | + |-- output/ Generate structured output files + | |-- metadata.json Lightweight execution summary + | |-- SUMMARY.txt Human-readable report + | |-- manifest.json Full machine-readable archive index + | |-- report.ndjson Streaming event log (1 JSON line per record) + | '-- schemas/ Embedded JSON schemas (self-describing archive) + | + |-- transfer/ Floating IP detection, SCP command generation + | + '-- archive tar.gz the work directory +``` + +## Package Map + +| Package | Purpose | Depends on | +|---------|---------|------------| +| `cmd/gather-info` | Binary entrypoint | cli, config | +| `internal/cli` | Cobra command, flags, signal handling | config, executor, runner, ui | +| `internal/config` | Config struct, modes, timeouts, exit codes, build metadata | stdlib only | +| `internal/runner` | Orchestration: detect → collect → triage → output → archive | all internal packages | +| `internal/collector` | Collector interface, registry, core types (Issue, Severity, ArtifactRecord) | executor, output, ui, platform, probe, sanitize | +| `internal/triage` | Post-collection analysis (Xid, firewall, critical logs) | collector (types), identity, output (writer), ui | +| `internal/triage/xidcatalog` | Neutral Xid catalog lookup + kernel line parsers | stdlib only | +| `internal/identity` | Stable issue fingerprint helper (normalized tuple hashing) | stdlib only | +| `internal/output` | Writer, manifest, report, summary, archive creation | executor, schemas | +| `internal/transfer` | IP discovery, floating IP detection, SCP commands | netlink | +| `internal/executor` | Subprocess execution: timeouts, process groups, capture limits | stdlib only | +| `internal/ui` | TTY-aware terminal output (pterm), spinners, prompts | pterm, isatty, runewidth | +| `internal/probe` | Go-native probes: systemd D-Bus, procfs, netlink, GHW | go-systemd, ghw, procfs, netlink | +| `internal/platform` | Distro, GPU, DCGM, WSL detection | executor | +| `internal/sanitize` | Redaction of secrets from configs, process lists, Docker inspect | stdlib only | +| `internal/privilege` | Root/sudo detection and interactive acquisition | stdlib only | +| `internal/install` | Optional DCGM installation and daemon enablement (Ubuntu 22.04/24.04) | config, executor, platform, ui | + +## Core Types + +### `collector.Severity` (int enum) + +``` +SeverityUnspecified = 0 // sentinel, catches uninitialized Issue{} +SeverityInfo = 1 +SeverityWarning = 2 +SeverityCritical = 3 +``` + +Explicit integer values (not iota). MarshalJSON/UnmarshalJSON serialize as strings (`"info"`, `"warning"`, `"critical"`). + +### `collector.Issue` + +```go +Code IssueCode +Severity Severity +Confidence Confidence // "high" or "low" +Category string // "GPU", "SVC", "MEM", "DISK", "FW", etc. +Message string +Fingerprint string +RelatedArtifactPaths []string +UnresolvedArtifactPaths []string +``` + +### `collector.ArtifactRecord` + +Every collected file has structured metadata: path, type (`command`/`file`/`probe`), command string, exit code, status (`ok`/`skipped`/`error`), timing, SHA-256, content type, parser hint, and semantic tags. + +### `collector.CollectorResult` + +Aggregated per-collector output: ID, name, issues, facts (`map[string]string`), artifacts, skipped reasons, errors, duration. + +### `triage.Finding` + +Richer than Issue: includes `code`, `severity`, `confidence`, title/description/action, evidence, source artifact paths, and issue fingerprint. Findings are converted to synthetic issues by the runner bridge. + +## Machine-Readable Output + +### `manifest.json` — Archive Index + +The primary machine-readable file. Contains: + +- **`artifact_index[]`** — flat list of every collector-produced file with SHA-256, size, parser hint, tags +- **`collectors{}`** — per-collector summary with status, duration, facts (typed), issues, skipped reasons, errors +- issue records include `code`, `severity`, `confidence`, `message`, `issue_fingerprint`, and path linkage (`related_artifact_paths`, `unresolved_artifact_paths`) +- **`platform{}`** — OS and kernel +- Schema version, archive ID, tool version, generation timestamp + +Control files (manifest.json itself, report.ndjson, SUMMARY.txt, metadata.json, schemas/) are excluded from `artifact_index`. + +### `report.ndjson` — Event Stream + +Same data as manifest in streaming NDJSON format. Four record types discriminated by `type`: + +- `artifact` — file was collected +- `issue` — problem detected +- `fact` — key-value observation +- `collector_summary` — collector finished + +Order: per collector (registration order) → artifacts → issues → facts (alphabetical) → summary. + +Wire rules (per NDJSON spec v1.0.0): UTF-8, `\n` delimited, no internal newlines, parsers may ignore empty lines. + +### `triage/_data/*.json` — Analysis Detail + +Three JSON files with rich finding detail: + +- `gpu_health.json` — Xid/SXid findings with `code`, `confidence`, fingerprint, source artifacts, and typed facts +- `firewall_posture.json` — posture classification, per-tool results +- `critical_events.json` — critical log findings with explicit pattern metadata and deterministic fingerprints + +### `metadata.json` — Execution Summary + +Lightweight backward-compatible summary: version, flags, per-collector counts (artifacts, skipped, errors, duration). Does not duplicate manifest detail. + +### `SUMMARY.txt` — Human Report + +Text report with issues grouped by severity (CRITICAL → WARNING → INFO), system/hardware/GPU summaries, collector status table, and archive contents listing. +Only `confidence=high` issues are shown in `SUMMARY.txt`; low-confidence issues remain available in machine-readable outputs. + +## Controlled Vocabularies + +**Tags** (on artifacts): identity, cpu, memory, disk, hardware, gpu, gpu-errors, gpu-health, network, firewall, docker, docker-security, services, journal, oom, packages, storage, infiniband, processes, config, triage + +**Parser hints** (on artifacts): ~40 values identifying the tool/format that produced the content (e.g., `nvidia-smi`, `dmesg`, `systemctl`, `json`, `text`) + +**Issue categories**: GPU, SVC, MEM, DISK, FW, KERN, HW, TIMEOUT, ERR + +## Collection Modes + +`--mode=safe|quick|standard|deep` + +| Collector | safe | quick | standard | deep | +|-----------|------|-------|----------|------| +| System | run | run | run | run | +| Network | run | run | run | run | +| NVIDIA | skip | run | run | run | +| DCGM | skip | skip | run | run + level-2 diag | +| Docker | skip | skip | run | run + container logs | +| Services | run | run | run | run | +| Journal | skip | skip | run | run + full journal | +| Packages | run | skip | run | run | +| Additional | run | run | run | run | +| Storage | run | run | run | run | +| InfiniBand | run | run | run | run | + +Explicit CLI flags (`--skip-*`, `--include-*`) always override mode defaults. + +## Key Invariants + +1. **Path reservation** — all artifact paths globally unique via `Writer.ReservePath()` +2. **Atomic writes** — all files written via temp → rename (no partial files) +3. **Process isolation** — all subprocesses in process groups for clean cleanup +4. **Context propagation** — cancellation flows through all layers +5. **TTY awareness** — stderr for progress, stdout reserved for archive path +6. **Fail per section** — collector errors don't stop other collectors +7. **Triage timing** — runs after collection, before summary generation + +## External Dependencies + +| Library | Purpose | +|---------|---------| +| `github.com/spf13/cobra` | CLI framework | +| `github.com/pterm/pterm` | Terminal UI (spinners, styled output) | +| `github.com/coreos/go-systemd/v22/dbus` | Systemd D-Bus for batch service status | +| `github.com/jaypipes/ghw` | Hardware detection (CPU, memory, PCI) | +| `github.com/prometheus/procfs` | /proc parsing | +| `github.com/vishvananda/netlink` | Netlink route/interface queries | +| `github.com/mattn/go-isatty` | TTY detection | +| `golang.org/x/sys` | Unix syscalls (disk space, process groups) | + +## Build + +From `customers/vm-troubleshooting/`: + +```bash +CGO_ENABLED=0 go build -trimpath -o gather-info ./cmd/gather-info +``` + +For release builds, set `-ldflags` with version/commit/date as in your release process (see `internal/config` for `-X` variable names). + +Static binary, no CGO. Target: common Ubuntu LTS on amd64; see `AGENTS.md` for portability expectations. + +## Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | Archive created, no errors | +| 1 | Fatal error, no archive produced | +| 2 | Archive created, some collectors had errors | +| 3 | Interrupted (SIGINT/SIGTERM), partial work directory preserved | diff --git a/customers/vm-troubleshooting/CLAUDE.md b/customers/vm-troubleshooting/CLAUDE.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/customers/vm-troubleshooting/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index a0ae39d..9111fd0 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -12,6 +12,7 @@ This map is intentionally short, skimmable, and current. It should answer: Keep this file updated in the same change as architecture or collector changes. ## Read This First +- Extended narrative (types, modes, outputs): `customers/vm-troubleshooting/ARCHITECTURE.md` - CLI entrypoint: `customers/vm-troubleshooting/cmd/gather-info/main.go` - Command wiring: `customers/vm-troubleshooting/internal/cli/root.go` - Runtime orchestration: `customers/vm-troubleshooting/internal/runner/runner.go` diff --git a/customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md b/customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md new file mode 100644 index 0000000..eceab93 --- /dev/null +++ b/customers/vm-troubleshooting/SCHEMA-COMPATIBILITY.md @@ -0,0 +1,104 @@ +# Schema Compatibility + +## Why this exists +The collector emits structured data (`manifest.json`, `report.ndjson`, +`triage/_data/*.json`) consumed by the dashboard and downstream tools. Old +archives must remain viewable forever. This file is the rule of the road for +evolving the schemas without breaking historical data. + +Applies to: `customers/vm-troubleshooting/schemas/*.schema.json` and the matching +mirror files in `customers/vm-troubleshooting-dashboard/schemas/`. + +## Versioning contract +- Schemas use semver: `MAJOR.MINOR.PATCH`. Pattern: `^[0-9]+\.[0-9]+\.[0-9]+$`. +- Current major: `v3` (encoded in each schema's `$id`, e.g. `.../manifest/v3`). +- The dashboard accepts any archive whose major matches `SupportedSchemaMajor` + (`internal/model/types.go`). Minor and patch are ignored by the version gate. +- Schemas declare: *"unknown fields must be ignored; field types never change + within a major version; new fields are additive only."* Treat that as binding. + +## Bump rules +| Change | Bump | Notes | +|----------------------------------------------|-------|------------------------------------------------------------------| +| Add a new field | minor | Optional by default. Old archives still validate. | +| Add a new enum value (issue code, tag, etc.) | minor | Old archives won't emit it. Consumers treat codes as opaque. | +| Add a new collector / analyzer / artifact | minor | Pure addition. | +| Documentation / description text | patch | No structural change. | +| Rename a field | major | Always breaking. Add new + deprecate old instead if possible. | +| Remove a field | major | Always breaking. | +| Change a field's type | major | Always breaking. | +| Tighten a constraint (optional → required) | major | Old archives may fail validation. | +| Reuse an enum value with new semantics | major | Silent corruption of historical archives. Never do this quietly. | +| Reuse a fact key with new semantics | major | Same as above. | + +## Forbidden without a major bump +- Renaming or removing any field, code, tag, parser hint, or enum value. +- Changing the meaning of an existing field (`gpu_count`, `archive_id`, + fingerprints, severity, confidence, etc.). +- Changing a fact's type (e.g. string → integer, integer → string). +- Tightening JSON Schema constraints (adding `required`, narrowing `pattern`, + shrinking enums). +- Reusing a deprecated identifier for a different concept. + +## Where to update for a minor bump +When adding a new enum value or field, every site below must be updated in the +same change so the contract test passes: + +1. **Go source** (`customers/vm-troubleshooting/`): + - Add the constant or field in the relevant collector / triage package. + - For new issue codes: add to `internal/collector/collector.go` + `CollectorIssueCodes` map. + - For new tags / parser hints: add to `ValidTags` / `ValidParserHints` in + `internal/collector/collector.go`. +2. **Schema files (collector authoritative):** + - `customers/vm-troubleshooting/schemas/manifest.schema.json` + - `customers/vm-troubleshooting/schemas/report-record.schema.json` + - `customers/vm-troubleshooting/schemas/triage-result.schema.json` + (only if triage is changed) +3. **Schema files (dashboard mirror — must match collector exactly):** + - `customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json` + - `customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json` + - `customers/vm-troubleshooting-dashboard/schemas/triage-result.schema.json` +4. **Version constants:** + - `internal/runner/runner.go` — manifest `SchemaVersion` in `manifestMeta` + - `internal/output/report.go` — `reportSchemaVersion` + - `internal/triage/triage.go` — `triageSchemaVersion` (only if triage + schema is touched) + +## Major bumps require a migration plan +Bumping major (e.g. `3.x.y → 4.0.0`) is a breaking change. Before bumping major: +- Document why a minor bump cannot achieve the goal. +- Decide the dashboard's strategy: dual-major support, one-time re-ingest, or + drop old archives. Today the dashboard only accepts one major at a time + (`SupportedSchemaMajor` is exact-match — extend the check first if you need + overlap). +- Bump every `$id` (e.g. `/v3` → `/v4`) and every version constant in the same + change. + +## Verification +The contract test `TestSchemaIssueCodeEnumsMatchGoConstants` +(`internal/output/contract_test.go`) bidirectionally validates Go constants +against schema enums. It must pass before merging any schema change: + +```bash +cd customers/vm-troubleshooting +go test -run TestSchemaIssueCodeEnumsMatchGoConstants ./internal/output/ +go test ./... +``` + +For a dashboard-side change, also run dashboard tests: + +```bash +cd customers/vm-troubleshooting-dashboard +go test ./... +``` + +## Quick checklist for any schema-touching PR +- [ ] Change is purely additive (or major bump is justified in PR description). +- [ ] Both collector and dashboard mirror schema files updated identically. +- [ ] Go constants / `Valid*` lists updated. +- [ ] Version constants bumped in `runner.go` and `report.go` (and + `triage.go` if applicable). +- [ ] `TestSchemaIssueCodeEnumsMatchGoConstants` passes. +- [ ] If a new field, fact, or enum value: confirmed old archives still + ingest into the dashboard without error. diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index 7a22a0b..fdc8410 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -220,12 +220,17 @@ func (r *CollectorResult) SetFact(key, value string) { } } func (r *CollectorResult) AddIssue(code IssueCode, sev Severity, confidence Confidence, cat, msg string, fingerprintParts ...string) { + r.AddIssueWithArtifacts(code, sev, confidence, cat, msg, nil, fingerprintParts...) +} + +func (r *CollectorResult) AddIssueWithArtifacts(code IssueCode, sev Severity, confidence Confidence, cat, msg string, artifactPaths []string, fingerprintParts ...string) { issue := Issue{ - Code: code, - Severity: sev, - Confidence: confidence, - Category: cat, - Message: msg, + Code: code, + Severity: sev, + Confidence: confidence, + Category: cat, + Message: msg, + RelatedArtifactPaths: artifactPaths, } if len(fingerprintParts) > 0 { issue.Fingerprint = identity.Fingerprint(fingerprintParts...) @@ -241,6 +246,7 @@ const ( IssueDiskCritical IssueCode = "disk_critical" IssueSvcFailed IssueCode = "svc_failed" IssueSvcFabricmanagerBenign IssueCode = "svc_fabricmanager_benign" + IssueGPUUnreachable IssueCode = "gpu_unreachable" ) // CollectorIssueCodes enumerates collector-owned issue codes. @@ -250,6 +256,7 @@ var CollectorIssueCodes = map[string]bool{ string(IssueDiskCritical): true, string(IssueSvcFailed): true, string(IssueSvcFabricmanagerBenign): true, + string(IssueGPUUnreachable): true, } type Confidence string diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index d4e78b6..d7d690a 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -268,7 +268,7 @@ func TestCountOOMIncidents(t *testing.T) { "Apr 04 12:10:01 host kernel: node invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", "Apr 04 12:10:02 host kernel: Memory cgroup out of memory: Killed process 222 (python3) total-vm:5678kB", } - if got := countOOMIncidents(lines); got != 2 { + if got := countOOMIncidents(lines, false); got != 2 { t.Fatalf("expected 2 incidents from timestamped OOM bursts, got %d", got) } @@ -276,11 +276,22 @@ func TestCountOOMIncidents(t *testing.T) { "kernel: invoked oom-killer: gfp_mask=0x100cca", "kernel: invoked oom-killer: gfp_mask=0x100cca", } - if got := countOOMIncidents(fallbackOnly); got != 2 { + if got := countOOMIncidents(fallbackOnly, false); got != 2 { t.Fatalf("expected fallback count=2 from invoked-only lines, got %d", got) } } +func TestCountOOMIncidentsShortISOPreservesYear(t *testing.T) { + t.Parallel() + lines := []string{ + "2025-12-31T23:59:50Z host kernel: invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", + "2026-01-01T00:00:10Z host kernel: invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", + } + if got := countOOMIncidents(lines, true); got != 2 { + t.Fatalf("expected 2 incidents across year boundary with short-iso timestamps, got %d", got) + } +} + func TestJournalCollectorOOMTimeoutSetsUnavailable(t *testing.T) { t.Parallel() @@ -409,6 +420,12 @@ func TestNvidiaCollectorParsesMixedRowsOnExitZero(t *testing.T) { if got := res.Facts["driver_version"]; got != "575.57.08" { t.Fatalf("expected driver_version from parsed row, got %q", got) } + if got := res.Facts["gpu_unreachable_count"]; got != "0" { + t.Fatalf("expected gpu_unreachable_count=0, got %q", got) + } + if got := res.Facts["gpu_total_count"]; got != "2" { + t.Fatalf("expected gpu_total_count=2, got %q", got) + } } func TestNvidiaCollectorParsesMixedRowsOnNonZeroExit(t *testing.T) { @@ -431,6 +448,12 @@ func TestNvidiaCollectorParsesMixedRowsOnNonZeroExit(t *testing.T) { if got := res.Facts["gpu_count"]; got != "2" { t.Fatalf("expected gpu_count=2 from mixed rows on non-zero exit, got %q", got) } + if got := res.Facts["gpu_unreachable_count"]; got != "0" { + t.Fatalf("expected gpu_unreachable_count=0, got %q", got) + } + if got := res.Facts["gpu_total_count"]; got != "2" { + t.Fatalf("expected gpu_total_count=2, got %q", got) + } } func TestNvidiaCollectorLeavesFactsUnavailableOnTimedOutCSV(t *testing.T) { @@ -449,6 +472,12 @@ func TestNvidiaCollectorLeavesFactsUnavailableOnTimedOutCSV(t *testing.T) { if got := res.Facts["gpu_count"]; got != "unavailable" { t.Fatalf("expected gpu_count=unavailable on timeout, got %q", got) } + if got := res.Facts["gpu_unreachable_count"]; got != "unavailable" { + t.Fatalf("expected gpu_unreachable_count=unavailable on timeout, got %q", got) + } + if got := res.Facts["gpu_total_count"]; got != "unavailable" { + t.Fatalf("expected gpu_total_count=unavailable on timeout, got %q", got) + } } func TestNvidiaCollectorLeavesFactsUnavailableWhenNoValidCSVRows(t *testing.T) { @@ -462,8 +491,155 @@ func TestNvidiaCollectorLeavesFactsUnavailableWhenNoValidCSVRows(t *testing.T) { if err != nil { t.Fatalf("Collect failed: %v", err) } - if got := res.Facts["gpu_count"]; got != "unavailable" { - t.Fatalf("expected gpu_count=unavailable, got %q", got) + if got := res.Facts["gpu_count"]; got != "0" { + t.Fatalf("expected gpu_count=0, got %q", got) + } + if got := res.Facts["gpu_unreachable_count"]; got != "0" { + t.Fatalf("expected gpu_unreachable_count=0, got %q", got) + } + if got := res.Facts["gpu_total_count"]; got != "0" { + t.Fatalf("expected gpu_total_count=0, got %q", got) + } + if len(res.Issues) != 0 { + t.Fatalf("expected no issues when no unreachable GPUs detected, got %+v", res.Issues) + } +} + +func TestDetectUnreachableGPUDetails(t *testing.T) { + t.Parallel() + stdout := strings.Join([]string{ + "Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error", + "Unable to determine the device handle for GPU3: 0000:00:0a.0: Unknown Error", + "Unable to determine the device handle for GPU3: 0000:00:0A.0: Unknown Error", + "Unable to determine the device handle for GPU9: Unknown Error", + }, "\n") + stderr := strings.Join([]string{ + "Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error", + "some unrelated line", + }, "\n") + + bdfs, unknownCount := detectUnreachableGPUDetails(stdout, stderr) + if unknownCount != 1 { + t.Fatalf("expected unknownCount=1, got %d", unknownCount) + } + if len(bdfs) != 2 || bdfs[0] != "0000:00:08.0" || bdfs[1] != "0000:00:0a.0" { + t.Fatalf("unexpected sorted bdfs: %v", bdfs) + } +} + +func TestDetectUnreachableGPUDetails_StderrOnly(t *testing.T) { + t.Parallel() + stderr := strings.Join([]string{ + "Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error", + "Unable to determine the device handle for GPU5: Unknown Error", + }, "\n") + + bdfs, unknownCount := detectUnreachableGPUDetails("", stderr) + if unknownCount != 1 { + t.Fatalf("expected unknownCount=1, got %d", unknownCount) + } + if len(bdfs) != 1 || bdfs[0] != "0000:00:08.0" { + t.Fatalf("unexpected sorted bdfs: %v", bdfs) + } +} + +func TestDetectUnreachableGPUDetails_DedupesSameGPUAcrossStreams(t *testing.T) { + t.Parallel() + // Same GPU index, no BDF, with slightly different trailing text across + // stdout and stderr — must not inflate the unknown count. + stdout := "Unable to determine the device handle for GPU5: Unknown Error." + stderr := "Unable to determine the device handle for GPU5: Unknown Error" + + bdfs, unknownCount := detectUnreachableGPUDetails(stdout, stderr) + if unknownCount != 1 { + t.Fatalf("expected unknownCount=1 (same GPU, different wording), got %d", unknownCount) + } + if len(bdfs) != 0 { + t.Fatalf("expected no BDFs, got %v", bdfs) + } +} + +func TestNvidiaCollectorEmitsGPUUnreachableIssueWithBDFFingerprint(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("NVIDIA RTX PRO 6000, 97871 MiB, 575.57.08\n"), + Stderr: []byte("Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error\n"), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if got := res.Facts["gpu_count"]; got != "1" { + t.Fatalf("expected gpu_count=1, got %q", got) + } + if got := res.Facts["gpu_unreachable_count"]; got != "1" { + t.Fatalf("expected gpu_unreachable_count=1, got %q", got) + } + if got := res.Facts["gpu_total_count"]; got != "2" { + t.Fatalf("expected gpu_total_count=2, got %q", got) + } + if len(res.Issues) != 1 { + t.Fatalf("expected exactly one issue, got %d (%+v)", len(res.Issues), res.Issues) + } + issue := res.Issues[0] + if issue.Code != IssueGPUUnreachable { + t.Fatalf("expected issue code %q, got %q", IssueGPUUnreachable, issue.Code) + } + if issue.Severity != SeverityCritical || issue.Confidence != ConfidenceHigh { + t.Fatalf("unexpected issue severity/confidence: %+v", issue) + } + if want := identity.Fingerprint("nvidia", string(IssueGPUUnreachable), "0000:00:08.0"); issue.Fingerprint != want { + t.Fatalf("unexpected fingerprint: got %q want %q", issue.Fingerprint, want) + } +} + +func TestNvidiaCollectorGPUUnreachableFingerprintStableAcrossCountChanges(t *testing.T) { + t.Parallel() + run := func(stderr string) string { + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("NVIDIA RTX PRO 6000, 97871 MiB, 575.57.08\n"), + Stderr: []byte(stderr), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if len(res.Issues) != 1 { + t.Fatalf("expected one issue, got %d", len(res.Issues)) + } + return res.Issues[0].Fingerprint + } + + base := run("Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error\n") + withExtraUnknown := run(strings.Join([]string{ + "Unable to determine the device handle for GPU1: 0000:00:08.0: Unknown Error", + "Unable to determine the device handle for GPU9: Unknown Error", + }, "\n")) + if base != withExtraUnknown { + t.Fatalf("expected fingerprint stability for same BDF set, got %q vs %q", base, withExtraUnknown) + } +} + +func TestNvidiaCollectorGPUUnreachableFallbackFingerprintWithoutBDF(t *testing.T) { + t.Parallel() + fake := newNvidiaCollectorFake(executor.FakeResponse{ + Stdout: []byte("Unable to determine the device handle for GPU9: Unknown Error\n"), + }) + root := t.TempDir() + c := NewNvidiaCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + if len(res.Issues) != 1 { + t.Fatalf("expected one issue, got %d", len(res.Issues)) + } + if want := identity.Fingerprint("nvidia", string(IssueGPUUnreachable)); res.Issues[0].Fingerprint != want { + t.Fatalf("unexpected fallback fingerprint: got %q want %q", res.Issues[0].Fingerprint, want) } } @@ -593,6 +769,61 @@ func TestAddIssueComputesFingerprint(t *testing.T) { } } +func TestAddIssueWithArtifactsPopulatesRelatedPaths(t *testing.T) { + t.Parallel() + + r := NewResult() + paths := []string{"hardware/disk.txt"} + r.AddIssueWithArtifacts( + IssueDiskCritical, + SeverityCritical, + ConfidenceHigh, + "DISK", + "/ at 96% capacity", + paths, + "sys", + string(IssueDiskCritical), + "/", + ) + if len(r.Issues) != 1 { + t.Fatalf("expected 1 issue, got %d", len(r.Issues)) + } + issue := r.Issues[0] + if len(issue.RelatedArtifactPaths) != 1 || issue.RelatedArtifactPaths[0] != "hardware/disk.txt" { + t.Fatalf("expected RelatedArtifactPaths=[hardware/disk.txt], got %v", issue.RelatedArtifactPaths) + } + want := identity.Fingerprint("sys", string(IssueDiskCritical), "/") + if issue.Fingerprint != want { + t.Fatalf("unexpected fingerprint: got %q want %q", issue.Fingerprint, want) + } +} + +func TestAddIssueDelegatesToAddIssueWithArtifacts(t *testing.T) { + t.Parallel() + + r := NewResult() + r.AddIssue( + IssueDiskWarning, + SeverityWarning, + ConfidenceHigh, + "DISK", + "/var at 87% capacity", + "sys", + string(IssueDiskWarning), + "/var", + ) + if len(r.Issues) != 1 { + t.Fatalf("expected 1 issue, got %d", len(r.Issues)) + } + issue := r.Issues[0] + if issue.RelatedArtifactPaths != nil { + t.Fatalf("expected nil RelatedArtifactPaths from AddIssue, got %v", issue.RelatedArtifactPaths) + } + if issue.Code != IssueDiskWarning { + t.Fatalf("expected code %q, got %q", IssueDiskWarning, issue.Code) + } +} + func TestSeverityJSON(t *testing.T) { t.Parallel() type sample struct { @@ -675,3 +906,60 @@ func TestDCGMCollectorRunsDiagWhenEnabled(t *testing.T) { t.Error("diag should run when activeGPUDiag=true") } } + +func TestSystemCollectorCollectsKernelTaintedArtifact(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + root := t.TempDir() + c := NewSystemCollector(fake, output.NewWriter(root), ui.NoopUI{}) + res, err := c.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + found := false + for _, artifact := range res.Artifacts { + if artifact.Path != "hardware/kernel_tainted.txt" { + continue + } + found = true + hasConfig := false + for _, tag := range artifact.Tags { + if tag == "config" { + hasConfig = true + break + } + } + if !hasConfig { + t.Fatalf("expected config tag on kernel taint artifact, got %v", artifact.Tags) + } + } + if !found { + t.Fatal("expected hardware/kernel_tainted.txt artifact") + } + if _, err := os.Stat(filepath.Join(root, "hardware/kernel_tainted.txt")); err != nil { + t.Fatalf("expected kernel taint artifact to be written: %v", err) + } +} + +func TestSaveFileMissingSourceRecordsSkip(t *testing.T) { + t.Parallel() + + root := t.TempDir() + base := Base{ + Writer: output.NewWriter(root), + UI: ui.NoopUI{}, + } + res := NewResult() + base.saveFile(res, "hardware/kernel_tainted.txt", filepath.Join(root, "missing"), nil, "config") + if len(res.Skipped) != 1 { + t.Fatalf("expected one skip reason, got %+v", res.Skipped) + } + if res.Skipped[0].Reason != SkipSourceUnavailable { + t.Fatalf("expected SkipSourceUnavailable, got %q", res.Skipped[0].Reason) + } + if res.Skipped[0].ArtifactPath != "hardware/kernel_tainted.txt" { + t.Fatalf("unexpected skipped artifact path: %+v", res.Skipped[0]) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/docker.go b/customers/vm-troubleshooting/internal/collector/docker.go index eb00e99..687b2c0 100644 --- a/customers/vm-troubleshooting/internal/collector/docker.go +++ b/customers/vm-troubleshooting/internal/collector/docker.go @@ -17,6 +17,18 @@ type DockerCollector struct { IncludeLogs bool } +var diagnosticContainerPatterns = []string{ + "dcgm-exporter", + "dcgm_exporter", + "cadvisor", + "node-exporter", + "node_exporter", + "gpu-operator", + "vmagent", + "prometheus", + "telegraf", +} + func NewDockerCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, includeLogs bool) *DockerCollector { return &DockerCollector{Base: Base{Exec: exec, Writer: writer, UI: ui}, IncludeLogs: includeLogs} } @@ -24,6 +36,20 @@ func NewDockerCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, func (c *DockerCollector) Name() string { return "Docker" } func (c *DockerCollector) ID() string { return "docker" } +func isVLLMContainer(name, image string) bool { + return strings.Contains(strings.ToLower(name+" "+image), "vllm") +} + +func isDiagnosticContainer(name, image string) bool { + haystack := strings.ToLower(name + " " + image) + for _, pattern := range diagnosticContainerPatterns { + if strings.Contains(haystack, pattern) { + return true + } + } + return false +} + func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() if !c.Exec.CommandExists("docker") { @@ -79,9 +105,6 @@ func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) name string image string } - isVLLM := func(row containerRow) bool { - return strings.Contains(strings.ToLower(row.name+" "+row.image), "vllm") - } listResult, list, _ := c.Exec.Capture(ctx, executor.CommandSpec{Name: "docker", Args: []string{"ps", "-a", "--format", "{{.ID}}|{{.Names}}|{{.Image}}"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, 2*1024*1024) var rows []containerRow @@ -124,7 +147,7 @@ func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) r.SetFact("vllm_container_count", "unavailable") } else { for _, row := range rows { - if isVLLM(row) { + if isVLLMContainer(row.name, row.image) { vllmCount++ safeName := SanitizePathComponent(row.name) inspectSpec := executor.CommandSpec{Name: "docker", Args: []string{"inspect", row.id}, NeedsRoot: true, Timeout: config.TimeoutMedium} @@ -136,6 +159,17 @@ func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) c.saveCommand(ctx, r, fmt.Sprintf("docker/vllm_logs/%s_logs.txt", safeName), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "10000", "--timestamps", row.id}, NeedsRoot: true, Timeout: config.TimeoutSlow}, "docker", "docker") } } + + if isDiagnosticContainer(row.name, row.image) { + safeName := SanitizePathComponent(row.name) + inspectSpec := executor.CommandSpec{Name: "docker", Args: []string{"inspect", row.id}, NeedsRoot: true, Timeout: config.TimeoutMedium} + inspectResult, out, _ := c.Exec.Capture(ctx, inspectSpec, 2*1024*1024) + path := fmt.Sprintf("docker/diagnostic/%s_inspect.txt", safeName) + c.saveCapturedProbe(r, path, inspectSpec, inspectResult, sanitize.DockerInspect(string(out))+"\n", "", "docker", []string{"docker", "docker-security"}, "Environment variable values and common secret fields have been redacted") + if c.IncludeLogs { + c.saveCommand(ctx, r, fmt.Sprintf("docker/diagnostic/%s_logs.txt", safeName), executor.CommandSpec{Name: "docker", Args: []string{"logs", "--tail", "2000", "--timestamps", row.id}, NeedsRoot: true, Timeout: config.TimeoutSlow}, "docker", "docker") + } + } } r.SetFact("vllm_container_count", fmt.Sprintf("%d", vllmCount)) } @@ -146,7 +180,7 @@ func (c *DockerCollector) Collect(ctx context.Context) (*CollectorResult, error) if count >= 20 { break } - if isVLLM(row) { + if isVLLMContainer(row.name, row.image) || isDiagnosticContainer(row.name, row.image) { continue } count++ diff --git a/customers/vm-troubleshooting/internal/collector/docker_test.go b/customers/vm-troubleshooting/internal/collector/docker_test.go index 67e7b67..c0eeb70 100644 --- a/customers/vm-troubleshooting/internal/collector/docker_test.go +++ b/customers/vm-troubleshooting/internal/collector/docker_test.go @@ -91,3 +91,126 @@ func TestDockerCollectorMarksCountsUnavailableOnMalformedContainerList(t *testin t.Fatalf("expected vllm_container_count=unavailable, got %q", got) } } + +func TestIsDiagnosticContainer(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + container string + image string + want bool + }{ + {name: "dcgm exporter name", container: "safe_runpod_dcgm_exporter", image: "anything", want: true}, + {name: "dcgm exporter image", container: "monitor", image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5", want: true}, + {name: "dcgm underscore", container: "safe_runpod_dcgm_exporter", image: "anything", want: true}, + {name: "cadvisor", container: "safe_runpod_cadvisor", image: "gcr.io/cadvisor/cadvisor:v0.49.1", want: true}, + {name: "node exporter", container: "node-exporter", image: "prom/node-exporter:v1.8.1", want: true}, + {name: "node exporter underscore", container: "node_exporter", image: "prom/node-exporter:v1.8.1", want: true}, + {name: "gpu operator", container: "gpu-operator", image: "nvcr.io/nvidia/cloud-native/gpu-operator:latest", want: true}, + {name: "vmagent", container: "VMAgent", image: "victoriametrics/vmagent:v1", want: true}, + {name: "prometheus", container: "PROMETHEUS", image: "prom/prometheus:v2", want: true}, + {name: "telegraf", container: "tele", image: "telegraf:1.29", want: true}, + {name: "random app", container: "customer-app", image: "ghcr.io/example/api:latest", want: false}, + {name: "vllm remains separate", container: "vllm-main", image: "vllm/vllm-openai:latest", want: false}, + } + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if got := isDiagnosticContainer(tc.container, tc.image); got != tc.want { + t.Fatalf("isDiagnosticContainer(%q, %q)=%t want %t", tc.container, tc.image, got, tc.want) + } + }) + } +} + +func TestDockerCollectorDiagnosticInspectAlwaysCollected(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["docker"] = true + fake.Commands["docker info"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["docker version"] = executor.FakeResponse{Stdout: []byte("version\n")} + fake.Commands["docker version --format {{.Server.Version}}"] = executor.FakeResponse{Stdout: []byte("26.1.0\n")} + fake.Commands[`docker ps -a --format table {{.ID}}`+"\t"+`{{.Image}}`+"\t"+`{{.Status}}`+"\t"+`{{.Names}}`+"\t"+`{{.Ports}}`] = executor.FakeResponse{Stdout: []byte("table\n")} + fake.Commands["docker system df -v"] = executor.FakeResponse{Stdout: []byte("df\n")} + fake.Commands["docker network ls"] = executor.FakeResponse{Stdout: []byte("bridge\n")} + fake.Commands["docker ps -a --format {{.ID}}|{{.Names}}|{{.Image}}"] = executor.FakeResponse{Stdout: []byte("abc|safe_runpod_dcgm_exporter|nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5\n")} + fake.Commands["docker inspect abc"] = executor.FakeResponse{Stdout: []byte(`[{"Config":{"Env":["TOKEN=supersecret"]}}]`)} + + root := t.TempDir() + collector := NewDockerCollector(fake, output.NewWriter(root), ui.NoopUI{}, false) + res, err := collector.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + inspectPath := filepath.Join(root, "docker/diagnostic/safe_runpod_dcgm_exporter_inspect.txt") + data, err := os.ReadFile(inspectPath) + if err != nil { + t.Fatalf("reading inspect artifact: %v", err) + } + text := string(data) + if strings.Contains(text, "supersecret") || !strings.Contains(text, "[REDACTED]") { + t.Fatalf("inspect artifact was not sanitized: %s", text) + } + if !strings.Contains(text, "Environment variable values and common secret fields have been redacted") { + t.Fatalf("inspect artifact missing disclosure note: %s", text) + } + if _, err := os.Stat(filepath.Join(root, "docker/diagnostic/safe_runpod_dcgm_exporter_logs.txt")); !os.IsNotExist(err) { + t.Fatalf("expected no diagnostic logs when IncludeLogs=false, err=%v", err) + } + + foundInspect := false + for _, artifact := range res.Artifacts { + if artifact.Path != "docker/diagnostic/safe_runpod_dcgm_exporter_inspect.txt" { + continue + } + foundInspect = true + hasDockerTag := false + hasSecurityTag := false + for _, tag := range artifact.Tags { + if tag == "docker" { + hasDockerTag = true + } + if tag == "docker-security" { + hasSecurityTag = true + } + } + if !hasDockerTag || !hasSecurityTag { + t.Fatalf("inspect tags missing expected values: %v", artifact.Tags) + } + } + if !foundInspect { + t.Fatal("expected diagnostic inspect artifact in collector metadata") + } +} + +func TestDockerCollectorDiagnosticLogsWhenEnabled(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["docker"] = true + fake.Commands["docker info"] = executor.FakeResponse{Stdout: []byte("ok\n")} + fake.Commands["docker version"] = executor.FakeResponse{Stdout: []byte("version\n")} + fake.Commands["docker version --format {{.Server.Version}}"] = executor.FakeResponse{Stdout: []byte("26.1.0\n")} + fake.Commands[`docker ps -a --format table {{.ID}}`+"\t"+`{{.Image}}`+"\t"+`{{.Status}}`+"\t"+`{{.Names}}`+"\t"+`{{.Ports}}`] = executor.FakeResponse{Stdout: []byte("table\n")} + fake.Commands["docker system df -v"] = executor.FakeResponse{Stdout: []byte("df\n")} + fake.Commands["docker network ls"] = executor.FakeResponse{Stdout: []byte("bridge\n")} + fake.Commands["docker ps -a --format {{.ID}}|{{.Names}}|{{.Image}}"] = executor.FakeResponse{Stdout: []byte("abc|safe_runpod_dcgm_exporter|nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5\n")} + fake.Commands["docker inspect abc"] = executor.FakeResponse{Stdout: []byte(`[{"Config":{"Env":["TOKEN=supersecret"]}}]`)} + fake.Commands["docker logs --tail 2000 --timestamps abc"] = executor.FakeResponse{Stdout: []byte("diagnostic log line\n")} + + root := t.TempDir() + collector := NewDockerCollector(fake, output.NewWriter(root), ui.NoopUI{}, true) + if _, err := collector.Collect(context.Background()); err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if _, err := os.Stat(filepath.Join(root, "docker/diagnostic/safe_runpod_dcgm_exporter_logs.txt")); err != nil { + t.Fatalf("expected diagnostic logs artifact when IncludeLogs=true: %v", err) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 5f0a21e..3875712 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -1,6 +1,8 @@ package collector import ( + "bufio" + "bytes" "context" "encoding/json" "fmt" @@ -73,10 +75,18 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCommand(ctx, r, "logs/dmesg.txt", executor.CommandSpec{Name: "dmesg", Args: []string{"-T"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "dmesg", "journal") } - // Grep journal for OOM events + // Grep journal for OOM events (short-iso keeps the year on the timestamp). + useShortISO := c.journalShortISOAvailable(ctx) + if !useShortISO { + c.UI.Warn("journalctl -o short-iso is not available; OOM event timestamps may be inaccurate across calendar years") + } + oomJournalArgs := append(append([]string{}, journalArgs...), "-k") + if useShortISO { + oomJournalArgs = append([]string{"-o", "short-iso"}, oomJournalArgs...) + } oomSpec := executor.CommandSpec{ Name: "journalctl", - Args: append(append(append([]string{}, journalArgs...), "-k"), "--grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm", "--case-sensitive=false"), + Args: append(append(oomJournalArgs, "--grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm"), "--case-sensitive=false"), NeedsRoot: true, Timeout: config.TimeoutMedium, } @@ -113,7 +123,7 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error } } - oomIncidentCount := countOOMIncidents(oom) + oomIncidentCount := countOOMIncidents(oom, useShortISO) // journalctl --grep returns exit 1 when no entries match (like grep). // Only treat exit codes >= 2 as real errors. oomRealErr := oomResult.Err != nil && oomResult.ExitCode >= 2 @@ -128,12 +138,13 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") } else if oomIncidentCount > 0 { r.SetFact("oom_event_count", fmt.Sprintf("%d", oomIncidentCount)) - r.AddIssue( + r.AddIssueWithArtifacts( IssueOOMEvents, SeverityCritical, ConfidenceHigh, "MEM", fmt.Sprintf("%d OOM killer event(s)", oomIncidentCount), + []string{oomPath}, "journal", string(IssueOOMEvents), ) @@ -150,12 +161,29 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } -func countOOMIncidents(lines []string) int { +func (c *JournalCollector) journalShortISOAvailable(ctx context.Context) bool { + spec := executor.CommandSpec{ + Name: "journalctl", + Args: []string{"--no-pager", "-o", "short-iso", "-n", "0"}, + NeedsRoot: true, + Timeout: config.TimeoutQuick, + } + res, _, _ := c.Exec.Capture(ctx, spec, 2048) + if res.Skipped { + return false + } + if res.Err != nil && res.ExitCode >= 2 { + return false + } + return true +} + +func countOOMIncidents(lines []string, useShortISO bool) int { count := 0 var lastIncidentAt time.Time haveLastIncidentAt := false for _, line := range lines { - ts, ok := oomLineTimestamp(line) + ts, ok := oomLineTimestamp(line, useShortISO) if !ok { // Unexpected formatting: preserve support value by counting the line. count++ @@ -170,8 +198,14 @@ func countOOMIncidents(lines []string) int { return count } -func oomLineTimestamp(line string) (time.Time, bool) { +func oomLineTimestamp(line string, useShortISO bool) (time.Time, bool) { fields := strings.Fields(strings.TrimSpace(line)) + if len(fields) < 1 { + return time.Time{}, false + } + if useShortISO { + return parseJournalShortISOTimestamp(fields[0]) + } if len(fields) < 4 { return time.Time{}, false } @@ -183,6 +217,30 @@ func oomLineTimestamp(line string) (time.Time, bool) { return time.Date(time.Now().Year(), ts.Month(), ts.Day(), ts.Hour(), ts.Minute(), ts.Second(), 0, time.Local), true } +func parseJournalShortISOTimestamp(s string) (time.Time, bool) { + layouts := []string{ + time.RFC3339Nano, + time.RFC3339, + "2006-01-02T15:04:05.999999999Z07:00", + "2006-01-02T15:04:05.999999Z07:00", + "2006-01-02T15:04:05Z07:00", + "2006-01-02 15:04:05", + "2006-01-02T15:04:05", + } + for _, layout := range layouts { + if t, err := time.Parse(layout, s); err == nil { + return t, true + } + } + if t, err := time.ParseInLocation("2006-01-02T15:04:05.999999999", s, time.Local); err == nil { + return t, true + } + if t, err := time.ParseInLocation("2006-01-02T15:04:05", s, time.Local); err == nil { + return t, true + } + return time.Time{}, false +} + func (c *JournalCollector) journalBaseArgs(until string) []string { args := []string{"--no-pager", "--until=" + until} if c.Since == "" || c.Since == "boot" { @@ -249,15 +307,16 @@ func (c *JournalCollector) saveStructuredJournalNDJSON(ctx context.Context, r *C } func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int, bool, string) { - lines := strings.Split(string(raw), "\n") - written := make([]string, 0, len(lines)) + sc := bufio.NewScanner(bytes.NewReader(raw)) + sc.Buffer(make([]byte, 0, 64*1024), 1024*1024) + written := make([]string, 0, 1024) recordsWritten := 0 usedBytes := 0 truncated := false truncationReason := "" - for _, line := range lines { - line = strings.TrimSpace(line) + for sc.Scan() { + line := strings.TrimSpace(sc.Text()) if line == "" { continue } @@ -288,6 +347,12 @@ func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int recordsWritten++ usedBytes += needed } + if err := sc.Err(); err != nil { + truncated = true + if truncationReason == "" { + truncationReason = "byte_limit" + } + } if forceByteTruncated { truncated = true diff --git a/customers/vm-troubleshooting/internal/collector/journal_ndjson_build_test.go b/customers/vm-troubleshooting/internal/collector/journal_ndjson_build_test.go new file mode 100644 index 0000000..ae0fd6e --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/journal_ndjson_build_test.go @@ -0,0 +1,105 @@ +package collector + +import ( + "encoding/json" + "strings" + "testing" +) + +// buildJournalNDJSONContentSplit mirrors the pre-scanner implementation for regression tests. +func buildJournalNDJSONContentSplit(raw []byte, forceByteTruncated bool) (string, int, bool, string) { + lines := strings.Split(string(raw), "\n") + written := make([]string, 0, len(lines)) + recordsWritten := 0 + usedBytes := 0 + truncated := false + truncationReason := "" + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + rec := map[string]any{} + if err := json.Unmarshal([]byte(line), &rec); err != nil { + continue + } + normalized := normalizeJournalRecord(rec) + encoded, err := json.Marshal(normalized) + if err != nil { + continue + } + + if recordsWritten >= journalNDJSONRecordLimit { + truncated = true + truncationReason = "record_limit" + break + } + needed := len(encoded) + 1 + if usedBytes+needed+journalNDJSONSentinelReserve > journalNDJSONByteLimit { + truncated = true + truncationReason = "byte_limit" + break + } + written = append(written, string(encoded)) + recordsWritten++ + usedBytes += needed + } + + if forceByteTruncated { + truncated = true + if truncationReason == "" { + truncationReason = "byte_limit" + } + } + + if truncated { + worstSentinel, _ := json.Marshal(map[string]any{ + "_truncated": true, + "records_written": recordsWritten, + "reason": truncationReason, + }) + for len(written) > 0 && usedBytes+len(worstSentinel)+1 > journalNDJSONByteLimit { + last := written[len(written)-1] + written = written[:len(written)-1] + usedBytes -= len(last) + 1 + recordsWritten-- + } + sentinel, _ := json.Marshal(map[string]any{ + "_truncated": true, + "records_written": recordsWritten, + "reason": truncationReason, + }) + if usedBytes+len(sentinel)+1 <= journalNDJSONByteLimit { + written = append(written, string(sentinel)) + } + } + + if len(written) == 0 { + return "", recordsWritten, truncated, truncationReason + } + return strings.Join(written, "\n") + "\n", recordsWritten, truncated, truncationReason +} + +func TestBuildJournalNDJSONContentMatchesSplitImplementation(t *testing.T) { + t.Parallel() + raw := []byte(`{"MESSAGE":"hello","PRIORITY":"6"} +{"not": "json on purpose"} +{"MESSAGE":"world","PRIORITY":"3","_BOOT_ID":"abc"} +`) + gotS, gotN, gotTrunc, gotReason := buildJournalNDJSONContent(raw, false) + wantS, wantN, wantTrunc, wantReason := buildJournalNDJSONContentSplit(raw, false) + if gotS != wantS || gotN != wantN || gotTrunc != wantTrunc || gotReason != wantReason { + t.Fatalf("scanner vs split mismatch:\n got: n=%d trunc=%v reason=%q\n s=%q\n want: n=%d trunc=%v reason=%q\n s=%q", + gotN, gotTrunc, gotReason, gotS, wantN, wantTrunc, wantReason, wantS) + } +} + +func TestBuildJournalNDJSONContentScannerEmptyInput(t *testing.T) { + t.Parallel() + s, n, trunc, reason := buildJournalNDJSONContent([]byte("\n\n \n"), false) + if s != "" || n != 0 || trunc || reason != "" { + t.Fatalf("unexpected: %q n=%d trunc=%v reason=%q", s, n, trunc, reason) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/nvidia.go b/customers/vm-troubleshooting/internal/collector/nvidia.go index 5778dc8..d4bc9d5 100644 --- a/customers/vm-troubleshooting/internal/collector/nvidia.go +++ b/customers/vm-troubleshooting/internal/collector/nvidia.go @@ -7,6 +7,8 @@ import ( "io" "os" "path/filepath" + "regexp" + "sort" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" @@ -21,6 +23,14 @@ type NvidiaCollector struct { Enabled bool } +var ( + unreachableGPURe = regexp.MustCompile(`(?i)unable to determine the device handle`) + pciBDFRe = regexp.MustCompile(`(?i)\b([0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f])\b`) + // Secondary identity for error lines that lack a BDF. Matches both + // "GPU1" / "GPU 1" and "GPU-" forms nvidia-smi may emit. + gpuIdentRe = regexp.MustCompile(`(?i)\bgpu[\s-]?([0-9a-f\-]+)`) +) + func NewNvidiaCollector(exec executor.Executor, writer *output.Writer, ui ui.UI, enabled bool) *NvidiaCollector { return &NvidiaCollector{Base: Base{Exec: exec, Writer: writer, UI: ui}, Enabled: enabled} } @@ -81,6 +91,8 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) } gpuSpec := executor.CommandSpec{Name: "nvidia-smi", Args: []string{"--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"}, Timeout: config.TimeoutMedium} r.SetFact("gpu_count", "unavailable") + r.SetFact("gpu_unreachable_count", "unavailable") + r.SetFact("gpu_total_count", "unavailable") r.SetFact("gpu_model", "unavailable") r.SetFact("driver_version", "unavailable") gpuResult, gpuCsv, gpuErr := c.Exec.Capture(ctx, gpuSpec, 512*1024) @@ -90,8 +102,13 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) } if !gpuResult.Skipped && !gpuResult.TimedOut { rows := parseGPUCSVRows(string(gpuCsv)) + unreachableBDFs, unreachableUnknownCount := detectUnreachableGPUDetails(string(gpuCsv), string(gpuErr)) + unreachableCount := len(unreachableBDFs) + unreachableUnknownCount + totalCount := len(rows) + unreachableCount + r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) + r.SetFact("gpu_unreachable_count", fmt.Sprintf("%d", unreachableCount)) + r.SetFact("gpu_total_count", fmt.Sprintf("%d", totalCount)) if len(rows) > 0 { - r.SetFact("gpu_count", fmt.Sprintf("%d", len(rows))) if model := strings.TrimSpace(rows[0][0]); model != "" { r.SetFact("gpu_model", model) } @@ -99,6 +116,21 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) r.SetFact("driver_version", driver) } } + if unreachableCount > 0 { + fingerprintParts := []string{"nvidia", string(IssueGPUUnreachable)} + if len(unreachableBDFs) > 0 { + fingerprintParts = append(fingerprintParts, unreachableBDFs...) + } + r.AddIssueWithArtifacts( + IssueGPUUnreachable, + SeverityCritical, + ConfidenceHigh, + "GPU", + fmt.Sprintf("%d of %d GPUs unreachable (device handle error)", unreachableCount, totalCount), + []string{"nvidia/gpu_summary.txt", "nvidia/xid_errors.txt"}, + fingerprintParts..., + ) + } } // Capture raw dmesg for the archive. Xid classification is handled by the // triage layer (internal/triage/xid.go) which runs after all collectors. @@ -171,6 +203,38 @@ func parseGPUCSVRows(raw string) [][]string { return rows } +func detectUnreachableGPUDetails(outputs ...string) ([]string, int) { + bdfSet := make(map[string]struct{}) + unknownSet := make(map[string]struct{}) + for _, raw := range outputs { + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + if line == "" || !unreachableGPURe.MatchString(line) { + continue + } + if matches := pciBDFRe.FindStringSubmatch(line); len(matches) == 2 { + bdfSet[strings.ToLower(matches[1])] = struct{}{} + continue + } + // No BDF: fall back to a GPU-index/UUID identity so the same + // GPU reported with slightly different wording across stdout + // and stderr is not double-counted. Final fallback is the + // whole line, which covers unexpected driver output shapes. + if idMatch := gpuIdentRe.FindStringSubmatch(line); len(idMatch) == 2 { + unknownSet["gpu:"+strings.ToLower(idMatch[1])] = struct{}{} + continue + } + unknownSet[strings.ToLower(line)] = struct{}{} + } + } + sortedBDFs := make([]string, 0, len(bdfSet)) + for bdf := range bdfSet { + sortedBDFs = append(sortedBDFs, bdf) + } + sort.Strings(sortedBDFs) + return sortedBDFs, len(unknownSet) +} + func stripANSIArtifact(path string) error { data, err := os.ReadFile(path) if err != nil { diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index f4dfefc..b626f0a 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -222,12 +222,13 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context } if fmIdx >= 0 && c.isFabricManagerBenign(ctx) { - r.AddIssue( + r.AddIssueWithArtifacts( IssueSvcFabricmanagerBenign, SeverityWarning, ConfidenceHigh, "SVC", "nvidia-fabricmanager not running (expected: no NVSwitch/SXM detected)", + []string{"services/status_nvidia-fabricmanager.txt"}, "svc", string(IssueSvcFabricmanagerBenign), ) @@ -254,12 +255,13 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context } slices.Sort(deduped) fingerprintParts := append([]string{"svc", string(IssueSvcFailed)}, deduped...) - r.AddIssue( + r.AddIssueWithArtifacts( IssueSvcFailed, SeverityCritical, ConfidenceHigh, "SVC", fmt.Sprintf("%d failed systemd service(s)", len(failedNames)), + []string{"services/failed_services.txt"}, fingerprintParts..., ) } diff --git a/customers/vm-troubleshooting/internal/collector/system.go b/customers/vm-troubleshooting/internal/collector/system.go index 7f05aab..6229c30 100644 --- a/customers/vm-troubleshooting/internal/collector/system.go +++ b/customers/vm-troubleshooting/internal/collector/system.go @@ -58,6 +58,7 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) c.saveCommand(ctx, r, spec.path, spec.cmd, spec.hint, spec.tags...) } c.saveFile(r, "hardware/memory_detailed.txt", "/proc/meminfo", nil, "memory") + c.saveFile(r, "hardware/kernel_tainted.txt", "/proc/sys/kernel/tainted", nil, "config") if c.Exec.CommandExists("lspci") { c.saveCommand(ctx, r, "hardware/pci_devices.txt", executor.CommandSpec{Name: "lspci", Args: []string{"-nn"}, Timeout: config.TimeoutQuick}, "lspci", "hardware") } @@ -66,23 +67,25 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) continue } if d.UsedPct >= 95 { - r.AddIssue( + r.AddIssueWithArtifacts( IssueDiskCritical, SeverityCritical, ConfidenceHigh, "DISK", fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct), + []string{"hardware/disk.txt"}, "sys", string(IssueDiskCritical), d.Mountpoint, ) } else if d.UsedPct >= 85 { - r.AddIssue( + r.AddIssueWithArtifacts( IssueDiskWarning, SeverityWarning, ConfidenceHigh, "DISK", fmt.Sprintf("%s at %.0f%% capacity", d.Mountpoint, d.UsedPct), + []string{"hardware/disk.txt"}, "sys", string(IssueDiskWarning), d.Mountpoint, diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index d789230..116c7ed 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -24,7 +24,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.1.0", + SchemaVersion: "3.2.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/contract_test.go b/customers/vm-troubleshooting/internal/output/contract_test.go index ebf5fbc..5ff7d35 100644 --- a/customers/vm-troubleshooting/internal/output/contract_test.go +++ b/customers/vm-troubleshooting/internal/output/contract_test.go @@ -180,11 +180,13 @@ func TestSchemaErrorCodeEnumsMatchGoConstants(t *testing.T) { func TestFactTypingConsistency(t *testing.T) { t.Parallel() facts := map[string]string{ - "cpu_cores": "8", - "gpu_count": "2", - "xid_classified_count": "3", - "firewall_posture": "restrictive", - "oom_event_count": "unavailable", + "cpu_cores": "8", + "gpu_count": "2", + "gpu_unreachable_count": "1", + "gpu_total_count": "unavailable", + "xid_classified_count": "3", + "firewall_posture": "restrictive", + "oom_event_count": "unavailable", } result := output.ConvertFacts(facts) // Integer keys should be numbers @@ -194,6 +196,12 @@ func TestFactTypingConsistency(t *testing.T) { if v, ok := result["gpu_count"].(int64); !ok || v != 2 { t.Errorf("gpu_count should be int64(2), got %T(%v)", result["gpu_count"], result["gpu_count"]) } + if v, ok := result["gpu_unreachable_count"].(int64); !ok || v != 1 { + t.Errorf("gpu_unreachable_count should be int64(1), got %T(%v)", result["gpu_unreachable_count"], result["gpu_unreachable_count"]) + } + if result["gpu_total_count"] != nil { + t.Errorf("gpu_total_count with 'unavailable' should be nil, got %T(%v)", result["gpu_total_count"], result["gpu_total_count"]) + } // String keys should stay strings if v, ok := result["firewall_posture"].(string); !ok || v != "restrictive" { t.Errorf("firewall_posture should be string, got %T(%v)", result["firewall_posture"], result["firewall_posture"]) @@ -273,7 +281,7 @@ func TestSchemaIssueCodeEnumsMatchGoConstants(t *testing.T) { checkEnumMatchesMap(t, "report issue code", reportCodes, expected) triageCodes := extractSchemaEnumAtPath(t, "triage-result.schema.json", "$defs", "finding", "properties", "code", "enum") - checkEnumMatchesMap(t, "triage finding code", triageCodes, triage.FindingCodes) + checkEnumMatchesMap(t, "triage finding code", triageCodes, triage.AllFindingCodes) } func TestSchemaConfidenceEnumsMatchGoConstants(t *testing.T) { diff --git a/customers/vm-troubleshooting/internal/output/manifest.go b/customers/vm-troubleshooting/internal/output/manifest.go index 98747f4..00b4b7d 100644 --- a/customers/vm-troubleshooting/internal/output/manifest.go +++ b/customers/vm-troubleshooting/internal/output/manifest.go @@ -105,6 +105,7 @@ type ManifestJSON struct { // "unavailable" maps to null (json omit or explicit null). var integerFactKeys = map[string]bool{ "cpu_cores": true, "gpu_count": true, "memory_total": true, + "gpu_unreachable_count": true, "gpu_total_count": true, "container_count": true, "vllm_container_count": true, "failed_service_count": true, "xid_classified_count": true, "critical_event_count": true, "oom_event_count": true, } diff --git a/customers/vm-troubleshooting/internal/output/manifest_test.go b/customers/vm-troubleshooting/internal/output/manifest_test.go index aa9d3da..2e6f12a 100644 --- a/customers/vm-troubleshooting/internal/output/manifest_test.go +++ b/customers/vm-troubleshooting/internal/output/manifest_test.go @@ -37,7 +37,7 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.1.0", + SchemaVersion: "3.2.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", @@ -58,7 +58,12 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { ParserHint: "lscpu", Tags: []string{"cpu", "hardware"}, }}, - map[string]string{"cpu_cores": "16", "oom_event_count": "unavailable"}, + map[string]string{ + "cpu_cores": "16", + "oom_event_count": "unavailable", + "gpu_unreachable_count": "1", + "gpu_total_count": "unavailable", + }, []ManifestIssueInput{{ Code: "disk_warning", Severity: "warning", @@ -108,6 +113,12 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { if _, ok := facts["cpu_cores"].(float64); !ok { t.Fatalf("expected cpu_cores to be numeric, got %#v", facts["cpu_cores"]) } + if v, ok := facts["gpu_unreachable_count"].(float64); !ok || v != 1 { + t.Fatalf("expected gpu_unreachable_count=1 as number, got %#v", facts["gpu_unreachable_count"]) + } + if facts["gpu_total_count"] != nil { + t.Fatalf("expected gpu_total_count to map to null, got %#v", facts["gpu_total_count"]) + } if facts["oom_event_count"] != nil { t.Fatalf("expected oom_event_count to map to null, got %#v", facts["oom_event_count"]) } diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index ae5fea1..31e8938 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -45,7 +45,7 @@ type ReportRecord struct { ErrorCount int `json:"error_count,omitempty"` } -const reportSchemaVersion = "3.1.0" +const reportSchemaVersion = "3.2.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. diff --git a/customers/vm-troubleshooting/internal/output/summary.go b/customers/vm-troubleshooting/internal/output/summary.go index 2cf7c46..eb83c54 100644 --- a/customers/vm-troubleshooting/internal/output/summary.go +++ b/customers/vm-troubleshooting/internal/output/summary.go @@ -136,12 +136,21 @@ func WriteSummary(w *Writer, hostname, version string, results []SummaryResult) b.WriteString(fmt.Sprintf("Memory: %s\n", formatBytes(v))) } b.WriteString("\n") - if firstFact(facts, "gpu_model", "gpu_count", "driver_version") != "" { + if firstFact(facts, "gpu_model", "gpu_count", "gpu_total_count", "gpu_unreachable_count", "driver_version") != "" { b.WriteString("-----------------------------------------------------------------------\n") b.WriteString("GPU Summary\n") b.WriteString("-----------------------------------------------------------------------\n") - if v := firstFact(facts, "gpu_count"); v != "" { - b.WriteString(fmt.Sprintf("GPU Count: %s\n", v)) + gpuCount := firstFact(facts, "gpu_count") + gpuTotal := firstFact(facts, "gpu_total_count") + if gpuCount != "" { + if gpuTotal != "" && gpuCount != "unavailable" && gpuTotal != "unavailable" { + b.WriteString(fmt.Sprintf("GPU Count: %s of %s\n", gpuCount, gpuTotal)) + } else { + b.WriteString(fmt.Sprintf("GPU Count: %s\n", gpuCount)) + } + } + if v := firstFact(facts, "gpu_unreachable_count"); v != "" && v != "unavailable" && v != "0" { + b.WriteString(fmt.Sprintf("Unreachable GPUs: %s\n", v)) } if v := firstFact(facts, "gpu_model"); v != "" { b.WriteString(fmt.Sprintf("GPU Model: %s\n", v)) diff --git a/customers/vm-troubleshooting/internal/output/summary_test.go b/customers/vm-troubleshooting/internal/output/summary_test.go index 68ac87c..009dedc 100644 --- a/customers/vm-troubleshooting/internal/output/summary_test.go +++ b/customers/vm-troubleshooting/internal/output/summary_test.go @@ -40,3 +40,71 @@ func TestWriteSummary_ShowsOnlyHighConfidenceIssues(t *testing.T) { t.Fatal("did not expect low-confidence issue in SUMMARY.txt") } } + +func TestWriteSummary_ShowsUnreachableGPUCounts(t *testing.T) { + t.Parallel() + + root := t.TempDir() + w := NewWriter(root) + results := []SummaryResult{ + { + Name: "nvidia", + Facts: map[string]string{ + "gpu_count": "7", + "gpu_total_count": "8", + "gpu_unreachable_count": "1", + "gpu_model": "RTX 5090", + "driver_version": "575.57.08", + }, + }, + } + if err := WriteSummary(w, "node-1", "dev", results); err != nil { + t.Fatalf("WriteSummary failed: %v", err) + } + + data, err := os.ReadFile(filepath.Join(root, "SUMMARY.txt")) + if err != nil { + t.Fatalf("reading SUMMARY.txt: %v", err) + } + text := string(data) + if !strings.Contains(text, "GPU Count: 7 of 8") { + t.Fatalf("expected combined GPU count in summary, got: %s", text) + } + if !strings.Contains(text, "Unreachable GPUs: 1") { + t.Fatalf("expected unreachable GPU line in summary, got: %s", text) + } +} + +func TestWriteSummary_HealthyGPUsOmitUnreachableLine(t *testing.T) { + t.Parallel() + + root := t.TempDir() + w := NewWriter(root) + results := []SummaryResult{ + { + Name: "nvidia", + Facts: map[string]string{ + "gpu_count": "8", + "gpu_total_count": "8", + "gpu_unreachable_count": "0", + "gpu_model": "RTX 5090", + "driver_version": "575.57.08", + }, + }, + } + if err := WriteSummary(w, "node-1", "dev", results); err != nil { + t.Fatalf("WriteSummary failed: %v", err) + } + + data, err := os.ReadFile(filepath.Join(root, "SUMMARY.txt")) + if err != nil { + t.Fatalf("reading SUMMARY.txt: %v", err) + } + text := string(data) + if strings.Contains(text, "Unreachable GPUs") { + t.Fatalf("did not expect unreachable GPU line on healthy system, got: %s", text) + } + if !strings.Contains(text, "GPU Count: 8 of 8") { + t.Fatalf("expected GPU Count: 8 of 8 on healthy system, got: %s", text) + } +} diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index b4ec71a..9ef68c5 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -2,6 +2,7 @@ package runner import ( "context" + "encoding/json" "fmt" "os" "path/filepath" @@ -199,6 +200,21 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { results = append(results, triageResult) } + // Run issue enrichment — produces metadata-only findings for collector-owned issues. + // These land in triage/_data/ for the dashboard but are NOT converted to manifest issues. + enrichInputs := buildEnrichmentInputs(results) + if len(enrichInputs) > 0 { + enrichResult, enrichErr := triage.EnrichIssues(workDir, enrichInputs) + if enrichErr != nil { + r.UI.Warn("Issue enrichment failed: " + enrichErr.Error()) + } else if len(enrichResult.Findings) > 0 || len(enrichResult.Facts) > 0 { + // Write enrichment findings via the same RunAllAnalyzers JSON path + writeEnrichmentResult(enrichResult, archiveName, writer, r.UI) + // Track artifacts on existing triage collector result (or synthetic enrichment) + results = appendEnrichmentArtifacts(results, enrichResult) + } + } + // Build summary and metadata r.UI.Section("Finalizing") sp := r.UI.StartSpinner("Writing report metadata...") @@ -338,7 +354,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "3.1.0", + SchemaVersion: "3.2.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, @@ -494,3 +510,110 @@ func validateOutputDir(dir string) error { _ = os.Remove(test) return nil } + +// buildEnrichmentInputs iterates collector results and extracts issues suitable for enrichment. +func buildEnrichmentInputs(results []*collector.CollectorResult) []triage.EnrichmentInput { + var inputs []triage.EnrichmentInput + for _, res := range results { + if res.ID == "triage" { + continue // triage-owned issues are already rich + } + for _, issue := range res.Issues { + if issue.Fingerprint == "" { + continue + } + if _, mapped := triage.EnrichmentFamilyFor(issue.Code); !mapped { + continue + } + inputs = append(inputs, triage.EnrichmentInput{ + CollectorID: res.ID, + Code: issue.Code, + Fingerprint: issue.Fingerprint, + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Message: issue.Message, + RelatedArtifactPaths: issue.RelatedArtifactPaths, + Facts: res.Facts, + }) + } + } + return inputs +} + +// writeEnrichmentResult writes the enrichment triage result to triage/_data/issue_enrichment.json. +func writeEnrichmentResult(tr *triage.TriageResult, archiveID string, writer *output.Writer, u ui.UI) { + jsonPath := "triage/_data/" + tr.Name + ".json" + typedFacts := output.ConvertFacts(tr.Facts) + findings := tr.Findings + if findings == nil { + findings = []triage.Finding{} + } + + type envelope struct { + Kind string `json:"kind"` + SchemaVersion string `json:"schema_version"` + ArchiveID string `json:"archive_id"` + Analyzer string `json:"analyzer"` + Findings []triage.Finding `json:"findings"` + Facts map[string]any `json:"facts,omitempty"` + } + + jsonData, jErr := json.MarshalIndent(envelope{ + Kind: "triage_result", + SchemaVersion: "3.1.0", + ArchiveID: archiveID, + Analyzer: tr.Name, + Findings: findings, + Facts: typedFacts, + }, "", " ") + if jErr != nil { + u.Warn("Failed to marshal enrichment result: " + jErr.Error()) + return + } + if rErr := writer.ReservePath(jsonPath); rErr != nil { + u.Warn("Failed to reserve " + jsonPath + ": " + rErr.Error()) + return + } + if wErr := writer.SaveOutput(jsonPath, string(jsonData)+"\n"); wErr != nil { + writer.ReleasePath(jsonPath) + u.Warn("Failed to write " + jsonPath + ": " + wErr.Error()) + return + } + tr.Artifacts = append(tr.Artifacts, jsonPath) +} + +// appendEnrichmentArtifacts adds enrichment artifacts to an existing triage collector result, +// or appends a synthetic "enrichment" collector so manifest.json stays consistent when triage +// did not run. +func appendEnrichmentArtifacts(results []*collector.CollectorResult, enrichResult *triage.TriageResult) []*collector.CollectorResult { + for _, res := range results { + if res.ID == "triage" { + for _, path := range enrichResult.Artifacts { + hint := "text" + if strings.HasSuffix(path, ".json") { + hint = "json" + } + res.AddProbeArtifact(path, hint, "triage") + } + for k, v := range enrichResult.Facts { + res.SetFact("enrichment_"+k, v) + } + return results + } + } + syn := collector.NewResult() + syn.ID = "enrichment" + syn.Name = "Enrichment" + for _, path := range enrichResult.Artifacts { + hint := "text" + if strings.HasSuffix(path, ".json") { + hint = "json" + } + syn.AddProbeArtifact(path, hint, "enrichment") + } + for k, v := range enrichResult.Facts { + syn.SetFact("enrichment_"+k, v) + } + return append(results, syn) +} diff --git a/customers/vm-troubleshooting/internal/runner/runner_enrichment_test.go b/customers/vm-troubleshooting/internal/runner/runner_enrichment_test.go new file mode 100644 index 0000000..2489d27 --- /dev/null +++ b/customers/vm-troubleshooting/internal/runner/runner_enrichment_test.go @@ -0,0 +1,50 @@ +package runner + +import ( + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/triage" +) + +func TestAppendEnrichmentArtifactsSyntheticCollector(t *testing.T) { + t.Parallel() + enrich := &triage.TriageResult{ + Name: "issue_enrichment", + Findings: nil, + Facts: map[string]string{"enriched_count": "0"}, + Artifacts: []string{"triage/_data/issue_enrichment.json"}, + } + base := []*collector.CollectorResult{collector.NewResult()} + out := appendEnrichmentArtifacts(base, enrich) + if len(out) != 2 { + t.Fatalf("expected synthetic collector appended, len=%d", len(out)) + } + syn := out[1] + if syn.ID != "enrichment" { + t.Fatalf("expected enrichment id, got %q", syn.ID) + } + if len(syn.Artifacts) != 1 || syn.Artifacts[0].Path != "triage/_data/issue_enrichment.json" { + t.Fatalf("unexpected artifacts: %+v", syn.Artifacts) + } +} + +func TestAppendEnrichmentArtifactsMergesIntoTriage(t *testing.T) { + t.Parallel() + tr := collector.NewResult() + tr.ID = "triage" + enrich := &triage.TriageResult{ + Artifacts: []string{"triage/_data/issue_enrichment.json"}, + Facts: map[string]string{"k": "v"}, + } + out := appendEnrichmentArtifacts([]*collector.CollectorResult{tr}, enrich) + if len(out) != 1 { + t.Fatalf("expected same slice length, got %d", len(out)) + } + if len(tr.Artifacts) != 1 { + t.Fatalf("expected artifact on triage, got %+v", tr.Artifacts) + } + if tr.Facts["enrichment_k"] != "v" { + t.Fatalf("expected merged fact, facts=%v", tr.Facts) + } +} diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize.go b/customers/vm-troubleshooting/internal/sanitize/sanitize.go index f59c747..4416195 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize.go @@ -15,6 +15,14 @@ var ( authorizationRE = regexp.MustCompile(`(?i)(Authorization:\s*)(Bearer|Basic)\s+[^\s]+`) commonEnvRE = regexp.MustCompile(`(?i)\b(AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HUGGINGFACE_API_KEY|DATABASE_URL|PGPASSWORD|MYSQL_PWD|REDIS_PASSWORD)=[^\s]+`) ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;?]*[ -/]*[@-~]`) + + // dockerEnvSecretKeyRE matches environment variable names that should always be redacted. + dockerEnvSecretKeyRE = regexp.MustCompile(`(?i)^(TOKEN|AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|GITHUB_TOKEN|GH_TOKEN|SLACK_TOKEN|VAULT_TOKEN|NPM_TOKEN|PYPI_TOKEN|CARGO_REGISTRY_TOKEN|OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HUGGINGFACE_TOKEN|HUGGINGFACE_API_KEY|DATABASE_URL|PGPASSWORD|MYSQL_PWD|REDIS_PASSWORD|.*(?:PASSWORD|PASSWD|PSK|SECRET|CREDENTIAL|API[_-]?KEY|AUTH[_-]?TOKEN|PRIVATE[_-]?KEY))$`) + + dockerEnvSkValueRE = regexp.MustCompile(`sk-[A-Za-z0-9_-]{20,}`) + dockerEnvPEMRE = regexp.MustCompile(`-----BEGIN (?:RSA |EC |OPENSSH |DSA |)?PRIVATE KEY-----`) + dockerEnvGitHubPATRE = regexp.MustCompile(`\bgh[opusr]_[A-Za-z0-9]+\b`) + dockerEnvSlackTokenRE = regexp.MustCompile(`xox[baprs]-[A-Za-z0-9-]+`) ) func DockerInspect(input string) string { @@ -30,6 +38,15 @@ func DockerInspect(input string) string { return string(out) } +// isCLIArgField returns true for JSON keys whose values contain CLI arguments. +func isCLIArgField(key string) bool { + switch strings.ToLower(key) { + case "cmd", "entrypoint", "args": + return true + } + return false +} + func sanitizeJSON(v any, key string) any { switch x := v.(type) { case map[string]any: @@ -53,12 +70,11 @@ func sanitizeJSON(v any, key string) any { } return out case []any: + if isCLIArgField(key) { + return redactArgSlice(x) + } out := make([]any, 0, len(x)) for _, item := range x { - if s, ok := item.(string); ok && (strings.EqualFold(key, "Cmd") || strings.EqualFold(key, "Entrypoint")) { - out = append(out, redactArgLikeSecret(s)) - continue - } out = append(out, sanitizeJSON(item, key)) } return out @@ -66,7 +82,7 @@ func sanitizeJSON(v any, key string) any { if configSecretRE.MatchString(key) { return "[REDACTED]" } - if strings.EqualFold(key, "Cmd") || strings.EqualFold(key, "Entrypoint") { + if isCLIArgField(key) { return redactArgLikeSecret(x) } return x @@ -75,12 +91,74 @@ func sanitizeJSON(v any, key string) any { } } +// redactArgSlice sanitizes a CLI argument array (e.g. Docker Cmd/Entrypoint). +// It handles both "--flag=value" (inline) and "--flag" "value" (separate elements), +// and detects standalone credential values like JWTs. +func redactArgSlice(args []any) []any { + out := make([]any, 0, len(args)) + redactNext := false + for _, item := range args { + s, ok := item.(string) + if !ok { + out = append(out, item) + redactNext = false + continue + } + if redactNext { + out = append(out, "[REDACTED]") + redactNext = false + continue + } + // key=value: redact value if key matches a secret keyword + if eqIdx := strings.IndexRune(s, '='); eqIdx >= 0 { + if configSecretRE.MatchString(s[:eqIdx]) { + out = append(out, s[:eqIdx+1]+"[REDACTED]") + continue + } + } + // CLI flag without =: if it matches a secret keyword, redact the next element + if strings.HasPrefix(s, "-") && configSecretRE.MatchString(s) { + out = append(out, s) + redactNext = true + continue + } + // Standalone credential value (e.g. JWT passed as positional arg) + if looksLikeToken(s) { + out = append(out, "[REDACTED]") + continue + } + out = append(out, s) + } + return out +} + +// looksLikeToken detects standalone credential values by format. +// Intentionally limited to high-confidence patterns to avoid false positives. +func looksLikeToken(s string) bool { + // JWT: base64url header.payload[.signature] — always starts with {" + return len(s) > 20 && strings.HasPrefix(s, "eyJ") && strings.Count(s, ".") >= 1 +} + func redactEnvString(s string) string { parts := strings.SplitN(s, "=", 2) if len(parts) != 2 { return s } - return parts[0] + "=[REDACTED]" + key, val := parts[0], parts[1] + if dockerEnvSecretKeyRE.MatchString(strings.TrimSpace(key)) { + return key + "=[REDACTED]" + } + return key + "=" + redactDockerEnvValue(val) +} + +func redactDockerEnvValue(val string) string { + out := dockerEnvSkValueRE.ReplaceAllString(val, "[REDACTED]") + out = dockerEnvGitHubPATRE.ReplaceAllString(out, "[REDACTED]") + out = dockerEnvSlackTokenRE.ReplaceAllString(out, "[REDACTED]") + if dockerEnvPEMRE.MatchString(out) { + return "[REDACTED]" + } + return out } func redactArgLikeSecret(s string) string { diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go index 35c9833..da1e85c 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go @@ -70,17 +70,29 @@ func TestDockerInspect(t *testing.T) { check func(t *testing.T, out string) }{ { - "env vars redacted", - `[{"Config":{"Env":["TOKEN=abc","PATH=/usr/bin"]}}]`, + "env selective redaction preserves PATH", + `[{"Config":{"Env":["GITHUB_TOKEN=ghp_abc123456789012345678901234567890abcd","PATH=/usr/bin:/bin","HOME=/root","OPENAI_API_KEY=sk-proj-abcdefghijklmnopqrstuvwxyz1234567890","NOT_A_SECRET=-----BEGIN RSA PRIVATE KEY-----\nMIIB\n-----END RSA PRIVATE KEY-----"]}}]`, func(t *testing.T, out string) { - if strings.Contains(out, "abc") { - t.Error("TOKEN value should be redacted") + if strings.Contains(out, "ghp_") { + t.Error("GitHub PAT value should be redacted") } - if !strings.Contains(out, "TOKEN=[REDACTED]") { - t.Error("expected TOKEN=[REDACTED]") + if !strings.Contains(out, "GITHUB_TOKEN=[REDACTED]") { + t.Error("expected GITHUB_TOKEN=[REDACTED]") } - if !strings.Contains(out, "PATH=[REDACTED]") { - // All env values are redacted + if !strings.Contains(out, "PATH=/usr/bin:/bin") { + t.Error("PATH should be preserved") + } + if !strings.Contains(out, "HOME=/root") { + t.Error("HOME should be preserved") + } + if strings.Contains(out, "sk-proj-") { + t.Error("OpenAI key value should be redacted") + } + if !strings.Contains(out, "OPENAI_API_KEY=[REDACTED]") { + t.Error("expected OPENAI_API_KEY=[REDACTED]") + } + if !strings.Contains(out, "NOT_A_SECRET=[REDACTED]") { + t.Error("PEM in value should be value-redacted to single [REDACTED]") } }, }, @@ -97,14 +109,88 @@ func TestDockerInspect(t *testing.T) { }, }, { - "cmd arrays redacted", + "cmd inline flag=value redacted", `{"Config":{"Cmd":["--token=secret123","serve"]}}`, func(t *testing.T, out string) { if strings.Contains(out, "secret123") { t.Error("Cmd token should be redacted") } - if !strings.Contains(out, "[REDACTED]") { - t.Error("expected redacted output") + if !strings.Contains(out, "--token=[REDACTED]") { + t.Error("expected --token=[REDACTED]") + } + if !strings.Contains(out, "serve") { + t.Error("non-secret arg should be preserved") + } + }, + }, + { + "cmd separate flag value redacted", + `[{"Config":{"Cmd":["--api-key","secret_jwt_value","--port","8080"]}}]`, + func(t *testing.T, out string) { + if strings.Contains(out, "secret_jwt_value") { + t.Error("api-key value in separate element should be redacted") + } + if !strings.Contains(out, "--api-key") { + t.Error("flag name should be preserved") + } + if !strings.Contains(out, "8080") { + t.Error("non-secret --port value should be preserved") + } + }, + }, + { + "cmd standalone JWT redacted", + `[{"Config":{"Cmd":["serve","eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoiYWxpY2UifQ.sig"]}}]`, + func(t *testing.T, out string) { + if strings.Contains(out, "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9") { + t.Error("standalone JWT should be redacted") + } + if !strings.Contains(out, "serve") { + t.Error("non-secret positional arg should be preserved") + } + }, + }, + { + "entrypoint separate flag value redacted", + `{"Config":{"Entrypoint":["myapp","--password","hunter2"]}}`, + func(t *testing.T, out string) { + if strings.Contains(out, "hunter2") { + t.Error("password value should be redacted") + } + if !strings.Contains(out, "--password") { + t.Error("flag name should be preserved") + } + if !strings.Contains(out, "myapp") { + t.Error("program name should be preserved") + } + }, + }, + { + "args field separate flag value redacted", + `[{"Args":["serve","--api-key","eyJhbGciOiJIUzI1NiJ9.eyJ1c2VyIjoiYWxpY2UifQ.sig","--port","8000"]}]`, + func(t *testing.T, out string) { + if strings.Contains(out, "eyJhbGciOiJIUzI1NiJ9") { + t.Error("Args api-key JWT should be redacted") + } + if !strings.Contains(out, "--api-key") { + t.Error("flag name should be preserved") + } + if !strings.Contains(out, "8000") { + t.Error("non-secret --port value should be preserved") + } + if !strings.Contains(out, "serve") { + t.Error("program command should be preserved") + } + }, + }, + { + "cmd no false positives on normal args", + `{"Config":{"Cmd":["python","app.py","--host","0.0.0.0","--workers","4"]}}`, + func(t *testing.T, out string) { + for _, want := range []string{"python", "app.py", "--host", "0.0.0.0", "--workers", "4"} { + if !strings.Contains(out, want) { + t.Errorf("expected %q to be preserved, got: %s", want, out) + } } }, }, @@ -136,6 +222,31 @@ func TestDockerInspect(t *testing.T) { } } +func TestLooksLikeToken(t *testing.T) { + tests := []struct { + input string + want bool + }{ + {"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoiYWxpY2UifQ.signature", true}, + {"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0", true}, + {"eyJ", false}, // too short + {"secret123", false}, // no JWT structure + {"8080", false}, // port number + {"/usr/bin/python", false}, // file path + {"", false}, // empty + {"serve", false}, // normal arg + {"--api-key", false}, // flag name, not a value + {"PYTHONPATH=/usr/lib", false}, // env-style, not a token + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + if got := looksLikeToken(tt.input); got != tt.want { + t.Errorf("looksLikeToken(%q) = %v, want %v", tt.input, got, tt.want) + } + }) + } +} + func TestGrepLines(t *testing.T) { input := "line1 foo\nline2 bar\nline3 foo bar\n" got := GrepLines(input, "foo") diff --git a/customers/vm-troubleshooting/internal/triage/enrichment.go b/customers/vm-troubleshooting/internal/triage/enrichment.go new file mode 100644 index 0000000..939d63d --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/enrichment.go @@ -0,0 +1,431 @@ +package triage + +import ( + "fmt" + "regexp" + "sort" + "strings" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" +) + +// EnrichmentInput carries the issue fields needed for enrichment. +type EnrichmentInput struct { + CollectorID string + Code collector.IssueCode + Fingerprint string + Severity collector.Severity + Confidence collector.Confidence + Category string + Message string + RelatedArtifactPaths []string + Facts map[string]string +} + +// enrichmentFamily maps collector issue codes to enrichment finding families. +var enrichmentFamily = map[collector.IssueCode]FindingCode{ + collector.IssueSvcFailed: FindingServiceState, + collector.IssueSvcFabricmanagerBenign: FindingServiceState, + collector.IssueDiskWarning: FindingDiskCapacity, + collector.IssueDiskCritical: FindingDiskCapacity, + collector.IssueOOMEvents: FindingMemoryPressure, +} + +// familyEnricher is a function that produces a rich finding for a mapped issue code. +type familyEnricher func(issue EnrichmentInput, workDir string, resolver *EvidenceResolver) *Finding + +// familyEnrichers maps finding family codes to their enricher implementations. +var familyEnrichers = map[FindingCode]familyEnricher{ + FindingServiceState: enrichServiceState, + FindingDiskCapacity: enrichDiskCapacity, + FindingMemoryPressure: enrichMemoryPressure, +} + +// EnrichIssues produces enrichment findings for collector-owned issues. +// Returns at most one finding per input issue. Findings are metadata-only: +// they are written to triage/_data/ but NOT converted to manifest issues. +func EnrichIssues(workDir string, issues []EnrichmentInput) (*TriageResult, error) { + result := &TriageResult{ + Name: "issue_enrichment", + Facts: make(map[string]string), + } + + resolver := NewEvidenceResolver(workDir) + enrichedCount := 0 + genericCount := 0 + unavailableCount := 0 + + for _, issue := range issues { + family, mapped := enrichmentFamily[issue.Code] + if !mapped { + continue + } + + var finding *Finding + + // Try family enricher first + if enricher, ok := familyEnrichers[family]; ok { + finding = enricher(issue, workDir, resolver) + } + + // Fall back to generic if family enricher returned nil or doesn't exist + if finding == nil { + finding = enrichGeneric(issue, family, resolver) + } + + if finding == nil { + unavailableCount++ + continue + } + + // Ensure fingerprint matches the collector issue + finding.Fingerprint = issue.Fingerprint + finding.Code = family + + result.Findings = append(result.Findings, *finding) + + if finding.Title != humanizeCode(string(issue.Code)) { + enrichedCount++ + } else { + genericCount++ + } + } + + result.Facts["enriched_count"] = fmt.Sprintf("%d", enrichedCount) + result.Facts["generic_count"] = fmt.Sprintf("%d", genericCount) + result.Facts["unavailable_count"] = fmt.Sprintf("%d", unavailableCount) + + return result, nil +} + +// enrichGeneric produces a stub finding using evidence from related artifacts. +func enrichGeneric(issue EnrichmentInput, family FindingCode, resolver *EvidenceResolver) *Finding { + artifacts := resolver.Resolve(issue.RelatedArtifactPaths) + evidence := ExtractLines(artifacts, 10) + + if len(evidence) == 0 && len(issue.RelatedArtifactPaths) == 0 { + return nil + } + + title := humanizeCode(string(issue.Code)) + + var action string + switch issue.Severity { + case collector.SeverityCritical: + action = "Investigate immediately. Review the related artifacts for details." + case collector.SeverityWarning: + action = "Monitor the situation. Review the related artifacts for details." + default: + action = "Review the related artifacts for details." + } + + return &Finding{ + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Title: title, + Description: issue.Message, + Action: action, + Evidence: evidence, + SourceArtifacts: issue.RelatedArtifactPaths, + } +} + +// humanizeCode converts an issue code like "oom_events" to "OOM Events". +func humanizeCode(code string) string { + // Known acronyms to uppercase + acronyms := map[string]string{ + "oom": "OOM", "svc": "Service", "gpu": "GPU", "cpu": "CPU", + } + parts := strings.Split(code, "_") + for i, p := range parts { + if upper, ok := acronyms[p]; ok { + parts[i] = upper + } else if len(p) > 0 { + parts[i] = strings.ToUpper(p[:1]) + p[1:] + } + } + return strings.Join(parts, " ") +} + +// --- Family enrichers --- + +// enrichServiceState handles svc_failed and svc_fabricmanager_benign. +func enrichServiceState(issue EnrichmentInput, workDir string, resolver *EvidenceResolver) *Finding { + switch issue.Code { + case collector.IssueSvcFabricmanagerBenign: + return enrichFabricmanagerBenign(issue, resolver) + case collector.IssueSvcFailed: + return enrichSvcFailed(issue, workDir, resolver) + default: + return nil + } +} + +func enrichFabricmanagerBenign(issue EnrichmentInput, resolver *EvidenceResolver) *Finding { + artifacts := resolver.Resolve(issue.RelatedArtifactPaths) + evidence := ExtractLines(artifacts, 5) + + return &Finding{ + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Title: "NVIDIA Fabric Manager Not Running (Expected)", + Description: "nvidia-fabricmanager.service is not running, but this is expected because no NVSwitch " + + "or SXM topology was detected on this system. Fabric Manager is only required for multi-GPU " + + "NVLink/NVSwitch configurations.", + Action: "No action needed. This is informational — the service would only be required if NVSwitch hardware were present.", + Evidence: evidence, + SourceArtifacts: issue.RelatedArtifactPaths, + } +} + +func enrichSvcFailed(issue EnrichmentInput, workDir string, resolver *EvidenceResolver) *Finding { + // Read failed services artifact to extract service names + artifacts := resolver.Resolve(issue.RelatedArtifactPaths) + failedLines := ExtractLines(artifacts, 20) + + // Parse service names from failed_services.txt lines + var serviceNames []string + for _, line := range failedLines { + fields := strings.Fields(line) + if len(fields) > 0 { + name := strings.TrimSuffix(fields[0], ".service") + if name != "" { + serviceNames = append(serviceNames, name) + } + } + } + + // Collect per-service status artifacts for additional evidence + var statusPaths []string + for _, svc := range serviceNames { + statusPath := fmt.Sprintf("services/status_%s.txt", svc) + statusPaths = append(statusPaths, statusPath) + } + statusArtifacts := resolver.Resolve(statusPaths) + statusEvidence := ExtractLines(statusArtifacts, 10) + + allEvidence := append(failedLines, statusEvidence...) + if len(allEvidence) > 15 { + allEvidence = allEvidence[:15] + } + + allSourcePaths := append([]string(nil), issue.RelatedArtifactPaths...) + for _, a := range statusArtifacts { + if !a.Missing { + allSourcePaths = append(allSourcePaths, a.Path) + } + } + + sort.Strings(serviceNames) + nameList := strings.Join(serviceNames, ", ") + count := len(serviceNames) + if count == 0 { + count = 1 + nameList = "(unknown)" + } + + return &Finding{ + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Title: "Failed Systemd Services", + Description: fmt.Sprintf("%d systemd service(s) in failed state: %s. "+ + "Failed services may indicate configuration errors, missing dependencies, "+ + "or runtime crashes that need investigation.", count, nameList), + Action: "For each failed service, run 'systemctl status ' and " + + "'journalctl -u --no-pager -n 50' to identify the failure cause. " + + "Restart with 'systemctl restart ' after resolving the underlying issue.", + Evidence: allEvidence, + SourceArtifacts: allSourcePaths, + } +} + +// diskMessageRe parses the mountpoint from the collector's disk issue message format. +// Format: "%s at %.0f%% capacity" (system.go) +var diskMessageRe = regexp.MustCompile(`^(.+) at (\d+)% capacity$`) + +// enrichDiskCapacity handles disk_warning and disk_critical. +func enrichDiskCapacity(issue EnrichmentInput, workDir string, resolver *EvidenceResolver) *Finding { + artifacts := resolver.Resolve(issue.RelatedArtifactPaths) + allLines := ExtractLines(artifacts, 30) + + // Parse mountpoint and usage from issue message + mountpoint := "" + usedPct := "" + if m := diskMessageRe.FindStringSubmatch(issue.Message); len(m) == 3 { + mountpoint = m[1] + usedPct = m[2] + } + + // Find relevant df line for this mountpoint + var evidence []string + if len(allLines) > 0 { + // First line is usually the header + evidence = append(evidence, allLines[0]) + } + for _, line := range allLines[1:] { + if mountpoint != "" && strings.Contains(line, mountpoint) { + evidence = append(evidence, line) + } + } + if len(evidence) <= 1 && len(allLines) > 1 { + // Couldn't find specific mountpoint — include a few lines + end := len(allLines) + if end > 6 { + end = 6 + } + evidence = allLines[:end] + } + + var title, description, action string + if issue.Code == collector.IssueDiskCritical { + title = "Disk Space Critical" + description = fmt.Sprintf("%s is at %s%% capacity.", mountpoint, usedPct) + if mountpoint == "" { + description = issue.Message + } + description += " Running out of disk space can cause service failures, failed writes, and system instability." + action = "Immediately free disk space. Check for large log files (du -sh /var/log/*), " + + "package caches (apt clean), and temporary files. Consider expanding the volume if cleanup is insufficient." + } else { + title = "Disk Space Warning" + description = fmt.Sprintf("%s is at %s%% capacity.", mountpoint, usedPct) + if mountpoint == "" { + description = issue.Message + } + description += " Monitor disk usage to prevent the situation from becoming critical." + action = "Monitor disk usage. Consider cleanup of old logs, package caches, and " + + "temporary files before the situation becomes critical." + } + + return &Finding{ + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Title: title, + Description: description, + Action: action, + Evidence: evidence, + SourceArtifacts: issue.RelatedArtifactPaths, + } +} + +// enrichMemoryPressure handles oom_events. +func enrichMemoryPressure(issue EnrichmentInput, workDir string, resolver *EvidenceResolver) *Finding { + artifacts := resolver.Resolve(issue.RelatedArtifactPaths) + evidence := ExtractLines(artifacts, 10) + + // Use oom_event_count from facts if available, otherwise fall back to issue message + eventCount := issue.Facts["oom_event_count"] + if eventCount == "" || eventCount == "unavailable" { + eventCount = "multiple" + } + + description := fmt.Sprintf( + "The kernel OOM killer was invoked %s time(s). "+ + "OOM kills indicate the system ran out of available memory, "+ + "forcing the kernel to kill processes to prevent a complete system hang.", + eventCount, + ) + + // Try to extract killed process names from evidence + var killedProcesses []string + for _, line := range evidence { + lower := strings.ToLower(line) + if strings.Contains(lower, "killed process") { + killedProcesses = append(killedProcesses, line) + } + } + if len(killedProcesses) > 0 { + description += " Affected processes: " + summarizeKilledProcesses(killedProcesses) + } + + return &Finding{ + Severity: issue.Severity, + Confidence: issue.Confidence, + Category: issue.Category, + Title: "OOM Killer Activated", + Description: description, + Action: "Check application memory usage and limits. Consider increasing system RAM, " + + "enabling or expanding swap, or reducing workload memory footprint. " + + "Review the killed processes in logs/oom_events.txt to identify the memory-hungry application.", + Evidence: evidence, + SourceArtifacts: issue.RelatedArtifactPaths, + } +} + +// summarizeKilledProcesses extracts process names from OOM kill lines for the description. +func summarizeKilledProcesses(lines []string) string { + seen := make(map[string]struct{}) + var names []string + re := regexp.MustCompile(`[Kk]illed process \d+ \(([^)]+)\)`) + for _, line := range lines { + if m := re.FindStringSubmatch(line); len(m) == 2 { + name := m[1] + if _, ok := seen[name]; !ok { + seen[name] = struct{}{} + names = append(names, name) + } + } + } + if len(names) == 0 { + return "(see evidence)" + } + return strings.Join(names, ", ") +} + +// EnrichmentFamilyFor returns the finding family code for a given issue code, if mapped. +func EnrichmentFamilyFor(code collector.IssueCode) (FindingCode, bool) { + fc, ok := enrichmentFamily[code] + return fc, ok +} + +// ValidateEnrichmentFamily checks that every enrichmentFamily entry maps to a valid FindingCode. +func ValidateEnrichmentFamily() error { + for code, family := range enrichmentFamily { + if !EnrichmentFindingCodes[string(family)] { + return fmt.Errorf("enrichmentFamily[%q] maps to %q which is not in EnrichmentFindingCodes", code, family) + } + } + // Check disjointness: collector issue codes and classifier finding codes must not overlap. + for code := range collector.CollectorIssueCodes { + if ClassifierFindingCodes[code] { + return fmt.Errorf("issue code %q appears in both CollectorIssueCodes and ClassifierFindingCodes", code) + } + } + return nil +} + +// VerifyFingerprintMatch checks that an enrichment finding's fingerprint matches a collector issue. +// This is exposed for contract testing. +func VerifyFingerprintMatch(finding Finding, issueFingerprints map[string]bool) bool { + return issueFingerprints[finding.Fingerprint] +} + +// init validates the enrichment family map at program start. +func init() { + if err := ValidateEnrichmentFamily(); err != nil { + panic("enrichment configuration error: " + err.Error()) + } +} + +// ---- Helpers for test access ---- + +// EnrichmentFamilyMap returns a copy of the enrichmentFamily map for testing. +func EnrichmentFamilyMap() map[collector.IssueCode]FindingCode { + m := make(map[collector.IssueCode]FindingCode, len(enrichmentFamily)) + for k, v := range enrichmentFamily { + m[k] = v + } + return m +} + +// HumanizeCode is exported for testing the generic fallback title generation. +func HumanizeCode(code string) string { + return humanizeCode(code) +} + +// DiskMessagePattern is exported for testing that the disk message format is parseable. +var DiskMessagePattern = diskMessageRe diff --git a/customers/vm-troubleshooting/internal/triage/enrichment_test.go b/customers/vm-troubleshooting/internal/triage/enrichment_test.go new file mode 100644 index 0000000..db06681 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/enrichment_test.go @@ -0,0 +1,477 @@ +package triage + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" + "github.com/NexGenCloud/vm-diagnostics/internal/identity" +) + +func TestEnrichIssues_OOMEvents(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "logs/oom_events.txt", + "Jun 10 14:22:01 host kernel: Out of memory: Killed process 1234 (python3), total-vm:8192000kB\n"+ + "Jun 10 14:22:01 host kernel: oom-kill: constraint=CONSTRAINT_NONE\n") + + inputs := []EnrichmentInput{{ + CollectorID: "journal", + Code: collector.IssueOOMEvents, + Fingerprint: identity.Fingerprint("journal", string(collector.IssueOOMEvents)), + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "MEM", + Message: "1 OOM killer event(s)", + RelatedArtifactPaths: []string{"logs/oom_events.txt"}, + Facts: map[string]string{"oom_event_count": "1"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } + + f := result.Findings[0] + if f.Code != FindingMemoryPressure { + t.Errorf("code = %q, want %q", f.Code, FindingMemoryPressure) + } + if f.Title != "OOM Killer Activated" { + t.Errorf("title = %q, want OOM Killer Activated", f.Title) + } + if f.Fingerprint != inputs[0].Fingerprint { + t.Errorf("fingerprint mismatch: got %q, want %q", f.Fingerprint, inputs[0].Fingerprint) + } + if !strings.Contains(f.Description, "python3") { + t.Errorf("description should mention killed process, got: %s", f.Description) + } + if len(f.Evidence) == 0 { + t.Error("expected non-empty evidence") + } + if len(f.SourceArtifacts) == 0 { + t.Error("expected non-empty source_artifacts") + } +} + +func TestEnrichIssues_DiskCritical(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "hardware/disk.txt", + "Filesystem Size Used Avail Use% Mounted on\n"+ + "/dev/sda1 100G 96G 4.0G 96% /\n"+ + "/dev/sdb1 50G 20G 30G 40% /data\n") + + inputs := []EnrichmentInput{{ + CollectorID: "system", + Code: collector.IssueDiskCritical, + Fingerprint: identity.Fingerprint("sys", string(collector.IssueDiskCritical), "/"), + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "DISK", + Message: "/ at 96% capacity", + RelatedArtifactPaths: []string{"hardware/disk.txt"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } + + f := result.Findings[0] + if f.Code != FindingDiskCapacity { + t.Errorf("code = %q, want %q", f.Code, FindingDiskCapacity) + } + if f.Title != "Disk Space Critical" { + t.Errorf("title = %q, want Disk Space Critical", f.Title) + } + if !strings.Contains(f.Description, "96%") { + t.Errorf("description should mention 96%%, got: %s", f.Description) + } +} + +func TestEnrichIssues_DiskWarning(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "hardware/disk.txt", + "Filesystem Size Used Avail Use% Mounted on\n"+ + "/dev/sda1 100G 87G 13G 87% /var\n") + + inputs := []EnrichmentInput{{ + CollectorID: "system", + Code: collector.IssueDiskWarning, + Fingerprint: identity.Fingerprint("sys", string(collector.IssueDiskWarning), "/var"), + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Category: "DISK", + Message: "/var at 87% capacity", + RelatedArtifactPaths: []string{"hardware/disk.txt"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } + + f := result.Findings[0] + if f.Title != "Disk Space Warning" { + t.Errorf("title = %q, want Disk Space Warning", f.Title) + } +} + +func TestEnrichIssues_SvcFailed(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "services/failed_services.txt", + "docker.service loaded failed failed Docker Application Container Engine\n"+ + "kubelet.service loaded failed failed Kubernetes Kubelet\n") + writeArtifact(t, workDir, "services/status_docker.txt", + "docker.service - Docker Application Container Engine\n Loaded: loaded\n Active: failed\n") + + inputs := []EnrichmentInput{{ + CollectorID: "services", + Code: collector.IssueSvcFailed, + Fingerprint: identity.Fingerprint("svc", string(collector.IssueSvcFailed), "docker.service", "kubelet.service"), + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "SVC", + Message: "2 failed systemd service(s)", + RelatedArtifactPaths: []string{"services/failed_services.txt"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } + + f := result.Findings[0] + if f.Code != FindingServiceState { + t.Errorf("code = %q, want %q", f.Code, FindingServiceState) + } + if f.Title != "Failed Systemd Services" { + t.Errorf("title = %q, want Failed Systemd Services", f.Title) + } + if !strings.Contains(f.Description, "docker") { + t.Errorf("description should mention docker, got: %s", f.Description) + } + if !strings.Contains(f.Description, "kubelet") { + t.Errorf("description should mention kubelet, got: %s", f.Description) + } +} + +func TestEnrichIssues_FabricmanagerBenign(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "services/status_nvidia-fabricmanager.txt", + "nvidia-fabricmanager.service - NVIDIA Fabric Manager\n Active: failed (Result: exit-code)\n NV_WARN_NOTHING_TO_DO\n") + + inputs := []EnrichmentInput{{ + CollectorID: "services", + Code: collector.IssueSvcFabricmanagerBenign, + Fingerprint: identity.Fingerprint("svc", string(collector.IssueSvcFabricmanagerBenign)), + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Category: "SVC", + Message: "nvidia-fabricmanager not running (expected: no NVSwitch/SXM detected)", + RelatedArtifactPaths: []string{"services/status_nvidia-fabricmanager.txt"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } + + f := result.Findings[0] + if f.Title != "NVIDIA Fabric Manager Not Running (Expected)" { + t.Errorf("title = %q", f.Title) + } + if !strings.Contains(f.Description, "NVSwitch") { + t.Errorf("description should mention NVSwitch, got: %s", f.Description) + } + if !strings.Contains(f.Action, "No action needed") { + t.Errorf("action should say no action needed, got: %s", f.Action) + } +} + +func TestEnrichIssues_MissingArtifact_FallsBackToGeneric(t *testing.T) { + t.Parallel() + workDir := t.TempDir() // no artifacts at all + + inputs := []EnrichmentInput{{ + CollectorID: "journal", + Code: collector.IssueOOMEvents, + Fingerprint: identity.Fingerprint("journal", string(collector.IssueOOMEvents)), + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "MEM", + Message: "3 OOM killer event(s)", + RelatedArtifactPaths: []string{"logs/oom_events.txt"}, + Facts: map[string]string{"oom_event_count": "3"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + // Should still produce a finding (family enricher falls back gracefully) + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding even with missing artifact, got %d", len(result.Findings)) + } + f := result.Findings[0] + if f.Title != "OOM Killer Activated" { + t.Errorf("title = %q, expected OOM Killer Activated", f.Title) + } +} + +func TestEnrichIssues_UnmappedCode_Skipped(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + + inputs := []EnrichmentInput{{ + CollectorID: "gpu", + Code: "gpu_temperature_warning", // not in enrichmentFamily + Fingerprint: "abc123", + Severity: collector.SeverityWarning, + Confidence: collector.ConfidenceHigh, + Category: "GPU", + Message: "GPU temperature above threshold", + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if len(result.Findings) != 0 { + t.Errorf("expected 0 findings for unmapped code, got %d", len(result.Findings)) + } +} + +func TestEnrichIssues_Facts(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "logs/oom_events.txt", "Out of memory: Killed process 1 (a)\n") + + inputs := []EnrichmentInput{{ + CollectorID: "journal", + Code: collector.IssueOOMEvents, + Fingerprint: "fp1", + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "MEM", + Message: "1 OOM killer event(s)", + RelatedArtifactPaths: []string{"logs/oom_events.txt"}, + Facts: map[string]string{"oom_event_count": "1"}, + }} + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + if result.Facts["enriched_count"] != "1" { + t.Errorf("enriched_count = %q, want 1", result.Facts["enriched_count"]) + } + if result.Facts["unavailable_count"] != "0" { + t.Errorf("unavailable_count = %q, want 0", result.Facts["unavailable_count"]) + } +} + +func TestDiskMessagePattern_Parseable(t *testing.T) { + t.Parallel() + // These are the exact message formats produced by system.go + cases := []struct { + msg string + mountpoint string + pct string + }{ + {"/ at 96% capacity", "/", "96"}, + {"/var at 87% capacity", "/var", "87"}, + {"/home/user at 100% capacity", "/home/user", "100"}, + } + for _, tc := range cases { + m := DiskMessagePattern.FindStringSubmatch(tc.msg) + if len(m) != 3 { + t.Errorf("failed to parse %q", tc.msg) + continue + } + if m[1] != tc.mountpoint { + t.Errorf("mountpoint = %q, want %q for %q", m[1], tc.mountpoint, tc.msg) + } + if m[2] != tc.pct { + t.Errorf("pct = %q, want %q for %q", m[2], tc.pct, tc.msg) + } + } +} + +func TestHumanizeCode(t *testing.T) { + t.Parallel() + cases := map[string]string{ + "oom_events": "OOM Events", + "svc_failed": "Service Failed", + "disk_critical": "Disk Critical", + "svc_fabricmanager_benign": "Service Fabricmanager Benign", + } + for input, want := range cases { + got := HumanizeCode(input) + if got != want { + t.Errorf("HumanizeCode(%q) = %q, want %q", input, got, want) + } + } +} + +// --- Contract / governance tests --- + +func TestEnrichmentFamilyMapsToValidFindingCodes(t *testing.T) { + t.Parallel() + if err := ValidateEnrichmentFamily(); err != nil { + t.Fatal(err) + } +} + +func TestCollectorAndClassifierCodesAreDisjoint(t *testing.T) { + t.Parallel() + for code := range collector.CollectorIssueCodes { + if ClassifierFindingCodes[code] { + t.Errorf("code %q appears in both CollectorIssueCodes and ClassifierFindingCodes", code) + } + } +} + +func TestEnrichmentAndClassifierCodesAreDisjoint(t *testing.T) { + t.Parallel() + for code := range EnrichmentFindingCodes { + if ClassifierFindingCodes[code] { + t.Errorf("code %q appears in both EnrichmentFindingCodes and ClassifierFindingCodes", code) + } + } +} + +func TestAllFindingCodesIsUnion(t *testing.T) { + t.Parallel() + expected := len(ClassifierFindingCodes) + len(EnrichmentFindingCodes) + if len(AllFindingCodes) != expected { + t.Errorf("AllFindingCodes has %d entries, expected %d (classifier=%d + enrichment=%d)", + len(AllFindingCodes), expected, len(ClassifierFindingCodes), len(EnrichmentFindingCodes)) + } + for code := range ClassifierFindingCodes { + if !AllFindingCodes[code] { + t.Errorf("ClassifierFindingCode %q missing from AllFindingCodes", code) + } + } + for code := range EnrichmentFindingCodes { + if !AllFindingCodes[code] { + t.Errorf("EnrichmentFindingCode %q missing from AllFindingCodes", code) + } + } +} + +func TestEveryEnrichmentFamilyEntryMapsToValidFindingCode(t *testing.T) { + t.Parallel() + for issueCode, findingCode := range EnrichmentFamilyMap() { + if !EnrichmentFindingCodes[string(findingCode)] { + t.Errorf("enrichmentFamily[%q] = %q, not in EnrichmentFindingCodes", issueCode, findingCode) + } + } +} + +func TestFingerprintMatchBetweenCollectorAndEnrichment(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + writeArtifact(t, workDir, "logs/oom_events.txt", "Out of memory: Killed process 1 (test)\n") + writeArtifact(t, workDir, "hardware/disk.txt", "Filesystem Size Used Avail Use% Mounted on\n/dev/sda1 100G 96G 4G 96% /\n") + writeArtifact(t, workDir, "services/failed_services.txt", "docker.service loaded failed\n") + writeArtifact(t, workDir, "services/status_nvidia-fabricmanager.txt", "NV_WARN_NOTHING_TO_DO\n") + + // These fingerprints match what the collectors produce + cases := []struct { + code collector.IssueCode + fingerprint string + msg string + paths []string + facts map[string]string + }{ + { + collector.IssueOOMEvents, + identity.Fingerprint("journal", string(collector.IssueOOMEvents)), + "1 OOM killer event(s)", + []string{"logs/oom_events.txt"}, + map[string]string{"oom_event_count": "1"}, + }, + { + collector.IssueDiskCritical, + identity.Fingerprint("sys", string(collector.IssueDiskCritical), "/"), + "/ at 96% capacity", + []string{"hardware/disk.txt"}, + nil, + }, + { + collector.IssueSvcFailed, + identity.Fingerprint("svc", string(collector.IssueSvcFailed), "docker.service"), + "1 failed systemd service(s)", + []string{"services/failed_services.txt"}, + nil, + }, + { + collector.IssueSvcFabricmanagerBenign, + identity.Fingerprint("svc", string(collector.IssueSvcFabricmanagerBenign)), + "nvidia-fabricmanager not running", + []string{"services/status_nvidia-fabricmanager.txt"}, + nil, + }, + } + + var inputs []EnrichmentInput + issueFingerprints := make(map[string]bool) + for _, tc := range cases { + issueFingerprints[tc.fingerprint] = true + inputs = append(inputs, EnrichmentInput{ + CollectorID: "test", + Code: tc.code, + Fingerprint: tc.fingerprint, + Severity: collector.SeverityCritical, + Confidence: collector.ConfidenceHigh, + Category: "TEST", + Message: tc.msg, + RelatedArtifactPaths: tc.paths, + Facts: tc.facts, + }) + } + + result, err := EnrichIssues(workDir, inputs) + if err != nil { + t.Fatal(err) + } + + for _, f := range result.Findings { + if !VerifyFingerprintMatch(f, issueFingerprints) { + t.Errorf("enrichment finding fingerprint %q does not match any collector issue", f.Fingerprint) + } + } +} + +// writeArtifact creates a file under workDir with the given content. +func writeArtifact(t *testing.T, workDir, relPath, content string) { + t.Helper() + absPath := filepath.Join(workDir, relPath) + if err := os.MkdirAll(filepath.Dir(absPath), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(absPath, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/evidence.go b/customers/vm-troubleshooting/internal/triage/evidence.go new file mode 100644 index 0000000..92d31fc --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/evidence.go @@ -0,0 +1,107 @@ +package triage + +import ( + "os" + "path/filepath" + "strings" +) + +// maxEvidenceBytesPerIssue caps the total bytes read across all artifact files for a single issue. +const maxEvidenceBytesPerIssue = 64 * 1024 + +// EvidenceResolver reads bounded content from artifact files for evidence extraction. +// It does NOT interpret what the evidence means — it answers "what content is at these paths?". +type EvidenceResolver struct { + workDir string +} + +// NewEvidenceResolver creates a resolver rooted at workDir. +func NewEvidenceResolver(workDir string) *EvidenceResolver { + return &EvidenceResolver{workDir: workDir} +} + +// ResolvedArtifact holds the bounded content from a single artifact file. +type ResolvedArtifact struct { + Path string + Content string + Truncated bool + Missing bool +} + +// Resolve reads artifact files listed in paths, returning bounded content. +// Missing files are annotated but not fatal. Total bytes across all files is capped. +func (r *EvidenceResolver) Resolve(paths []string) []ResolvedArtifact { + if len(paths) == 0 { + return nil + } + + var results []ResolvedArtifact + remaining := maxEvidenceBytesPerIssue + + for _, p := range paths { + if remaining <= 0 { + results = append(results, ResolvedArtifact{Path: p, Truncated: true}) + continue + } + + data, err := os.ReadFile(filepath.Join(r.workDir, p)) + if err != nil { + results = append(results, ResolvedArtifact{Path: p, Missing: true}) + continue + } + + content := string(data) + truncated := false + if len(content) > remaining { + content = content[:remaining] + truncated = true + } + remaining -= len(content) + + results = append(results, ResolvedArtifact{ + Path: p, + Content: content, + Truncated: truncated, + }) + } + + return results +} + +// ExtractLines returns deduplicated non-empty lines from resolved artifacts, +// capped at maxLines. Skips artifact header lines (# metadata + --- separator). +func ExtractLines(artifacts []ResolvedArtifact, maxLines int) []string { + seen := make(map[string]struct{}) + var lines []string + + for _, a := range artifacts { + if a.Missing || a.Content == "" { + continue + } + payload := skipArtifactHeader(a.Content) + for _, raw := range strings.Split(payload, "\n") { + line := strings.TrimSpace(raw) + if line == "" { + continue + } + if _, ok := seen[line]; ok { + continue + } + seen[line] = struct{}{} + lines = append(lines, line) + if len(lines) >= maxLines { + return lines + } + } + } + return lines +} + +// skipArtifactHeader strips the artifact header (lines before and including "---" separator). +func skipArtifactHeader(content string) string { + idx := strings.Index(content, "\n---\n") + if idx >= 0 { + return content[idx+5:] + } + return content +} diff --git a/customers/vm-troubleshooting/internal/triage/evidence_test.go b/customers/vm-troubleshooting/internal/triage/evidence_test.go new file mode 100644 index 0000000..93b2fa2 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/evidence_test.go @@ -0,0 +1,155 @@ +package triage + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestEvidenceResolver_Resolve_HappyPath(t *testing.T) { + t.Parallel() + dir := t.TempDir() + writeFile(t, dir, "a.txt", "line1\nline2\nline3\n") + + r := NewEvidenceResolver(dir) + results := r.Resolve([]string{"a.txt"}) + + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if results[0].Missing { + t.Error("expected file to be found") + } + if results[0].Truncated { + t.Error("expected no truncation") + } + if !strings.Contains(results[0].Content, "line1") { + t.Errorf("expected content to contain line1, got: %s", results[0].Content) + } +} + +func TestEvidenceResolver_Resolve_MissingFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + r := NewEvidenceResolver(dir) + results := r.Resolve([]string{"nonexistent.txt"}) + + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if !results[0].Missing { + t.Error("expected missing=true for nonexistent file") + } +} + +func TestEvidenceResolver_Resolve_BoundedReading(t *testing.T) { + t.Parallel() + dir := t.TempDir() + // Create a file larger than the byte limit + big := strings.Repeat("x", maxEvidenceBytesPerIssue+1000) + writeFile(t, dir, "big.txt", big) + + r := NewEvidenceResolver(dir) + results := r.Resolve([]string{"big.txt"}) + + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if !results[0].Truncated { + t.Error("expected truncated=true for large file") + } + if len(results[0].Content) > maxEvidenceBytesPerIssue { + t.Errorf("content length %d exceeds limit %d", len(results[0].Content), maxEvidenceBytesPerIssue) + } +} + +func TestEvidenceResolver_Resolve_MultipleFiles_BudgetShared(t *testing.T) { + t.Parallel() + dir := t.TempDir() + half := strings.Repeat("y", maxEvidenceBytesPerIssue/2+1) + writeFile(t, dir, "a.txt", half) + writeFile(t, dir, "b.txt", half) + writeFile(t, dir, "c.txt", "should be truncated marker") + + r := NewEvidenceResolver(dir) + results := r.Resolve([]string{"a.txt", "b.txt", "c.txt"}) + + if len(results) != 3 { + t.Fatalf("expected 3 results, got %d", len(results)) + } + totalBytes := 0 + for _, res := range results { + totalBytes += len(res.Content) + } + if totalBytes > maxEvidenceBytesPerIssue { + t.Errorf("total bytes %d exceeds limit %d", totalBytes, maxEvidenceBytesPerIssue) + } +} + +func TestEvidenceResolver_Resolve_EmptyPaths(t *testing.T) { + t.Parallel() + dir := t.TempDir() + r := NewEvidenceResolver(dir) + results := r.Resolve(nil) + if results != nil { + t.Errorf("expected nil for empty paths, got %v", results) + } +} + +func TestExtractLines_Deduplication(t *testing.T) { + t.Parallel() + artifacts := []ResolvedArtifact{ + {Content: "alpha\nbeta\nalpha\ngamma\nbeta\n"}, + } + lines := ExtractLines(artifacts, 10) + if len(lines) != 3 { + t.Errorf("expected 3 unique lines, got %d: %v", len(lines), lines) + } +} + +func TestExtractLines_MaxLines(t *testing.T) { + t.Parallel() + artifacts := []ResolvedArtifact{ + {Content: "a\nb\nc\nd\ne\nf\n"}, + } + lines := ExtractLines(artifacts, 3) + if len(lines) != 3 { + t.Errorf("expected 3 lines, got %d", len(lines)) + } +} + +func TestExtractLines_SkipsArtifactHeader(t *testing.T) { + t.Parallel() + artifacts := []ResolvedArtifact{ + {Content: "# Command: systemctl list-units\n# Exit: 0\n---\nactual content\n"}, + } + lines := ExtractLines(artifacts, 10) + if len(lines) != 1 || lines[0] != "actual content" { + t.Errorf("expected [actual content], got %v", lines) + } +} + +func TestExtractLines_SkipsMissing(t *testing.T) { + t.Parallel() + artifacts := []ResolvedArtifact{ + {Missing: true}, + {Content: "real data\n"}, + } + lines := ExtractLines(artifacts, 10) + if len(lines) != 1 || lines[0] != "real data" { + t.Errorf("expected [real data], got %v", lines) + } +} + +func writeFile(t *testing.T, dir, name, content string) { + t.Helper() + path := filepath.Join(dir, name) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index 9715b7d..f061253 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -32,6 +32,7 @@ type Finding struct { type FindingCode string +// Classifier finding codes — findings that become synthetic "triage" issues in manifest/report. const ( FindingXid FindingCode = "xid" FindingSXid FindingCode = "sxid" @@ -40,8 +41,15 @@ const ( FindingDataQuality FindingCode = "data_quality" ) -// FindingCodes enumerates triage-owned finding codes. -var FindingCodes = map[string]bool{ +// Enrichment finding codes — findings that live only in triage/_data/, never in manifest/report. +const ( + FindingServiceState FindingCode = "service_state" + FindingDiskCapacity FindingCode = "disk_capacity" + FindingMemoryPressure FindingCode = "memory_pressure" +) + +// ClassifierFindingCodes are findings that become synthetic "triage" issues in manifest/report. +var ClassifierFindingCodes = map[string]bool{ string(FindingXid): true, string(FindingSXid): true, string(FindingFirewallPosture): true, @@ -49,6 +57,29 @@ var FindingCodes = map[string]bool{ string(FindingDataQuality): true, } +// EnrichmentFindingCodes are findings that live only in triage/_data/, never in manifest/report. +var EnrichmentFindingCodes = map[string]bool{ + string(FindingServiceState): true, + string(FindingDiskCapacity): true, + string(FindingMemoryPressure): true, +} + +// AllFindingCodes is ClassifierFindingCodes ∪ EnrichmentFindingCodes (for triage-result schema). +var AllFindingCodes = func() map[string]bool { + m := make(map[string]bool, len(ClassifierFindingCodes)+len(EnrichmentFindingCodes)) + for k, v := range ClassifierFindingCodes { + m[k] = v + } + for k, v := range EnrichmentFindingCodes { + m[k] = v + } + return m +}() + +// FindingCodes is an alias for ClassifierFindingCodes, kept for backward compatibility +// with contract tests that reference it. New code should use the specific set. +var FindingCodes = ClassifierFindingCodes + // TriageResult holds the output of a single analyzer. type TriageResult struct { Name string // analyzer name for spinner/logging diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index b2745eb..b5b060d 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -96,6 +96,7 @@ "type": "string", "enum": [ "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "gpu_unreachable", "xid", "sxid", "firewall_posture", "critical_log", "data_quality" ] }, diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index 24e3451..4786d84 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -41,6 +41,7 @@ "type": "string", "enum": [ "oom_events", "disk_warning", "disk_critical", "svc_failed", "svc_fabricmanager_benign", + "gpu_unreachable", "xid", "sxid", "firewall_posture", "critical_log", "data_quality" ] }, diff --git a/customers/vm-troubleshooting/schemas/triage-result.schema.json b/customers/vm-troubleshooting/schemas/triage-result.schema.json index d036273..e1e657d 100644 --- a/customers/vm-troubleshooting/schemas/triage-result.schema.json +++ b/customers/vm-troubleshooting/schemas/triage-result.schema.json @@ -28,7 +28,7 @@ "properties": { "code": { "type": "string", - "enum": ["xid", "sxid", "firewall_posture", "critical_log", "data_quality"] + "enum": ["xid", "sxid", "firewall_posture", "critical_log", "data_quality", "service_state", "disk_capacity", "memory_pressure"] }, "severity": { "type": "string", "enum": ["critical", "warning", "info"] }, "confidence": { "type": "string", "enum": ["high", "low"] }, diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..019caa3 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,19 @@ +# Documentation index + +This folder holds **plans and pointers**. Authoritative engineering rules for the repo are in the root [`AGENTS.md`](../AGENTS.md) and per-project `AGENTS.md` files. Authoritative layout and data-flow descriptions are in each project’s **CODEMAP** (and the collector’s **ARCHITECTURE** deep-dive). + +## Architecture + +- [Architecture entrypoint](architecture.md) — links to project-local maps. + +## Plans (`docs/plans/`) + +These files describe **workstreams, goals, and review outcomes**. Treat them as specifications and history: confirm current behavior in code and tests before assuming every item is implemented. + +| Document | Topic | +|----------|--------| +| [`post-audit-hardening.md`](plans/post-audit-hardening.md) | Follow-on hardening after collector audit work (dashboard auth, ingest guards, store/list behavior, collector hygiene, etc.). | +| [`collector-audit-improvements.md`](plans/collector-audit-improvements.md) | Prior audit-driven collector/schema improvements. | +| [`triage-improvements.md`](plans/triage-improvements.md) | Triage-related improvements plan. | + +Additional follow-up tracking may live under `.cursor/plans/` for agent workflows; those are optional for human readers. diff --git a/docs/architecture.md b/docs/architecture.md index 06487ae..2738df8 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,224 +1,10 @@ -# gather-info Architecture +# Architecture documentation -## Overview +Detailed architecture lives **next to each project** so it stays accurate when code changes. -`gather-info` is a static Linux binary that collects VM diagnostics into a self-contained `.tar.gz` archive. It is designed for customer and support engineer use on machines we do not control and cannot access directly. +| Project | Quick map | Extended reference | +|---------|-----------|-------------------| +| Diagnostics collector (`gather-info`) | [`customers/vm-troubleshooting/CODEMAP.md`](../customers/vm-troubleshooting/CODEMAP.md) | [`customers/vm-troubleshooting/ARCHITECTURE.md`](../customers/vm-troubleshooting/ARCHITECTURE.md) | +| Support dashboard | [`customers/vm-troubleshooting-dashboard/CODEMAP.md`](../customers/vm-troubleshooting-dashboard/CODEMAP.md) | (see CODEMAP + [`AGENTS.md`](../customers/vm-troubleshooting-dashboard/AGENTS.md)) | -The binary runs 11 domain collectors, performs automated triage analysis, and produces structured machine-readable output alongside human-readable summaries. - -## Execution Pipeline - -``` -main.go - | - v -cli/root.go Cobra flags, signal handling, context creation - | - v -runner/runner.go Orchestration hub - | - |-- privilege/ Detect root/sudo availability - |-- platform/ Detect distro, GPU, DCGM, WSL - |-- install/ Optional interactive DCGM installation - | - |-- collector/ 11 domain collectors (sequential, skip-aware) - | |-- system CPU, memory, disk, processes, hardware - | |-- network Links, routes, neighbors, firewall rules, netplan - | |-- nvidia nvidia-smi, dmesg Xid extraction, driver params - | |-- dcgm dcgmi discovery, health, stats, optional level-2 diag - | |-- docker docker info/ps, sanitized container inspect - | |-- services Batch D-Bus service status, failed services, fabricmanager - | |-- journal dmesg, journalctl (kernel, errors, OOM), optional full journal - | |-- packages dpkg/rpm nvidia packages, pip, held packages - | |-- additional Limits, sysctl, LVM, sensors, mounts - | |-- storage nvme, smartctl - | '-- infiniband ibstat, rdma tools - | - |-- triage/ Post-collection analysis (after collectors, before output) - | |-- xid NVIDIA Xid/SXid classification (catalog + local policy overrides) - | |-- xidcatalog Local Xid catalog/parsing boundary (neutral data) - | |-- firewall Firewall posture detection (iptables/ufw/nft/firewalld) - | '-- critical Critical log extraction (panic, HW error, fallen off bus, timeout) - | - |-- output/ Generate structured output files - | |-- metadata.json Lightweight execution summary - | |-- SUMMARY.txt Human-readable report - | |-- manifest.json Full machine-readable archive index - | |-- report.ndjson Streaming event log (1 JSON line per record) - | '-- schemas/ Embedded JSON schemas (self-describing archive) - | - |-- transfer/ Floating IP detection, SCP command generation - | - '-- archive tar.gz the work directory -``` - -## Package Map - -| Package | Purpose | Depends on | -|---------|---------|------------| -| `cmd/gather-info` | Binary entrypoint | cli, config | -| `internal/cli` | Cobra command, flags, signal handling | config, executor, runner, ui | -| `internal/config` | Config struct, modes, timeouts, exit codes, build metadata | stdlib only | -| `internal/runner` | Orchestration: detect → collect → triage → output → archive | all internal packages | -| `internal/collector` | Collector interface, registry, core types (Issue, Severity, ArtifactRecord) | executor, output, ui, platform, probe, sanitize | -| `internal/triage` | Post-collection analysis (Xid, firewall, critical logs) | collector (types), identity, output (writer), ui | -| `internal/triage/xidcatalog` | Neutral Xid catalog lookup + kernel line parsers | stdlib only | -| `internal/identity` | Stable issue fingerprint helper (normalized tuple hashing) | stdlib only | -| `internal/output` | Writer, manifest, report, summary, archive creation | executor, schemas | -| `internal/transfer` | IP discovery, floating IP detection, SCP commands | netlink | -| `internal/executor` | Subprocess execution: timeouts, process groups, capture limits | stdlib only | -| `internal/ui` | TTY-aware terminal output (pterm), spinners, prompts | pterm, isatty, runewidth | -| `internal/probe` | Go-native probes: systemd D-Bus, procfs, netlink, GHW | go-systemd, ghw, procfs, netlink | -| `internal/platform` | Distro, GPU, DCGM, WSL detection | executor | -| `internal/sanitize` | Redaction of secrets from configs, process lists, Docker inspect | stdlib only | -| `internal/privilege` | Root/sudo detection and interactive acquisition | stdlib only | -| `internal/install` | Optional DCGM installation and daemon enablement (Ubuntu 22.04/24.04) | config, executor, platform, ui | - -## Core Types - -### `collector.Severity` (int enum) - -``` -SeverityUnspecified = 0 // sentinel, catches uninitialized Issue{} -SeverityInfo = 1 -SeverityWarning = 2 -SeverityCritical = 3 -``` - -Explicit integer values (not iota). MarshalJSON/UnmarshalJSON serialize as strings (`"info"`, `"warning"`, `"critical"`). - -### `collector.Issue` - -```go -Code IssueCode -Severity Severity -Confidence Confidence // "high" or "low" -Category string // "GPU", "SVC", "MEM", "DISK", "FW", etc. -Message string -Fingerprint string -RelatedArtifactPaths []string -UnresolvedArtifactPaths []string -``` - -### `collector.ArtifactRecord` - -Every collected file has structured metadata: path, type (`command`/`file`/`probe`), command string, exit code, status (`ok`/`skipped`/`error`), timing, SHA-256, content type, parser hint, and semantic tags. - -### `collector.CollectorResult` - -Aggregated per-collector output: ID, name, issues, facts (`map[string]string`), artifacts, skipped reasons, errors, duration. - -### `triage.Finding` - -Richer than Issue: includes `code`, `severity`, `confidence`, title/description/action, evidence, source artifact paths, and issue fingerprint. Findings are converted to synthetic issues by the runner bridge. - -## Machine-Readable Output - -### `manifest.json` — Archive Index - -The primary machine-readable file. Contains: -- **`artifact_index[]`** — flat list of every collector-produced file with SHA-256, size, parser hint, tags -- **`collectors{}`** — per-collector summary with status, duration, facts (typed), issues, skipped reasons, errors -- issue records include `code`, `severity`, `confidence`, `message`, `issue_fingerprint`, and path linkage (`related_artifact_paths`, `unresolved_artifact_paths`) -- **`platform{}`** — OS and kernel -- Schema version, archive ID, tool version, generation timestamp - -Control files (manifest.json itself, report.ndjson, SUMMARY.txt, metadata.json, schemas/) are excluded from `artifact_index`. - -### `report.ndjson` — Event Stream - -Same data as manifest in streaming NDJSON format. Four record types discriminated by `type`: -- `artifact` — file was collected -- `issue` — problem detected -- `fact` — key-value observation -- `collector_summary` — collector finished - -Order: per collector (registration order) → artifacts → issues → facts (alphabetical) → summary. - -Wire rules (per NDJSON spec v1.0.0): UTF-8, `\n` delimited, no internal newlines, parsers may ignore empty lines. - -### `triage/_data/*.json` — Analysis Detail - -Three JSON files with rich finding detail: -- `gpu_health.json` — Xid/SXid findings with `code`, `confidence`, fingerprint, source artifacts, and typed facts -- `firewall_posture.json` — posture classification, per-tool results -- `critical_events.json` — critical log findings with explicit pattern metadata and deterministic fingerprints - -### `metadata.json` — Execution Summary - -Lightweight backward-compatible summary: version, flags, per-collector counts (artifacts, skipped, errors, duration). Does not duplicate manifest detail. - -### `SUMMARY.txt` — Human Report - -Text report with issues grouped by severity (CRITICAL → WARNING → INFO), system/hardware/GPU summaries, collector status table, and archive contents listing. -Only `confidence=high` issues are shown in `SUMMARY.txt`; low-confidence issues remain available in machine-readable outputs. - -## Controlled Vocabularies - -**Tags** (on artifacts): identity, cpu, memory, disk, hardware, gpu, gpu-errors, gpu-health, network, firewall, docker, docker-security, services, journal, oom, packages, storage, infiniband, processes, config, triage - -**Parser hints** (on artifacts): ~40 values identifying the tool/format that produced the content (e.g., `nvidia-smi`, `dmesg`, `systemctl`, `json`, `text`) - -**Issue categories**: GPU, SVC, MEM, DISK, FW, KERN, HW, TIMEOUT, ERR - -## Collection Modes - -`--mode=safe|quick|standard|deep` - -| Collector | safe | quick | standard | deep | -|-----------|------|-------|----------|------| -| System | run | run | run | run | -| Network | run | run | run | run | -| NVIDIA | skip | run | run | run | -| DCGM | skip | skip | run | run + level-2 diag | -| Docker | skip | skip | run | run + container logs | -| Services | run | run | run | run | -| Journal | skip | skip | run | run + full journal | -| Packages | run | skip | run | run | -| Additional | run | run | run | run | -| Storage | run | run | run | run | -| InfiniBand | run | run | run | run | - -Explicit CLI flags (`--skip-*`, `--include-*`) always override mode defaults. - -## Key Invariants - -1. **Path reservation** — all artifact paths globally unique via `Writer.ReservePath()` -2. **Atomic writes** — all files written via temp → rename (no partial files) -3. **Process isolation** — all subprocesses in process groups for clean cleanup -4. **Context propagation** — cancellation flows through all layers -5. **TTY awareness** — stderr for progress, stdout reserved for archive path -6. **Fail per section** — collector errors don't stop other collectors -7. **Triage timing** — runs after collection, before summary generation - -## External Dependencies - -| Library | Purpose | -|---------|---------| -| `github.com/spf13/cobra` | CLI framework | -| `github.com/pterm/pterm` | Terminal UI (spinners, styled output) | -| `github.com/coreos/go-systemd/v22/dbus` | Systemd D-Bus for batch service status | -| `github.com/jaypipes/ghw` | Hardware detection (CPU, memory, PCI) | -| `github.com/prometheus/procfs` | /proc parsing | -| `github.com/vishvananda/netlink` | Netlink route/interface queries | -| `github.com/mattn/go-isatty` | TTY detection | -| `golang.org/x/sys` | Unix syscalls (disk space, process groups) | - -## Build - -```bash -CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -trimpath \ - -ldflags="-s -w -X .../config.Version=v1.2.0 -X .../config.Commit=$(git rev-parse --short HEAD) -X .../config.BuildDate=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - -o gather-info ./cmd/gather-info -``` - -Static binary, no CGO, ~15MB. Runs on Ubuntu 20.04/22.04/24.04 with no external dependencies. - -## Exit Codes - -| Code | Meaning | -|------|---------| -| 0 | Archive created, no errors | -| 1 | Fatal error, no archive produced | -| 2 | Archive created, some collectors had errors | -| 3 | Interrupted (SIGINT/SIGTERM), partial work directory preserved | +For planning documents and indexes, see [`docs/README.md`](README.md). From 5b2195bd961acd4d68575f3fab26cae89d024b86 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 13 Apr 2026 17:18:32 +0200 Subject: [PATCH 10/23] feat: harden dashboard ingest/API and improve gather-info journal & enrichment Require identity headers when using trust-forwarded auth; cap and evict upload rate limiters; symlink-safe SafeJoin; SaveBounded + ingest cap; ListPage with SQL pagination; nullable triage_finding_count (API + UI). Journal NDJSON via bytes.Buffer; enrichment errors recorded on collector results; sanitize tests; triage/integration/version and CODEMAP updates. --- .../vm-troubleshooting-dashboard/CODEMAP.md | 4 +- .../vm-troubleshooting-dashboard/Makefile | 2 +- .../frontend/src/types.ts | 9 +- .../internal/api/server.go | 142 ++++++++--- .../internal/api/server_test.go | 112 +++++++++ .../internal/ingest/ingest.go | 7 +- .../internal/ingest/ingest_test.go | 6 +- .../internal/model/types.go | 2 +- .../internal/pathutil/safejoin.go | 41 +++- .../internal/pathutil/safejoin_test.go | 71 ++++++ .../internal/store/store.go | 157 +++++++++--- .../internal/store/store_test.go | 227 ++++++++++++++++++ customers/vm-troubleshooting/CODEMAP.md | 4 + .../internal/collector/journal.go | 44 ++-- .../internal/config/version.go | 2 +- .../internal/runner/runner.go | 69 ++++-- .../internal/sanitize/sanitize_test.go | 12 + .../internal/triage/integration_test.go | 4 +- .../internal/triage/triage.go | 6 +- 19 files changed, 799 insertions(+), 122 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/CODEMAP.md b/customers/vm-troubleshooting-dashboard/CODEMAP.md index 5797529..7175e9b 100644 --- a/customers/vm-troubleshooting-dashboard/CODEMAP.md +++ b/customers/vm-troubleshooting-dashboard/CODEMAP.md @@ -33,7 +33,7 @@ This file is the skimmable map. Schema compatibility rules and consumer contract ### `internal/api/` -- Routing, middleware (security headers, optional API auth), JSON helpers. +- Routing, middleware (security headers, optional API auth, per-IP upload rate limiting), JSON helpers. - Handlers for archives, issues, artifacts, upload, delete. - Pagination and query parsing for list endpoints. @@ -47,6 +47,8 @@ This file is the skimmable map. Schema compatibility rules and consumer contract - SQLite schema, migrations, CRUD for archives/issues/collectors. - Denormalized fields where needed for list performance (see migrations in `store.go`). +- `SaveBounded`: cap-aware insert that rejects over-limit archives with `ErrArchiveCapExceeded`. +- `ListPage(pageSize, offset)`: paginated listing returning items, total count, and total storage bytes. - `evidence.go`: heuristic suggestions linking issues to artifacts (scoring); keep tolerant of unknown issue shapes. ### `internal/model/` diff --git a/customers/vm-troubleshooting-dashboard/Makefile b/customers/vm-troubleshooting-dashboard/Makefile index 986a6a3..216ad52 100644 --- a/customers/vm-troubleshooting-dashboard/Makefile +++ b/customers/vm-troubleshooting-dashboard/Makefile @@ -20,7 +20,7 @@ dev: wait dev-backend: - go run ./cmd/dashboard -addr :8080 -data-dir ./dashboard-data -web-root "" + go run ./cmd/dashboard -listen 127.0.0.1:8080 -data-dir ./dashboard-data -web-root "" dev-frontend: cd frontend && pnpm dev diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts index e1fde0b..6384118 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts @@ -23,7 +23,14 @@ export type ArchiveSummary = { issue_counts: IssueCounts collector_count: number artifact_count: number - triage_finding_count: number + /** + * Number of triage findings attached to this archive. + * + * Denormalised onto the archives table at ingest time. May be null for + * archives ingested before the column was added (the startup backfill + * eventually fills these in — render "—" for null in the meantime). + */ + triage_finding_count: number | null status: string error_reason?: string compressed_size: number diff --git a/customers/vm-troubleshooting-dashboard/internal/api/server.go b/customers/vm-troubleshooting-dashboard/internal/api/server.go index 4f258b2..5dc1340 100644 --- a/customers/vm-troubleshooting-dashboard/internal/api/server.go +++ b/customers/vm-troubleshooting-dashboard/internal/api/server.go @@ -12,6 +12,7 @@ import ( "net/http" "net/url" "os" + "path" "path/filepath" "sort" "strconv" @@ -41,13 +42,32 @@ type Options struct { RequireAPIAuth bool } +// uploadLimiterTTL is how long a per-IP limiter entry lives without use before +// it is eligible for eviction. Picked long enough to preserve bucket state +// across realistic upload cadence but short enough to bound memory. +const uploadLimiterTTL = 15 * time.Minute + +// uploadLimiterMaxEntries caps the per-IP limiter map. When exceeded we +// perform a sweep; if that does not free enough slots we also drop the oldest +// entry to keep the map bounded even under header-spoofed amplification. +const uploadLimiterMaxEntries = 4096 + +// forwardedUserHeaders are the identity headers trusted when the operator +// opts in to --trust-forwarded-user. +var forwardedUserHeaders = []string{"X-Forwarded-User", "X-Forwarded-Email", "X-Remote-User"} + +type limiterEntry struct { + limiter *rate.Limiter + lastSeen time.Time +} + type Server struct { store *store.Store webRoot string opts Options mux *http.ServeMux uploadLimiterMu sync.Mutex - uploadLimiters map[string]*rate.Limiter + uploadLimiters map[string]*limiterEntry } func New(st *store.Store, webRoot string, opts Options) *Server { @@ -59,7 +79,7 @@ func New(st *store.Store, webRoot string, opts Options) *Server { webRoot: webRoot, opts: opts, mux: http.NewServeMux(), - uploadLimiters: make(map[string]*rate.Limiter), + uploadLimiters: make(map[string]*limiterEntry), } s.routes() return s @@ -83,7 +103,16 @@ func (s *Server) withSecurity(next http.Handler) http.Handler { if !s.checkBearer(w, r) { return } - } else if !s.opts.TrustForwardedUser { + } else if s.opts.TrustForwardedUser { + // Proxy-trust mode without a shared token is only safe when a + // trusted upstream actually attaches an identity header. Refuse + // requests that arrive without one — otherwise a direct hit on + // the dashboard would pass as open-auth. + if forwardedUser(r) == "" { + writeError(w, http.StatusUnauthorized, "unauthorized") + return + } + } else { writeError(w, http.StatusForbidden, "server misconfiguration") return } @@ -91,6 +120,17 @@ func (s *Server) withSecurity(next http.Handler) http.Handler { }) } +// forwardedUser returns the first non-empty value from the trusted identity +// headers, or "" if none are present. +func forwardedUser(r *http.Request) string { + for _, key := range forwardedUserHeaders { + if value := strings.TrimSpace(r.Header.Get(key)); value != "" { + return value + } + } + return "" +} + func (s *Server) checkBearer(w http.ResponseWriter, r *http.Request) bool { parts := strings.SplitN(strings.TrimSpace(r.Header.Get("Authorization")), " ", 2) if len(parts) != 2 || !strings.EqualFold(parts[0], "Bearer") { @@ -131,14 +171,45 @@ func (s *Server) uploadLimiterAllow(r *http.Request) bool { return true } key := clientIP(r, s.opts.TrustForwardedUser) + now := time.Now() s.uploadLimiterMu.Lock() defer s.uploadLimiterMu.Unlock() - lim, ok := s.uploadLimiters[key] + entry, ok := s.uploadLimiters[key] if !ok { - lim = rate.NewLimiter(rate.Limit(5.0/60.0), 2) - s.uploadLimiters[key] = lim + s.evictStaleLimitersLocked(now) + entry = &limiterEntry{limiter: rate.NewLimiter(rate.Limit(5.0/60.0), 2)} + s.uploadLimiters[key] = entry + } + entry.lastSeen = now + return entry.limiter.Allow() +} + +// evictStaleLimitersLocked sweeps expired entries. If the map is still at or +// above the hard cap after a sweep, drop the single oldest entry. Cheap +// amortised O(n) sweep is acceptable here: inserts are rate-limited themselves +// and the map is bounded by the cap. +// +// Caller must hold uploadLimiterMu. +func (s *Server) evictStaleLimitersLocked(now time.Time) { + for k, e := range s.uploadLimiters { + if now.Sub(e.lastSeen) > uploadLimiterTTL { + delete(s.uploadLimiters, k) + } + } + if len(s.uploadLimiters) < uploadLimiterMaxEntries { + return + } + var oldestKey string + var oldestTime time.Time + for k, e := range s.uploadLimiters { + if oldestKey == "" || e.lastSeen.Before(oldestTime) { + oldestKey = k + oldestTime = e.lastSeen + } + } + if oldestKey != "" { + delete(s.uploadLimiters, oldestKey) } - return lim.Allow() } func (s *Server) routes() { @@ -193,12 +264,10 @@ func (s *Server) handleUpload(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusTooManyRequests, "upload rate limit exceeded") return } - nArchives, err := s.store.ArchiveCount() - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - if int(nArchives) >= s.opts.MaxArchives { + // Fast-path cap rejection to avoid wasting an extraction cycle on clients + // that are already over the cap. The authoritative, race-free check still + // happens inside ingest.Ingest -> store.SaveBounded. + if nArchives, err := s.store.ArchiveCount(); err == nil && int(nArchives) >= s.opts.MaxArchives { writeError(w, http.StatusInsufficientStorage, "archive limit reached; prune old archives or raise --max-archives") return } @@ -245,8 +314,12 @@ func (s *Server) handleUpload(w http.ResponseWriter, r *http.Request) { return } - archive, err := ingest.Ingest(r.Context(), s.store, tmpPath, s.uploadedBy(r)) + archive, err := ingest.Ingest(r.Context(), s.store, tmpPath, s.uploadedBy(r), s.opts.MaxArchives) if err != nil { + if errors.Is(err, store.ErrArchiveCapExceeded) { + writeError(w, http.StatusInsufficientStorage, "archive limit reached; prune old archives or raise --max-archives") + return + } writeError(w, http.StatusBadRequest, err.Error()) return } @@ -263,22 +336,17 @@ func (s *Server) handleUpload(w http.ResponseWriter, r *http.Request) { func (s *Server) handleListArchives(w http.ResponseWriter, r *http.Request) { page, pageSize := pagination(r) - items := s.store.List() - total := len(items) - start := (page - 1) * pageSize - if start > total { - start = total - } - end := start + pageSize - if end > total { - end = total + items, total, totalBytes, err := s.store.ListPage(pageSize, (page-1)*pageSize) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return } writeJSON(w, http.StatusOK, map[string]any{ - "items": items[start:end], + "items": items, "total": total, "page": page, "page_size": pageSize, - "total_storage_bytes": totalStorageBytes(items), + "total_storage_bytes": totalBytes, }) } @@ -379,6 +447,9 @@ func (s *Server) handleDownloadArtifact(w http.ResponseWriter, r *http.Request) http.ServeFile(w, r, fullPath) } +// handleWeb serves the built SPA: static files when they exist on disk, +// index.html for the root and all unknown paths (so client-side routing works). +// A single serve path — http.ServeFile handles Content-Type, Range, ETags, etc. func (s *Server) handleWeb(w http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.URL.Path, "/api/") { writeError(w, http.StatusNotFound, "not found") @@ -388,9 +459,10 @@ func (s *Server) handleWeb(w http.ResponseWriter, r *http.Request) { http.Error(w, "dashboard frontend is not built", http.StatusNotFound) return } - rel := strings.TrimPrefix(r.URL.Path, "/") + indexPath := filepath.Join(s.webRoot, "index.html") + rel := strings.TrimPrefix(path.Clean("/"+r.URL.Path), "/") if rel == "" { - http.ServeFile(w, r, filepath.Join(s.webRoot, "index.html")) + http.ServeFile(w, r, indexPath) return } full, err := pathutil.SafeJoin(s.webRoot, rel) @@ -402,7 +474,7 @@ func (s *Server) handleWeb(w http.ResponseWriter, r *http.Request) { http.ServeFile(w, r, full) return } - http.ServeFile(w, r, filepath.Join(s.webRoot, "index.html")) + http.ServeFile(w, r, indexPath) } func (s *Server) getArchive(w http.ResponseWriter, r *http.Request) (*model.ArchiveDetail, bool) { @@ -443,10 +515,8 @@ func writeError(w http.ResponseWriter, status int, msg string) { func (s *Server) uploadedBy(r *http.Request) string { if s.opts.TrustForwardedUser { - for _, key := range []string{"X-Forwarded-User", "X-Forwarded-Email", "X-Remote-User"} { - if value := strings.TrimSpace(r.Header.Get(key)); value != "" { - return value - } + if value := forwardedUser(r); value != "" { + return value } } return "anonymous" @@ -478,14 +548,6 @@ func atoiDefault(value string, fallback int) int { return n } -func totalStorageBytes(items []model.ArchiveSummary) int64 { - var total int64 - for _, item := range items { - total += item.StorageBytes - } - return total -} - func filterIssues(issues []model.IssueRecord, r *http.Request) []model.IssueRecord { query := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("q"))) severity := strings.ToLower(strings.TrimSpace(r.URL.Query().Get("severity"))) diff --git a/customers/vm-troubleshooting-dashboard/internal/api/server_test.go b/customers/vm-troubleshooting-dashboard/internal/api/server_test.go index 0331dad..e686ba1 100644 --- a/customers/vm-troubleshooting-dashboard/internal/api/server_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/api/server_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "os" + "path/filepath" "strings" "testing" "time" @@ -143,6 +144,45 @@ func TestAPI_ListIgnoresSpoofedForwardedUserWithoutTrust(t *testing.T) { } } +func TestAPI_TrustForwardedRequiresIdentityHeader(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + srv := New(st, "", Options{ + RequireAPIAuth: true, + TrustForwardedUser: true, + }) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + // No identity header → reject. Without this guard, --trust-forwarded-user + // on a non-loopback listener would behave as open-auth. + resp, err := http.Get(ts.URL + "/api/v1/archives") + if err != nil { + t.Fatal(err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("missing identity header: want 401, got %d", resp.StatusCode) + } + + req, _ := http.NewRequest(http.MethodGet, ts.URL+"/api/v1/archives", nil) + req.Header.Set("X-Forwarded-User", "alice@example.com") + resp2, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + resp2.Body.Close() + if resp2.StatusCode != http.StatusOK { + t.Fatalf("trusted identity header: want 200, got %d", resp2.StatusCode) + } +} + func TestAPI_UploadRateLimit(t *testing.T) { t.Parallel() dir := t.TempDir() @@ -227,3 +267,75 @@ func TestAPI_MaxArchives(t *testing.T) { t.Fatalf("expected json error body, Content-Type=%q", ct) } } + +func TestHandleWeb_ServesStaticViaFileServer(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + web := t.TempDir() + if err := os.MkdirAll(filepath.Join(web, "assets"), 0o755); err != nil { + t.Fatal(err) + } + const wantBody = "hello-asset" + if err := os.WriteFile(filepath.Join(web, "assets", "app.js"), []byte(wantBody), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(web, "index.html"), []byte(""), 0o644); err != nil { + t.Fatal(err) + } + + srv := New(st, web, Options{RequireAPIAuth: false}) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + resp, err := http.Get(ts.URL + "/assets/app.js") + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("GET static: want 200, got %d", resp.StatusCode) + } + b, _ := io.ReadAll(resp.Body) + if string(b) != wantBody { + t.Fatalf("body: want %q, got %q", wantBody, string(b)) + } +} + +func TestHandleWeb_SPANonFileFallsBackToIndex(t *testing.T) { + t.Parallel() + dir := t.TempDir() + st, err := store.New(dir) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + web := t.TempDir() + const indexHTML = "spa" + if err := os.WriteFile(filepath.Join(web, "index.html"), []byte(indexHTML), 0o644); err != nil { + t.Fatal(err) + } + + srv := New(st, web, Options{RequireAPIAuth: false}) + ts := httptest.NewServer(srv.Handler()) + t.Cleanup(ts.Close) + + resp, err := http.Get(ts.URL + "/dashboard/reports/123") + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("SPA fallback: want 200, got %d", resp.StatusCode) + } + b, _ := io.ReadAll(resp.Body) + if string(b) != indexHTML { + t.Fatalf("body: want index.html, got %q", string(b)) + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go index 4cdad6b..eb5eac2 100644 --- a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go @@ -32,7 +32,10 @@ func validateRegularTarFileHeader(hdr *tar.Header) error { return nil } -func Ingest(ctx context.Context, st *store.Store, archivePath, uploadedBy string) (*model.ArchiveDetail, error) { +// Ingest extracts and persists an archive. maxArchives > 0 enforces an +// atomic cap: if the store already holds that many archives, the insert is +// rejected with store.ErrArchiveCapExceeded. +func Ingest(ctx context.Context, st *store.Store, archivePath, uploadedBy string, maxArchives int) (*model.ArchiveDetail, error) { if ctx.Err() != nil { return nil, ctx.Err() } @@ -101,7 +104,7 @@ func Ingest(ctx context.Context, st *store.Store, archivePath, uploadedBy string } detail.StorageDir = finalDir detail.Summary.UploadedBy = uploadedBy - if err := st.Save(detail); err != nil { + if err := st.SaveBounded(detail, maxArchives); err != nil { _ = os.RemoveAll(finalDir) return nil, err } diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go index e849ac2..16a2a93 100644 --- a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go @@ -77,7 +77,7 @@ func TestIngestArchive(t *testing.T) { })) }) - detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com") + detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com", 0) if err != nil { t.Fatalf("Ingest: %v", err) } @@ -137,7 +137,7 @@ func TestIngestWrappedArchive(t *testing.T) { writeTarFile(t, tw, prefix+"/logs/dmesg.txt", []byte("hello")) }) - detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com") + detail, err := Ingest(context.Background(), st, archivePath, "tester@example.com", 0) if err != nil { t.Fatalf("Ingest wrapped archive: %v", err) } @@ -169,7 +169,7 @@ func TestIngestRejectsTraversal(t *testing.T) { archivePath := createArchive(t, root, func(tw *tar.Writer) { writeTarFile(t, tw, "../evil.txt", []byte("bad")) }) - if _, err := Ingest(context.Background(), st, archivePath, "tester@example.com"); err == nil { + if _, err := Ingest(context.Background(), st, archivePath, "tester@example.com", 0); err == nil { t.Fatal("expected traversal error") } } diff --git a/customers/vm-troubleshooting-dashboard/internal/model/types.go b/customers/vm-troubleshooting-dashboard/internal/model/types.go index b568012..0519fa2 100644 --- a/customers/vm-troubleshooting-dashboard/internal/model/types.go +++ b/customers/vm-troubleshooting-dashboard/internal/model/types.go @@ -173,7 +173,7 @@ type ArchiveSummary struct { IssueCounts IssueCounts `json:"issue_counts"` CollectorCount int `json:"collector_count"` ArtifactCount int `json:"artifact_count"` - TriageFindingCount int `json:"triage_finding_count"` + TriageFindingCount *int `json:"triage_finding_count"` Status string `json:"status"` ErrorReason string `json:"error_reason,omitempty"` StorageBytes int64 `json:"compressed_size"` diff --git a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go index 06cb587..52c86e0 100644 --- a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go +++ b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin.go @@ -1,27 +1,64 @@ package pathutil import ( + "errors" "fmt" + "io/fs" "os" "path/filepath" "strings" ) -// SafeJoin resolves rel under root and rejects path traversal. +// SafeJoin resolves rel under root and rejects path traversal, including +// escapes through symlinks. The root must exist; symlinks inside root that +// resolve outside root are rejected even when the lexical path is contained. func SafeJoin(root, relPath string) (string, error) { rel, err := safeRelativePath(relPath) if err != nil { return "", err } - full := filepath.Join(root, filepath.FromSlash(rel)) cleanRoot := filepath.Clean(root) + full := filepath.Join(cleanRoot, filepath.FromSlash(rel)) cleanFull := filepath.Clean(full) if cleanFull != cleanRoot && !strings.HasPrefix(cleanFull, cleanRoot+string(os.PathSeparator)) { return "", fmt.Errorf("path escapes root: %q", relPath) } + if err := verifyNoSymlinkEscape(cleanRoot, cleanFull); err != nil { + return "", err + } return cleanFull, nil } +// verifyNoSymlinkEscape ensures cleanFull does not resolve outside cleanRoot +// via symlinks. It walks toward the deepest existing ancestor of cleanFull +// and requires its real path stay under the root's real path. +func verifyNoSymlinkEscape(cleanRoot, cleanFull string) error { + realRoot, err := filepath.EvalSymlinks(cleanRoot) + if err != nil { + return fmt.Errorf("resolve root %q: %w", cleanRoot, err) + } + candidate := cleanFull + for { + resolved, err := filepath.EvalSymlinks(candidate) + if err == nil { + if resolved != realRoot && !strings.HasPrefix(resolved, realRoot+string(os.PathSeparator)) { + return fmt.Errorf("path escapes root via symlink: %q", cleanFull) + } + return nil + } + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("resolve %q: %w", candidate, err) + } + parent := filepath.Dir(candidate) + if parent == candidate || parent == cleanRoot { + // Walked up to the root (which resolved cleanly above) without + // encountering an existing ancestor whose resolution escapes. + return nil + } + candidate = parent + } +} + func safeRelativePath(value string) (string, error) { value = strings.TrimSpace(value) if value == "" { diff --git a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go index b457083..f9708cc 100644 --- a/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/pathutil/safejoin_test.go @@ -3,6 +3,8 @@ package pathutil import ( "os" "path/filepath" + "runtime" + "strings" "testing" ) @@ -48,3 +50,72 @@ func TestSafeJoin_RejectsEmpty(t *testing.T) { t.Fatal("expected error for empty path") } } + +// TestSafeJoin_RejectsSymlinkEscape verifies that a symlink inside the root +// pointing outside the root is rejected (even though the lexical path is +// contained). This is the containment claim the post-audit plan asks for. +func TestSafeJoin_RejectsSymlinkEscape(t *testing.T) { + t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("symlinks require elevation on Windows CI") + } + outside := t.TempDir() + victim := filepath.Join(outside, "secret.txt") + if err := os.WriteFile(victim, []byte("top secret"), 0o600); err != nil { + t.Fatal(err) + } + root := t.TempDir() + linkPath := filepath.Join(root, "link") + if err := os.Symlink(outside, linkPath); err != nil { + t.Skipf("cannot create symlink (likely unsupported filesystem): %v", err) + } + _, err := SafeJoin(root, "link/secret.txt") + if err == nil { + t.Fatal("expected symlink escape error") + } + if !strings.Contains(err.Error(), "symlink") { + t.Fatalf("expected symlink error, got %v", err) + } +} + +// TestSafeJoin_AllowsSymlinkWithinRoot verifies that a symlink pointing to a +// location still inside the root resolves successfully. +func TestSafeJoin_AllowsSymlinkWithinRoot(t *testing.T) { + t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("symlinks require elevation on Windows CI") + } + root := t.TempDir() + real := filepath.Join(root, "real") + if err := os.MkdirAll(real, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(real, "ok.txt"), []byte("fine"), 0o600); err != nil { + t.Fatal(err) + } + if err := os.Symlink(real, filepath.Join(root, "alias")); err != nil { + t.Skipf("cannot create symlink: %v", err) + } + if _, err := SafeJoin(root, "alias/ok.txt"); err != nil { + t.Fatalf("symlink-within-root should be allowed, got %v", err) + } +} + +// TestSafeJoin_RejectsDotDotThroughSymlink verifies a symlink that points to a +// legitimate directory cannot be used to step out of root via "..". +func TestSafeJoin_RejectsDotDotThroughSymlink(t *testing.T) { + t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("symlinks require elevation on Windows CI") + } + root := t.TempDir() + // "../" is rejected lexically before symlink resolution, but ensure the + // error comes through even if a symlink tries to disguise the escape. + outside := t.TempDir() + if err := os.Symlink(outside, filepath.Join(root, "out")); err != nil { + t.Skipf("cannot create symlink: %v", err) + } + if _, err := SafeJoin(root, "out/../../etc/passwd"); err == nil { + t.Fatal("expected escape rejection") + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store.go b/customers/vm-troubleshooting-dashboard/internal/store/store.go index b2022af..cfca26f 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/store.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/store.go @@ -8,8 +8,10 @@ import ( "fmt" "os" "path/filepath" + "sort" "strconv" "strings" + "sync" "time" "github.com/NexGenCloud/diagnostic-dashboard/internal/model" @@ -20,10 +22,19 @@ import ( //go:embed schema.sql var ddl string +// ErrArchiveCapExceeded is returned by SaveBounded when the configured +// maximum is reached. Callers map this to HTTP 507. +var ErrArchiveCapExceeded = errors.New("archive cap exceeded") + // Store provides SQLite-backed persistence for diagnostic archives. type Store struct { db *sql.DB rootDir string + // writeMu serialises all write paths (Save / Delete) so the in-process + // "count then insert" check in SaveBounded is race-free against + // concurrent uploads. SQLite WAL only allows one writer at a time, so + // this also matches the underlying engine's behaviour. + writeMu sync.Mutex } // New opens (or creates) a SQLite database at rootDir/dashboard.db, applies the @@ -77,23 +88,55 @@ func migrateArchivesTriageCount(db *sql.DB) error { return nil } +// backfillTriageFindingCounts populates triage_finding_count for archives +// whose value is NULL. Rows are read fully into memory before any UPDATE is +// issued, then updates run inside a single transaction. This avoids +// interleaving an open SELECT cursor with per-row writes (a fragile pattern +// on SQLite and a correctness hazard on stricter engines). func (s *Store) backfillTriageFindingCounts() error { + type pending struct { + archiveID string + storagePath string + } + rows, err := s.db.Query(`SELECT archive_id, storage_path FROM archives WHERE triage_finding_count IS NULL`) if err != nil { return err } - defer rows.Close() + var todo []pending for rows.Next() { - var archiveID, storagePath string - if err := rows.Scan(&archiveID, &storagePath); err != nil { + var p pending + if err := rows.Scan(&p.archiveID, &p.storagePath); err != nil { + rows.Close() return err } - c := countTriageFindingsOnDisk(storagePath) - if _, err := s.db.Exec(`UPDATE archives SET triage_finding_count = ? WHERE archive_id = ?`, c, archiveID); err != nil { + todo = append(todo, p) + } + if err := rows.Err(); err != nil { + rows.Close() + return err + } + rows.Close() + if len(todo) == 0 { + return nil + } + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + stmt, err := tx.Prepare(`UPDATE archives SET triage_finding_count = ? WHERE archive_id = ?`) + if err != nil { + return err + } + defer stmt.Close() + for _, p := range todo { + if _, err := stmt.Exec(countTriageFindingsOnDisk(p.storagePath), p.archiveID); err != nil { return err } } - return rows.Err() + return tx.Commit() } func countTriageFindingsOnDisk(storagePath string) int { @@ -127,9 +170,17 @@ func (s *Store) HasArchive(archiveID string) bool { return exists } -// Save persists a fully built ArchiveDetail into the database. It inserts the -// archive, all collectors, and all issues in a single transaction. +// Save persists a fully built ArchiveDetail. Equivalent to SaveBounded with +// no cap. func (s *Store) Save(detail *model.ArchiveDetail) error { + return s.SaveBounded(detail, 0) +} + +// SaveBounded persists detail. When maxArchives > 0, returns +// ErrArchiveCapExceeded if the archive count is already at or above the cap. +// The check and the insert are serialized via writeMu so concurrent uploads +// cannot both pass an off-by-one cap check. +func (s *Store) SaveBounded(detail *model.ArchiveDetail, maxArchives int) error { if detail == nil { return errors.New("archive detail is nil") } @@ -138,7 +189,19 @@ func (s *Store) Save(detail *model.ArchiveDetail) error { return errors.New("archive id is empty") } - // Serialize manifest_json for the artifact list endpoint. + s.writeMu.Lock() + defer s.writeMu.Unlock() + + if maxArchives > 0 { + var n int64 + if err := s.db.QueryRow(`SELECT COUNT(*) FROM archives`).Scan(&n); err != nil { + return fmt.Errorf("count archives: %w", err) + } + if n >= int64(maxArchives) { + return ErrArchiveCapExceeded + } + } + manifestJSON, err := json.Marshal(detail.Artifacts) if err != nil { return fmt.Errorf("marshal artifacts: %w", err) @@ -183,7 +246,6 @@ func (s *Store) Save(detail *model.ArchiveDetail) error { return fmt.Errorf("insert archive: %w", err) } - // Insert collectors. for _, c := range detail.Collectors { var factsJSON *string if c.Facts != nil { @@ -206,7 +268,6 @@ func (s *Store) Save(detail *model.ArchiveDetail) error { } } - // Insert issues. for _, issue := range detail.Issues { var relJSON, unresJSON *string if len(issue.RelatedArtifactPaths) > 0 { @@ -243,6 +304,9 @@ func (s *Store) Save(detail *model.ArchiveDetail) error { // Delete removes an archive's database rows (cascading to collectors and issues) // and its extracted files from disk. func (s *Store) Delete(archiveID string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + var storagePath string err := s.db.QueryRow("SELECT storage_path FROM archives WHERE archive_id = ?", archiveID).Scan(&storagePath) if err != nil { @@ -288,9 +352,27 @@ func (s *Store) Delete(archiveID string) error { return os.RemoveAll(storagePath) } -// List returns all archive summaries, sorted by uploaded_at descending. -func (s *Store) List() []model.ArchiveSummary { - rows, err := s.db.Query(` +// ListPage returns a single page of archive summaries, the unfiltered total +// row count, and the unfiltered total compressed_size. Pagination happens at +// the DB layer (LIMIT/OFFSET) so list responses do not materialize the entire +// archives table per request. +// +// If pageSize <= 0 the page is unbounded (used by the legacy List wrapper). +// Negative offset is clamped to 0. +func (s *Store) ListPage(pageSize, offset int) ([]model.ArchiveSummary, int, int64, error) { + if pageSize < 0 { + pageSize = 0 + } + if offset < 0 { + offset = 0 + } + var total int + var totalBytes sql.NullInt64 + if err := s.db.QueryRow(`SELECT COUNT(*), COALESCE(SUM(compressed_size), 0) FROM archives`).Scan(&total, &totalBytes); err != nil { + return nil, 0, 0, fmt.Errorf("count archives: %w", err) + } + + q := ` SELECT a.archive_id, COALESCE(a.hostname, ''), @@ -323,13 +405,21 @@ LEFT JOIN ( LEFT JOIN ( SELECT archive_id, COUNT(*) AS collector_count FROM collectors GROUP BY archive_id ) cc ON cc.archive_id = a.archive_id -ORDER BY a.uploaded_at DESC`) +ORDER BY a.uploaded_at DESC` + + args := []any{} + if pageSize > 0 { + q += ` LIMIT ? OFFSET ?` + args = append(args, pageSize, offset) + } + + rows, err := s.db.Query(q, args...) if err != nil { - return nil + return nil, 0, 0, fmt.Errorf("query archives: %w", err) } defer rows.Close() - var items []model.ArchiveSummary + items := make([]model.ArchiveSummary, 0, pageSize) for rows.Next() { var a model.ArchiveSummary var uploadedAt string @@ -343,14 +433,29 @@ ORDER BY a.uploaded_at DESC`) &a.CollectorCount, &triage, ); err != nil { - continue + return nil, 0, 0, fmt.Errorf("scan archive row: %w", err) } a.UploadedAt, _ = parseTime(uploadedAt) if triage.Valid { - a.TriageFindingCount = int(triage.Int64) + n := int(triage.Int64) + a.TriageFindingCount = &n } items = append(items, a) } + if err := rows.Err(); err != nil { + return nil, 0, 0, fmt.Errorf("iterate archive rows: %w", err) + } + return items, total, totalBytes.Int64, nil +} + +// List returns all archive summaries (no pagination), sorted by uploaded_at +// descending. Prefer ListPage in API code; this exists for tests and +// internal callers that genuinely need the full set. +func (s *Store) List() []model.ArchiveSummary { + items, _, _, err := s.ListPage(0, 0) + if err != nil { + return nil + } return items } @@ -386,18 +491,16 @@ func (s *Store) Get(archiveID string) (*model.ArchiveDetail, bool) { a.UploadedAt, _ = parseTime(uploadedAt) a.IssueCounts = s.issueCounts(archiveID) - // Load collectors. collectors := s.loadCollectors(archiveID) a.CollectorCount = len(collectors) - // Load issues with triage findings from disk. issues := s.loadIssues(archiveID, storagePath) - a.TriageFindingCount = 0 + triageTotal := 0 for _, issue := range issues { - a.TriageFindingCount += len(issue.TriageFindings) + triageTotal += len(issue.TriageFindings) } + a.TriageFindingCount = &triageTotal - // Deserialize artifact list from manifest_json. var artifacts []model.ArtifactRecord if manifestJSON != nil { _ = json.Unmarshal(manifestJSON, &artifacts) @@ -488,7 +591,6 @@ func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { } defer rows.Close() - // Load triage findings lazily. var triageLoaded bool var triageByFP map[string][]model.TriageFinding @@ -516,7 +618,6 @@ func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { } issue.Source = "manifest" - // Join triage findings by fingerprint. if issue.Fingerprint != "" { if !triageLoaded { triageByFP = loadTriageMap(storagePath) @@ -550,7 +651,8 @@ func (s *Store) enrichIssueEvidence(issues []model.IssueRecord, artifacts []mode } // loadTriageMap reads triage findings from triage/_data/*.json on disk and -// indexes them by fingerprint. +// indexes them by fingerprint. Files are read in lexical order so multiple +// matching findings have a deterministic post-condition. func loadTriageMap(storagePath string) map[string][]model.TriageFinding { result := make(map[string][]model.TriageFinding) root := filepath.Join(storagePath, "triage", "_data") @@ -558,6 +660,7 @@ func loadTriageMap(storagePath string) map[string][]model.TriageFinding { if err != nil { return result } + sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() }) for _, entry := range entries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") { continue diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store_test.go b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go index 0209b66..30dffe2 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/store_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go @@ -1,12 +1,36 @@ package store import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "strings" "testing" "time" "github.com/NexGenCloud/diagnostic-dashboard/internal/model" ) +func TestNewEnsuresTriageFindingCountColumn(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + var n int + q := `SELECT COUNT(*) FROM pragma_table_info('archives') WHERE name='triage_finding_count'` + if err := st.db.QueryRow(q).Scan(&n); err != nil { + t.Fatalf("pragma: %v", err) + } + if n != 1 { + t.Fatalf("expected triage_finding_count column, pragma count=%d", n) + } +} + func TestGetAddsFallbackEvidence(t *testing.T) { t.Parallel() @@ -134,3 +158,206 @@ func containsAll(have []string, want []string) bool { } return true } + +// minimalDetail returns an ArchiveDetail with just enough fields to satisfy +// Save. Used by tests that don't care about issue/artifact content. +func minimalDetail(id string, storage string) *model.ArchiveDetail { + return &model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: id, + SchemaVersion: "3.1.0", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "host", + UploadedAt: time.Now().UTC(), + UploadedBy: "tester", + Status: "ready", + StorageBytes: 100, + }, + StorageDir: storage, + } +} + +func TestSaveBounded_RejectsOverCap(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatal(err) + } + defer st.Close() + + for i, id := range []string{"a", "b"} { + dir := st.ArchiveDir(id) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := st.SaveBounded(minimalDetail(id, dir), 3); err != nil { + t.Fatalf("unexpected cap rejection at i=%d: %v", i, err) + } + } + // Third fits. + { + id := "c" + dir := st.ArchiveDir(id) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := st.SaveBounded(minimalDetail(id, dir), 3); err != nil { + t.Fatalf("third archive should fit under cap=3: %v", err) + } + } + // Fourth rejected. + { + id := "d" + dir := st.ArchiveDir(id) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := st.SaveBounded(minimalDetail(id, dir), 3); !errors.Is(err, ErrArchiveCapExceeded) { + t.Fatalf("fourth archive: want ErrArchiveCapExceeded, got %v", err) + } + } +} + +func TestListPage_PaginatesAndTotals(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatal(err) + } + defer st.Close() + + // Seed five archives with distinct storage bytes so totals are checkable. + sizes := []int64{10, 20, 30, 40, 50} + for i, sz := range sizes { + id := "id-" + strings.Repeat("x", i+1) + dir := st.ArchiveDir(id) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + d := minimalDetail(id, dir) + d.Summary.StorageBytes = sz + if err := st.Save(d); err != nil { + t.Fatalf("Save %s: %v", id, err) + } + } + + items, total, totalBytes, err := st.ListPage(2, 0) + if err != nil { + t.Fatalf("ListPage: %v", err) + } + if total != 5 { + t.Fatalf("total: want 5, got %d", total) + } + if totalBytes != 150 { + t.Fatalf("totalBytes: want 150, got %d", totalBytes) + } + if len(items) != 2 { + t.Fatalf("page size 2: got %d items", len(items)) + } + + // Second page still reports the same totals. + items2, total2, totalBytes2, err := st.ListPage(2, 2) + if err != nil { + t.Fatal(err) + } + if total2 != 5 || totalBytes2 != 150 { + t.Fatalf("second page totals drifted: total=%d bytes=%d", total2, totalBytes2) + } + if len(items2) != 2 { + t.Fatalf("second page: got %d items", len(items2)) + } + // Pages should be disjoint. + seen := map[string]bool{items[0].ArchiveID: true, items[1].ArchiveID: true} + for _, it := range items2 { + if seen[it.ArchiveID] { + t.Fatalf("duplicate archive across pages: %s", it.ArchiveID) + } + } +} + +func TestListPage_NegativeArgsClampedNotPanic(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatal(err) + } + defer st.Close() + + dir := st.ArchiveDir("only") + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := st.Save(minimalDetail("only", dir)); err != nil { + t.Fatal(err) + } + + // Negative pageSize must not panic on make([]T, 0, cap); the contract + // treats <=0 as unbounded. Negative offset clamps to 0. + items, total, _, err := st.ListPage(-5, -10) + if err != nil { + t.Fatalf("ListPage: %v", err) + } + if total != 1 || len(items) != 1 { + t.Fatalf("expected 1 item / total=1, got items=%d total=%d", len(items), total) + } +} + +func TestBackfillTriageFindingCounts(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatal(err) + } + defer st.Close() + + // Seed an archive, then NULL out its triage_finding_count to simulate a + // pre-migration row, and drop two findings on disk. + id := "bf-1" + dir := st.ArchiveDir(id) + if err := os.MkdirAll(filepath.Join(dir, "triage", "_data"), 0o755); err != nil { + t.Fatal(err) + } + for i, name := range []string{"a.json", "b.json"} { + // Findings are counted via loadTriageMap which only keeps entries + // with a non-empty fingerprint. Use distinct fingerprints per file so + // the map accumulates four findings total. + body, _ := json.Marshal(map[string]any{ + "analyzer": "test", + "findings": []map[string]any{ + {"code": "x", "issue_fingerprint": "fp-" + string(rune('a'+i*2))}, + {"code": "y", "issue_fingerprint": "fp-" + string(rune('a'+i*2+1))}, + }, + }) + if err := os.WriteFile(filepath.Join(dir, "triage", "_data", name), body, 0o600); err != nil { + t.Fatal(err) + } + } + if err := st.Save(minimalDetail(id, dir)); err != nil { + t.Fatal(err) + } + if _, err := st.db.Exec(`UPDATE archives SET triage_finding_count = NULL WHERE archive_id = ?`, id); err != nil { + t.Fatal(err) + } + + // Re-open to trigger backfill on startup. + if err := st.Close(); err != nil { + t.Fatal(err) + } + st2, err := New(root) + if err != nil { + t.Fatal(err) + } + defer st2.Close() + + var count int + if err := st2.db.QueryRow(`SELECT triage_finding_count FROM archives WHERE archive_id = ?`, id).Scan(&count); err != nil { + t.Fatal(err) + } + if count != 4 { // 2 files × 2 findings each + t.Fatalf("backfill count: want 4, got %d", count) + } +} diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index 9111fd0..31a7679 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -66,6 +66,10 @@ Keep this file updated in the same change as architecture or collector changes. - Owns artifact writing, command/file metadata headers, `manifest.json`, `report.ndjson`, `SUMMARY.txt`, `metadata.json`, and archive creation. - If support needs a new machine-readable field or archive invariant, change this package. +### `internal/identity/` +- `Fingerprint(parts ...string)` — stable 128-bit hex fingerprint (SHA-256 truncated) from an ordered tuple of strings. +- Used by analyzers and collectors to produce deterministic `issue_fingerprint` values. + ### `internal/privilege/` - Detects root/sudo availability and optionally acquires sudo for interactive runs. diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 3875712..e0b0490 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -309,20 +309,25 @@ func (c *JournalCollector) saveStructuredJournalNDJSON(ctx context.Context, r *C func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int, bool, string) { sc := bufio.NewScanner(bytes.NewReader(raw)) sc.Buffer(make([]byte, 0, 64*1024), 1024*1024) - written := make([]string, 0, 1024) + + // out is the single output buffer; records are encoded once and written + // straight through. offsets[i] is the byte offset at which record i starts, + // so post-write eviction truncates in O(1) without re-materialising the + // accepted records as individual strings. + var out bytes.Buffer + offsets := make([]int, 0, 1024) recordsWritten := 0 - usedBytes := 0 truncated := false truncationReason := "" for sc.Scan() { - line := strings.TrimSpace(sc.Text()) - if line == "" { + line := bytes.TrimSpace(sc.Bytes()) + if len(line) == 0 { continue } rec := map[string]any{} - if err := json.Unmarshal([]byte(line), &rec); err != nil { + if err := json.Unmarshal(line, &rec); err != nil { // Keep output parseable NDJSON by skipping malformed lines. continue } @@ -338,14 +343,15 @@ func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int break } needed := len(encoded) + 1 // include newline - if usedBytes+needed+journalNDJSONSentinelReserve > journalNDJSONByteLimit { + if out.Len()+needed+journalNDJSONSentinelReserve > journalNDJSONByteLimit { truncated = true truncationReason = "byte_limit" break } - written = append(written, string(encoded)) + offsets = append(offsets, out.Len()) + out.Write(encoded) + out.WriteByte('\n') recordsWritten++ - usedBytes += needed } if err := sc.Err(); err != nil { truncated = true @@ -362,34 +368,34 @@ func buildJournalNDJSONContent(raw []byte, forceByteTruncated bool) (string, int } if truncated { - // Use a worst-case sentinel size for the eviction loop (the actual - // sentinel can only be smaller after recordsWritten decreases). + // Reserve enough room for the worst-case sentinel. The actual sentinel + // can only get smaller once recordsWritten drops during eviction. worstSentinel, _ := json.Marshal(map[string]any{ "_truncated": true, "records_written": recordsWritten, "reason": truncationReason, }) - for len(written) > 0 && usedBytes+len(worstSentinel)+1 > journalNDJSONByteLimit { - last := written[len(written)-1] - written = written[:len(written)-1] - usedBytes -= len(last) + 1 + for len(offsets) > 0 && out.Len()+len(worstSentinel)+1 > journalNDJSONByteLimit { + last := offsets[len(offsets)-1] + offsets = offsets[:len(offsets)-1] + out.Truncate(last) recordsWritten-- } - // Re-marshal with the actual post-eviction count. sentinel, _ := json.Marshal(map[string]any{ "_truncated": true, "records_written": recordsWritten, "reason": truncationReason, }) - if usedBytes+len(sentinel)+1 <= journalNDJSONByteLimit { - written = append(written, string(sentinel)) + if out.Len()+len(sentinel)+1 <= journalNDJSONByteLimit { + out.Write(sentinel) + out.WriteByte('\n') } } - if len(written) == 0 { + if out.Len() == 0 { return "", recordsWritten, truncated, truncationReason } - return strings.Join(written, "\n") + "\n", recordsWritten, truncated, truncationReason + return out.String(), recordsWritten, truncated, truncationReason } func normalizeJournalRecord(rec map[string]any) map[string]any { diff --git a/customers/vm-troubleshooting/internal/config/version.go b/customers/vm-troubleshooting/internal/config/version.go index 788a997..b7b8069 100644 --- a/customers/vm-troubleshooting/internal/config/version.go +++ b/customers/vm-troubleshooting/internal/config/version.go @@ -1,7 +1,7 @@ package config var ( - Version = "0.2.1" + Version = "0.3.0" Commit = "unknown" BuildDate = "unknown" ) diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index 9ef68c5..aa4fa9b 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -205,13 +205,23 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { enrichInputs := buildEnrichmentInputs(results) if len(enrichInputs) > 0 { enrichResult, enrichErr := triage.EnrichIssues(workDir, enrichInputs) - if enrichErr != nil { + switch { + case enrichErr != nil: r.UI.Warn("Issue enrichment failed: " + enrichErr.Error()) - } else if len(enrichResult.Findings) > 0 || len(enrichResult.Facts) > 0 { - // Write enrichment findings via the same RunAllAnalyzers JSON path - writeEnrichmentResult(enrichResult, archiveName, writer, r.UI) - // Track artifacts on existing triage collector result (or synthetic enrichment) - results = appendEnrichmentArtifacts(results, enrichResult) + // Record the failure on a synthetic enrichment collector result so + // the manifest/metadata surface the attempt instead of silently + // dropping it. Errors in this stage used to be swallowed. + results = appendEnrichmentFailure(results, enrichErr) + case len(enrichResult.Findings) > 0 || len(enrichResult.Facts) > 0 || len(enrichResult.Artifacts) > 0: + // Only register enrichment artifacts once the envelope write + // succeeds — otherwise the manifest would point at files that + // never landed on disk. + if werr := writeEnrichmentResult(enrichResult, archiveName, writer); werr != nil { + r.UI.Warn("Issue enrichment write failed: " + werr.Error()) + results = appendEnrichmentFailure(results, werr) + } else { + results = appendEnrichmentArtifacts(results, enrichResult) + } } } @@ -541,8 +551,11 @@ func buildEnrichmentInputs(results []*collector.CollectorResult) []triage.Enrich return inputs } -// writeEnrichmentResult writes the enrichment triage result to triage/_data/issue_enrichment.json. -func writeEnrichmentResult(tr *triage.TriageResult, archiveID string, writer *output.Writer, u ui.UI) { +// writeEnrichmentResult writes the enrichment triage result to +// triage/_data/.json. It returns an error on any failure so the +// caller can skip manifest-level registration — otherwise the manifest would +// claim enrichment artifacts that are not actually on disk. +func writeEnrichmentResult(tr *triage.TriageResult, archiveID string, writer *output.Writer) error { jsonPath := "triage/_data/" + tr.Name + ".json" typedFacts := output.ConvertFacts(tr.Facts) findings := tr.Findings @@ -559,28 +572,46 @@ func writeEnrichmentResult(tr *triage.TriageResult, archiveID string, writer *ou Facts map[string]any `json:"facts,omitempty"` } - jsonData, jErr := json.MarshalIndent(envelope{ + jsonData, err := json.MarshalIndent(envelope{ Kind: "triage_result", - SchemaVersion: "3.1.0", + SchemaVersion: triage.TriageSchemaVersion, ArchiveID: archiveID, Analyzer: tr.Name, Findings: findings, Facts: typedFacts, }, "", " ") - if jErr != nil { - u.Warn("Failed to marshal enrichment result: " + jErr.Error()) - return + if err != nil { + return fmt.Errorf("marshal enrichment result: %w", err) } - if rErr := writer.ReservePath(jsonPath); rErr != nil { - u.Warn("Failed to reserve " + jsonPath + ": " + rErr.Error()) - return + if err := writer.ReservePath(jsonPath); err != nil { + return fmt.Errorf("reserve %s: %w", jsonPath, err) } - if wErr := writer.SaveOutput(jsonPath, string(jsonData)+"\n"); wErr != nil { + if err := writer.SaveOutput(jsonPath, string(jsonData)+"\n"); err != nil { writer.ReleasePath(jsonPath) - u.Warn("Failed to write " + jsonPath + ": " + wErr.Error()) - return + return fmt.Errorf("write %s: %w", jsonPath, err) } tr.Artifacts = append(tr.Artifacts, jsonPath) + return nil +} + +// appendEnrichmentFailure records an enrichment failure on a synthetic +// "enrichment" collector result. This keeps the manifest invariant — every +// stage is accounted for — when enrichment fails outright and produces no +// findings or artifacts. If a triage collector result already exists, the +// error is attached to it instead (same identity) so we don't duplicate +// stage rows. +func appendEnrichmentFailure(results []*collector.CollectorResult, enrichErr error) []*collector.CollectorResult { + for _, res := range results { + if res.ID == "triage" { + res.RecordError(collector.ErrProbeFailed, "enrichment failed: "+enrichErr.Error()) + return results + } + } + syn := collector.NewResult() + syn.ID = "enrichment" + syn.Name = "Enrichment" + syn.RecordError(collector.ErrProbeFailed, "enrichment failed: "+enrichErr.Error()) + return append(results, syn) } // appendEnrichmentArtifacts adds enrichment artifacts to an existing triage collector result, diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go index da1e85c..32c7c12 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize_test.go @@ -96,6 +96,18 @@ func TestDockerInspect(t *testing.T) { } }, }, + { + "benign env key preserved when value matches sk- pattern", + `[{"Config":{"Env":["MY_SETTING=sk-proj-abcdefghijklmnopqrstuvwxyz1234567890"]}}]`, + func(t *testing.T, out string) { + if strings.Contains(out, "sk-proj-") { + t.Error("OpenAI-style sk- value should be redacted") + } + if !strings.Contains(out, "MY_SETTING=[REDACTED]") { + t.Error("expected MY_SETTING=[REDACTED] for value-only redaction") + } + }, + }, { "sensitive field keys redacted", `{"Config":{"password":"secret123","hostname":"myhost"}}`, diff --git a/customers/vm-troubleshooting/internal/triage/integration_test.go b/customers/vm-troubleshooting/internal/triage/integration_test.go index c73d267..790f644 100644 --- a/customers/vm-troubleshooting/internal/triage/integration_test.go +++ b/customers/vm-troubleshooting/internal/triage/integration_test.go @@ -206,8 +206,8 @@ func TestTriagePipeline_EndToEnd(t *testing.T) { if envelope.Kind != "triage_result" { t.Errorf("%s: kind=%q, want triage_result", name, envelope.Kind) } - if envelope.SchemaVersion != triageSchemaVersion { - t.Errorf("%s: schema_version=%q, want %s", name, envelope.SchemaVersion, triageSchemaVersion) + if envelope.SchemaVersion != TriageSchemaVersion { + t.Errorf("%s: schema_version=%q, want %s", name, envelope.SchemaVersion, TriageSchemaVersion) } if envelope.ArchiveID != "vm-diagnostics-test" { t.Errorf("%s: archive_id=%q, want vm-diagnostics-test", name, envelope.ArchiveID) diff --git a/customers/vm-troubleshooting/internal/triage/triage.go b/customers/vm-troubleshooting/internal/triage/triage.go index f061253..810de80 100644 --- a/customers/vm-troubleshooting/internal/triage/triage.go +++ b/customers/vm-troubleshooting/internal/triage/triage.go @@ -93,8 +93,8 @@ type TriageResult struct { // produces a triage result. It should be resilient to missing files. type Analyzer func(ctx context.Context, workDir string) (*TriageResult, error) -// triageSchemaVersion is the schema version emitted in triage result JSON files. -const triageSchemaVersion = "3.1.0" +// TriageSchemaVersion is the schema version emitted in triage result JSON files. +const TriageSchemaVersion = "3.2.0" // RunAllAnalyzers executes all registered analyzers with spinner feedback. // Missing artifacts are handled gracefully — analyzers skip what isn't there. @@ -161,7 +161,7 @@ func RunAllAnalyzers(ctx context.Context, workDir string, archiveID string, writ Facts map[string]any `json:"facts,omitempty"` }{ Kind: "triage_result", - SchemaVersion: triageSchemaVersion, + SchemaVersion: TriageSchemaVersion, ArchiveID: archiveID, Analyzer: tr.Name, Findings: findings, From 44c7627e211f90240b58c740b5b05ddc01de8a76 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Thu, 16 Apr 2026 15:27:55 +0200 Subject: [PATCH 11/23] feat: harden gather-info for containerized hypervisors and refresh dashboard --- .../artifacts/ArtifactBrowserPage.tsx | 6 +- .../issue-detail/IssueDetailPage.tsx | 21 +- .../src/components/issues/IssuesPage.tsx | 108 +++- .../src/components/layout/AppShell.tsx | 4 +- .../frontend/src/index.css | 121 +++++ .../schemas/manifest.schema.json | 3 +- .../schemas/report-record.schema.json | 3 +- .../internal/collector/collector.go | 1 + .../internal/collector/collector_test.go | 64 ++- .../internal/collector/container.go | 80 +++ .../internal/collector/container_test.go | 58 +++ .../internal/collector/hypervisor.go | 471 ++++++++++++++++++ .../internal/collector/journal.go | 119 +---- .../internal/collector/network.go | 128 +++++ .../internal/collector/ovs.go | 210 ++++++++ .../internal/collector/ovs_test.go | 29 ++ .../internal/collector/services.go | 142 +++++- .../internal/collector/system.go | 199 ++++++++ .../internal/executor/executor.go | 39 +- .../internal/executor/executor_test.go | 31 ++ .../output/archive_consistency_test.go | 2 +- .../internal/output/manifest.go | 15 + .../internal/output/manifest_test.go | 2 +- .../internal/output/report.go | 2 +- .../internal/runner/runner.go | 4 +- .../internal/sanitize/sanitize.go | 27 + .../internal/triage/critical.go | 32 +- .../internal/triage/critical_test.go | 2 +- .../schemas/manifest.schema.json | 3 +- .../schemas/report-record.schema.json | 3 +- 30 files changed, 1739 insertions(+), 190 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/collector/container.go create mode 100644 customers/vm-troubleshooting/internal/collector/container_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/hypervisor.go create mode 100644 customers/vm-troubleshooting/internal/collector/ovs.go create mode 100644 customers/vm-troubleshooting/internal/collector/ovs_test.go diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx index 6f309d8..16c0810 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx @@ -69,16 +69,16 @@ export function ArtifactBrowserPage() { } return ( -
+
{archive ? ( ) : null} -
+
{/* File tree */} -

+

Files

diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 1197655..488d1f3 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -53,7 +53,7 @@ export function IssueDetailPage() { : "Suggested evidence" return ( -
+
{archive ? ( ) : null} @@ -76,13 +76,13 @@ export function IssueDetailPage() {
-
+
{/* Main content */}
{/* What happened */} -

+

What happened

{primaryFinding ? ( @@ -111,7 +111,7 @@ export function IssueDetailPage() { {/* How serious */} -

+

Classification

@@ -130,7 +130,7 @@ export function IssueDetailPage() { {primaryFinding?.action ? ( -

+

Recommended action

@@ -146,9 +146,9 @@ export function IssueDetailPage() { {/* Supporting findings */} {findings.length > 1 ? (
-

- Supporting findings -

+

+ Supporting findings +

{findings.slice(1).map((finding, i) => ( @@ -214,7 +214,7 @@ export function IssueDetailPage() {
-

+

Metadata

@@ -237,7 +237,7 @@ export function IssueDetailPage() { {/* Related artifacts */} -

+

{evidenceLabel}

{relatedArtifacts.length === 0 && triageArtifacts.length === 0 && suggestedArtifacts.length > 0 ? ( @@ -280,4 +280,3 @@ export function IssueDetailPage() { function uniquePaths(paths: string[]) { return Array.from(new Set(paths.filter(Boolean))) } - diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx index b33215b..feaa46c 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx @@ -13,11 +13,24 @@ import { TableRow, } from "@/components/ui/table" import { SeverityBadge } from "@/components/ui/severity-badge" +import { Pill } from "@/components/ui/pill" import { ArchiveHeader } from "@/components/layout/ArchiveHeader" import { useArchive } from "@/api/archives" import { useIssues, type IssueFilters } from "@/api/issues" -import { encodeSegment, categoryLabel, primaryFindingTitle } from "@/lib/utils" -import { Search, X } from "lucide-react" +import { encodeSegment, categoryLabel, primaryFindingTitle, cn } from "@/lib/utils" +import { Search, X, ArrowDown, ArrowUp, ArrowUpDown } from "lucide-react" + +type SortField = "severity" | "confidence" | "source" +type SortDir = "asc" | "desc" + +const severityRank: Record = { critical: 3, warning: 2, info: 1 } +const confidenceRank: Record = { high: 2, low: 1 } + +const confidenceOptions: SelectOption[] = [ + { value: "", label: "All confidence" }, + { value: "high", label: "High confidence" }, + { value: "low", label: "Low confidence" }, +] export function IssuesPage() { const { archiveId } = useParams<{ archiveId: string }>() @@ -34,6 +47,8 @@ export function IssuesPage() { } const [filters, setFilters] = useState(committedFilters) + const [sortField, setSortField] = useState("severity") + const [sortDir, setSortDir] = useState("desc") const { data: archive, isLoading: archiveLoading } = useArchive(archiveId!) const { @@ -50,6 +65,7 @@ export function IssuesPage() { const severityOptions = useMemo(() => { const filtered = allIssues.filter((i) => { if (filters.category && i.category !== filters.category) return false + if (filters.confidence && i.confidence.toLowerCase() !== filters.confidence.toLowerCase()) return false if (filters.q) { const q = filters.q.toLowerCase() const title = primaryFindingTitle(i.triage_findings) ?? "" @@ -66,11 +82,12 @@ export function IssuesPage() { } } return opts - }, [allIssues, filters.category, filters.q]) + }, [allIssues, filters.category, filters.confidence, filters.q]) const sourceOptions = useMemo(() => { const filtered = allIssues.filter((i) => { if (filters.severity && i.severity.toLowerCase() !== filters.severity.toLowerCase()) return false + if (filters.confidence && i.confidence.toLowerCase() !== filters.confidence.toLowerCase()) return false if (filters.q) { const q = filters.q.toLowerCase() const title = primaryFindingTitle(i.triage_findings) ?? "" @@ -91,7 +108,7 @@ export function IssuesPage() { { value: "", label: "All sources" }, ...sorted.map(([value, label]) => ({ value, label })), ] - }, [allIssues, filters.severity, filters.q]) + }, [allIssues, filters.severity, filters.confidence, filters.q]) // Auto-reset a dropdown if its selected value was eliminated by the other filters. useEffect(() => { @@ -142,6 +159,42 @@ export function IssuesPage() { const hasActiveFilters = Object.values(filters).some(Boolean) + // Client-side sort + const toggleSort = (field: SortField) => { + if (sortField === field) { + setSortDir((d) => (d === "desc" ? "asc" : "desc")) + } else { + setSortField(field) + setSortDir("desc") + } + } + + const sortedIssues = useMemo(() => { + const dir = sortDir === "desc" ? -1 : 1 + return [...issues].sort((a, b) => { + let cmp = 0 + switch (sortField) { + case "severity": + cmp = (severityRank[a.severity] ?? 0) - (severityRank[b.severity] ?? 0) + break + case "confidence": + cmp = (confidenceRank[a.confidence] ?? 0) - (confidenceRank[b.confidence] ?? 0) + break + case "source": + cmp = categoryLabel(a.category).localeCompare(categoryLabel(b.category)) + break + } + return cmp * dir + }) + }, [issues, sortField, sortDir]) + + const sortIcon = (field: SortField) => { + if (sortField !== field) return + return sortDir === "desc" + ? + : + } + if (isLoading) { return (
@@ -181,6 +234,16 @@ export function IssuesPage() { }} className="w-[140px] border-border bg-card focus-visible:border-ring" /> + - Severity - Source + toggleSort("severity")} + > + + Severity + {sortIcon("severity")} + + + toggleSort("confidence")} + > + + Confidence + {sortIcon("confidence")} + + + toggleSort("source")} + > + + Source + {sortIcon("source")} + + Issue - {issues.map((issue) => { + {sortedIssues.map((issue) => { const title = primaryFindingTitle(issue.triage_findings) + const isLow = issue.confidence.toLowerCase() === "low" return ( navigate( `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(issue.id)}`, @@ -248,6 +337,9 @@ export function IssuesPage() { + + {issue.confidence} + {categoryLabel(issue.category)} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx index 0b7b238..62b184b 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/layout/AppShell.tsx @@ -8,7 +8,7 @@ export function AppShell() { return (
-
+
Dx @@ -25,7 +25,7 @@ export function AppShell() {
-
+
diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/index.css b/customers/vm-troubleshooting-dashboard/frontend/src/index.css index 6bc8c23..004b39e 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/index.css +++ b/customers/vm-troubleshooting-dashboard/frontend/src/index.css @@ -5,6 +5,25 @@ @custom-variant dark (&:is(.dark *)); @theme { + --breakpoint-sm: 40rem; + --breakpoint-md: 48rem; + --breakpoint-lg: 64rem; + --breakpoint-xl: 80rem; + --breakpoint-2xl: 96rem; + + --grid-template-columns-1: repeat(1, minmax(0, 1fr)); + --grid-template-columns-2: repeat(2, minmax(0, 1fr)); + --grid-template-columns-3: repeat(3, minmax(0, 1fr)); + --grid-template-columns-4: repeat(4, minmax(0, 1fr)); + --grid-template-columns-5: repeat(5, minmax(0, 1fr)); + --grid-template-columns-6: repeat(6, minmax(0, 1fr)); + --grid-template-columns-7: repeat(7, minmax(0, 1fr)); + --grid-template-columns-8: repeat(8, minmax(0, 1fr)); + --grid-template-columns-9: repeat(9, minmax(0, 1fr)); + --grid-template-columns-10: repeat(10, minmax(0, 1fr)); + --grid-template-columns-11: repeat(11, minmax(0, 1fr)); + --grid-template-columns-12: repeat(12, minmax(0, 1fr)); + --color-severity-critical: #dc2626; --color-severity-warning: #d97706; --color-severity-info: #2563eb; @@ -101,3 +120,105 @@ @apply border-border outline-ring/50; } } + +@layer components { + .section-label { + font-size: 11px; + font-weight: 600; + line-height: 1rem; + letter-spacing: 0.08em; + text-transform: uppercase; + color: var(--muted-foreground); + } + + .overview-stats-grid { + display: grid; + gap: 1rem; + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .overview-panels-grid { + display: grid; + gap: 1.25rem; + grid-template-columns: minmax(0, 1fr); + } + + .kv-two-col { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + column-gap: 1.5rem; + row-gap: 0.75rem; + } + + .collectors-grid { + display: grid; + gap: 1rem; + grid-template-columns: minmax(0, 1fr); + } + + .issue-detail-grid { + display: grid; + gap: 1.5rem; + grid-template-columns: minmax(0, 1fr); + } + + .artifact-browser-grid { + display: grid; + gap: 1.25rem; + grid-template-columns: minmax(0, 1fr); + } + + .top-issue-list { + display: grid; + gap: 0.5rem; + } + + .top-issue-card { + border: 1px solid rgba(148, 163, 184, 0.28); + box-shadow: 0 1px 2px rgba(15, 23, 42, 0.04); + } + + .page-shell { + width: min(100%, 76rem); + } + + .page-shell-wide { + width: min(100%, 82rem); + } + + @media (min-width: 48rem) { + .overview-stats-grid { + grid-template-columns: repeat(4, minmax(0, 1fr)); + } + + .collectors-grid { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + } + + @media (min-width: 64rem) { + .overview-panels-grid { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .issue-detail-grid { + grid-template-columns: minmax(0, 1fr) 280px; + } + + .artifact-browser-grid { + grid-template-columns: 300px minmax(0, 1fr); + } + + .collectors-grid { + grid-template-columns: repeat(3, minmax(0, 1fr)); + } + + .page-shell { + width: min(100%, 78rem); + } + + .page-shell-wide { + width: min(100%, 84rem); + } + } +} diff --git a/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json index b5b060d..16cd7ca 100644 --- a/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json @@ -67,7 +67,8 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config", "triage" + "packages", "storage", "infiniband", "processes", "config", "triage", + "hypervisor", "ovs" ] } } diff --git a/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json index 4786d84..ff0ca0e 100644 --- a/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json @@ -26,7 +26,8 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config", "triage" + "packages", "storage", "infiniband", "processes", "config", "triage", + "hypervisor", "ovs" ] } }, diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index fdc8410..7dbedf3 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -46,6 +46,7 @@ var ValidTags = map[string]bool{ "services": true, "journal": true, "oom": true, "packages": true, "storage": true, "infiniband": true, "processes": true, "config": true, "triage": true, + "hypervisor": true, "ovs": true, } // ValidParserHints is the controlled vocabulary for parser hints. diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index d7d690a..680f421 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -259,36 +259,56 @@ func TestJournalCollectorOOMFilterMatchesRealEvents(t *testing.T) { } } -func TestCountOOMIncidents(t *testing.T) { +func TestOOManchorCountTwoInvocations(t *testing.T) { t.Parallel() - lines := []string{ - "Apr 04 12:00:01 host kernel: python3 invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", - "Apr 04 12:00:02 host kernel: Memory cgroup out of memory: Killed process 111 (python3) total-vm:1234kB", - "Apr 04 12:10:01 host kernel: node invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", - "Apr 04 12:10:02 host kernel: Memory cgroup out of memory: Killed process 222 (python3) total-vm:5678kB", + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Binaries["dmesg"] = true + const until = "2026-04-08 13:00:00" + base := "-b --no-pager --until=" + until + for _, args := range []string{ + "journalctl " + base + " -k", + "journalctl " + base + " -p err", + "journalctl " + base + " -p warning", + "journalctl " + base + " -k -o json --output-fields=" + journalOutputFields + " --lines=50001", + "journalctl " + base + " -p err -o json --output-fields=" + journalOutputFields + " --lines=50001", + } { + fake.Commands[args] = executor.FakeResponse{Stdout: []byte("log line\n")} } - if got := countOOMIncidents(lines, false); got != 2 { - t.Fatalf("expected 2 incidents from timestamped OOM bursts, got %d", got) + fake.Commands["dmesg -T"] = executor.FakeResponse{Stdout: []byte("ok\n")} + // Two canonical anchor lines → count must be exactly 2. + fake.Commands["journalctl "+base+" -k --grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm --case-sensitive=false"] = executor.FakeResponse{ + Stdout: []byte( + "Apr 04 12:00:01 host kernel: python3 invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0\n" + + "Apr 04 12:00:02 host kernel: Out of memory: Killed process 111 (python3) total-vm:1234kB\n" + + "Apr 04 12:10:01 host kernel: node invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0\n" + + "Apr 04 12:10:02 host kernel: Out of memory: Killed process 222 (node) total-vm:5678kB\n", + ), } + for _, svc := range journalServiceUnits { + fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} + } + root := t.TempDir() - fallbackOnly := []string{ - "kernel: invoked oom-killer: gfp_mask=0x100cca", - "kernel: invoked oom-killer: gfp_mask=0x100cca", + collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) + collector.nowFunc = func() time.Time { return time.Date(2026, 4, 8, 13, 0, 0, 0, time.UTC) } + res, err := collector.Collect(context.Background()) + if err != nil { + t.Fatalf("Collect failed: %v", err) } - if got := countOOMIncidents(fallbackOnly, false); got != 2 { - t.Fatalf("expected fallback count=2 from invoked-only lines, got %d", got) + if got := res.Facts["oom_event_count"]; got != "2" { + t.Fatalf("expected oom_event_count=2 for two anchor lines, got %q", got) } -} - -func TestCountOOMIncidentsShortISOPreservesYear(t *testing.T) { - t.Parallel() - lines := []string{ - "2025-12-31T23:59:50Z host kernel: invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", - "2026-01-01T00:00:10Z host kernel: invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0", + foundOOMIssue := false + for _, iss := range res.Issues { + if iss.Code == IssueOOMEvents { + foundOOMIssue = true + } } - if got := countOOMIncidents(lines, true); got != 2 { - t.Fatalf("expected 2 incidents across year boundary with short-iso timestamps, got %d", got) + if !foundOOMIssue { + t.Fatal("expected OOM issue for two invocations") } } diff --git a/customers/vm-troubleshooting/internal/collector/container.go b/customers/vm-troubleshooting/internal/collector/container.go new file mode 100644 index 0000000..6da7b28 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/container.go @@ -0,0 +1,80 @@ +package collector + +import ( + "context" + "strings" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" +) + +type containerHit struct { + Name string + Image string +} + +// findRunningContainer looks for a running container whose name/image contains +// all tokens from one of the candidate sets. It returns the first match. +func findRunningContainer(ctx context.Context, exec executor.Executor, candidates ...[]string) (containerHit, bool) { + if !exec.CommandExists("docker") { + return containerHit{}, false + } + spec := executor.CommandSpec{ + Name: "docker", + Args: []string{"ps", "--format", "{{.Names}}|{{.Image}}"}, + NeedsRoot: true, + Timeout: config.TimeoutQuick, + } + res, out, _ := exec.Capture(ctx, spec, 256*1024) + if res.Skipped || res.TimedOut || res.Err != nil { + return containerHit{}, false + } + + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.SplitN(line, "|", 2) + if len(parts) != 2 { + continue + } + hit := containerHit{Name: strings.TrimSpace(parts[0]), Image: strings.TrimSpace(parts[1])} + if hit.Name == "" { + continue + } + if containerMatches(hit, candidates...) { + return hit, true + } + } + return containerHit{}, false +} + +func containerMatches(hit containerHit, candidates ...[]string) bool { + haystack := strings.ToLower(hit.Name + " " + hit.Image) + for _, candidate := range candidates { + if len(candidate) == 0 { + continue + } + matched := true + for _, token := range candidate { + if !strings.Contains(haystack, strings.ToLower(token)) { + matched = false + break + } + } + if matched { + return true + } + } + return false +} + +func hostOrContainerCommandSpec(container containerHit, binary string, args []string, timeout time.Duration, needsRoot, inheritProcGroup bool) executor.CommandSpec { + if container.Name == "" { + return executor.CommandSpec{Name: binary, Args: args, NeedsRoot: needsRoot, Timeout: timeout, InheritProcGroup: inheritProcGroup} + } + containerArgs := append([]string{"exec", container.Name, binary}, args...) + return executor.CommandSpec{Name: "docker", Args: containerArgs, NeedsRoot: needsRoot, Timeout: timeout, InheritProcGroup: inheritProcGroup} +} diff --git a/customers/vm-troubleshooting/internal/collector/container_test.go b/customers/vm-troubleshooting/internal/collector/container_test.go new file mode 100644 index 0000000..69b43ad --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/container_test.go @@ -0,0 +1,58 @@ +package collector + +import ( + "context" + "testing" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" +) + +func TestFindRunningContainer(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["docker"] = true + fake.Commands["docker ps --format {{.Names}}|{{.Image}}"] = executor.FakeResponse{Stdout: []byte( + "octoserver_psu_exporter|docker-registry.example/infra/octoserver_psu_exporter:latest\n" + + "nova_libvirt|docker-registry.example/kolla/nova-libvirt:2024.2\n" + + "openvswitch_vswitchd|docker-registry.example/kolla/openvswitch-vswitchd:2024.2\n", + )} + + hit, ok := findRunningContainer(context.Background(), fake, []string{"nova", "libvirt"}) + if !ok { + t.Fatal("expected nova libvirt container to be detected") + } + if hit.Name != "nova_libvirt" { + t.Fatalf("expected nova_libvirt, got %q", hit.Name) + } + + hit, ok = findRunningContainer(context.Background(), fake, []string{"openvswitch", "vswitchd"}, []string{"openvswitch", "db"}) + if !ok { + t.Fatal("expected openvswitch container to be detected") + } + if hit.Name != "openvswitch_vswitchd" { + t.Fatalf("expected openvswitch_vswitchd, got %q", hit.Name) + } +} + +func TestHostOrContainerCommandSpec(t *testing.T) { + t.Parallel() + + host := hostOrContainerCommandSpec(containerHit{}, "virsh", []string{"--readonly", "version"}, time.Second, true, true) + if host.Name != "virsh" { + t.Fatalf("expected host command, got %q", host.Name) + } + if !host.InheritProcGroup { + t.Fatal("expected host virsh command to inherit process group") + } + + container := hostOrContainerCommandSpec(containerHit{Name: "nova_libvirt", Image: "kolla/nova-libvirt"}, "virsh", []string{"--readonly", "version"}, time.Second, true, true) + if container.Name != "docker" { + t.Fatalf("expected docker exec, got %q", container.Name) + } + if got := container.String(); got != "docker exec nova_libvirt virsh --readonly version" { + t.Fatalf("unexpected container command string: %q", got) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/hypervisor.go b/customers/vm-troubleshooting/internal/collector/hypervisor.go new file mode 100644 index 0000000..b20ac89 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/hypervisor.go @@ -0,0 +1,471 @@ +package collector + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type HypervisorCollector struct{ Base } + +func NewHypervisorCollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *HypervisorCollector { + return &HypervisorCollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *HypervisorCollector) Name() string { return "Hypervisor" } +func (c *HypervisorCollector) ID() string { return "hypervisor" } + +func (c *HypervisorCollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + libvirtContainer, hasLibvirtContainer := findRunningContainer(ctx, c.Exec, []string{"nova", "libvirt"}) + + if ok, evidence := hypervisorEvidence(libvirtContainer); !ok { + r.RecordSkip(SkipNotApplicable, "no KVM/libvirt/VFIO evidence detected") + return r, nil + } else { + for _, line := range evidence { + c.UI.Verbose(" " + line) + } + } + + c.saveKVMModules(ctx, r) + c.saveDirConcat(r, "hypervisor/kvm_module_params.txt", "/sys/module/kvm/parameters", + nil, "hypervisor", "config") + c.saveIOMMUGroups(r) + c.saveVFIOBindings(r) + c.savePCIInventory(r) + c.saveHugepageFacts(r) + r.SetFact("numa.nodes", strconv.Itoa(countNumaNodes())) + + if ctx.Err() == nil { + c.saveLibvirt(ctx, r, libvirtContainer, hasLibvirtContainer) + } + + return r, nil +} + +// hypervisorEvidence returns true when at least one indicator of KVM, libvirt, +// or VFIO usage is present on this host. +func hypervisorEvidence(libvirtContainer containerHit) (bool, []string) { + var evidence []string + if _, err := os.Stat("/dev/kvm"); err == nil { + evidence = append(evidence, "evidence: /dev/kvm present") + } + for _, sock := range []string{ + "/run/libvirt/libvirt-sock", + "/var/run/libvirt/libvirt-sock", + } { + if _, err := os.Stat(sock); err == nil { + evidence = append(evidence, fmt.Sprintf("evidence: libvirt socket present (%s)", sock)) + } + } + if entries, err := os.ReadDir("/sys/bus/pci/drivers/vfio-pci"); err == nil { + bound := 0 + for _, e := range entries { + if strings.Contains(e.Name(), ":") { + bound++ + } + } + if bound > 0 { + evidence = append(evidence, fmt.Sprintf("evidence: vfio-pci bound devices (%d)", bound)) + } + } + if libvirtContainer.Name != "" { + evidence = append(evidence, fmt.Sprintf("evidence: libvirt container present (%s, %s)", libvirtContainer.Name, libvirtContainer.Image)) + } + return len(evidence) > 0, evidence +} + +func (c *HypervisorCollector) saveKVMModules(ctx context.Context, r *CollectorResult) { + if !c.Exec.CommandExists("lsmod") { + c.saveSkippedArtifact(r, "hypervisor/kvm_modules.txt", "command", "lsmod", + "", SkipCommandUnavailable, "lsmod: unavailable", "hypervisor") + return + } + result, out, _ := c.Exec.Capture(ctx, executor.CommandSpec{ + Name: "lsmod", Timeout: config.TimeoutQuick, + }, 1*1024*1024) + var lines []string + lines = append(lines, "Module Size Used by") + for _, line := range strings.Split(string(out), "\n") { + lower := strings.ToLower(line) + if strings.Contains(lower, "kvm") || strings.Contains(lower, "vfio") || strings.Contains(lower, "vhost") { + lines = append(lines, line) + } + } + c.saveCapturedProbe(r, "hypervisor/kvm_modules.txt", + executor.CommandSpec{Name: "lsmod", Timeout: config.TimeoutQuick}, + result, strings.Join(lines, "\n")+"\n", "", "lsmod", []string{"hypervisor"}) +} + +func (c *HypervisorCollector) saveIOMMUGroups(r *CollectorResult) { + entries, err := os.ReadDir("/sys/kernel/iommu_groups") + if err != nil { + c.saveSkippedArtifact(r, "hypervisor/iommu_groups.txt", "file", "", + "/sys/kernel/iommu_groups", SkipSourceUnavailable, + "/sys/kernel/iommu_groups: unavailable (IOMMU may be disabled)", "hypervisor") + return + } + var buf strings.Builder + for _, entry := range entries { + if !entry.IsDir() { + continue + } + groupPath := filepath.Join("/sys/kernel/iommu_groups", entry.Name(), "devices") + devices, readErr := os.ReadDir(groupPath) + if readErr != nil { + continue + } + var devNames []string + for _, d := range devices { + devNames = append(devNames, d.Name()) + } + buf.WriteString(fmt.Sprintf("group %s: %s\n", entry.Name(), strings.Join(devNames, ", "))) + } + if buf.Len() == 0 { + c.saveSkippedArtifact(r, "hypervisor/iommu_groups.txt", "file", "", + "/sys/kernel/iommu_groups", SkipSourceUnavailable, "no IOMMU groups found", "hypervisor") + return + } + c.saveProbeOutput(r, "hypervisor/iommu_groups.txt", buf.String(), "text", "hypervisor") +} + +func (c *HypervisorCollector) saveVFIOBindings(r *CollectorResult) { + entries, err := os.ReadDir("/sys/bus/pci/drivers/vfio-pci") + if err != nil { + c.saveSkippedArtifact(r, "hypervisor/vfio_bindings.txt", "file", "", + "/sys/bus/pci/drivers/vfio-pci", SkipSourceUnavailable, + "/sys/bus/pci/drivers/vfio-pci: unavailable", "hypervisor") + r.SetFact("vfio.bound_devices", "0") + return + } + var buf strings.Builder + count := 0 + for _, entry := range entries { + if !strings.Contains(entry.Name(), ":") { + continue + } + count++ + base := filepath.Join("/sys/bus/pci/drivers/vfio-pci", entry.Name()) + vendor := readSysfsTrimmed(filepath.Join(base, "vendor")) + device := readSysfsTrimmed(filepath.Join(base, "device")) + class := readSysfsTrimmed(filepath.Join(base, "class")) + buf.WriteString(fmt.Sprintf("%s vendor=%s device=%s class=%s\n", + entry.Name(), vendor, device, class)) + } + r.SetFact("vfio.bound_devices", strconv.Itoa(count)) + if buf.Len() == 0 { + buf.WriteString("(no devices bound to vfio-pci)\n") + } + c.saveProbeOutput(r, "hypervisor/vfio_bindings.txt", buf.String(), "text", "hypervisor") +} + +// savePCIInventory scans /sys/bus/pci/devices to count NVIDIA GPUs and NVSwitches. +// GPUs: vendor 10de + display class (0x0300xx / 0x0302xx) +// NVSwitches: vendor 10de + bridge class (0x0680xx) +func (c *HypervisorCollector) savePCIInventory(r *CollectorResult) { + entries, err := os.ReadDir("/sys/bus/pci/devices") + if err != nil { + r.SetFact("gpu.pci_nvidia_count", "unavailable") + r.SetFact("gpu.vfio_bound_count", "unavailable") + r.SetFact("gpu.host_driver_count", "unavailable") + r.SetFact("nvswitch.pci_count", "unavailable") + return + } + + gpuTotal := 0 + gpuVFIO := 0 + gpuHost := 0 + nvSwitchTotal := 0 + + for _, entry := range entries { + base := filepath.Join("/sys/bus/pci/devices", entry.Name()) + vendor := readSysfsTrimmed(filepath.Join(base, "vendor")) + if strings.ToLower(vendor) != "0x10de" { + continue + } + cls := strings.ToLower(readSysfsTrimmed(filepath.Join(base, "class"))) + driverLink, _ := os.Readlink(filepath.Join(base, "driver")) + isVFIO := strings.HasSuffix(driverLink, "/vfio-pci") + hasDriver := driverLink != "" && !isVFIO + + switch { + case strings.HasPrefix(cls, "0x0300") || strings.HasPrefix(cls, "0x0302"): + gpuTotal++ + if isVFIO { + gpuVFIO++ + } else if hasDriver { + gpuHost++ + } + case strings.HasPrefix(cls, "0x0680"): + nvSwitchTotal++ + } + } + + r.SetFact("gpu.pci_nvidia_count", strconv.Itoa(gpuTotal)) + r.SetFact("gpu.vfio_bound_count", strconv.Itoa(gpuVFIO)) + r.SetFact("gpu.host_driver_count", strconv.Itoa(gpuHost)) + r.SetFact("nvswitch.pci_count", strconv.Itoa(nvSwitchTotal)) +} + +func (c *HypervisorCollector) saveHugepageFacts(r *CollectorResult) { + entries, err := os.ReadDir("/sys/kernel/mm/hugepages") + if err != nil { + r.SetFact("hugepages.total", "unavailable") + r.SetFact("hugepages.free", "unavailable") + r.SetFact("hugepages.reserved", "unavailable") + r.SetFact("hugepages.size_kb", "unavailable") + return + } + + bestDir := "" + bestSize := int64(0) + bestTotal := int64(0) + for _, entry := range entries { + if !entry.IsDir() { + continue + } + sizeStr := strings.TrimPrefix(entry.Name(), "hugepages-") + sizeStr = strings.TrimSuffix(sizeStr, "kB") + sizeKB, parseErr := strconv.ParseInt(sizeStr, 10, 64) + if parseErr != nil { + continue + } + totalStr := readSysfsTrimmed(filepath.Join("/sys/kernel/mm/hugepages", entry.Name(), "nr_hugepages")) + total, parseErr := strconv.ParseInt(totalStr, 10, 64) + if parseErr != nil { + continue + } + if total > 0 && sizeKB > bestSize { + bestSize = sizeKB + bestTotal = total + bestDir = entry.Name() + } + } + + if bestDir == "" { + r.SetFact("hugepages.total", "0") + r.SetFact("hugepages.free", "0") + r.SetFact("hugepages.reserved", "0") + r.SetFact("hugepages.size_kb", "0") + return + } + + base := filepath.Join("/sys/kernel/mm/hugepages", bestDir) + r.SetFact("hugepages.total", strconv.FormatInt(bestTotal, 10)) + r.SetFact("hugepages.size_kb", strconv.FormatInt(bestSize, 10)) + + if free, err := strconv.ParseInt(readSysfsTrimmed(filepath.Join(base, "free_hugepages")), 10, 64); err == nil { + r.SetFact("hugepages.free", strconv.FormatInt(free, 10)) + } else { + r.SetFact("hugepages.free", "unavailable") + } + if resv, err := strconv.ParseInt(readSysfsTrimmed(filepath.Join(base, "resv_hugepages")), 10, 64); err == nil { + r.SetFact("hugepages.reserved", strconv.FormatInt(resv, 10)) + } else { + r.SetFact("hugepages.reserved", "unavailable") + } +} + +func (c *HypervisorCollector) setLibvirtUnavailable(r *CollectorResult) { + r.SetFact("libvirt.running_domains", "unavailable") + r.SetFact("libvirt.total_domains", "unavailable") + r.SetFact("libvirt.vcpu_total", "unavailable") + r.SetFact("libvirt.memory_total_kib", "unavailable") +} + +func (c *HypervisorCollector) saveLibvirt(ctx context.Context, r *CollectorResult, libvirtContainer containerHit, hasContainer bool) { + if !hasContainer && !c.Exec.CommandExists("virsh") { + r.RecordSkip(SkipCommandUnavailable, "virsh unavailable; libvirt artifacts skipped") + c.setLibvirtUnavailable(r) + return + } + + // Log diagnostic context for runtime troubleshooting. + if hasContainer { + c.UI.Verbose(fmt.Sprintf(" libvirt container: %s (%s)", libvirtContainer.Name, libvirtContainer.Image)) + } else if path, err := exec.LookPath("virsh"); err == nil { + c.UI.Verbose(fmt.Sprintf(" virsh path: %s", path)) + } + for _, key := range []string{"LIBVIRT_DEFAULT_URI", "VIRSH_DEFAULT_CONNECT_URI", "LIBVIRT_AUTH_FILE"} { + if v, ok := os.LookupEnv(key); ok { + c.UI.Verbose(fmt.Sprintf(" env %s=%s", key, v)) + } + } + for _, sock := range []string{ + "/run/libvirt/libvirt-sock-ro", "/var/run/libvirt/libvirt-sock-ro", + "/run/libvirt/libvirt-sock", "/var/run/libvirt/libvirt-sock", + } { + if fi, err := os.Stat(sock); err == nil { + c.UI.Verbose(fmt.Sprintf(" socket: %s (mode=%s)", sock, fi.Mode())) + } + } + c.UI.Verbose(fmt.Sprintf(" euid=%d has_root=%t", os.Geteuid(), c.Exec.HasRoot())) + + // Explicit connection URI avoids ambiguity between session/system daemons. + baseArgs := []string{"--readonly", "-c", "qemu:///system"} + + // Gate on virsh version: if the libvirt connection cannot be established + // within 10s, skip remaining probes rather than cascading timeouts. + // InheritProcGroup avoids a hang where virsh's D-Bus/polkit interaction + // blocks indefinitely when the process is in a separate process group. + if !c.saveVirshCapture(ctx, r, "hypervisor/virsh_version.txt", hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "version"), 10*time.Second, true, true), 1*1024*1024) { + c.UI.Verbose(" virsh connection failed; skipping remaining libvirt probes") + c.setLibvirtUnavailable(r) + return + } + + c.saveVirshCapture(ctx, r, "hypervisor/virsh_nodeinfo.txt", hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "nodeinfo"), 10*time.Second, true, true), 1*1024*1024) + + c.saveVirshCapture(ctx, r, "hypervisor/virsh_capabilities.xml", hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "capabilities"), 15*time.Second, true, true), 2*1024*1024) + + c.saveVirshCapture(ctx, r, "hypervisor/virsh_net_list.txt", hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "net-list", "--all"), 10*time.Second, true, true), 1*1024*1024) + + c.saveVirshCapture(ctx, r, "hypervisor/virsh_pool_list.txt", hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "pool-list", "--all"), 10*time.Second, true, true), 1*1024*1024) + + // virsh list --all: captures running + defined domains and yields total count. + listSpec := hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "list", "--all"), 10*time.Second, true, true) + listResult, listOut, _ := c.Exec.Capture(ctx, listSpec, 1*1024*1024) + c.saveCapturedProbe(r, "hypervisor/virsh_list.txt", listSpec, listResult, + string(listOut), "", "text", []string{"hypervisor"}) + + running, total := parseVirshList(string(listOut)) + r.SetFact("libvirt.running_domains", strconv.Itoa(running)) + r.SetFact("libvirt.total_domains", strconv.Itoa(total)) + + // virsh domstats: aggregate vCPU and memory across all domains. + statsSpec := hostOrContainerCommandSpec(libvirtContainer, "virsh", append(baseArgs, "domstats", "--state", "--vcpu", "--balloon"), 20*time.Second, true, true) + statsSpec.IgnoreExit = true + statsResult, statsOut, _ := c.Exec.Capture(ctx, statsSpec, 4*1024*1024) + c.saveCapturedProbe(r, "hypervisor/virsh_domstats.txt", statsSpec, statsResult, + string(statsOut), "", "text", []string{"hypervisor"}) + + vcpuTotal, memKIB := parseVirshDomStats(string(statsOut)) + if vcpuTotal >= 0 { + r.SetFact("libvirt.vcpu_total", strconv.Itoa(vcpuTotal)) + } else { + r.SetFact("libvirt.vcpu_total", "unavailable") + } + if memKIB >= 0 { + r.SetFact("libvirt.memory_total_kib", strconv.FormatInt(memKIB, 10)) + } else { + r.SetFact("libvirt.memory_total_kib", "unavailable") + } +} + +// saveVirshCapture runs a virsh command via Capture and saves the result. +// Returns true if the command completed successfully. +func (c *HypervisorCollector) saveVirshCapture(ctx context.Context, r *CollectorResult, path string, spec executor.CommandSpec, limitBytes int64) bool { + result, stdout, stderr := c.Exec.Capture(ctx, spec, limitBytes) + _ = c.saveCapturedProbe(r, path, spec, result, string(stdout), string(stderr), "text", []string{"hypervisor"}) + if result.Skipped { + r.RecordSkipForArtifact(SkipPermissionOrAccess, spec.String(), path) + return false + } + if result.TimedOut { + c.UI.Verbose(fmt.Sprintf(" timed out: %s (%s)", spec.String(), result.Duration.Round(time.Millisecond))) + r.RecordErrorForArtifact(ErrCommandTimedOut, fmt.Sprintf("%s: timed out after %s", spec.String(), result.Duration.Round(time.Millisecond)), path) + return false + } + if result.Err != nil && !spec.IgnoreExit { + c.UI.Verbose(fmt.Sprintf(" error: %s: %v (exit %d)", spec.String(), result.Err, result.ExitCode)) + r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", spec.String(), result.Err), path) + return false + } + return true +} + +// parseVirshList counts running and total domain entries from "virsh list --all" output. +func parseVirshList(out string) (running, total int) { + for _, line := range strings.Split(out, "\n") { + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + if fields[0] != "-" { + if _, err := strconv.Atoi(fields[0]); err != nil { + continue + } + } + total++ + if strings.EqualFold(fields[len(fields)-1], "running") { + running++ + } + } + return running, total +} + +// parseVirshDomStats sums vcpu.current and balloon.current (KiB) across all domains. +// Returns -1 for each value if no data could be parsed. +func parseVirshDomStats(out string) (vcpuTotal int, memKIB int64) { + vcpuTotal = -1 + memKIB = -1 + vcpuSum := 0 + memSum := int64(0) + anyVCPU := false + anyMem := false + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + switch { + case strings.HasPrefix(line, "vcpu.current="): + v, err := strconv.Atoi(strings.TrimPrefix(line, "vcpu.current=")) + if err == nil { + vcpuSum += v + anyVCPU = true + } + case strings.HasPrefix(line, "balloon.current="): + v, err := strconv.ParseInt(strings.TrimPrefix(line, "balloon.current="), 10, 64) + if err == nil { + memSum += v + anyMem = true + } + } + } + if anyVCPU { + vcpuTotal = vcpuSum + } + if anyMem { + memKIB = memSum + } + return vcpuTotal, memKIB +} + +// countNumaNodes counts /sys/devices/system/node/nodeN directories. +func countNumaNodes() int { + entries, err := os.ReadDir("/sys/devices/system/node") + if err != nil { + return 0 + } + count := 0 + for _, e := range entries { + if e.IsDir() && strings.HasPrefix(e.Name(), "node") { + if _, parseErr := strconv.Atoi(e.Name()[4:]); parseErr == nil { + count++ + } + } + } + if count == 0 { + return 1 + } + return count +} + +// readSysfsTrimmed reads a sysfs file and returns its trimmed content, +// or an empty string on error. +func readSysfsTrimmed(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index e0b0490..42ff284 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -17,6 +17,10 @@ import ( "github.com/NexGenCloud/vm-diagnostics/internal/ui" ) +// oomAnchorSubstr is the canonical kernel OOM invocation anchor. +// Each occurrence is a distinct OOM killer call, regardless of timing proximity. +const oomAnchorSubstr = "invoked oom-killer" + type JournalCollector struct { Base Since string @@ -40,7 +44,6 @@ const ( journalNDJSONRecordLimit = 50000 journalNDJSONByteLimit = 10 * 1024 * 1024 journalNDJSONSentinelReserve = 256 - oomIncidentGap = 5 * time.Second ) var journalServiceUnits = []string{ @@ -75,15 +78,9 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCommand(ctx, r, "logs/dmesg.txt", executor.CommandSpec{Name: "dmesg", Args: []string{"-T"}, NeedsRoot: true, Timeout: config.TimeoutMedium}, "dmesg", "journal") } - // Grep journal for OOM events (short-iso keeps the year on the timestamp). - useShortISO := c.journalShortISOAvailable(ctx) - if !useShortISO { - c.UI.Warn("journalctl -o short-iso is not available; OOM event timestamps may be inaccurate across calendar years") - } + // Grep journal for OOM events. The artifact retains all matching context lines; + // the count uses only the canonical "invoked oom-killer" anchor (one per kernel call). oomJournalArgs := append(append([]string{}, journalArgs...), "-k") - if useShortISO { - oomJournalArgs = append([]string{"-o", "short-iso"}, oomJournalArgs...) - } oomSpec := executor.CommandSpec{ Name: "journalctl", Args: append(append(oomJournalArgs, "--grep=invoked oom-killer|oom-kill|Out of memory: Killed|killed process .* total-vm"), "--case-sensitive=false"), @@ -103,14 +100,14 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } - var oom []string + var oom []string // all context lines kept in the artifact + oomAnchorCount := 0 for _, line := range strings.Split(string(stdout), "\n") { line = strings.TrimSpace(line) if line == "" { continue } - // Skip sudo audit lines that log our own journalctl command - // (they contain the grep pattern as a literal substring of the command) + // Skip sudo audit lines that contain the grep pattern as a literal command substring. if strings.Contains(line, "COMMAND=") { continue } @@ -121,9 +118,12 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error (strings.Contains(l, "killed process") && strings.Contains(l, "total-vm")) { oom = append(oom, line) } + // Count only canonical anchor lines — one per distinct kernel OOM invocation. + if strings.Contains(l, oomAnchorSubstr) { + oomAnchorCount++ + } } - oomIncidentCount := countOOMIncidents(oom, useShortISO) // journalctl --grep returns exit 1 when no entries match (like grep). // Only treat exit codes >= 2 as real errors. oomRealErr := oomResult.Err != nil && oomResult.ExitCode >= 2 @@ -136,14 +136,14 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error r.SetFact("oom_event_count", "unavailable") r.RecordErrorForArtifact(ErrCommandFailed, fmt.Sprintf("%s: %v", oomSpec.String(), oomResult.Err), oomPath) c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, string(stdout), string(stderr), "journalctl", []string{"oom"}, "OOM scan incomplete") - } else if oomIncidentCount > 0 { - r.SetFact("oom_event_count", fmt.Sprintf("%d", oomIncidentCount)) + } else if oomAnchorCount > 0 { + r.SetFact("oom_event_count", fmt.Sprintf("%d", oomAnchorCount)) r.AddIssueWithArtifacts( IssueOOMEvents, SeverityCritical, ConfidenceHigh, "MEM", - fmt.Sprintf("%d OOM killer event(s)", oomIncidentCount), + fmt.Sprintf("%d OOM killer invocation(s)", oomAnchorCount), []string{oomPath}, "journal", string(IssueOOMEvents), @@ -151,7 +151,12 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, strings.Join(oom, "\n")+"\n", "", "journalctl", []string{"oom"}) } else { r.SetFact("oom_event_count", "0") - c.saveCapturedProbe(r, oomPath, oomSpec, oomResult, "No OOM events found\n", "", "journalctl", []string{"oom"}) + // journalctl --grep exits 1 on no match (like grep); neutralize so + // saveCapturedProbe records status=ok instead of error. + noMatchResult := oomResult + noMatchResult.Err = nil + noMatchResult.ExitCode = 0 + c.saveCapturedProbe(r, oomPath, oomSpec, noMatchResult, "No OOM events found\n", "", "journalctl", []string{"oom"}, "No OOM events found") } for _, svc := range journalServiceUnits { @@ -161,86 +166,6 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } -func (c *JournalCollector) journalShortISOAvailable(ctx context.Context) bool { - spec := executor.CommandSpec{ - Name: "journalctl", - Args: []string{"--no-pager", "-o", "short-iso", "-n", "0"}, - NeedsRoot: true, - Timeout: config.TimeoutQuick, - } - res, _, _ := c.Exec.Capture(ctx, spec, 2048) - if res.Skipped { - return false - } - if res.Err != nil && res.ExitCode >= 2 { - return false - } - return true -} - -func countOOMIncidents(lines []string, useShortISO bool) int { - count := 0 - var lastIncidentAt time.Time - haveLastIncidentAt := false - for _, line := range lines { - ts, ok := oomLineTimestamp(line, useShortISO) - if !ok { - // Unexpected formatting: preserve support value by counting the line. - count++ - continue - } - if !haveLastIncidentAt || ts.Sub(lastIncidentAt) > oomIncidentGap { - count++ - lastIncidentAt = ts - haveLastIncidentAt = true - } - } - return count -} - -func oomLineTimestamp(line string, useShortISO bool) (time.Time, bool) { - fields := strings.Fields(strings.TrimSpace(line)) - if len(fields) < 1 { - return time.Time{}, false - } - if useShortISO { - return parseJournalShortISOTimestamp(fields[0]) - } - if len(fields) < 4 { - return time.Time{}, false - } - tsText := strings.Join(fields[:3], " ") - ts, err := time.ParseInLocation("Jan 02 15:04:05", tsText, time.Local) - if err != nil { - return time.Time{}, false - } - return time.Date(time.Now().Year(), ts.Month(), ts.Day(), ts.Hour(), ts.Minute(), ts.Second(), 0, time.Local), true -} - -func parseJournalShortISOTimestamp(s string) (time.Time, bool) { - layouts := []string{ - time.RFC3339Nano, - time.RFC3339, - "2006-01-02T15:04:05.999999999Z07:00", - "2006-01-02T15:04:05.999999Z07:00", - "2006-01-02T15:04:05Z07:00", - "2006-01-02 15:04:05", - "2006-01-02T15:04:05", - } - for _, layout := range layouts { - if t, err := time.Parse(layout, s); err == nil { - return t, true - } - } - if t, err := time.ParseInLocation("2006-01-02T15:04:05.999999999", s, time.Local); err == nil { - return t, true - } - if t, err := time.ParseInLocation("2006-01-02T15:04:05", s, time.Local); err == nil { - return t, true - } - return time.Time{}, false -} - func (c *JournalCollector) journalBaseArgs(until string) []string { args := []string{"--no-pager", "--until=" + until} if c.Since == "" || c.Since == "boot" { diff --git a/customers/vm-troubleshooting/internal/collector/network.go b/customers/vm-troubleshooting/internal/collector/network.go index a005a37..724ff3b 100644 --- a/customers/vm-troubleshooting/internal/collector/network.go +++ b/customers/vm-troubleshooting/internal/collector/network.go @@ -2,7 +2,9 @@ package collector import ( "context" + "fmt" "os" + "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" @@ -23,6 +25,12 @@ func (c *NetworkCollector) ID() string { return "network" } func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() + + // Detect which network manager owns interfaces and emit the fact + artifact. + manager, managerReport := detectNetworkManager(ctx, c.Exec) + r.SetFact("network.manager", manager) + c.saveProbeOutput(r, "network/manager_detection.txt", managerReport, "text", "network") + for _, native := range []struct { path string content string @@ -123,3 +131,123 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error } return r, nil } + +// detectNetworkManager determines which network manager owns the host's interfaces. +// It returns a canonical manager name and a human-readable detection report. +// +// Priority: +// 1. Runtime ownership (ifupdown state file, networkctl managed check, NM) +// 2. Netplan renderer intent +// 3. Config-file evidence +// +// Returns one of: ifupdown | networkd | NetworkManager | netplan+networkd | +// netplan+NetworkManager | mixed | none | unknown +func detectNetworkManager(ctx context.Context, exec executor.Executor) (string, string) { + var report strings.Builder + report.WriteString("Network manager detection\n") + report.WriteString("=========================\n") + + // --- Runtime signals --- + + // ifupdown: /run/network/ifstate is written at runtime. + for _, statePath := range []string{"/run/network/ifstate", "/var/run/network/ifstate"} { + if data, err := os.ReadFile(statePath); err == nil && len(strings.TrimSpace(string(data))) > 0 { + report.WriteString(fmt.Sprintf("ifupdown runtime state: %s (present)\n", statePath)) + ifState := strings.TrimSpace(string(data)) + if len(strings.Fields(ifState)) > 0 { + report.WriteString("Result: ifupdown (runtime state file has active interfaces)\n") + return "ifupdown", report.String() + } + } + } + + // NetworkManager: check if NM is managing any interfaces via nmcli. + if exec.CommandExists("nmcli") { + spec := executor.CommandSpec{Name: "nmcli", Args: []string{"device", "status"}, Timeout: config.TimeoutQuick} + res, out, _ := exec.Capture(ctx, spec, 128*1024) + if res.Err == nil && !res.Skipped { + managedCount := 0 + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + // nmcli device status columns: DEVICE TYPE STATE CONNECTION + if len(fields) >= 3 && fields[2] != "unmanaged" && fields[2] != "STATE" { + managedCount++ + } + } + if managedCount > 0 { + // Check whether netplan is the renderer frontend. + if _, err := os.Stat("/etc/netplan"); err == nil { + report.WriteString("netplan + NetworkManager detected (nmcli has managed devices, /etc/netplan exists)\n") + return "netplan+NetworkManager", report.String() + } + report.WriteString(fmt.Sprintf("NetworkManager detected (nmcli shows %d managed device(s))\n", managedCount)) + return "NetworkManager", report.String() + } + } + } + + // systemd-networkd: check via networkctl. + if exec.CommandExists("networkctl") { + spec := executor.CommandSpec{Name: "networkctl", Args: []string{"list", "--no-pager"}, Timeout: config.TimeoutQuick} + res, out, _ := exec.Capture(ctx, spec, 128*1024) + if res.Err == nil && !res.Skipped { + managedCount := 0 + for _, line := range strings.Split(string(out), "\n") { + low := strings.ToLower(line) + if strings.Contains(low, "configured") || strings.Contains(low, "routable") || strings.Contains(low, "degraded") { + managedCount++ + } + } + if managedCount > 0 { + if _, err := os.Stat("/etc/netplan"); err == nil { + report.WriteString("netplan + networkd detected (networkctl shows managed interfaces, /etc/netplan exists)\n") + return "netplan+networkd", report.String() + } + report.WriteString(fmt.Sprintf("systemd-networkd detected (networkctl shows %d managed interface(s))\n", managedCount)) + return "networkd", report.String() + } + } + } + + // --- Config-file evidence (weaker signal) --- + + if _, err := os.Stat("/etc/netplan"); err == nil { + // Read netplan config to determine renderer. + renderer := "networkd" // netplan default + if entries, readErr := os.ReadDir("/etc/netplan"); readErr == nil { + for _, entry := range entries { + if entry.IsDir() || (!strings.HasSuffix(entry.Name(), ".yaml") && !strings.HasSuffix(entry.Name(), ".yml")) { + continue + } + if data, fileErr := os.ReadFile("/etc/netplan/" + entry.Name()); fileErr == nil { + if strings.Contains(strings.ToLower(string(data)), "renderer: networkmanager") { + renderer = "NetworkManager" + } else if strings.Contains(strings.ToLower(string(data)), "renderer: networkd") { + renderer = "networkd" + } + } + } + } + result := "netplan+" + renderer + report.WriteString(fmt.Sprintf("netplan config found, inferred renderer: %s\n", result)) + return result, report.String() + } + + if _, err := os.Stat("/etc/network/interfaces"); err == nil { + report.WriteString("/etc/network/interfaces found: likely ifupdown (runtime state absent)\n") + return "ifupdown", report.String() + } + + if _, err := os.Stat("/etc/NetworkManager"); err == nil { + report.WriteString("/etc/NetworkManager found: likely NetworkManager\n") + return "NetworkManager", report.String() + } + + if _, err := os.Stat("/etc/systemd/network"); err == nil { + report.WriteString("/etc/systemd/network found: likely networkd\n") + return "networkd", report.String() + } + + report.WriteString("No definitive signal found\n") + return "unknown", report.String() +} diff --git a/customers/vm-troubleshooting/internal/collector/ovs.go b/customers/vm-troubleshooting/internal/collector/ovs.go new file mode 100644 index 0000000..d48197a --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/ovs.go @@ -0,0 +1,210 @@ +package collector + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type OVSCollector struct{ Base } + +func NewOVSCollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *OVSCollector { + return &OVSCollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *OVSCollector) Name() string { return "OVS" } +func (c *OVSCollector) ID() string { return "ovs" } + +func (c *OVSCollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + ovsContainer, hasContainer := findRunningContainer(ctx, c.Exec, []string{"openvswitch", "vswitchd"}, []string{"openvswitch", "db"}) + + if !ovsEvidence(c.Exec, hasContainer) { + r.RecordSkip(SkipNotApplicable, "no OVS evidence detected") + return r, nil + } + if hasContainer { + c.UI.Verbose(fmt.Sprintf(" ovs container: %s (%s)", ovsContainer.Name, ovsContainer.Image)) + } + + // --- Version --- + versionSpec := hostOrContainerCommandSpec(ovsContainer, "ovs-vsctl", []string{"--version"}, config.TimeoutQuick, true, false) + if c.Exec.CommandExists("ovs-vsctl") || hasContainer { + c.saveCommand(ctx, r, "ovs/ovs_version.txt", versionSpec, "text", "ovs") + } + + // --- Bridge and port topology --- + bridgeCount := -1 + portCount := -1 + if c.Exec.CommandExists("ovs-vsctl") || hasContainer { + // Bridge list + bridgesSpec := hostOrContainerCommandSpec(ovsContainer, "ovs-vsctl", []string{"list-br"}, config.TimeoutQuick, true, false) + bridgesResult, bridgesOut, _ := c.Exec.Capture(ctx, bridgesSpec, 256*1024) + c.saveCapturedProbe(r, "ovs/ovs_bridges.txt", bridgesSpec, bridgesResult, + string(bridgesOut), "", "text", []string{"ovs"}) + bridgeCount = countNonEmptyLines(string(bridgesOut)) + + // show: full topology overview + c.saveCommand(ctx, r, "ovs/ovs_show.txt", hostOrContainerCommandSpec(ovsContainer, "ovs-vsctl", []string{"show"}, config.TimeoutMedium, true, false), "text", "ovs") + + // Port count: sum ports across all bridges + portSum := 0 + if bridgeCount > 0 { + for _, bridge := range strings.Split(strings.TrimSpace(string(bridgesOut)), "\n") { + bridge = strings.TrimSpace(bridge) + if bridge == "" { + continue + } + portsSpec := hostOrContainerCommandSpec(ovsContainer, "ovs-vsctl", []string{"list-ports", bridge}, config.TimeoutQuick, true, false) + _, portsOut, _ := c.Exec.Capture(ctx, portsSpec, 256*1024) + portSum += countNonEmptyLines(string(portsOut)) + } + portCount = portSum + } else { + portCount = 0 + } + } + + if bridgeCount >= 0 { + r.SetFact("ovs.bridge_count", strconv.Itoa(bridgeCount)) + } else { + r.SetFact("ovs.bridge_count", "unavailable") + } + if portCount >= 0 { + r.SetFact("ovs.port_count", strconv.Itoa(portCount)) + } else { + r.SetFact("ovs.port_count", "unavailable") + } + + // --- Datapath info --- + if c.Exec.CommandExists("ovs-dpctl") || hasContainer { + c.saveCommand(ctx, r, "ovs/ovs_datapath.txt", hostOrContainerCommandSpec(ovsContainer, "ovs-dpctl", []string{"show"}, config.TimeoutQuick, true, false), "text", "ovs") + // Datapath kind: hw-offload (tc) or kernel (ovs-native). + dpkindSpec := hostOrContainerCommandSpec(ovsContainer, "ovs-dpctl", []string{"dump-dps"}, config.TimeoutQuick, true, false) + dpkindResult, dpkindOut, _ := c.Exec.Capture(ctx, dpkindSpec, 64*1024) + if dpkindResult.Err == nil && !dpkindResult.Skipped { + kind := "kernel" + if strings.Contains(strings.ToLower(string(dpkindOut)), "tc") { + kind = "offload" + } + r.SetFact("ovs.datapath_kind", kind) + } else { + r.SetFact("ovs.datapath_kind", "unknown") + } + } else { + r.SetFact("ovs.datapath_kind", "unknown") + } + + // --- appctl stats (optional) --- + if c.Exec.CommandExists("ovs-appctl") || hasContainer { + c.saveCommand(ctx, r, "ovs/ovs_coverage.txt", hostOrContainerCommandSpec(ovsContainer, "ovs-appctl", []string{"coverage/show"}, config.TimeoutQuick, true, false), "text", "ovs") + c.saveCommand(ctx, r, "ovs/ovs_memory.txt", hostOrContainerCommandSpec(ovsContainer, "ovs-appctl", []string{"memory/show"}, config.TimeoutQuick, true, false), "text", "ovs") + } + + // --- Stale OVS sockets --- + staleCount := c.saveOVSStale(r) + r.SetFact("ovs.stale_socket_count", strconv.Itoa(staleCount)) + + return r, nil +} + +// ovsEvidence returns true when there is runtime evidence that OVS is present. +func ovsEvidence(exec executor.Executor, hasContainer bool) bool { + if exec.CommandExists("ovs-vsctl") || hasContainer { + return true + } + // Runtime socket check + for _, dir := range ovsRuntimeDirs() { + if entries, err := os.ReadDir(dir); err == nil && len(entries) > 0 { + return true + } + } + return false +} + +func ovsRuntimeDirs() []string { + return dedupeDirsByTarget([]string{"/var/run/openvswitch", "/run/openvswitch"}) +} + +func dedupeDirsByTarget(paths []string) []string { + seen := make(map[string]struct{}, len(paths)) + out := make([]string, 0, len(paths)) + for _, dir := range paths { + canonical := dir + if resolved, err := filepath.EvalSymlinks(dir); err == nil { + canonical = resolved + } + if _, ok := seen[canonical]; ok { + continue + } + seen[canonical] = struct{}{} + out = append(out, dir) + } + return out +} + +// saveOVSStale lists stale OVS control sockets (files matching *.ctl or ndu-sock.*) +// that do not correspond to a live process. Returns the stale count. +func (c *OVSCollector) saveOVSStale(r *CollectorResult) int { + var stale []string + for _, dir := range ovsRuntimeDirs() { + entries, err := os.ReadDir(dir) + if err != nil { + continue + } + for _, entry := range entries { + name := entry.Name() + if !strings.HasSuffix(name, ".ctl") && !strings.HasPrefix(name, "ndu-sock.") { + continue + } + // OVS socket names embed the PID: "ovs-vswitchd..ctl" + // Check whether the PID is still alive. + parts := strings.Split(name, ".") + if len(parts) >= 2 { + pidStr := parts[len(parts)-2] + if strings.HasSuffix(name, ".ctl") { + pidStr = parts[len(parts)-2] + } else { + // ndu-sock. + pidStr = parts[len(parts)-1] + } + if pid, parseErr := strconv.Atoi(pidStr); parseErr == nil { + if _, statErr := os.Stat(filepath.Join("/proc", strconv.Itoa(pid))); statErr != nil { + stale = append(stale, filepath.Join(dir, name)) + continue + } + } + } + // No PID extractable — report as potentially stale. + stale = append(stale, filepath.Join(dir, name)+" (no PID)") + } + } + + content := strings.Join(stale, "\n") + if content == "" { + content = "(none)\n" + } else { + content += "\n" + } + c.saveProbeOutput(r, "ovs/ovs_stale_sockets.txt", content, "text", "ovs") + return len(stale) +} + +// countNonEmptyLines counts non-blank lines in a string. +func countNonEmptyLines(s string) int { + count := 0 + for _, line := range strings.Split(s, "\n") { + if strings.TrimSpace(line) != "" { + count++ + } + } + return count +} diff --git a/customers/vm-troubleshooting/internal/collector/ovs_test.go b/customers/vm-troubleshooting/internal/collector/ovs_test.go new file mode 100644 index 0000000..36e0e74 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/ovs_test.go @@ -0,0 +1,29 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +func TestDedupeDirsByTarget(t *testing.T) { + t.Parallel() + + root := t.TempDir() + realDir := filepath.Join(root, "openvswitch-real") + if err := os.Mkdir(realDir, 0o755); err != nil { + t.Fatalf("mkdir real dir: %v", err) + } + aliasDir := filepath.Join(root, "openvswitch-alias") + if err := os.Symlink(realDir, aliasDir); err != nil { + t.Fatalf("create symlink: %v", err) + } + + dirs := dedupeDirsByTarget([]string{aliasDir, realDir}) + if len(dirs) != 1 { + t.Fatalf("expected 1 deduped dir, got %d: %#v", len(dirs), dirs) + } + if dirs[0] != aliasDir { + t.Fatalf("expected first unique path to win, got %q", dirs[0]) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index b626f0a..0892fc2 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "path/filepath" "slices" "strings" @@ -203,24 +204,123 @@ func (c *ServicesCollector) detectNVSwitch(ctx context.Context) nvSwitchPresence return nvSwitchAbsent } +// ibLinkLayer reads /sys/class/infiniband/*/ports/*/link_layer to determine what +// transport the host's InfiniBand ports actually carry. +// Returns one of: "none", "ethernet", "infiniband", "mixed", "unknown". +func ibLinkLayer() string { + glob, err := filepath.Glob("/sys/class/infiniband/*/ports/*/link_layer") + if err != nil || len(glob) == 0 { + return "none" + } + hasIB := false + hasEth := false + for _, path := range glob { + data, readErr := os.ReadFile(path) + if readErr != nil { + continue + } + switch strings.TrimSpace(strings.ToLower(string(data))) { + case "infiniband": + hasIB = true + case "ethernet": + hasEth = true + } + } + switch { + case hasIB && hasEth: + return "mixed" + case hasIB: + return "infiniband" + case hasEth: + return "ethernet" + default: + return "unknown" + } +} + +// isIfupdownActive returns true when ifupdown (not systemd-networkd) manages the +// host's network interfaces. +// Primary signal: /run/network/ifstate, written by ifupdown at runtime. +// Secondary: /etc/network/interfaces exists (config-level evidence). +func isIfupdownActive() bool { + for _, statePath := range []string{"/run/network/ifstate", "/var/run/network/ifstate"} { + if data, err := os.ReadFile(statePath); err == nil && len(strings.TrimSpace(string(data))) > 0 { + return true + } + } + _, err := os.Stat("/etc/network/interfaces") + return err == nil +} + +// isNVSwitchVFIOBound returns true when NVIDIA bridge-class PCI devices +// (vendor 10de, class 0x0680xx — i.e. NVSwitch) are present and all of them +// are bound to the vfio-pci driver. +// This is the physical-passthrough fingerprint on HGX hypervisors: NVSwitch +// hardware exists in the PCI tree but host-side FM is not applicable. +func isNVSwitchVFIOBound() bool { + entries, err := os.ReadDir("/sys/bus/pci/devices") + if err != nil { + return false + } + nvSwitchCount := 0 + vfioBound := 0 + for _, entry := range entries { + base := filepath.Join("/sys/bus/pci/devices", entry.Name()) + vendorBytes, err := os.ReadFile(filepath.Join(base, "vendor")) + if err != nil || strings.TrimSpace(string(vendorBytes)) != "0x10de" { + continue + } + classBytes, err := os.ReadFile(filepath.Join(base, "class")) + if err != nil { + continue + } + // PCI class 0x0680xx: PCI_BASE_CLASS_BRIDGE (0x06) + subclass 0x80 (Other Bridge) + if !strings.HasPrefix(strings.ToLower(strings.TrimSpace(string(classBytes))), "0x0680") { + continue + } + nvSwitchCount++ + driverLink, readErr := os.Readlink(filepath.Join(base, "driver")) + if readErr == nil && strings.HasSuffix(driverLink, "/vfio-pci") { + vfioBound++ + } + } + return nvSwitchCount > 0 && nvSwitchCount == vfioBound +} + // reportFailedServices classifies failed services and sets the failed_service_count fact. -// Handles fabricmanager false positive: requires BOTH NVSwitch positively absent -// AND benign failure text (NV_WARN_NOTHING_TO_DO) to downgrade. +// It filters out known-benign failures before counting, so the fact and any svc_failed +// issue reflect only real problems. func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context.Context, failedNames []string) { if len(failedNames) == 0 { r.SetFact("failed_service_count", "0") return } - // Check for fabricmanager false positive - fmIdx := -1 - for i, name := range failedNames { - if name == "nvidia-fabricmanager.service" { - fmIdx = i - break - } + // --- Filter 1: systemd-networkd-wait-online on ifupdown hosts --- + // This service waits for interfaces managed by systemd-networkd. On hosts + // using ifupdown, it will always "fail" because no interfaces are networkd-managed. + if isIfupdownActive() { + failedNames = slices.DeleteFunc(failedNames, func(name string) bool { + return name == "systemd-networkd-wait-online.service" + }) + } + + // --- Filter 2: ibacm / opensm when no real InfiniBand fabric is active --- + // Only suppress when the kernel sysfs confirms no IB-transport ports exist. + // Hosts with iWARP/RoCE (Ethernet-only RDMA) do not need these services. + linkLayer := ibLinkLayer() + if linkLayer != "infiniband" && linkLayer != "mixed" { + failedNames = slices.DeleteFunc(failedNames, func(name string) bool { + return name == "ibacm.service" || name == "opensm.service" + }) } + // --- Filter 3: fabricmanager false positive --- + // Suppress FM failure when NVSwitch is absent or all NVSwitch hardware is + // passed through to VMs (host FM is not applicable in either case). + fmIdx := slices.IndexFunc(failedNames, func(name string) bool { + return name == "nvidia-fabricmanager.service" + }) if fmIdx >= 0 && c.isFabricManagerBenign(ctx) { r.AddIssueWithArtifacts( IssueSvcFabricmanagerBenign, @@ -232,11 +332,10 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context "svc", string(IssueSvcFabricmanagerBenign), ) - // Remove from failed list for accurate counting - failedNames = append(failedNames[:fmIdx], failedNames[fmIdx+1:]...) + failedNames = slices.Delete(failedNames, fmIdx, fmIdx+1) } - // Set fact AFTER filtering — count reflects real failures only + // Set fact AFTER all filtering — count reflects real failures only r.SetFact("failed_service_count", fmt.Sprintf("%d", len(failedNames))) if len(failedNames) > 0 { @@ -267,13 +366,18 @@ func (c *ServicesCollector) reportFailedServices(r *CollectorResult, ctx context } } -// isFabricManagerBenign returns true only when BOTH conditions are met: -// 1. NVSwitch is positively absent (not unknown) -// 2. fabricmanager failure text contains the known benign signature +// isFabricManagerBenign returns true when the FM failure is expected and harmless: +// +// 1. NVSwitch is positively absent (no SXM hardware on this host), OR all +// NVSwitch PCI devices are bound to vfio-pci (passed through to VMs). +// 2. The fabricmanager service text contains the known benign signature, +// confirming FM itself agrees there is nothing to do. func (c *ServicesCollector) isFabricManagerBenign(ctx context.Context) bool { - if c.detectNVSwitch(ctx) != nvSwitchAbsent { + // Condition 1: no NVSwitch hardware, or all NVSwitch is VFIO-bound. + if c.detectNVSwitch(ctx) != nvSwitchAbsent && !isNVSwitchVFIOBound() { return false } + // Condition 2: FM service text confirms benign exit. if !c.Exec.CommandExists("systemctl") { return false } @@ -284,7 +388,7 @@ func (c *ServicesCollector) isFabricManagerBenign(ctx context.Context) bool { if result.Err == nil { return false // service is actually running, not a failure } - output := strings.ToLower(string(stdout)) - return strings.Contains(output, "nv_warn_nothing_to_do") || - strings.Contains(output, "nothing to do") + out := strings.ToLower(string(stdout)) + return strings.Contains(out, "nv_warn_nothing_to_do") || + strings.Contains(out, "nothing to do") } diff --git a/customers/vm-troubleshooting/internal/collector/system.go b/customers/vm-troubleshooting/internal/collector/system.go index 6229c30..9b0bb32 100644 --- a/customers/vm-troubleshooting/internal/collector/system.go +++ b/customers/vm-troubleshooting/internal/collector/system.go @@ -3,6 +3,8 @@ package collector import ( "context" "fmt" + "os" + "strconv" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" @@ -101,9 +103,206 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) c.saveFile(r, "system/hostname_file.txt", "/etc/hostname", nil, "identity") c.saveProbeOutput(r, "system/ulimits.txt", probe.Ulimits(), "procfs", "config") + + // --- Kernel cmdline (sanitized) --- + c.saveKernelCmdline(r) + + // --- NUMA topology --- + if c.Exec.CommandExists("numactl") { + c.saveCommand(ctx, r, "hardware/numa_topology.txt", executor.CommandSpec{ + Name: "numactl", Args: []string{"--hardware"}, Timeout: config.TimeoutQuick, + }, "text", "hardware") + } else { + c.saveNodeSysfs(r) + } + + // --- Hugepages (sysfs per-size view) --- + c.saveHugepages(r) + return r, nil } +// saveKernelCmdline reads /proc/cmdline, sanitizes it, saves the artifact, and +// emits kernel.iommu_enabled and kernel.transparent_hugepage facts. +func (c *SystemCollector) saveKernelCmdline(r *CollectorResult) { + data, err := os.ReadFile("/proc/cmdline") + if err != nil { + c.saveSkippedArtifact(r, "system/kernel_cmdline.txt", "file", "", "/proc/cmdline", + SkipSourceUnavailable, "/proc/cmdline: unavailable", "config") + r.SetFact("kernel.iommu_enabled", "unknown") + r.SetFact("kernel.transparent_hugepage", "unknown") + return + } + raw := strings.TrimSpace(string(data)) + sanitized := sanitize.KernelCmdline(raw) + + if err := c.Writer.ReservePath("system/kernel_cmdline.txt"); err != nil { + r.RecordErrorForArtifact(ErrArtifactReserve, "system/kernel_cmdline.txt: "+err.Error(), "system/kernel_cmdline.txt") + } else if err := c.Writer.SaveReadFile("system/kernel_cmdline.txt", "/proc/cmdline", sanitized+"\n", true); err != nil { + c.Writer.ReleasePath("system/kernel_cmdline.txt") + r.RecordErrorForArtifact(ErrArtifactWrite, "system/kernel_cmdline.txt: "+err.Error(), "system/kernel_cmdline.txt") + } else { + r.AddFileArtifact("system/kernel_cmdline.txt", "/proc/cmdline", true, "config") + } + + // IOMMU enabled fact — inferred from kernel parameters. + lower := strings.ToLower(raw) + switch { + case strings.Contains(lower, "intel_iommu=on"), + strings.Contains(lower, "amd_iommu=on"), + strings.Contains(lower, "iommu=pt"), + strings.Contains(lower, "iommu=1"): + r.SetFact("kernel.iommu_enabled", "true") + default: + r.SetFact("kernel.iommu_enabled", "false") + } + + // Transparent hugepage setting from cmdline. + thpFact := "unknown" + for _, param := range strings.Fields(raw) { + if strings.HasPrefix(strings.ToLower(param), "transparent_hugepage=") { + val := strings.ToLower(strings.SplitN(param, "=", 2)[1]) + switch val { + case "always", "madvise", "never": + thpFact = val + } + break + } + } + // Fall back to sysfs if not set on cmdline. + if thpFact == "unknown" { + if sysData, readErr := os.ReadFile("/sys/kernel/mm/transparent_hugepage/enabled"); readErr == nil { + // Format: "always [madvise] never" — active mode is in brackets. + content := string(sysData) + for _, mode := range []string{"always", "madvise", "never"} { + if strings.Contains(content, "["+mode+"]") { + thpFact = mode + break + } + } + } + } + r.SetFact("kernel.transparent_hugepage", thpFact) +} + +// saveNodeSysfs writes a simple NUMA node summary from /sys/devices/system/node/ +// when numactl is not available. +func (c *SystemCollector) saveNodeSysfs(r *CollectorResult) { + entries, err := os.ReadDir("/sys/devices/system/node") + if err != nil { + c.saveSkippedArtifact(r, "hardware/numa_topology.txt", "file", "", "/sys/devices/system/node", + SkipSourceUnavailable, "/sys/devices/system/node: unavailable", "hardware") + return + } + var buf strings.Builder + for _, entry := range entries { + if !strings.HasPrefix(entry.Name(), "node") { + continue + } + buf.WriteString(entry.Name() + ":\n") + // Emit cpulist if present. + if data, err := os.ReadFile("/sys/devices/system/node/" + entry.Name() + "/cpulist"); err == nil { + buf.WriteString(" cpulist: " + strings.TrimSpace(string(data)) + "\n") + } + // Emit MemTotal from per-node meminfo. + if data, err := os.ReadFile("/sys/devices/system/node/" + entry.Name() + "/meminfo"); err == nil { + for _, line := range strings.Split(string(data), "\n") { + if strings.Contains(line, "MemTotal") { + buf.WriteString(" " + strings.TrimSpace(line) + "\n") + } + } + } + } + if buf.Len() == 0 { + c.saveSkippedArtifact(r, "hardware/numa_topology.txt", "file", "", "/sys/devices/system/node", + SkipSourceUnavailable, "no NUMA nodes found in sysfs", "hardware") + return + } + c.saveProbeOutput(r, "hardware/numa_topology.txt", buf.String(), "text", "hardware") +} + +// saveHugepages writes a hugepages summary from /proc/meminfo (HugeTLB lines) +// and /sys/kernel/mm/hugepages/ (per-size breakdown), then emits meminfo facts. +func (c *SystemCollector) saveHugepages(r *CollectorResult) { + meminfoData, err := os.ReadFile("/proc/meminfo") + if err != nil { + r.SetFact("meminfo.mem_available_kib", "unavailable") + r.SetFact("meminfo.hugetlb_kib", "unavailable") + r.SetFact("meminfo.swap_total_kib", "unavailable") + r.SetFact("meminfo.swap_free_kib", "unavailable") + return + } + meminfo := parseMeminfoPairs(string(meminfoData)) + + setMemFact := func(factKey, meminfoKey string) { + if v, ok := meminfo[meminfoKey]; ok { + r.SetFact(factKey, strconv.FormatInt(v, 10)) + } else { + r.SetFact(factKey, "unavailable") + } + } + setMemFact("meminfo.mem_available_kib", "MemAvailable") + // Note: HugePages_Total × Hugepagesize = total kB reserved. + // Only publish the fact when the product is known; otherwise leave it unavailable. + hugepageReserved := "unavailable" + if total, ok1 := meminfo["HugePages_Total"]; ok1 { + if size, ok2 := meminfo["Hugepagesize"]; ok2 { + hugepageReserved = strconv.FormatInt(total*size, 10) + } + } + r.SetFact("meminfo.hugetlb_kib", hugepageReserved) + setMemFact("meminfo.swap_total_kib", "SwapTotal") + setMemFact("meminfo.swap_free_kib", "SwapFree") + + // Build the hugepages artifact: relevant /proc/meminfo lines + sysfs breakdown. + var buf strings.Builder + buf.WriteString("=== /proc/meminfo (hugepage lines) ===\n") + for _, line := range strings.Split(string(meminfoData), "\n") { + low := strings.ToLower(line) + if strings.Contains(low, "huge") || strings.Contains(low, "hugetlb") { + buf.WriteString(line + "\n") + } + } + buf.WriteString("\n=== /sys/kernel/mm/hugepages ===\n") + if entries, readErr := os.ReadDir("/sys/kernel/mm/hugepages"); readErr == nil { + for _, entry := range entries { + buf.WriteString(entry.Name() + ":\n") + for _, subfile := range []string{"nr_hugepages", "free_hugepages", "resv_hugepages", "surplus_hugepages"} { + path := fmt.Sprintf("/sys/kernel/mm/hugepages/%s/%s", entry.Name(), subfile) + if data, fileErr := os.ReadFile(path); fileErr == nil { + buf.WriteString(fmt.Sprintf(" %s: %s\n", subfile, strings.TrimSpace(string(data)))) + } + } + } + } + c.saveProbeOutput(r, "hardware/hugepages.txt", buf.String(), "text", "memory") +} + +// parseMeminfoPairs parses /proc/meminfo into a key→kB map. +// Lines have the format: "MemTotal: 16384000 kB" +func parseMeminfoPairs(content string) map[string]int64 { + out := make(map[string]int64) + for _, line := range strings.Split(content, "\n") { + colon := strings.IndexByte(line, ':') + if colon < 0 { + continue + } + key := strings.TrimSpace(line[:colon]) + rest := strings.TrimSpace(line[colon+1:]) + // Strip trailing unit (usually "kB") + val := strings.Fields(rest) + if len(val) == 0 { + continue + } + n, err := strconv.ParseInt(val[0], 10, 64) + if err != nil { + continue + } + out[key] = n + } + return out +} + func (c *SystemCollector) saveProcess(ctx context.Context, r *CollectorResult, path string, spec executor.CommandSpec) { if err := ValidateTagsAndHint("ps", []string{"processes"}); err != nil { r.RecordErrorForArtifact(ErrArtifactValidation, fmt.Sprintf("%s: %v", path, err), path) diff --git a/customers/vm-troubleshooting/internal/executor/executor.go b/customers/vm-troubleshooting/internal/executor/executor.go index 9ff0e64..ceb0211 100644 --- a/customers/vm-troubleshooting/internal/executor/executor.go +++ b/customers/vm-troubleshooting/internal/executor/executor.go @@ -16,12 +16,13 @@ import ( // CommandSpec describes a command to execute. type CommandSpec struct { - Name string - Args []string - NeedsRoot bool - Timeout time.Duration // 0 = use parent context deadline - Env []string // additional env vars (nil = inherit) - IgnoreExit bool // useful for status/info commands that may return non-zero but still produce support value + Name string + Args []string + NeedsRoot bool + Timeout time.Duration // 0 = use parent context deadline + Env []string // additional env vars (nil = inherit) + IgnoreExit bool // useful for status/info commands that may return non-zero but still produce support value + InheritProcGroup bool // stay in parent's process group (needed for commands that use D-Bus/polkit) } // String returns a human-readable representation of the command. @@ -104,19 +105,23 @@ func (e *RealExecutor) buildCmd(ctx context.Context, spec CommandSpec) (*exec.Cm cmd := exec.CommandContext(ctx, name, args...) - // Process group isolation for clean subprocess tree cleanup - cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} - - // WaitDelay bounds pipe cleanup after process kill (Go 1.20+) - cmd.WaitDelay = 5 * time.Second - - // Custom cancel: send SIGTERM to process group instead of just the process - cmd.Cancel = func() error { - if cmd.Process == nil { - return nil + if !spec.InheritProcGroup { + // Process group isolation for clean subprocess tree cleanup. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + // WaitDelay bounds pipe cleanup after process kill (Go 1.20+). + cmd.WaitDelay = 5 * time.Second + // Custom cancel: send SIGTERM to process group instead of just the process. + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + return syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) } - return syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) } + // When InheritProcGroup is true, the child stays in the parent's process + // group. On timeout Go sends SIGKILL directly (no SIGTERM grace period). + // Use this for commands that interact with D-Bus or polkit, which can hang + // when the caller is in a separate process group. // Additional env vars if len(spec.Env) > 0 { diff --git a/customers/vm-troubleshooting/internal/executor/executor_test.go b/customers/vm-troubleshooting/internal/executor/executor_test.go index f6f6694..49e31f7 100644 --- a/customers/vm-troubleshooting/internal/executor/executor_test.go +++ b/customers/vm-troubleshooting/internal/executor/executor_test.go @@ -44,3 +44,34 @@ func TestRealExecutorSkipRoot(t *testing.T) { t.Fatalf("expected skip message, got %q", stdout) } } + +func TestRealExecutorInheritProcGroup(t *testing.T) { + t.Parallel() + + exec := NewReal(false) + + cmd, skipped := exec.buildCmd(context.Background(), CommandSpec{Name: "/bin/true"}) + if skipped { + t.Fatal("expected non-root command to build") + } + if cmd.SysProcAttr == nil || !cmd.SysProcAttr.Setpgid { + t.Fatalf("expected default command to isolate process group, got %#v", cmd.SysProcAttr) + } + if cmd.WaitDelay != 5*time.Second { + t.Fatalf("expected default WaitDelay=5s, got %s", cmd.WaitDelay) + } + if cmd.Cancel == nil { + t.Fatal("expected default command to install custom Cancel") + } + + cmd, skipped = exec.buildCmd(context.Background(), CommandSpec{Name: "/bin/true", InheritProcGroup: true}) + if skipped { + t.Fatal("expected inherited process-group command to build") + } + if cmd.SysProcAttr != nil { + t.Fatalf("expected no SysProcAttr when inheriting process group, got %#v", cmd.SysProcAttr) + } + if cmd.WaitDelay != 0 { + t.Fatalf("expected no WaitDelay when inheriting process group, got %s", cmd.WaitDelay) + } +} diff --git a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go index 116c7ed..bb01dbb 100644 --- a/customers/vm-troubleshooting/internal/output/archive_consistency_test.go +++ b/customers/vm-troubleshooting/internal/output/archive_consistency_test.go @@ -24,7 +24,7 @@ func TestStructuredArchiveContainsSchemasAndConsistentIndexes(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.2.0", + SchemaVersion: "3.3.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/manifest.go b/customers/vm-troubleshooting/internal/output/manifest.go index 00b4b7d..5555336 100644 --- a/customers/vm-troubleshooting/internal/output/manifest.go +++ b/customers/vm-troubleshooting/internal/output/manifest.go @@ -104,10 +104,25 @@ type ManifestJSON struct { // Integer fact keys — values are converted from string to int in the manifest. // "unavailable" maps to null (json omit or explicit null). var integerFactKeys = map[string]bool{ + // Existing keys "cpu_cores": true, "gpu_count": true, "memory_total": true, "gpu_unreachable_count": true, "gpu_total_count": true, "container_count": true, "vllm_container_count": true, "failed_service_count": true, "xid_classified_count": true, "critical_event_count": true, "oom_event_count": true, + // P1.1 HypervisorCollector + "libvirt.running_domains": true, "libvirt.total_domains": true, + "libvirt.vcpu_total": true, "libvirt.memory_total_kib": true, + "vfio.bound_devices": true, + "gpu.pci_nvidia_count": true, "gpu.vfio_bound_count": true, "gpu.host_driver_count": true, + "nvswitch.pci_count": true, + "hugepages.total": true, "hugepages.free": true, + "hugepages.reserved": true, "hugepages.size_kb": true, + "numa.nodes": true, + // P1.2 OVSCollector + "ovs.bridge_count": true, "ovs.port_count": true, "ovs.stale_socket_count": true, + // P1.3 SystemCollector + "meminfo.mem_available_kib": true, "meminfo.hugetlb_kib": true, + "meminfo.swap_total_kib": true, "meminfo.swap_free_kib": true, } // ConvertFacts converts string facts to typed JSON values per the explicit allowlist. diff --git a/customers/vm-troubleshooting/internal/output/manifest_test.go b/customers/vm-troubleshooting/internal/output/manifest_test.go index 2e6f12a..d287c83 100644 --- a/customers/vm-troubleshooting/internal/output/manifest_test.go +++ b/customers/vm-troubleshooting/internal/output/manifest_test.go @@ -37,7 +37,7 @@ func TestWriteManifestFromResultsStagesSchemasAndValidates(t *testing.T) { } meta := ManifestMeta{ - SchemaVersion: "3.2.0", + SchemaVersion: "3.3.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: "vm-diagnostics-test", Version: "dev", diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 31e8938..5620fc6 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -45,7 +45,7 @@ type ReportRecord struct { ErrorCount int `json:"error_count,omitempty"` } -const reportSchemaVersion = "3.2.0" +const reportSchemaVersion = "3.3.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index aa4fa9b..ad2ccd7 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -155,6 +155,8 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { registry.Register(collector.NewAdditionalCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewStorageCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewInfiniBandCollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewHypervisorCollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewOVSCollector(r.Exec, writer, r.UI)) results, err := registry.RunAll(ctx, r.Config.SkipSet(), r.UI) if err != nil { @@ -364,7 +366,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "3.2.0", + SchemaVersion: "3.3.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, diff --git a/customers/vm-troubleshooting/internal/sanitize/sanitize.go b/customers/vm-troubleshooting/internal/sanitize/sanitize.go index 4416195..e2a2ce5 100644 --- a/customers/vm-troubleshooting/internal/sanitize/sanitize.go +++ b/customers/vm-troubleshooting/internal/sanitize/sanitize.go @@ -16,6 +16,12 @@ var ( commonEnvRE = regexp.MustCompile(`(?i)\b(AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HUGGINGFACE_API_KEY|DATABASE_URL|PGPASSWORD|MYSQL_PWD|REDIS_PASSWORD)=[^\s]+`) ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;?]*[ -/]*[@-~]`) + // cmdlineURLUserinfoRE matches URL userinfo (user:password@) in kernel cmdline values. + // Cloud-init NoCloud seedfrom URIs may carry ftp://user:pass@host/path credentials. + cmdlineURLUserinfoRE = regexp.MustCompile(`([a-zA-Z][a-zA-Z0-9+\-.]*://)([^/@:]+:[^/@]+)@`) + // cmdlineSecretParamRE matches key=value parameters whose key names suggest credentials. + cmdlineSecretParamRE = regexp.MustCompile(`(?i)\b(password|passwd|secret|token|credential|api[_-]?key|auth[_-]?token|private[_-]?key|seedfrom)=(\S+)`) + // dockerEnvSecretKeyRE matches environment variable names that should always be redacted. dockerEnvSecretKeyRE = regexp.MustCompile(`(?i)^(TOKEN|AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|GITHUB_TOKEN|GH_TOKEN|SLACK_TOKEN|VAULT_TOKEN|NPM_TOKEN|PYPI_TOKEN|CARGO_REGISTRY_TOKEN|OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HUGGINGFACE_TOKEN|HUGGINGFACE_API_KEY|DATABASE_URL|PGPASSWORD|MYSQL_PWD|REDIS_PASSWORD|.*(?:PASSWORD|PASSWD|PSK|SECRET|CREDENTIAL|API[_-]?KEY|AUTH[_-]?TOKEN|PRIVATE[_-]?KEY))$`) @@ -216,3 +222,24 @@ func GrepLines(input, pattern string) []string { func StripANSI(input string) string { return ansiEscapeRE.ReplaceAllString(input, "") } + +// KernelCmdline sanitizes /proc/cmdline content. +// It preserves flag names and non-sensitive tuning values, but redacts: +// - URL userinfo (user:password@ in seedfrom= URIs and similar) +// - values for key= parameters whose names suggest credentials or provisioning URIs +// +// The full flag string (key= prefix) is always preserved so support engineers +// can still identify which parameters are configured. +func KernelCmdline(cmdline string) string { + // Redact URL userinfo: scheme://user:pass@host → scheme://[REDACTED]@host + out := cmdlineURLUserinfoRE.ReplaceAllString(cmdline, "${1}[REDACTED]@") + // Redact credential-bearing key=value parameters. + out = cmdlineSecretParamRE.ReplaceAllStringFunc(out, func(m string) string { + eqIdx := strings.IndexByte(m, '=') + if eqIdx < 0 { + return m + } + return m[:eqIdx+1] + "[REDACTED]" + }) + return out +} diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 9e46ea6..41fa390 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -62,10 +62,13 @@ var criticalPatterns = []CriticalPattern{ KernelOnly: true, }, { + // Match the kernel's canonical MCE format: "mce: [Hardware Error]: ..." + // This excludes the boot-time banner "mce: CPU N supports N MCE banks" + // which does not contain "[Hardware Error]". Name: "Hardware Error", Code: FindingCriticalLog, FingerprintKey: "hardware_error", - Pattern: regexp.MustCompile(`(?i)(hardware error|machine check|mce:)`), + Pattern: regexp.MustCompile(`(?i)mce: \[Hardware Error\]`), Severity: collector.SeverityCritical, Category: "HW", Confidence: collector.ConfidenceHigh, @@ -138,19 +141,44 @@ const maxEvents = 100 var ( pidBracketRe = regexp.MustCompile(`\[\s*\d+\]`) kernelTsRe = regexp.MustCompile(`\[\s*\d+\.\d+\]`) + // Dmesg human-readable timestamps, e.g. "[Fri Feb 6 02:31:46 2026]" + dmesgHumanTsRe = regexp.MustCompile(`\[(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}\]`) + // "(handlerNNN)" patterns in OVS/kernel messages. + handlerRe = regexp.MustCompile(`\(handler\d+\)`) // Matches "from port " patterns in SSH/syslog messages. - ipPortRe = regexp.MustCompile(`from \d+\.\d+\.\d+\.\d+ port \d+`) + ipPortRe = regexp.MustCompile(`from \d+\.\d+\.\d+\.\d+ port \d+`) + // PCI BDF: "0000:05:00.0" — used only for targeted family normalization. + pciBDFNormRe = regexp.MustCompile(`\b[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]\b`) + issuedMsecRe = regexp.MustCompile(`\bissued\s+\d+\s+msec\s+ago\b`) explicitHexRe = regexp.MustCompile(`\b0x[0-9a-fA-F]+\b`) bareHexTokenRe = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`) ) +// isPCIHotplugLine reports whether a log line belongs to a known repetitive +// PCI hotplug message family (pciehp / DOE). BDF addresses and timing values +// in these messages are volatile per-slot identifiers, not diagnostically +// distinct information — they should be normalized for dedup. +func isPCIHotplugLine(line string) bool { + lower := strings.ToLower(line) + return strings.Contains(lower, "pciehp") || + strings.Contains(lower, "data object exchange") +} + // normalizeCriticalLine strips high-cardinality tokens for deduplication. // The original line is preserved in evidence; the normalized form is only // used for dedup keys and fingerprint generation. func normalizeCriticalLine(line string) string { n := pidBracketRe.ReplaceAllString(line, "[_]") n = kernelTsRe.ReplaceAllString(n, "[_]") + n = dmesgHumanTsRe.ReplaceAllString(n, "[_]") + n = handlerRe.ReplaceAllString(n, "(handler_)") n = ipPortRe.ReplaceAllString(n, "_._._._:_") + // Targeted: PCI BDF and timing normalization for known repetitive hotplug families. + // NOT applied globally — a BDF in e.g. "fallen off the bus" is diagnostically meaningful. + if isPCIHotplugLine(n) { + n = pciBDFNormRe.ReplaceAllString(n, "BDF") + n = issuedMsecRe.ReplaceAllString(n, "issued _ msec ago") + } n = explicitHexRe.ReplaceAllString(n, "0xHEX") n = bareHexTokenRe.ReplaceAllStringFunc(n, func(token string) string { if !strings.ContainsAny(strings.ToLower(token), "abcdef") { diff --git a/customers/vm-troubleshooting/internal/triage/critical_test.go b/customers/vm-troubleshooting/internal/triage/critical_test.go index 0126f73..9e2e69f 100644 --- a/customers/vm-troubleshooting/internal/triage/critical_test.go +++ b/customers/vm-troubleshooting/internal/triage/critical_test.go @@ -225,7 +225,7 @@ func TestAnalyzeCriticalLogs_NDJSONPrimarySource(t *testing.T) { // Provide structured NDJSON as the primary source (no text fallback needed). ndjson := `{"MESSAGE":"kernel panic - not syncing: Fatal exception","PRIORITY":"0","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1000","_BOOT_ID":"b1"}` + "\n" + - `{"MESSAGE":"hardware error detected on CPU 0","PRIORITY":"2","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1001","_BOOT_ID":"b1"}` + "\n" + `{"MESSAGE":"mce: [Hardware Error]: Machine check events logged","PRIORITY":"2","SYSLOG_IDENTIFIER":"kernel","_SYSTEMD_UNIT":"","_TRANSPORT":"kernel","__REALTIME_TIMESTAMP":"1001","_BOOT_ID":"b1"}` + "\n" os.WriteFile(filepath.Join(workDir, "logs/journal_kernel.ndjson"), []byte(ndjson), 0o644) tr, err := AnalyzeCriticalLogs(context.Background(), workDir) diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index b5b060d..16cd7ca 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -67,7 +67,8 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config", "triage" + "packages", "storage", "infiniband", "processes", "config", "triage", + "hypervisor", "ovs" ] } } diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index 4786d84..ff0ca0e 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -26,7 +26,8 @@ "enum": [ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", - "packages", "storage", "infiniband", "processes", "config", "triage" + "packages", "storage", "infiniband", "processes", "config", "triage", + "hypervisor", "ovs" ] } }, From 3f4f5026687e3ee8c77307f1107a51d126e99de0 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Fri, 17 Apr 2026 10:33:17 +0200 Subject: [PATCH 12/23] feat(vm-diagnostics): HW telemetry collectors, deeper net/IB/journal, triage Register EDAC, PCIe AER, IPMI, and thermal collectors; add shared sysfs/PCI helpers, saveJSONProbe, and isPhysicalPort for consistent device classification. Network: skip resolvectl when systemd-resolved is absent; collect physical NIC error counters (sysfs, ethtool, devlink). InfiniBand: gate IB-only tools on infiniband link_layer, keep rdma link on RoCE-only hosts, add perfquery. Journal: boot/previous-boot views, bounded previous-boot errors, /var/crash listing with crash_dump_count and manifest integer-fact typing. Triage: broaden critical kernel/hardware/disk/net patterns; strip dmesg-style timestamps in normalizeCriticalLine so journal vs dmesg fingerprints align. Extend manifest/report schemas (parser hints, tags) for vm-troubleshooting and the dashboard copy; refresh AGENTS.md and CODEMAP.md accordingly. --- .../schemas/manifest.schema.json | 6 +- .../schemas/report-record.schema.json | 3 +- customers/vm-troubleshooting/AGENTS.md | 42 +- customers/vm-troubleshooting/CODEMAP.md | 23 +- .../internal/collector/collector.go | 2 + .../internal/collector/collector_test.go | 3 + .../internal/collector/common.go | 14 + .../internal/collector/edac.go | 136 ++++++ .../internal/collector/hypervisor.go | 62 +-- .../internal/collector/infiniband.go | 83 +++- .../internal/collector/infiniband_test.go | 83 ++++ .../internal/collector/ipmi.go | 79 ++++ .../internal/collector/ipmi_test.go | 82 ++++ .../internal/collector/journal.go | 65 +++ .../internal/collector/journal_bounds_test.go | 115 +++++ .../internal/collector/journal_phase3_test.go | 3 + .../internal/collector/network.go | 228 ++++++++++ .../internal/collector/network_iface.go | 50 +++ .../internal/collector/network_test.go | 220 ++++++++++ .../internal/collector/nvidia.go | 114 ++++- .../internal/collector/pci.go | 84 ++++ .../internal/collector/pci_test.go | 135 ++++++ .../internal/collector/pcie.go | 278 +++++++++++++ .../internal/collector/pcie_test.go | 178 ++++++++ .../internal/collector/services.go | 42 +- .../internal/collector/services_test.go | 44 +- .../internal/collector/storage.go | 40 +- .../internal/collector/sysfs.go | 42 ++ .../internal/collector/sysfs_test.go | 77 ++++ .../internal/collector/system.go | 31 ++ .../internal/collector/thermal.go | 322 ++++++++++++++ .../internal/collector/thermal_test.go | 144 +++++++ .../internal/executor/fake.go | 7 + .../internal/output/manifest.go | 12 + .../internal/output/report.go | 2 +- .../internal/runner/runner.go | 6 +- .../internal/triage/critical.go | 393 +++++++++++++++++- .../internal/triage/critical_hw_test.go | 374 +++++++++++++++++ .../schemas/manifest.schema.json | 6 +- .../schemas/report-record.schema.json | 3 +- 40 files changed, 3517 insertions(+), 116 deletions(-) create mode 100644 customers/vm-troubleshooting/internal/collector/edac.go create mode 100644 customers/vm-troubleshooting/internal/collector/infiniband_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/ipmi.go create mode 100644 customers/vm-troubleshooting/internal/collector/ipmi_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/journal_bounds_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/network_iface.go create mode 100644 customers/vm-troubleshooting/internal/collector/network_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/pci.go create mode 100644 customers/vm-troubleshooting/internal/collector/pci_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/pcie.go create mode 100644 customers/vm-troubleshooting/internal/collector/pcie_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/sysfs.go create mode 100644 customers/vm-troubleshooting/internal/collector/sysfs_test.go create mode 100644 customers/vm-troubleshooting/internal/collector/thermal.go create mode 100644 customers/vm-troubleshooting/internal/collector/thermal_test.go create mode 100644 customers/vm-troubleshooting/internal/triage/critical_hw_test.go diff --git a/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json index 16cd7ca..3d507fe 100644 --- a/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting-dashboard/schemas/manifest.schema.json @@ -57,7 +57,8 @@ "procfs", "netlink", "sysctl", "ps", "top", "text", "binary", "ss", "mount", "lsmod", "pip", "docker", "nmcli", "networkctl", "resolvectl", "bridge", "netplan", "iptables", "nft", "ufw", "firewall-cmd", "ibstat", "ibstatus", "ibv_devinfo", "rdma", "apt-mark", "sh", - "hostname", "date", "uptime", "uname", "csv" + "hostname", "date", "uptime", "uname", "csv", + "ipmitool", "dmidecode", "ethtool", "devlink", "perfquery" ] }, "tags": { @@ -68,7 +69,8 @@ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", "packages", "storage", "infiniband", "processes", "config", "triage", - "hypervisor", "ovs" + "hypervisor", "ovs", + "edac", "ipmi", "pcie", "thermal" ] } } diff --git a/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json index ff0ca0e..f3222a8 100644 --- a/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting-dashboard/schemas/report-record.schema.json @@ -27,7 +27,8 @@ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", "packages", "storage", "infiniband", "processes", "config", "triage", - "hypervisor", "ovs" + "hypervisor", "ovs", + "edac", "ipmi", "pcie", "thermal" ] } }, diff --git a/customers/vm-troubleshooting/AGENTS.md b/customers/vm-troubleshooting/AGENTS.md index 449c0b0..4e9f522 100644 --- a/customers/vm-troubleshooting/AGENTS.md +++ b/customers/vm-troubleshooting/AGENTS.md @@ -83,12 +83,52 @@ Rules for the structured layer: - Preserve deterministic sorting and explicit fingerprint inputs in analyzers. ## Artifact registration -- Every artifact must go through `saveCommand`, `saveFile`, `saveCapturedProbe`, `saveProbeOutput`, or `saveDirConcat` in `common.go`, or use `Writer.ReservePath` + one of the `Add*Artifact` helpers for custom flows. +- Every artifact must go through `saveCommand`, `saveFile`, `saveCapturedProbe`, `saveProbeOutput`, `saveJSONProbe`, or `saveDirConcat` in `common.go`, or use `Writer.ReservePath` + one of the `Add*Artifact` helpers for custom flows. - Every artifact requires a `parserHint` (from `ValidParserHints`) and 1-3 `tags` (from `ValidTags`). Both are validated before write — invalid values record an error and skip the write. - Artifact paths are globally unique across all collectors, enforced by `Writer.ReservePath`. Duplicate paths are rejected before write. - Framework-owned paths (`metadata.json`, `manifest.json`, `report.ndjson`, `SUMMARY.txt`, `schemas/*`) are reserved before collectors run. - Writes use atomic temp-file-then-rename. On failure, the reservation is released and no partial file is left on disk. +## Shared collector primitives +The collector package exposes a small, deliberately minimal set of shared +helpers. Before inlining a new version of one of these patterns, use (or +extend) the existing primitive. These exist because the same pattern was +duplicated 2–4 times across collectors; reinventing it creates drift bugs +(HGX audit found `nvidia.nvswitch_present=false` and +`hypervisor.nvswitch.pci_count=4` on the same host — two different NVSwitch +detection paths, two different answers). + +**Sysfs scalar reads** — `internal/collector/sysfs.go`: +- `readSysfsInt(path) int64` — missing/unparsable → 0. Use for error counters and similar "missing is zero" semantics. +- `readSysfsIntOK(path) (int64, bool)` — use when "file absent" and "legitimate zero" must be distinguishable (PCIe link width, CPU topology IDs). +- `readSysfsString(path) string` — trimmed, empty on error. +- Do NOT inline `os.ReadFile` + `strings.TrimSpace` + `strconv.ParseInt` on a sysfs path. Do NOT add a 4th variant of these readers (e.g. `readSysfsTrimmed`). + +**PCI device enumeration** — `internal/collector/pci.go`: +- `iteratePCIDevices() ([]pciDevice, error)` walks `/sys/bus/pci/devices` once, returning BDF/Vendor/Device/Class/Driver for every device. +- `classifyPCI(vendor, class) string` returns `"gpu" | "nvswitch" | "nic" | "storage" | ""` — the canonical kind mapping. +- Do NOT walk `/sys/bus/pci/devices` by hand. Do NOT run `lspci` and grep for vendor/class text to classify devices — `lspci` output changes across versions and `classifyPCI` is the single source of truth. `lspci -nn` for a raw customer-facing inventory artifact is fine; classification must go through `classifyPCI`. +- Tests can use `iteratePCIDevicesAt(tmpDir)` with a fake `/sys` tree (see `pci.go`'s doc comment on the indirection). + +**JSON probe artifacts** — `Base.saveJSONProbe(r, path, data, tags...)` in `common.go`: +- Marshals `data` with `json.MarshalIndent`, records `ErrProbeFailed` on marshal failure, writes via `saveProbeOutput` with `hint="json"`. +- Do NOT hand-roll the `json.MarshalIndent` + `RecordError` + `saveProbeOutput` triad. It's one call. + +**Physical NIC filtering** — `isPhysicalPort(iface) bool` in `internal/collector/network_iface.go`: +- Single predicate: `/device` exists AND no `physfn` symlink AND not a `pf*` representor AND not `rndis_host`. +- Do NOT inline `/device` stat checks. Add new NIC-filtering criteria by extending this predicate, not by duplicating it. + +**When NOT to consolidate.** Some patterns look like duplication but aren't: +- Service-active checks. `services.go` intentionally uses three approaches (D-Bus batch, `systemctl list-units` shell fallback, inline `systemctl status` for text parsing) because each has a distinct fallback policy. Don't collapse them into a generic `IsServiceActive` helper. Inline one-line `os.Stat` on a known runtime path (e.g. `/run/systemd/resolve` for `systemd-resolved`) is the right gate. +- Domain-specific block-device enumeration (`/dev/nvme*n*`, `/dev/sd*`) lives in `storage.go` and is not shared — each device family has distinct follow-up commands. +- Test file setup. Use the project's existing `t.TempDir()` + `os.WriteFile` idiom. Do not introduce a `buildFakeSysfs`-style map-literal helper; the inline form reads more clearly. + +**Before extracting a new cross-collector primitive:** +1. List every confirmed duplicate call site (file:line). +2. Confirm they can be served by one signature without awkward knobs. +3. If it's 2 callsites and the abstraction needs flags/variants, keep them inline. +4. If callsites have intentional policy variance, document the variance and keep them separate. + ## Coding style - Prefer small structs and explicit dependencies. - Favor straightforward code over framework-heavy patterns. diff --git a/customers/vm-troubleshooting/CODEMAP.md b/customers/vm-troubleshooting/CODEMAP.md index 31a7679..d60a7b9 100644 --- a/customers/vm-troubleshooting/CODEMAP.md +++ b/customers/vm-troubleshooting/CODEMAP.md @@ -113,6 +113,12 @@ Keep this file updated in the same change as architecture or collector changes. ### `internal/collector/` - Owns domain collectors and shared collector helpers. - Keep collectors narrow and artifact-oriented. +- **Shared primitives** — use these instead of inlining new variants: + - `common.go` — `Base.saveCommand`, `saveFile`, `saveProbeOutput`, `saveJSONProbe`, `saveCapturedProbe`, `saveDirConcat`, `saveSkippedArtifact`. + - `sysfs.go` — `readSysfsInt`, `readSysfsIntOK` (presence-distinguishing), `readSysfsString`. + - `pci.go` — `iteratePCIDevices`, `iteratePCIDevicesAt` (test-injectable), `classifyPCI`. + - `network_iface.go` — `isPhysicalPort` (physical NIC predicate). +- Rules for adding new cross-collector helpers: see "Shared collector primitives" in `AGENTS.md`. ## Collector Map @@ -127,8 +133,12 @@ Keep this file updated in the same change as architecture or collector changes. | `JournalCollector` | `logs/` | No | `journalctl`/`dmesg` are authoritative; emits text plus bounded sanitized NDJSON (`journal_kernel.ndjson`, `journal_errors.ndjson`) using one fixed `--until` bound per run | | `PackagesCollector` | `packages/` | No | Package managers remain distro authority | | `AdditionalCollector` | `system/`, `hardware/` | Mixed | Limits, sysctl, LVM, sensors, mounts | -| `StorageCollector` | `hardware/` | No | `nvme` and `smartctl` | -| `InfiniBandCollector` | `network/` | No | `ib*` and `rdma` tools | +| `StorageCollector` | `hardware/` | Mixed | `nvme`, `smartctl`, NVMe controller state (sysfs) | +| `InfiniBandCollector` | `network/` | No | `ib*`, `rdma`, and `perfquery` tools | +| `EDACCollector` | `hardware/` | Yes (sysfs) | EDAC memory ECC error counts, optional `ras-mc-ctl` | +| `PCIeCollector` | `hardware/` | Yes (sysfs) | PCIe AER error counters (fatal/non-fatal/correctable) | +| `IPMICollector` | `ipmi/` | No | `ipmitool` BMC/sensor/SEL/chassis data | +| `ThermalCollector` | `hardware/` | Yes (sysfs) | hwmon fans/temps/voltages, thermal zones | ## Native vs Shell Rules - Use native Go for stable kernel/file-backed state. @@ -161,7 +171,14 @@ Keep this file updated in the same change as architecture or collector changes. 1. Prefer an existing helper in `internal/collector/common.go`. 2. Use `saveCommand` for subprocess output. 3. Use `saveFile` or `SaveReadFile` for file-backed content. -4. Record skipped vs errors correctly. +4. Use `saveJSONProbe` for structured JSON — do not hand-roll `json.MarshalIndent` + `saveProbeOutput`. +5. Record skipped vs errors correctly. + +### Probe a sysfs or PCI surface +1. Sysfs scalar reads → `readSysfsInt` / `readSysfsIntOK` / `readSysfsString` (`sysfs.go`). +2. PCI enumeration → `iteratePCIDevices` + `classifyPCI` (`pci.go`). Never grep `lspci` for classification. +3. Physical NIC predicate → `isPhysicalPort` (`network_iface.go`). +4. See "Shared collector primitives" in `AGENTS.md` before inventing a new helper. ### Change sanitization 1. Update `internal/sanitize/`. diff --git a/customers/vm-troubleshooting/internal/collector/collector.go b/customers/vm-troubleshooting/internal/collector/collector.go index 7dbedf3..9999d2b 100644 --- a/customers/vm-troubleshooting/internal/collector/collector.go +++ b/customers/vm-troubleshooting/internal/collector/collector.go @@ -47,6 +47,7 @@ var ValidTags = map[string]bool{ "packages": true, "storage": true, "infiniband": true, "processes": true, "config": true, "triage": true, "hypervisor": true, "ovs": true, + "edac": true, "ipmi": true, "pcie": true, "thermal": true, } // ValidParserHints is the controlled vocabulary for parser hints. @@ -64,6 +65,7 @@ var ValidParserHints = map[string]bool{ "ibv_devinfo": true, "rdma": true, "apt-mark": true, "sh": true, "hostname": true, "date": true, "uptime": true, "uname": true, "csv": true, + "ipmitool": true, "dmidecode": true, "ethtool": true, "devlink": true, "perfquery": true, } // ValidateTagsAndHint validates that all tags and the parser hint come from the controlled vocabularies. diff --git a/customers/vm-troubleshooting/internal/collector/collector_test.go b/customers/vm-troubleshooting/internal/collector/collector_test.go index 680f421..ae7cc36 100644 --- a/customers/vm-troubleshooting/internal/collector/collector_test.go +++ b/customers/vm-troubleshooting/internal/collector/collector_test.go @@ -139,6 +139,9 @@ func TestJournalCollectorTreatsGrepExitCode1AsNoMatches(t *testing.T) { for _, svc := range []string{"docker", "containerd", "nvidia-persistenced", "nvidia-fabricmanager", "kubelet", "NetworkManager", "systemd-networkd", "systemd-resolved"} { fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} } + // Previous boot and boot history (P2 additions) + fake.Commands["journalctl --list-boots --no-pager"] = executor.FakeResponse{Stdout: []byte("0 boot\n")} + fake.Commands["journalctl -b -1 -p err --no-pager"] = executor.FakeResponse{Stdout: []byte("ok\n")} root := t.TempDir() collector := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) diff --git a/customers/vm-troubleshooting/internal/collector/common.go b/customers/vm-troubleshooting/internal/collector/common.go index dceb66b..cc5c100 100644 --- a/customers/vm-troubleshooting/internal/collector/common.go +++ b/customers/vm-troubleshooting/internal/collector/common.go @@ -2,6 +2,7 @@ package collector import ( "context" + "encoding/json" "fmt" "os" "path/filepath" @@ -67,6 +68,19 @@ func (b Base) saveProbeOutput(r *CollectorResult, path, content, hint string, ta return path } +// saveJSONProbe marshals data as pretty JSON and writes it as a probe +// artifact with hint="json". On marshal failure, records ErrProbeFailed +// against the artifact path (no partial file) and returns "". All write- +// path validation and error recording is delegated to saveProbeOutput. +func (b Base) saveJSONProbe(r *CollectorResult, path string, data any, tags ...string) string { + jsonData, err := json.MarshalIndent(data, "", " ") + if err != nil { + r.RecordErrorForArtifact(ErrProbeFailed, fmt.Sprintf("marshal %s: %v", path, err), path) + return "" + } + return b.saveProbeOutput(r, path, string(jsonData)+"\n", "json", tags...) +} + func (b Base) saveCapturedProbe(r *CollectorResult, path string, spec executor.CommandSpec, cmdResult executor.CommandResult, stdout, stderr, hint string, tags []string, notes ...string) string { b.UI.Verbose(fmt.Sprintf(" probe-capture: %s -> %s", spec.String(), path)) if err := ValidateTagsAndHint(hint, tags); err != nil { diff --git a/customers/vm-troubleshooting/internal/collector/edac.go b/customers/vm-troubleshooting/internal/collector/edac.go new file mode 100644 index 0000000..4b6e6e3 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/edac.go @@ -0,0 +1,136 @@ +package collector + +import ( + "context" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type EDACCollector struct{ Base } + +func NewEDACCollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *EDACCollector { + return &EDACCollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *EDACCollector) Name() string { return "EDAC" } +func (c *EDACCollector) ID() string { return "edac" } + +// edacMCData holds per-memory-controller error counts. +type edacMCData struct { + ID string `json:"id"` + UECount int64 `json:"ue_count"` + UENoInfoCount int64 `json:"ue_noinfo_count"` + CECount int64 `json:"ce_count"` + CENoInfoCount int64 `json:"ce_noinfo_count"` + DIMMs []edacDIMMData `json:"dimms,omitempty"` +} + +type edacDIMMData struct { + ID string `json:"id"` + Label string `json:"label,omitempty"` + Location string `json:"location,omitempty"` + UECount int64 `json:"ue_count"` + CECount int64 `json:"ce_count"` +} + +type edacStatus struct { + Present bool `json:"present"` + MCs []edacMCData `json:"memory_controllers"` +} + +func (c *EDACCollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + + edacBase := "/sys/devices/system/edac/mc" + entries, err := os.ReadDir(edacBase) + if err != nil { + r.SetFact("edac.present", "false") + r.RecordSkip(SkipSourceUnavailable, "EDAC sysfs not present") + return r, nil + } + + r.SetFact("edac.present", "true") + status := edacStatus{Present: true} + var totalUE, totalCE int64 + dimmCount := 0 + + for _, entry := range entries { + if !strings.HasPrefix(entry.Name(), "mc") { + continue + } + mcPath := filepath.Join(edacBase, entry.Name()) + mc := edacMCData{ID: entry.Name()} + + mc.UECount = readSysfsInt(filepath.Join(mcPath, "ue_count")) + mc.UENoInfoCount = readSysfsInt(filepath.Join(mcPath, "ue_noinfo_count")) + mc.CECount = readSysfsInt(filepath.Join(mcPath, "ce_count")) + mc.CENoInfoCount = readSysfsInt(filepath.Join(mcPath, "ce_noinfo_count")) + + totalUE += mc.UECount + totalCE += mc.CECount + + // Per-DIMM data: ordered fallback — dimm* first, else rank*, else + // csrow* (below). A controller exposes exactly ONE of these layouts; + // do NOT union them or dimm_count will double-count. + dimmEntries, _ := filepath.Glob(filepath.Join(mcPath, "dimm*")) + if len(dimmEntries) == 0 { + // rank* layout used by skx_edac, i10nm_edac on newer Intel systems + dimmEntries, _ = filepath.Glob(filepath.Join(mcPath, "rank*")) + } + for _, dimmPath := range dimmEntries { + dimmName := filepath.Base(dimmPath) + dimm := edacDIMMData{ + ID: dimmName, + Label: readSysfsString(filepath.Join(dimmPath, "dimm_label")), + Location: readSysfsString(filepath.Join(dimmPath, "dimm_location")), + UECount: readSysfsInt(filepath.Join(dimmPath, "dimm_ue_count")), + CECount: readSysfsInt(filepath.Join(dimmPath, "dimm_ce_count")), + } + mc.DIMMs = append(mc.DIMMs, dimm) + dimmCount++ + } + + // Legacy csrow layout fallback (if no dimm*/rank* entries found) + if len(mc.DIMMs) == 0 { + csrowEntries, _ := filepath.Glob(filepath.Join(mcPath, "csrow*")) + for _, csrowPath := range csrowEntries { + csrowName := filepath.Base(csrowPath) + dimm := edacDIMMData{ + ID: csrowName, + UECount: readSysfsInt(filepath.Join(csrowPath, "ue_count")), + CECount: readSysfsInt(filepath.Join(csrowPath, "ce_count")), + } + mc.DIMMs = append(mc.DIMMs, dimm) + dimmCount++ + } + } + + status.MCs = append(status.MCs, mc) + } + + r.SetFact("edac.ue_total", strconv.FormatInt(totalUE, 10)) + r.SetFact("edac.ce_total", strconv.FormatInt(totalCE, 10)) + r.SetFact("edac.dimm_count", strconv.Itoa(dimmCount)) + + // Write structured JSON artifact + c.saveJSONProbe(r, "hardware/edac_status.json", status, "edac", "memory") + + // Optional: ras-mc-ctl if available + if c.Exec.CommandExists("ras-mc-ctl") { + c.saveCommand(ctx, r, "hardware/rasdaemon_summary.txt", executor.CommandSpec{ + Name: "ras-mc-ctl", Args: []string{"--summary"}, NeedsRoot: true, Timeout: config.TimeoutQuick, + }, "text", "edac", "memory") + c.saveCommand(ctx, r, "hardware/rasdaemon_errors.txt", executor.CommandSpec{ + Name: "ras-mc-ctl", Args: []string{"--errors"}, NeedsRoot: true, Timeout: config.TimeoutQuick, + }, "text", "edac", "memory") + } + + return r, nil +} diff --git a/customers/vm-troubleshooting/internal/collector/hypervisor.go b/customers/vm-troubleshooting/internal/collector/hypervisor.go index b20ac89..79d0423 100644 --- a/customers/vm-troubleshooting/internal/collector/hypervisor.go +++ b/customers/vm-troubleshooting/internal/collector/hypervisor.go @@ -141,27 +141,30 @@ func (c *HypervisorCollector) saveIOMMUGroups(r *CollectorResult) { } func (c *HypervisorCollector) saveVFIOBindings(r *CollectorResult) { - entries, err := os.ReadDir("/sys/bus/pci/drivers/vfio-pci") - if err != nil { + if _, err := os.Stat("/sys/bus/pci/drivers/vfio-pci"); err != nil { c.saveSkippedArtifact(r, "hypervisor/vfio_bindings.txt", "file", "", "/sys/bus/pci/drivers/vfio-pci", SkipSourceUnavailable, "/sys/bus/pci/drivers/vfio-pci: unavailable", "hypervisor") r.SetFact("vfio.bound_devices", "0") return } + devs, err := iteratePCIDevices() + if err != nil { + c.saveSkippedArtifact(r, "hypervisor/vfio_bindings.txt", "file", "", + "/sys/bus/pci/devices", SkipSourceUnavailable, + "/sys/bus/pci/devices: unavailable", "hypervisor") + r.SetFact("vfio.bound_devices", "0") + return + } var buf strings.Builder count := 0 - for _, entry := range entries { - if !strings.Contains(entry.Name(), ":") { + for _, d := range devs { + if d.Driver != "vfio-pci" { continue } count++ - base := filepath.Join("/sys/bus/pci/drivers/vfio-pci", entry.Name()) - vendor := readSysfsTrimmed(filepath.Join(base, "vendor")) - device := readSysfsTrimmed(filepath.Join(base, "device")) - class := readSysfsTrimmed(filepath.Join(base, "class")) buf.WriteString(fmt.Sprintf("%s vendor=%s device=%s class=%s\n", - entry.Name(), vendor, device, class)) + d.BDF, d.Vendor, d.Device, d.Class)) } r.SetFact("vfio.bound_devices", strconv.Itoa(count)) if buf.Len() == 0 { @@ -174,7 +177,7 @@ func (c *HypervisorCollector) saveVFIOBindings(r *CollectorResult) { // GPUs: vendor 10de + display class (0x0300xx / 0x0302xx) // NVSwitches: vendor 10de + bridge class (0x0680xx) func (c *HypervisorCollector) savePCIInventory(r *CollectorResult) { - entries, err := os.ReadDir("/sys/bus/pci/devices") + devs, err := iteratePCIDevices() if err != nil { r.SetFact("gpu.pci_nvidia_count", "unavailable") r.SetFact("gpu.vfio_bound_count", "unavailable") @@ -188,26 +191,17 @@ func (c *HypervisorCollector) savePCIInventory(r *CollectorResult) { gpuHost := 0 nvSwitchTotal := 0 - for _, entry := range entries { - base := filepath.Join("/sys/bus/pci/devices", entry.Name()) - vendor := readSysfsTrimmed(filepath.Join(base, "vendor")) - if strings.ToLower(vendor) != "0x10de" { - continue - } - cls := strings.ToLower(readSysfsTrimmed(filepath.Join(base, "class"))) - driverLink, _ := os.Readlink(filepath.Join(base, "driver")) - isVFIO := strings.HasSuffix(driverLink, "/vfio-pci") - hasDriver := driverLink != "" && !isVFIO - - switch { - case strings.HasPrefix(cls, "0x0300") || strings.HasPrefix(cls, "0x0302"): + for _, d := range devs { + switch classifyPCI(d.Vendor, d.Class) { + case "gpu": gpuTotal++ - if isVFIO { + switch { + case d.Driver == "vfio-pci": gpuVFIO++ - } else if hasDriver { + case d.Driver != "": gpuHost++ } - case strings.HasPrefix(cls, "0x0680"): + case "nvswitch": nvSwitchTotal++ } } @@ -241,7 +235,7 @@ func (c *HypervisorCollector) saveHugepageFacts(r *CollectorResult) { if parseErr != nil { continue } - totalStr := readSysfsTrimmed(filepath.Join("/sys/kernel/mm/hugepages", entry.Name(), "nr_hugepages")) + totalStr := readSysfsString(filepath.Join("/sys/kernel/mm/hugepages", entry.Name(), "nr_hugepages")) total, parseErr := strconv.ParseInt(totalStr, 10, 64) if parseErr != nil { continue @@ -265,12 +259,12 @@ func (c *HypervisorCollector) saveHugepageFacts(r *CollectorResult) { r.SetFact("hugepages.total", strconv.FormatInt(bestTotal, 10)) r.SetFact("hugepages.size_kb", strconv.FormatInt(bestSize, 10)) - if free, err := strconv.ParseInt(readSysfsTrimmed(filepath.Join(base, "free_hugepages")), 10, 64); err == nil { + if free, err := strconv.ParseInt(readSysfsString(filepath.Join(base, "free_hugepages")), 10, 64); err == nil { r.SetFact("hugepages.free", strconv.FormatInt(free, 10)) } else { r.SetFact("hugepages.free", "unavailable") } - if resv, err := strconv.ParseInt(readSysfsTrimmed(filepath.Join(base, "resv_hugepages")), 10, 64); err == nil { + if resv, err := strconv.ParseInt(readSysfsString(filepath.Join(base, "resv_hugepages")), 10, 64); err == nil { r.SetFact("hugepages.reserved", strconv.FormatInt(resv, 10)) } else { r.SetFact("hugepages.reserved", "unavailable") @@ -459,13 +453,3 @@ func countNumaNodes() int { } return count } - -// readSysfsTrimmed reads a sysfs file and returns its trimmed content, -// or an empty string on error. -func readSysfsTrimmed(path string) string { - data, err := os.ReadFile(path) - if err != nil { - return "" - } - return strings.TrimSpace(string(data)) -} diff --git a/customers/vm-troubleshooting/internal/collector/infiniband.go b/customers/vm-troubleshooting/internal/collector/infiniband.go index 6f14a39..2ea2f29 100644 --- a/customers/vm-troubleshooting/internal/collector/infiniband.go +++ b/customers/vm-troubleshooting/internal/collector/infiniband.go @@ -2,6 +2,9 @@ package collector import ( "context" + "os" + "path/filepath" + "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" "github.com/NexGenCloud/vm-diagnostics/internal/executor" @@ -18,24 +21,90 @@ func NewInfiniBandCollector(exec executor.Executor, writer *output.Writer, ui ui func (c *InfiniBandCollector) Name() string { return "InfiniBand" } func (c *InfiniBandCollector) ID() string { return "infiniband" } +// ibPresence inspects /sys/class/infiniband/ to determine what RDMA surface +// this host exposes. Returns two booleans: +// - hasAny: at least one entry under /sys/class/infiniband/ +// - hasIB: at least one port with link_layer == "infiniband" +// +// Ethernet-only RDMA devices (mlx5 in RoCE mode) return (true, false). +// Hosts without an RDMA stack return (false, false). +func ibPresence() (hasAny, hasIB bool) { + return ibPresenceAt("/sys/class/infiniband") +} + +// ibPresenceAt is the path-injectable core of ibPresence. +func ibPresenceAt(ibRoot string) (hasAny, hasIB bool) { + entries, err := os.ReadDir(ibRoot) + if err != nil || len(entries) == 0 { + return false, false + } + hasAny = true + for _, dev := range entries { + portsDir := filepath.Join(ibRoot, dev.Name(), "ports") + ports, err := os.ReadDir(portsDir) + if err != nil { + continue + } + for _, port := range ports { + layer := strings.ToLower(readSysfsString(filepath.Join(portsDir, port.Name(), "link_layer"))) + if layer == "infiniband" { + return hasAny, true + } + } + } + return hasAny, false +} + func (c *InfiniBandCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() - collected := false - for _, spec := range []struct { + + // Presence matrix: + // !hasAny → skip the whole collector (no RDMA stack at all) + // hasAny && !hasIB → only `rdma link show` (ethernet-mode mlx5 / RoCE) + // else → run everything (native IB or mixed IB+RoCE) + hasAny, hasIB := ibPresence() + if !hasAny { + r.RecordSkip(SkipNotApplicable, "no RDMA devices in /sys/class/infiniband") + return r, nil + } + + specs := []struct { path, name string args []string + ibOnly bool // requires an infiniband-mode port to be meaningful }{ - {"network/ibstat.txt", "ibstat", nil}, - {"network/ibstatus.txt", "ibstatus", nil}, - {"network/ibv_devinfo.txt", "ibv_devinfo", []string{"-v"}}, - {"network/rdma_link.txt", "rdma", []string{"link", "show"}}, - } { + {"network/ibstat.txt", "ibstat", nil, true}, + {"network/ibstatus.txt", "ibstatus", nil, true}, + {"network/ibv_devinfo.txt", "ibv_devinfo", []string{"-v"}, true}, + {"network/rdma_link.txt", "rdma", []string{"link", "show"}, false}, + } + + collected := false + for _, spec := range specs { + if spec.ibOnly && !hasIB { + // Ethernet-mode mlx5 / RoCE: the IB tools would exit non-zero. + c.saveSkippedArtifact(r, spec.path, "command", spec.name, "/sys/class/infiniband", + SkipNotApplicable, "no infiniband-mode ports (RoCE or ethernet-mode only)", "infiniband") + continue + } if !c.Exec.CommandExists(spec.name) { + c.saveSkippedArtifact(r, spec.path, "command", spec.name, "", + SkipCommandUnavailable, spec.name+": unavailable", "infiniband") continue } collected = true c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, Timeout: config.TimeoutQuick}, spec.name, "infiniband") } + + // perfquery: per-port error counters (package: infiniband-diags). Only + // meaningful when at least one IB port is active. + if hasIB && c.Exec.CommandExists("perfquery") { + collected = true + c.saveCommand(ctx, r, "network/perfquery.txt", executor.CommandSpec{ + Name: "perfquery", NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true, + }, "perfquery", "infiniband") + } + if !collected { r.RecordSkip(SkipCommandUnavailable, "InfiniBand tools unavailable") } diff --git a/customers/vm-troubleshooting/internal/collector/infiniband_test.go b/customers/vm-troubleshooting/internal/collector/infiniband_test.go new file mode 100644 index 0000000..d68e72e --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/infiniband_test.go @@ -0,0 +1,83 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +// stageIBPort writes ///ports//link_layer = . +func stageIBPort(t *testing.T, ibRoot, dev, port, layer string) { + t.Helper() + portDir := filepath.Join(ibRoot, dev, "ports", port) + if err := os.MkdirAll(portDir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(portDir, "link_layer"), []byte(layer+"\n"), 0o644); err != nil { + t.Fatal(err) + } +} + +// TestIBPresenceAt covers the four decision scenarios that drive collector +// behavior (R20): no RDMA → skip; ethernet-only RDMA (RoCE) → only +// `rdma link show`; infiniband-mode → run all tools; mixed → run all tools. +// Each scenario is staged against a temp fake sysfs. +func TestIBPresenceAt(t *testing.T) { + t.Parallel() + + t.Run("no /sys/class/infiniband at all", func(t *testing.T) { + t.Parallel() + root := filepath.Join(t.TempDir(), "does_not_exist") + hasAny, hasIB := ibPresenceAt(root) + if hasAny || hasIB { + t.Errorf("want (false,false), got (%v,%v)", hasAny, hasIB) + } + }) + + t.Run("directory exists but empty", func(t *testing.T) { + t.Parallel() + root := t.TempDir() + if err := os.MkdirAll(root, 0o755); err != nil { + t.Fatal(err) + } + hasAny, hasIB := ibPresenceAt(root) + if hasAny || hasIB { + t.Errorf("want (false,false), got (%v,%v)", hasAny, hasIB) + } + }) + + t.Run("ethernet-only (RoCE) mlx5", func(t *testing.T) { + t.Parallel() + root := t.TempDir() + stageIBPort(t, root, "mlx5_0", "1", "Ethernet") + stageIBPort(t, root, "mlx5_0", "2", "Ethernet") + hasAny, hasIB := ibPresenceAt(root) + if !hasAny { + t.Error("hasAny: want true (device exists)") + } + if hasIB { + t.Error("hasIB: want false (no infiniband-mode ports)") + } + }) + + t.Run("infiniband-mode", func(t *testing.T) { + t.Parallel() + root := t.TempDir() + stageIBPort(t, root, "mlx5_0", "1", "InfiniBand") + hasAny, hasIB := ibPresenceAt(root) + if !hasAny || !hasIB { + t.Errorf("want (true,true), got (%v,%v)", hasAny, hasIB) + } + }) + + t.Run("mixed IB + Ethernet", func(t *testing.T) { + t.Parallel() + root := t.TempDir() + stageIBPort(t, root, "mlx5_0", "1", "InfiniBand") + stageIBPort(t, root, "mlx5_1", "1", "Ethernet") + hasAny, hasIB := ibPresenceAt(root) + if !hasAny || !hasIB { + t.Errorf("want (true,true), got (%v,%v)", hasAny, hasIB) + } + }) +} diff --git a/customers/vm-troubleshooting/internal/collector/ipmi.go b/customers/vm-troubleshooting/internal/collector/ipmi.go new file mode 100644 index 0000000..d9218e2 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/ipmi.go @@ -0,0 +1,79 @@ +package collector + +import ( + "context" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type IPMICollector struct{ Base } + +func NewIPMICollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *IPMICollector { + return &IPMICollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *IPMICollector) Name() string { return "IPMI" } +func (c *IPMICollector) ID() string { return "ipmi" } + +// Collect runs ipmitool sub-commands and records their raw output as +// artifacts. Structured parsing of SEL events, SDR sensor readings, and +// chassis fields (e.g. ipmi.critical_sensor_count, ipmi.sel_critical_event_count) +// is intentionally deferred to the dashboard layer: SEL/SDR semantics vary +// across BMC vendors and kernel versions, and an evolving parser belongs +// with the consumer, not the collector. This boundary keeps the archive's +// raw data stable and lets the dashboard iterate on interpretation without +// forcing a gather-info re-release. +func (c *IPMICollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + + if !c.Exec.CommandExists("ipmitool") { + r.SetFact("ipmi.present", "false") + r.RecordSkip(SkipCommandUnavailable, "ipmitool unavailable") + return r, nil + } + + // Probe: does the BMC respond? A busy BMC on a large chassis can take + // 10-20s to return mc info; TimeoutQuick would misreport "no BMC" on + // exactly the hosts most likely to need BMC triage. + probeSpec := executor.CommandSpec{ + Name: "ipmitool", Args: []string{"mc", "info"}, + NeedsRoot: true, Timeout: config.TimeoutMedium, + } + probeResult, _, _ := c.Exec.Capture(ctx, probeSpec, 64*1024) + if probeResult.Err != nil || probeResult.Skipped { + r.SetFact("ipmi.present", "false") + r.RecordSkip(SkipSourceUnavailable, "IPMI/BMC not accessible") + return r, nil + } + + r.SetFact("ipmi.present", "true") + + // Large chassis (300+ sensors, months of SEL) can take 2-3 minutes for + // full sdr/sel enumeration. TimeoutMedium would silently truncate the + // output on hosts most likely to need thorough BMC triage, so allow the + // slow tier for those two commands specifically. The rest remain quick. + for _, spec := range []struct { + path string + args []string + timeout time.Duration + }{ + {"ipmi/bmc_info.txt", []string{"mc", "info"}, config.TimeoutMedium}, + {"ipmi/bmc_selftest.txt", []string{"mc", "selftest"}, config.TimeoutMedium}, + {"ipmi/sdr_list.txt", []string{"sdr", "list", "full"}, config.TimeoutSlow}, + {"ipmi/sensor_list.txt", []string{"sensor", "list"}, config.TimeoutMedium}, + {"ipmi/sel_events.txt", []string{"sel", "elist"}, config.TimeoutSlow}, + {"ipmi/sel_info.txt", []string{"sel", "info"}, config.TimeoutMedium}, + {"ipmi/chassis_status.txt", []string{"chassis", "status"}, config.TimeoutMedium}, + } { + c.saveCommand(ctx, r, spec.path, executor.CommandSpec{ + Name: "ipmitool", Args: spec.args, + NeedsRoot: true, Timeout: spec.timeout, + }, "ipmitool", "ipmi", "hardware") + } + + return r, nil +} diff --git a/customers/vm-troubleshooting/internal/collector/ipmi_test.go b/customers/vm-troubleshooting/internal/collector/ipmi_test.go new file mode 100644 index 0000000..f408016 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/ipmi_test.go @@ -0,0 +1,82 @@ +package collector + +import ( + "context" + "testing" + "time" + + "github.com/NexGenCloud/vm-diagnostics/internal/config" + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +// TestIPMITimeouts locks in R22: `sdr list full` and `sel elist` must use +// TimeoutSlow (300s) because large chassis (300+ sensors, months of SEL) +// legitimately take 2-3 minutes. TimeoutMedium (60s) would silently +// truncate on exactly the hosts most likely to need thorough BMC triage. +func TestIPMITimeouts(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["ipmitool"] = true + // BMC probe succeeds so the full command set runs. + fake.Commands["ipmitool mc info"] = executor.FakeResponse{Stdout: []byte("Device ID: 23\n")} + // All others can return nothing; we only care about the spec recorded. + + root := t.TempDir() + c := NewIPMICollector(fake, output.NewWriter(root), ui.NoopUI{}) + if _, err := c.Collect(context.Background()); err != nil { + t.Fatal(err) + } + + // Check each artifact spec in the loop. The pre-flight `mc info` probe + // runs with TimeoutQuick and is intentionally not asserted here (it's + // a single-shot reachability check, distinct from the artifact spec). + checkArtifactTimeout := func(args []string, want time.Duration) { + t.Helper() + found := false + for _, got := range fake.Calls { + if got.Name != "ipmitool" { + continue + } + if got.Timeout == config.TimeoutQuick { + continue // skip the reachability probe + } + if argsEqual(got.Args, args) { + found = true + if got.Timeout != want { + t.Errorf("%v: timeout want %v, got %v", args, want, got.Timeout) + } + } + } + if !found { + t.Errorf("expected artifact spec with args %v, never saw one", args) + } + } + + // The slow tier applies only to the two heavyweight commands. + checkArtifactTimeout([]string{"sdr", "list", "full"}, config.TimeoutSlow) + checkArtifactTimeout([]string{"sel", "elist"}, config.TimeoutSlow) + + // The rest stay at medium — bumping everything to slow would waste + // real runtime on healthy hosts with light BMCs. + checkArtifactTimeout([]string{"mc", "info"}, config.TimeoutMedium) + checkArtifactTimeout([]string{"mc", "selftest"}, config.TimeoutMedium) + checkArtifactTimeout([]string{"sensor", "list"}, config.TimeoutMedium) + checkArtifactTimeout([]string{"sel", "info"}, config.TimeoutMedium) + checkArtifactTimeout([]string{"chassis", "status"}, config.TimeoutMedium) +} + +func argsEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/customers/vm-troubleshooting/internal/collector/journal.go b/customers/vm-troubleshooting/internal/collector/journal.go index 42ff284..ae19e17 100644 --- a/customers/vm-troubleshooting/internal/collector/journal.go +++ b/customers/vm-troubleshooting/internal/collector/journal.go @@ -6,6 +6,7 @@ import ( "context" "encoding/json" "fmt" + "os" "strconv" "strings" "time" @@ -163,9 +164,73 @@ func (c *JournalCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveCommand(ctx, r, fmt.Sprintf("logs/journal_%s.txt", svc), executor.CommandSpec{Name: "journalctl", Args: append(append([]string{}, journalArgs...), "-u", svc), NeedsRoot: true, Timeout: config.TimeoutMedium}, "journalctl", "journal", "services") } + // Previous boot and crash history. + c.saveCommand(ctx, r, "logs/boot_history.txt", executor.CommandSpec{ + Name: "journalctl", Args: []string{"--list-boots", "--no-pager"}, + NeedsRoot: true, Timeout: config.TimeoutQuick, + }, "journalctl", "journal") + // --lines=1000: long-uptime hosts (100+ days) can accumulate months of + // previous-boot errors. 1000 lines is enough for triage and bounds the + // archive; unbounded output risks bloat without adding signal. + c.saveCommand(ctx, r, "logs/previous_boot_errors.txt", executor.CommandSpec{ + Name: "journalctl", Args: []string{"-b", "-1", "-p", "err", "--lines=1000", "--no-pager"}, + NeedsRoot: true, Timeout: config.TimeoutMedium, IgnoreExit: true, + }, "journalctl", "journal") + + // /var/crash/ listing (crash dumps presence) + c.saveCrashDumpListing(r) + return r, nil } +// saveCrashDumpListing lists /var/crash/ contents (names only, no vmcore data). +// +// Emits the crash_dump_count fact with three distinct states — the dashboard +// needs to distinguish "not checked" from "checked, zero": +// +// - "unavailable" — /var/crash absent (no kdump configured, most common), +// or present but unreadable. A read error also records ErrProbeFailed +// so the condition surfaces in the artifact error log. +// - "0" — directory exists and is empty. Genuine clean state. +// - "N" — N entries present (vmcore dumps and siblings). +// +// The count is added to output.integerFactKeys so "unavailable" maps to +// JSON null in the typed manifest. +func (c *JournalCollector) saveCrashDumpListing(r *CollectorResult) { + c.saveCrashDumpListingAt(r, "/var/crash") +} + +// saveCrashDumpListingAt is the path-injectable core of saveCrashDumpListing. +// Production callers use /var/crash; tests point at a TempDir. +func (c *JournalCollector) saveCrashDumpListingAt(r *CollectorResult, crashDir string) { + entries, err := os.ReadDir(crashDir) + if err != nil { + // Absent and permission-denied both map to unavailable. Only record + // an error for the permission case so "no kdump" doesn't spam the + // error log on every minimal system. + r.SetFact("crash_dump_count", "unavailable") + if !os.IsNotExist(err) { + r.RecordError(ErrProbeFailed, fmt.Sprintf("read %s: %v", crashDir, err)) + } + return + } + r.SetFact("crash_dump_count", fmt.Sprintf("%d", len(entries))) + if len(entries) == 0 { + return + } + var buf strings.Builder + buf.WriteString(fmt.Sprintf("Contents of %s/:\n", crashDir)) + for _, entry := range entries { + info, err := entry.Info() + if err != nil { + buf.WriteString(fmt.Sprintf(" %s (stat error)\n", entry.Name())) + continue + } + buf.WriteString(fmt.Sprintf(" %s %d bytes %s\n", entry.Name(), info.Size(), info.ModTime().Format("2006-01-02 15:04:05"))) + } + c.saveProbeOutput(r, "system/crash_dumps.txt", buf.String(), "text", "journal") +} + func (c *JournalCollector) journalBaseArgs(until string) []string { args := []string{"--no-pager", "--until=" + until} if c.Since == "" || c.Since == "boot" { diff --git a/customers/vm-troubleshooting/internal/collector/journal_bounds_test.go b/customers/vm-troubleshooting/internal/collector/journal_bounds_test.go new file mode 100644 index 0000000..e703ac5 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/journal_bounds_test.go @@ -0,0 +1,115 @@ +package collector + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +// TestJournalPreviousBootBounded locks in R23: the previous-boot errors +// journalctl call must carry --lines=1000. Long-uptime hosts (100+ days) +// accumulate months of previous-boot errors; unbounded output bloats the +// archive without adding signal. This test ensures a refactor or a plan +// reviewer can't silently drop the bound. +func TestJournalPreviousBootBounded(t *testing.T) { + t.Parallel() + + fake := executor.NewFake() + fake.RootAccess = true + fake.Binaries["journalctl"] = true + fake.Binaries["dmesg"] = true + + root := t.TempDir() + c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "1 hour ago", false) + if _, err := c.Collect(context.Background()); err != nil { + t.Fatal(err) + } + + // Find the journalctl spec that requests previous-boot (-b -1) with + // priority=err. That's the specific call R23 guards. + var prev *executor.CommandSpec + for i := range fake.Calls { + spec := fake.Calls[i] + if spec.Name != "journalctl" { + continue + } + joined := strings.Join(spec.Args, " ") + if strings.Contains(joined, "-b -1") && strings.Contains(joined, "-p err") { + prev = &spec + break + } + } + if prev == nil { + t.Fatal("no previous-boot journalctl spec found among recorded calls") + } + + foundLines := false + for _, a := range prev.Args { + if a == "--lines=1000" { + foundLines = true + break + } + } + if !foundLines { + t.Errorf("previous-boot spec missing --lines=1000; args=%v", prev.Args) + } +} + +// TestCrashDumpCountThreeState locks in R32. The dashboard draws different +// panels for "checked, zero" vs "never checked"; conflating them silently +// misrepresents minimal systems without kdump as clean-rebooted machines. +func TestCrashDumpCountThreeState(t *testing.T) { + t.Parallel() + + collectorFor := func() *JournalCollector { + return NewJournalCollector(executor.NewFake(), output.NewWriter(t.TempDir()), ui.NoopUI{}, "1 hour ago", false) + } + + t.Run("absent directory reports unavailable", func(t *testing.T) { + t.Parallel() + c := collectorFor() + r := NewResult() + c.saveCrashDumpListingAt(r, filepath.Join(t.TempDir(), "does_not_exist")) + if got := r.Facts["crash_dump_count"]; got != "unavailable" { + t.Errorf("absent: want %q, got %q", "unavailable", got) + } + // Absent is the normal non-kdump case; must NOT record an error. + for _, e := range r.Errors { + if strings.Contains(e.Message, "does_not_exist") { + t.Errorf("absent should not record error, got: %+v", e) + } + } + }) + + t.Run("empty directory reports zero", func(t *testing.T) { + t.Parallel() + c := collectorFor() + r := NewResult() + c.saveCrashDumpListingAt(r, t.TempDir()) + if got := r.Facts["crash_dump_count"]; got != "0" { + t.Errorf("empty: want %q, got %q", "0", got) + } + }) + + t.Run("populated directory reports count", func(t *testing.T) { + t.Parallel() + dir := t.TempDir() + for _, name := range []string{"202601010101", "202601020202", "README"} { + if err := os.Mkdir(filepath.Join(dir, name), 0o755); err != nil { + t.Fatal(err) + } + } + c := collectorFor() + r := NewResult() + c.saveCrashDumpListingAt(r, dir) + if got := r.Facts["crash_dump_count"]; got != "3" { + t.Errorf("populated: want %q, got %q", "3", got) + } + }) +} diff --git a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go index 5b927ff..4f82bf4 100644 --- a/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go +++ b/customers/vm-troubleshooting/internal/collector/journal_phase3_test.go @@ -43,6 +43,9 @@ func TestJournalCollectorWritesParseableStructuredNDJSON(t *testing.T) { for _, svc := range journalServiceUnits { fake.Commands["journalctl "+base+" -u "+svc] = executor.FakeResponse{Stdout: []byte("ok\n")} } + // Previous boot and boot history (P2 additions) + fake.Commands["journalctl --list-boots --no-pager"] = executor.FakeResponse{Stdout: []byte("0 boot\n")} + fake.Commands["journalctl -b -1 -p err --no-pager"] = executor.FakeResponse{Stdout: []byte("ok\n")} root := t.TempDir() c := NewJournalCollector(fake, output.NewWriter(root), ui.NoopUI{}, "boot", false) diff --git a/customers/vm-troubleshooting/internal/collector/network.go b/customers/vm-troubleshooting/internal/collector/network.go index 724ff3b..d59928a 100644 --- a/customers/vm-troubleshooting/internal/collector/network.go +++ b/customers/vm-troubleshooting/internal/collector/network.go @@ -2,8 +2,10 @@ package collector import ( "context" + "encoding/json" "fmt" "os" + "strconv" "strings" "github.com/NexGenCloud/vm-diagnostics/internal/config" @@ -68,6 +70,15 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error } { c.saveDirConcat(r, dir.dest, dir.path, sanitize.SensitiveConfig, "network", "config") } + // systemd-resolved creates /run/systemd/resolve/ while active and removes + // it via ExecStopPost. Its presence is the authoritative runtime signal + // for "resolvectl will have something to talk to." When absent, the + // resolvectl commands would exit non-zero with noise — record as a + // daemon-unavailable skip instead. + resolvedActive := false + if _, err := os.Stat("/run/systemd/resolve"); err == nil { + resolvedActive = true + } for _, spec := range []struct { path string cmd executor.CommandSpec @@ -89,6 +100,11 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error c.saveSkippedArtifact(r, spec.path, "command", spec.cmd.String(), "", SkipCommandUnavailable, spec.cmd.Name+": unavailable", "network") continue } + if spec.cmd.Name == "resolvectl" && !resolvedActive { + c.saveSkippedArtifact(r, spec.path, "command", spec.cmd.String(), "systemd-resolved", + SkipDaemonUnavailable, "systemd-resolved not running (no /run/systemd/resolve)", "network") + continue + } c.saveCommand(ctx, r, spec.path, spec.cmd, spec.hint, "network") } if _, err := os.Stat("/etc/NetworkManager/system-connections"); err == nil && c.Exec.CommandExists("ls") { @@ -129,9 +145,221 @@ func (c *NetworkCollector) Collect(ctx context.Context) (*CollectorResult, error } c.saveCommand(ctx, r, spec.path, executor.CommandSpec{Name: spec.name, Args: spec.args, NeedsRoot: spec.root, IgnoreExit: spec.ignoreExit, Timeout: config.TimeoutMedium}, spec.hint, spec.tags...) } + // NIC hardware error counters (sysfs + ethtool + devlink) + c.collectNICErrors(ctx, r) + return r, nil } +// collectNICErrors reads physical NIC error counters from sysfs, runs ethtool +// and devlink health for hardware fault evidence. +func (c *NetworkCollector) collectNICErrors(ctx context.Context, r *CollectorResult) { + netEntries, err := os.ReadDir("/sys/class/net") + if err != nil { + return + } + + hwErrorIfaces := 0 + linkFlapIfaces := 0 + var nicReport strings.Builder + nicReport.WriteString("NIC Hardware Error Counters\n===========================\n\n") + + for _, entry := range netEntries { + iface := entry.Name() + // Skip non-physical interfaces (no /device symlink, SR-IOV VFs, + // switchdev representors, USB management interfaces). Centralized + // in isPhysicalPort so the ethtool loop below uses the same filter. + if !isPhysicalPort(iface) { + continue + } + ifPath := "/sys/class/net/" + iface + + crcErrors := readSysfsInt(ifPath + "/statistics/rx_crc_errors") + frameErrors := readSysfsInt(ifPath + "/statistics/rx_frame_errors") + carrierErrors := readSysfsInt(ifPath + "/statistics/tx_carrier_errors") + carrierChanges := readSysfsInt(ifPath + "/statistics/carrier_changes") + carrierDowncount := readSysfsInt(ifPath + "/statistics/carrier_downcount") + + hasErrors := crcErrors > 0 || frameErrors > 0 || carrierErrors > 0 + if hasErrors { + hwErrorIfaces++ + } + if carrierChanges > 10 { + linkFlapIfaces++ + } + + if hasErrors || carrierChanges > 0 { + nicReport.WriteString(fmt.Sprintf("%s:\n", iface)) + nicReport.WriteString(fmt.Sprintf(" rx_crc_errors: %d\n", crcErrors)) + nicReport.WriteString(fmt.Sprintf(" rx_frame_errors: %d\n", frameErrors)) + nicReport.WriteString(fmt.Sprintf(" tx_carrier_errors: %d\n", carrierErrors)) + nicReport.WriteString(fmt.Sprintf(" carrier_changes: %d\n", carrierChanges)) + nicReport.WriteString(fmt.Sprintf(" carrier_downcount: %d\n", carrierDowncount)) + nicReport.WriteString("\n") + } + } + + r.SetFact("nic.hw_error_interfaces", fmt.Sprintf("%d", hwErrorIfaces)) + r.SetFact("nic.link_flap_interfaces", fmt.Sprintf("%d", linkFlapIfaces)) + + if hwErrorIfaces > 0 || linkFlapIfaces > 0 { + c.saveProbeOutput(r, "network/nic_hw_errors.txt", nicReport.String(), "text", "network", "hardware") + } + + // ethtool -S for physical interfaces (driver-specific counters) + if c.Exec.CommandExists("ethtool") { + for _, entry := range netEntries { + iface := entry.Name() + if !isPhysicalPort(iface) { + continue + } + c.saveCommand(ctx, r, "network/ethtool_stats_"+iface+".txt", executor.CommandSpec{ + Name: "ethtool", Args: []string{"-S", iface}, + NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true, + }, "ethtool", "network", "hardware") + c.saveCommand(ctx, r, "network/ethtool_module_"+iface+".txt", executor.CommandSpec{ + Name: "ethtool", Args: []string{"-m", iface}, + NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true, + }, "ethtool", "network", "hardware") + } + } + + // devlink health: capture once (JSON), save, then parse the same buffer. + // On parse failure fall back to text-mode; on both failures set fact to + // "unavailable" + record ErrProbeFailed — never silently emit 0. + c.collectDevlinkHealth(ctx, r) +} + +// collectDevlinkHealth runs `devlink -j health show` once, persists it as an +// artifact, and parses it for fw_fatal reporter state. The JSON mode is +// preferred; a text-mode fallback covers older iproute2 releases. When both +// paths fail the fact is set to "unavailable" (not silently zero). +func (c *NetworkCollector) collectDevlinkHealth(ctx context.Context, r *CollectorResult) { + const factKey = "devlink.fw_fatal_count" + const artifactPath = "network/devlink_health.json" + + if !c.Exec.CommandExists("devlink") { + r.SetFact(factKey, "unavailable") + c.saveSkippedArtifact(r, artifactPath, "command", "devlink", "devlink", + SkipCommandUnavailable, "devlink: unavailable", "network", "hardware") + return + } + + spec := executor.CommandSpec{ + Name: "devlink", Args: []string{"-j", "health", "show"}, + NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true, + } + result, stdout, stderr := c.Exec.Capture(ctx, spec, 256*1024) + c.saveCapturedProbe(r, artifactPath, spec, result, string(stdout), string(stderr), + "devlink", []string{"network", "hardware"}) + + if result.Skipped { + r.SetFact(factKey, "unavailable") + return + } + if result.Err != nil { + r.SetFact(factKey, "unavailable") + r.RecordErrorForArtifact(ErrCommandFailed, + fmt.Sprintf("%s: %v", spec.String(), result.Err), artifactPath) + return + } + + if count, ok := parseDevlinkFWFatalJSON(stdout); ok { + r.SetFact(factKey, fmt.Sprintf("%d", count)) + return + } + if count, ok := parseDevlinkFWFatalText(stdout); ok { + r.SetFact(factKey, fmt.Sprintf("%d", count)) + return + } + r.SetFact(factKey, "unavailable") + r.RecordErrorForArtifact(ErrProbeFailed, + "devlink health output unparseable (neither JSON nor text form recognized)", artifactPath) +} + +// parseDevlinkFWFatalJSON parses `devlink -j health show` output. Returns +// (count, true) on success; (0, false) if the input is not valid JSON of the +// expected shape. +// +// Shape: { "health": { "/": [ { "reporter": "fw_fatal", +// "state": "healthy", "error": 0, ... }, ... ], ... } } +func parseDevlinkFWFatalJSON(data []byte) (int, bool) { + var root struct { + Health map[string][]struct { + Reporter string `json:"reporter"` + State string `json:"state"` + Error int `json:"error"` + } `json:"health"` + } + if err := json.Unmarshal(data, &root); err != nil || root.Health == nil { + return 0, false + } + count := 0 + for _, reporters := range root.Health { + for _, rep := range reporters { + if rep.Reporter != "fw_fatal" { + continue + } + if rep.Error > 0 || (rep.State != "" && rep.State != "healthy") { + count++ + } + } + } + return count, true +} + +// parseDevlinkFWFatalText parses the non-JSON form of `devlink health show` +// where reporter name and state/error counters live on adjacent lines: +// +// pci/0000:05:00.0: +// name fw_fatal +// state healthy error 0 recover 0 ... +// +// Returns (count, true) on success; (0, false) if no recognizable reporter +// entries were found (indicating the input is something else entirely — +// a command error page, an empty string, etc.). +func parseDevlinkFWFatalText(data []byte) (int, bool) { + currentReporter := "" + count := 0 + sawReporter := false + for _, line := range strings.Split(string(data), "\n") { + trim := strings.TrimSpace(line) + if strings.HasPrefix(trim, "name ") { + currentReporter = strings.TrimSpace(strings.TrimPrefix(trim, "name")) + sawReporter = true + continue + } + if currentReporter != "fw_fatal" { + continue + } + if !strings.HasPrefix(trim, "state ") { + continue + } + // Tokens: state error recover ... + fields := strings.Fields(trim) + state := "" + errCount := 0 + for i := 0; i+1 < len(fields); i++ { + switch fields[i] { + case "state": + state = fields[i+1] + case "error": + n, err := strconv.Atoi(fields[i+1]) + if err == nil { + errCount = n + } + } + } + if errCount > 0 || (state != "" && state != "healthy") { + count++ + } + } + if !sawReporter { + return 0, false + } + return count, true +} + // detectNetworkManager determines which network manager owns the host's interfaces. // It returns a canonical manager name and a human-readable detection report. // diff --git a/customers/vm-troubleshooting/internal/collector/network_iface.go b/customers/vm-troubleshooting/internal/collector/network_iface.go new file mode 100644 index 0000000..6a41359 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/network_iface.go @@ -0,0 +1,50 @@ +package collector + +import ( + "os" + "path/filepath" + "strings" +) + +// isPhysicalPort reports whether the named network interface should be +// treated as a physical NIC for the purposes of hardware-error reporting. +// +// An interface qualifies when: +// - /sys/class/net//device exists (rules out pure virtual devices +// like bonds, bridges, tunnels, loopback) +// - /sys/class/net//device/physfn does NOT exist (rules out +// SR-IOV virtual functions — the physical function is the authoritative +// source for hardware stats) +// - phys_port_name does not start with "pf" (rules out switchdev-mode +// representor ports, which are software-side artifacts of a PF) +// - the bound driver is not rndis_host (rules out USB-ethernet management +// interfaces commonly exposed by BMCs) +// +// VM callers: an SR-IOV VF passed through to a guest typically has no +// /device/physfn symlink from inside the VM, so it is treated as physical +// (correct — the VM has no visibility into the host PF). +func isPhysicalPort(iface string) bool { + return isPhysicalPortAt("/sys/class/net", iface) +} + +// isPhysicalPortAt is the path-injectable core of isPhysicalPort; production +// callers use /sys/class/net, tests point at a fake tree. +func isPhysicalPortAt(netRoot, iface string) bool { + ifPath := filepath.Join(netRoot, iface) + devPath := filepath.Join(ifPath, "device") + if _, err := os.Stat(devPath); err != nil { + return false + } + if _, err := os.Stat(filepath.Join(devPath, "physfn")); err == nil { + return false + } + if name := readSysfsString(filepath.Join(ifPath, "phys_port_name")); strings.HasPrefix(name, "pf") { + return false + } + if link, err := os.Readlink(filepath.Join(devPath, "driver")); err == nil { + if filepath.Base(link) == "rndis_host" { + return false + } + } + return true +} diff --git a/customers/vm-troubleshooting/internal/collector/network_test.go b/customers/vm-troubleshooting/internal/collector/network_test.go new file mode 100644 index 0000000..9a55bd9 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/network_test.go @@ -0,0 +1,220 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +// TestParseDevlinkFWFatalJSON exercises the primary (JSON) devlink parser. +// The dashboard needs a typed fw_fatal_count fact; the parser must never +// silently report zero when the input is unparseable — callers handle that +// by setting the fact to "unavailable" (see collectDevlinkHealth). +func TestParseDevlinkFWFatalJSON(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + wantCount int + wantOK bool + }{ + { + name: "healthy fw_fatal reporter", + input: `{"health":{"pci/0000:05:00.0":[ + {"reporter":"fw","state":"healthy","error":0}, + {"reporter":"fw_fatal","state":"healthy","error":0} + ]}}`, + wantCount: 0, + wantOK: true, + }, + { + name: "fw_fatal with error count", + input: `{"health":{"pci/0000:05:00.0":[ + {"reporter":"fw_fatal","state":"error","error":1} + ]}}`, + wantCount: 1, + wantOK: true, + }, + { + name: "fw_fatal with non-healthy state but zero errors", + input: `{"health":{"pci/0000:05:00.0":[ + {"reporter":"fw_fatal","state":"error","error":0} + ]}}`, + wantCount: 1, + wantOK: true, + }, + { + name: "multiple buses, one unhealthy fw_fatal", + input: `{"health":{ + "pci/0000:05:00.0":[{"reporter":"fw_fatal","state":"healthy","error":0}], + "pci/0000:06:00.0":[{"reporter":"fw_fatal","state":"error","error":3}] + }}`, + wantCount: 1, + wantOK: true, + }, + { + name: "non-fw_fatal reporters ignored", + input: `{"health":{"pci/0000:05:00.0":[ + {"reporter":"tx","state":"error","error":5} + ]}}`, + wantCount: 0, + wantOK: true, + }, + { + name: "malformed JSON", + input: `not json at all`, + wantCount: 0, + wantOK: false, + }, + { + name: "JSON without health key", + input: `{"other":"data"}`, + wantCount: 0, + wantOK: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + count, ok := parseDevlinkFWFatalJSON([]byte(tc.input)) + if ok != tc.wantOK { + t.Errorf("ok: want %v, got %v", tc.wantOK, ok) + } + if count != tc.wantCount { + t.Errorf("count: want %d, got %d", tc.wantCount, count) + } + }) + } +} + +// TestParseDevlinkFWFatalText exercises the fallback text parser. The real +// reason this exists: the original grep-based parser required "fw_fatal" and +// "error N" on the SAME line, but devlink's text output splits them across +// adjacent lines — so the original parser never fired on real input. This +// test locks in the multi-line handling. +func TestParseDevlinkFWFatalText(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + wantCount int + wantOK bool + }{ + { + name: "healthy reporters", + input: `pci/0000:05:00.0: + name fw + state healthy error 0 recover 0 + name fw_fatal + state healthy error 0 recover 0 +`, + wantCount: 0, + wantOK: true, + }, + { + name: "fw_fatal with error", + input: `pci/0000:05:00.0: + name fw_fatal + state error error 3 recover 0 +`, + wantCount: 1, + wantOK: true, + }, + { + name: "fw_fatal healthy, tx unhealthy — tx ignored", + input: `pci/0000:05:00.0: + name fw_fatal + state healthy error 0 recover 0 + name tx + state error error 5 recover 0 +`, + wantCount: 0, + wantOK: true, + }, + { + name: "empty input", + input: "", + wantCount: 0, + wantOK: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + count, ok := parseDevlinkFWFatalText([]byte(tc.input)) + if ok != tc.wantOK { + t.Errorf("ok: want %v, got %v", tc.wantOK, ok) + } + if count != tc.wantCount { + t.Errorf("count: want %d, got %d", tc.wantCount, count) + } + }) + } +} + +// TestIsPhysicalPortAt covers the four rejection cases plus the happy +// path. Each subtest builds a fake /sys/class/net tree and asserts the +// predicate. The production wrapper (`isPhysicalPort`) just delegates with +// the real root, so this exercises the full decision surface. +func TestIsPhysicalPortAt(t *testing.T) { + t.Parallel() + + type stage struct { + iface string + deviceExists bool + physfnExists bool + physPortName string + driverBasename string // "" = no driver link + } + cases := []struct { + name string + stage stage + want bool + }{ + {"physical NIC", stage{"eth0", true, false, "", "mlx5_core"}, true}, + {"no /device (pure virtual, e.g. bridge)", stage{"br0", false, false, "", ""}, false}, + {"SR-IOV VF (has physfn)", stage{"eth1", true, true, "", "mlx5_core"}, false}, + {"switchdev representor (pf0vf0)", stage{"enp0s0_pf0vf0", true, false, "pf0vf0", "mlx5_core"}, false}, + {"USB mgmt (rndis_host)", stage{"usb0", true, false, "", "rndis_host"}, false}, + {"physical NIC with phys_port_name p0 (not a representor)", stage{"enp0s0", true, false, "p0", "mlx5_core"}, true}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + root := t.TempDir() + if !tc.stage.deviceExists { + // Interface exists but has no /device — stage the parent dir + // so os.Stat on the interface itself succeeds but /device fails. + if err := os.MkdirAll(filepath.Join(root, tc.stage.iface), 0o755); err != nil { + t.Fatal(err) + } + } else { + devPath := filepath.Join(root, tc.stage.iface, "device") + if err := os.MkdirAll(devPath, 0o755); err != nil { + t.Fatal(err) + } + if tc.stage.physfnExists { + if err := os.Symlink("/dev/null", filepath.Join(devPath, "physfn")); err != nil { + t.Fatal(err) + } + } + if tc.stage.driverBasename != "" { + if err := os.Symlink("/fake/path/"+tc.stage.driverBasename, filepath.Join(devPath, "driver")); err != nil { + t.Fatal(err) + } + } + if tc.stage.physPortName != "" { + if err := os.WriteFile(filepath.Join(root, tc.stage.iface, "phys_port_name"), + []byte(tc.stage.physPortName+"\n"), 0o644); err != nil { + t.Fatal(err) + } + } + } + + if got := isPhysicalPortAt(root, tc.stage.iface); got != tc.want { + t.Errorf("isPhysicalPortAt(%s): want %v, got %v", tc.stage.iface, tc.want, got) + } + }) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/nvidia.go b/customers/vm-troubleshooting/internal/collector/nvidia.go index d4bc9d5..0714e90 100644 --- a/customers/vm-troubleshooting/internal/collector/nvidia.go +++ b/customers/vm-troubleshooting/internal/collector/nvidia.go @@ -44,25 +44,30 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) r.RecordSkip(SkipNotApplicable, "no NVIDIA hardware detected") return r, nil } + // NVSwitch presence is derived from PCI sysfs (class 0x0680xx on vendor + // 0x10de), not from lspci text. On VFIO-passthrough hosts the host has + // no NVIDIA driver loaded, so lspci emits "Bridge: ... [rev ..]" instead + // of a "NVSwitch" string — the sysfs path is the authoritative source. + // + // Both fact keys are emitted from the same source: + // - nvswitch_present (legacy, retained for dashboard panel stability) + // - nvswitch.present (canonical dotted namespace — matches edac.*, + // pcie.*, thermal.*, nic.*, gpu.* conventions) + nvSwitch := nvSwitchPresenceFact() + r.SetFact("nvswitch_present", nvSwitch) + r.SetFact("nvswitch.present", nvSwitch) if c.Exec.CommandExists("lspci") { lspciSpec := executor.CommandSpec{Name: "lspci", Timeout: config.TimeoutQuick} lspciResult, out, _ := c.Exec.Capture(ctx, lspciSpec, 512*1024) if lspciResult.Err == nil && !lspciResult.Skipped { var lines []string - hasNVSwitch := false for _, line := range strings.Split(string(out), "\n") { - lower := strings.ToLower(line) - if strings.Contains(lower, "nvidia") { + if strings.Contains(strings.ToLower(line), "nvidia") { lines = append(lines, line) } - if strings.Contains(lower, "nvswitch") { - hasNVSwitch = true - } } - r.SetFact("nvswitch_present", fmt.Sprintf("%t", hasNVSwitch)) c.saveCapturedProbe(r, "nvidia/pci_devices.txt", lspciSpec, lspciResult, strings.Join(lines, "\n")+"\n", "", "lspci", []string{"gpu", "hardware"}) } else { - r.SetFact("nvswitch_present", "unavailable") c.saveCapturedProbe(r, "nvidia/pci_devices.txt", lspciSpec, lspciResult, "", "", "lspci", []string{"gpu", "hardware"}) } } @@ -132,6 +137,9 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) ) } } + // GPU health fact extraction from extended queries. + c.collectGPUHealthFacts(ctx, r) + // Capture raw dmesg for the archive. Xid classification is handled by the // triage layer (internal/triage/xid.go) which runs after all collectors. // The collector only saves the raw artifact — no issue classification here. @@ -163,9 +171,13 @@ func (c *NvidiaCollector) Collect(ctx context.Context) (*CollectorResult, error) } for _, args := range [][]string{ {"--query-gpu=index,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total", "--format=csv"}, + {"--query-gpu=index,ecc.errors.corrected.aggregate.total,ecc.errors.uncorrected.aggregate.total", "--format=csv"}, {"--query-gpu=index,clocks_throttle_reasons.active", "--format=csv"}, {"--query-gpu=index,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max", "--format=csv"}, {"--query-gpu=index,power.draw,power.limit,temperature.gpu", "--format=csv"}, + {"--query-gpu=index,gpu_recovery_action,inforom.checksum_validation", "--format=csv"}, + {"--query-gpu=index,row_remapper.correctable,row_remapper.uncorrectable,row_remapper.pending,row_remapper.remapping_failure_occurred", "--format=csv"}, + {"--query-gpu=index,retired_pages.sbe,retired_pages.dbe,retired_pages.pending", "--format=csv"}, } { // Derive filename from the query parameter (e.g. "--query-gpu=index,power.draw" → "power_draw") query := args[0] @@ -235,6 +247,92 @@ func detectUnreachableGPUDetails(outputs ...string) ([]string, int) { return sortedBDFs, len(unknownSet) } +// collectGPUHealthFacts queries nvidia-smi for hardware health indicators +// and sets summary facts for dashboard consumption. Each sub-query follows +// the same shape — run `nvidia-smi --query-gpu= --format=csv,noheader`, +// count rows satisfying a predicate — so the work delegates to countGPUField. +func (c *NvidiaCollector) collectGPUHealthFacts(ctx context.Context, r *CollectorResult) { + // gpu_recovery_action: any value other than "None" (and not an unavailable + // token) means the GPU reports a pending reset/drain action. + recoveryCount := c.countGPUField(ctx, "gpu_recovery_action", func(v string) bool { + return v != "" && !isNvidiaSmiUnavailable(v) && !strings.EqualFold(v, "none") + }) + r.SetFact("gpu.recovery_action_required", fmt.Sprintf("%d", recoveryCount)) + + // inforom.checksum_validation: "Invalid" indicates the on-GPU ROM image + // failed its integrity check. + invalidCount := c.countGPUField(ctx, "inforom.checksum_validation", func(v string) bool { + return strings.EqualFold(v, "invalid") + }) + r.SetFact("gpu.inforom_invalid_count", fmt.Sprintf("%d", invalidCount)) + + // row_remapper.remapping_failure_occurred: "Yes" indicates the GPU ran + // out of spare memory rows to remap bad addresses. + remapFailCount := c.countGPUField(ctx, "row_remapper.remapping_failure_occurred", func(v string) bool { + return strings.EqualFold(v, "yes") + }) + r.SetFact("gpu.row_remap_failure_count", fmt.Sprintf("%d", remapFailCount)) +} + +// countGPUField runs `nvidia-smi --query-gpu= --format=csv,noheader` +// and counts rows whose trimmed value satisfies `match`. Returns 0 if the +// command is skipped or errors (the caller converts that to a zero fact, +// which is correct — we genuinely observed no rows matching). +func (c *NvidiaCollector) countGPUField(ctx context.Context, field string, match func(string) bool) int { + spec := executor.CommandSpec{ + Name: "nvidia-smi", + Args: []string{"--query-gpu=" + field, "--format=csv,noheader"}, + Timeout: config.TimeoutMedium, + } + result, out, _ := c.Exec.Capture(ctx, spec, 64*1024) + if result.Err != nil || result.Skipped { + return 0 + } + count := 0 + for _, line := range strings.Split(string(out), "\n") { + if match(strings.TrimSpace(line)) { + count++ + } + } + return count +} + +// isNvidiaSmiUnavailable reports whether a nvidia-smi CSV field value indicates +// the field is unsupported or unavailable. Older drivers return "N/A", +// "[N/A]", or "[Not Supported]" for query fields they don't recognize. +// isNvidiaSmiUnavailable matches the explicit tokens nvidia-smi uses for +// absent/unsupported/inaccessible values. An earlier version used +// strings.HasPrefix(lower, "[") as a catch-all, which would mask any valid +// nvidia-smi field wrapped in brackets (e.g. a future `gpu_recovery_action` +// value). Keep this list explicit. +func isNvidiaSmiUnavailable(v string) bool { + lower := strings.ToLower(v) + switch lower { + case "n/a", "[n/a]", + "not supported", "[not supported]", + "[unknown error]", + "[insufficient permissions]": + return true + } + return false +} + +// nvSwitchPresenceFact returns "true"/"false" based on PCI sysfs classification. +// On hosts where sysfs is unreadable the fact reports "unavailable" — this is +// semantically distinct from "checked, none present". +func nvSwitchPresenceFact() string { + devs, err := iteratePCIDevices() + if err != nil { + return "unavailable" + } + for _, d := range devs { + if classifyPCI(d.Vendor, d.Class) == "nvswitch" { + return "true" + } + } + return "false" +} + func stripANSIArtifact(path string) error { data, err := os.ReadFile(path) if err != nil { diff --git a/customers/vm-troubleshooting/internal/collector/pci.go b/customers/vm-troubleshooting/internal/collector/pci.go new file mode 100644 index 0000000..7e37903 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/pci.go @@ -0,0 +1,84 @@ +package collector + +import ( + "os" + "path/filepath" + "strings" +) + +// pciDevice describes one entry under /sys/bus/pci/devices//. +// Fields are raw sysfs strings (vendor/device/class in "0xNNNN"/"0xNNNNNN" +// form, lower-case). Driver is the basename of /driver symlink, or "" when +// no driver is bound. +type pciDevice struct { + BDF string + Vendor string + Device string + Class string + Driver string +} + +// iteratePCIDevices reads /sys/bus/pci/devices. Returns nil+error only when +// the base directory is unreadable; per-device read errors are silently +// tolerated (unreadable devices simply return partial fields). +// +// Calls route through pciIterator so tests can swap in fake device lists +// without needing to stage a sysfs tree on disk. +func iteratePCIDevices() ([]pciDevice, error) { + return pciIterator() +} + +// pciIterator is the swappable backing for iteratePCIDevices. Tests replace +// it via a deferred restore; production code never touches it. +var pciIterator = func() ([]pciDevice, error) { + return iteratePCIDevicesAt("") +} + +// iteratePCIDevicesAt enumerates PCI devices rooted at /sys/bus/pci/devices. +// A root of "" means the real /sys tree. This indirection exists for tests; +// production code calls iteratePCIDevices(). +func iteratePCIDevicesAt(root string) ([]pciDevice, error) { + base := filepath.Join(root, "sys", "bus", "pci", "devices") + if root == "" { + base = "/sys/bus/pci/devices" + } + entries, err := os.ReadDir(base) + if err != nil { + return nil, err + } + out := make([]pciDevice, 0, len(entries)) + for _, entry := range entries { + devPath := filepath.Join(base, entry.Name()) + dev := pciDevice{ + BDF: entry.Name(), + Vendor: strings.ToLower(readSysfsString(filepath.Join(devPath, "vendor"))), + Device: strings.ToLower(readSysfsString(filepath.Join(devPath, "device"))), + Class: strings.ToLower(readSysfsString(filepath.Join(devPath, "class"))), + } + if link, readErr := os.Readlink(filepath.Join(devPath, "driver")); readErr == nil { + dev.Driver = filepath.Base(link) + } + out = append(out, dev) + } + return out, nil +} + +// classifyPCI returns a coarse device kind for the canonical (vendor, class) +// pair. Returns "" for unrecognized combinations. Callers that need finer +// distinctions (e.g. NIC vs. wireless) should inspect Class directly. +// +// Vendor/class are expected in "0x...." form (lower-case); this is what +// iteratePCIDevicesAt writes. Inputs in other forms won't match. +func classifyPCI(vendor, class string) string { + switch { + case vendor == "0x10de" && (strings.HasPrefix(class, "0x0300") || strings.HasPrefix(class, "0x0302")): + return "gpu" + case vendor == "0x10de" && strings.HasPrefix(class, "0x0680"): + return "nvswitch" + case strings.HasPrefix(class, "0x0200") || strings.HasPrefix(class, "0x0207"): + return "nic" + case strings.HasPrefix(class, "0x0106") || strings.HasPrefix(class, "0x0108"): + return "storage" + } + return "" +} diff --git a/customers/vm-troubleshooting/internal/collector/pci_test.go b/customers/vm-troubleshooting/internal/collector/pci_test.go new file mode 100644 index 0000000..f6f61b6 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/pci_test.go @@ -0,0 +1,135 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +// stageFakePCIDevice writes a /sys/bus/pci/devices// directory under +// root, populated with vendor/device/class files and (optionally) a driver +// symlink. The caller passes the values the iterator expects to read back. +func stageFakePCIDevice(t *testing.T, root, bdf, vendor, device, class, driver string) { + t.Helper() + base := filepath.Join(root, "sys", "bus", "pci", "devices", bdf) + if err := os.MkdirAll(base, 0o755); err != nil { + t.Fatal(err) + } + for name, content := range map[string]string{ + "vendor": vendor, + "device": device, + "class": class, + } { + if err := os.WriteFile(filepath.Join(base, name), []byte(content+"\n"), 0o644); err != nil { + t.Fatal(err) + } + } + if driver != "" { + // Mimic the real kernel symlink shape: /sys/bus/pci/devices//driver + // points at /sys/bus/pci/drivers/. The iterator only uses + // filepath.Base on the link target, so the target need not exist. + driverPath := filepath.Join(root, "sys", "bus", "pci", "drivers", driver) + if err := os.Symlink(driverPath, filepath.Join(base, "driver")); err != nil { + t.Fatal(err) + } + } +} + +// TestClassifyPCI is the single source of truth for "what kind of device is +// this?" Callers (nvidia.go, services.go, hypervisor.go) rely on this +// classification being consistent across the codebase. The plan's +// motivating incident: HGX audit showed nvidia.nvswitch_present=false +// while hypervisor.nvswitch.pci_count=4, because two callsites used +// different heuristics. classifyPCI is now that single heuristic. +func TestClassifyPCI(t *testing.T) { + t.Parallel() + cases := []struct { + name string + vendor string + class string + want string + }{ + {"NVIDIA GPU (VGA)", "0x10de", "0x030000", "gpu"}, + {"NVIDIA GPU (3D)", "0x10de", "0x030200", "gpu"}, + {"NVIDIA NVSwitch", "0x10de", "0x068000", "nvswitch"}, + {"Mellanox NIC", "0x15b3", "0x020000", "nic"}, + {"Intel InfiniBand class", "0x8086", "0x020700", "nic"}, + {"NVMe storage", "0x144d", "0x010802", "storage"}, + {"SCSI/SATA storage", "0x8086", "0x010600", "storage"}, + {"unclassified bridge", "0x8086", "0x060400", ""}, + {"NVIDIA chipset (not GPU)", "0x10de", "0x040300", ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if got := classifyPCI(tc.vendor, tc.class); got != tc.want { + t.Errorf("classifyPCI(%q,%q): want %q, got %q", tc.vendor, tc.class, tc.want, got) + } + }) + } +} + +// TestIteratePCIDevicesAt verifies the sysfs walk against a controlled fake +// tree. It proves the iterator returns the expected fields, lowercases +// vendor/device/class (classifyPCI depends on that), and picks up the +// driver basename from the /driver symlink. +func TestIteratePCIDevicesAt(t *testing.T) { + t.Parallel() + root := t.TempDir() + + stageFakePCIDevice(t, root, "0000:00:00.0", "0x8086", "0x1234", "0x060400", "") + stageFakePCIDevice(t, root, "0000:01:00.0", "0x10de", "0x2330", "0x030200", "nvidia") + stageFakePCIDevice(t, root, "0000:05:00.0", "0x10de", "0x22a3", "0x068000", "vfio-pci") + stageFakePCIDevice(t, root, "0000:06:00.0", "0x15b3", "0x1017", "0x020000", "mlx5_core") + + devs, err := iteratePCIDevicesAt(root) + if err != nil { + t.Fatal(err) + } + if len(devs) != 4 { + t.Fatalf("expected 4 devices, got %d", len(devs)) + } + + // Index by BDF for easier assertions regardless of dir-read order. + byBDF := map[string]pciDevice{} + for _, d := range devs { + byBDF[d.BDF] = d + } + + check := func(bdf, vendor, class, driver, wantClass string) { + t.Helper() + d, ok := byBDF[bdf] + if !ok { + t.Errorf("expected device %s", bdf) + return + } + if d.Vendor != vendor { + t.Errorf("%s: vendor want %q, got %q", bdf, vendor, d.Vendor) + } + if d.Class != class { + t.Errorf("%s: class want %q, got %q", bdf, class, d.Class) + } + if d.Driver != driver { + t.Errorf("%s: driver want %q, got %q", bdf, driver, d.Driver) + } + if kind := classifyPCI(d.Vendor, d.Class); kind != wantClass { + t.Errorf("%s: classifyPCI want %q, got %q", bdf, wantClass, kind) + } + } + + check("0000:00:00.0", "0x8086", "0x060400", "", "") + check("0000:01:00.0", "0x10de", "0x030200", "nvidia", "gpu") + check("0000:05:00.0", "0x10de", "0x068000", "vfio-pci", "nvswitch") + check("0000:06:00.0", "0x15b3", "0x020000", "mlx5_core", "nic") +} + +// TestIteratePCIDevicesAt_MissingRoot mirrors the "no PCI sysfs" guest-VM +// case. The iterator must return an error (not an empty slice that callers +// confuse with "scanned and found nothing"). +func TestIteratePCIDevicesAt_MissingRoot(t *testing.T) { + t.Parallel() + _, err := iteratePCIDevicesAt(filepath.Join(t.TempDir(), "does_not_exist")) + if err == nil { + t.Error("expected error for missing /sys/bus/pci/devices") + } +} diff --git a/customers/vm-troubleshooting/internal/collector/pcie.go b/customers/vm-troubleshooting/internal/collector/pcie.go new file mode 100644 index 0000000..ec9e2dd --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/pcie.go @@ -0,0 +1,278 @@ +package collector + +import ( + "context" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type PCIeCollector struct{ Base } + +func NewPCIeCollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *PCIeCollector { + return &PCIeCollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *PCIeCollector) Name() string { return "PCIe AER" } +func (c *PCIeCollector) ID() string { return "pcie" } + +type pcieDeviceErrors struct { + BDF string `json:"bdf"` + Vendor string `json:"vendor,omitempty"` + Device string `json:"device,omitempty"` + Class string `json:"class,omitempty"` + Fatal map[string]int64 `json:"fatal,omitempty"` + NonFatal map[string]int64 `json:"nonfatal,omitempty"` + Correctable map[string]int64 `json:"correctable,omitempty"` +} + +type pcieAERStatus struct { + Devices []pcieDeviceErrors `json:"devices"` +} + +// pcieLinkDevice describes a PCI device running at a degraded link. +// Only devices where current < max on speed or width are recorded; healthy +// devices are omitted from the JSON artifact to keep it small. Facts report +// totals (always set). +type pcieLinkDevice struct { + BDF string `json:"bdf"` + Vendor string `json:"vendor,omitempty"` + Device string `json:"device,omitempty"` + Class string `json:"class,omitempty"` + CurrentSpeedGTS float64 `json:"current_speed_gts,omitempty"` + MaxSpeedGTS float64 `json:"max_speed_gts,omitempty"` + CurrentWidth int64 `json:"current_width,omitempty"` + MaxWidth int64 `json:"max_width,omitempty"` + SpeedDegraded bool `json:"speed_degraded"` + WidthDegraded bool `json:"width_degraded"` +} + +type pcieLinkStatus struct { + Devices []pcieLinkDevice `json:"degraded_devices"` +} + +func (c *PCIeCollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + + devs, err := iteratePCIDevices() + if err != nil { + // Facts are authoritatively "unavailable" — distinct from zero. We + // cannot emit counts without reading sysfs. + r.SetFact("pcie.aer_fatal_total", "unavailable") + r.SetFact("pcie.aer_nonfatal_total", "unavailable") + r.SetFact("pcie.aer_devices_with_errors", "unavailable") + r.SetFact("pcie.link_speed_degraded_count", "unavailable") + r.SetFact("pcie.link_width_degraded_count", "unavailable") + r.RecordSkip(SkipSourceUnavailable, "PCI sysfs not present") + return r, nil + } + + pciBase := "/sys/bus/pci/devices" + aerStatus := pcieAERStatus{} + linkStatus := pcieLinkStatus{} + var fatalTotal, nonfatalTotal int64 + devicesWithErrors := 0 + speedDegraded := 0 + widthDegraded := 0 + // Track whether ANY device exposed link speed/width sysfs files. Guest + // VMs with only virtio devices, containers with minimal sysfs, and very + // old kernels may not expose these — in which case "0 degraded" would + // be a false negative masquerading as a clean scan. + sawAnySpeed := false + sawAnyWidth := false + + for _, d := range devs { + devPath := filepath.Join(pciBase, d.BDF) + + // --- AER counters (as before) --- + fatal := parseAERFile(filepath.Join(devPath, "aer_dev_fatal")) + nonfatal := parseAERFile(filepath.Join(devPath, "aer_dev_nonfatal")) + correctable := parseAERFile(filepath.Join(devPath, "aer_dev_correctable")) + if !(fatal == nil && nonfatal == nil && correctable == nil) { + fatalCount := aerTotal(fatal) + nonfatalCount := aerTotal(nonfatal) + if !(fatalCount == 0 && nonfatalCount == 0 && aerTotal(correctable) == 0) { + aerStatus.Devices = append(aerStatus.Devices, pcieDeviceErrors{ + BDF: d.BDF, + Vendor: d.Vendor, + Device: d.Device, + Class: d.Class, + Fatal: fatal, + NonFatal: nonfatal, + Correctable: correctable, + }) + fatalTotal += fatalCount + nonfatalTotal += nonfatalCount + if fatalCount > 0 || nonfatalCount > 0 { + devicesWithErrors++ + } + } + } + + // Link speed/width read from the same iteration — no second PCI walk. + link := readDeviceLinkStatus(devPath) + if link.hadSpeed { + sawAnySpeed = true + } + if link.hadWidth { + sawAnyWidth = true + } + speedDegradedDev := link.speedDegraded + widthDegradedDev := link.widthDegraded + + if speedDegradedDev { + speedDegraded++ + } + if widthDegradedDev { + widthDegraded++ + } + if speedDegradedDev || widthDegradedDev { + linkStatus.Devices = append(linkStatus.Devices, pcieLinkDevice{ + BDF: d.BDF, + Vendor: d.Vendor, + Device: d.Device, + Class: d.Class, + CurrentSpeedGTS: link.curSpeed, + MaxSpeedGTS: link.maxSpeed, + CurrentWidth: link.curWidth, + MaxWidth: link.maxWidth, + SpeedDegraded: speedDegradedDev, + WidthDegraded: widthDegradedDev, + }) + } + } + + // AER facts: always emitted. A clean scan is 0, not skipped. + r.SetFact("pcie.aer_fatal_total", strconv.FormatInt(fatalTotal, 10)) + r.SetFact("pcie.aer_nonfatal_total", strconv.FormatInt(nonfatalTotal, 10)) + r.SetFact("pcie.aer_devices_with_errors", strconv.Itoa(devicesWithErrors)) + + // Link degrade facts: "unavailable" when no device exposed the sysfs + // files, so dashboards don't confuse it with "checked, all healthy". + if sawAnySpeed { + r.SetFact("pcie.link_speed_degraded_count", strconv.Itoa(speedDegraded)) + } else { + r.SetFact("pcie.link_speed_degraded_count", "unavailable") + } + if sawAnyWidth { + r.SetFact("pcie.link_width_degraded_count", strconv.Itoa(widthDegraded)) + } else { + r.SetFact("pcie.link_width_degraded_count", "unavailable") + } + + // Artifacts. AER JSON only when devices with errors exist (preserves + // existing behavior; triage can still find everything via facts). + if len(aerStatus.Devices) > 0 { + c.saveJSONProbe(r, "hardware/pcie_aer_errors.json", aerStatus, "pcie", "hardware") + } + // Link status JSON: always emitted (even if empty) so dashboards can + // distinguish "checked, clean" from "never probed". Degraded devices + // only; healthy devices omitted to keep the artifact small. + c.saveJSONProbe(r, "hardware/pcie_link_status.json", linkStatus, "pcie", "hardware") + + return r, nil +} + +// linkStatus is the result of reading a single device's PCIe link sysfs +// files. hadSpeed / hadWidth report whether both current AND max of each +// pair were successfully read; the collector uses these to distinguish +// "checked, healthy" from "not checked". +type linkStatus struct { + curSpeed, maxSpeed float64 + curWidth, maxWidth int64 + hadSpeed, hadWidth bool + speedDegraded bool + widthDegraded bool +} + +// readDeviceLinkStatus reads PCIe link speed and width from +// /{current,max}_link_{speed,width}. Returns a struct with all +// four values plus presence flags and the computed degrade flags. A +// speed or width comparison is only meaningful when both current and +// max were successfully read AND non-zero (kernel reports width==0 when +// the link is down). +func readDeviceLinkStatus(devPath string) linkStatus { + ls := linkStatus{} + cur, curOK := parseLinkSpeedGTS(readSysfsString(filepath.Join(devPath, "current_link_speed"))) + max, maxOK := parseLinkSpeedGTS(readSysfsString(filepath.Join(devPath, "max_link_speed"))) + if curOK && maxOK { + ls.hadSpeed = true + ls.curSpeed = cur + ls.maxSpeed = max + if cur > 0 && max > 0 && cur < max { + ls.speedDegraded = true + } + } + cw, cwOK := readSysfsIntOK(filepath.Join(devPath, "current_link_width")) + mw, mwOK := readSysfsIntOK(filepath.Join(devPath, "max_link_width")) + if cwOK && mwOK { + ls.hadWidth = true + ls.curWidth = cw + ls.maxWidth = mw + if cw > 0 && mw > 0 && cw < mw { + ls.widthDegraded = true + } + } + return ls +} + +// parseLinkSpeedGTS extracts the leading float value in GT/s from a PCIe +// link-speed sysfs string. Accepts "32.0 GT/s PCIe", "8.0 GT/s", +// "2.5 GT/s PCIe". Returns (0, false) for "Unknown", empty, or unparseable. +func parseLinkSpeedGTS(s string) (float64, bool) { + fields := strings.Fields(s) + if len(fields) < 2 || !strings.EqualFold(fields[1], "GT/s") { + return 0, false + } + v, err := strconv.ParseFloat(fields[0], 64) + if err != nil || v <= 0 { + return 0, false + } + return v, true +} + +// parseAERFile parses an AER counter file (aer_dev_fatal, etc.). +// Format: "ErrorType \n" with a TOTAL line. +// Returns nil if the file doesn't exist. +func parseAERFile(path string) map[string]int64 { + data, err := os.ReadFile(path) + if err != nil { + return nil + } + result := make(map[string]int64) + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // Find the last space-separated token as the count + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + countStr := fields[len(fields)-1] + name := strings.Join(fields[:len(fields)-1], " ") + n, err := strconv.ParseInt(countStr, 10, 64) + if err != nil { + continue + } + result[name] = n + } + return result +} + +// aerTotal extracts the TOTAL value from an AER counter map. +func aerTotal(counters map[string]int64) int64 { + for k, v := range counters { + upper := strings.ToUpper(k) + if strings.HasPrefix(upper, "TOTAL") { + return v + } + } + return 0 +} diff --git a/customers/vm-troubleshooting/internal/collector/pcie_test.go b/customers/vm-troubleshooting/internal/collector/pcie_test.go new file mode 100644 index 0000000..b9a922b --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/pcie_test.go @@ -0,0 +1,178 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +func TestParseAERFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + // Valid AER file with counts + content := "BadTLP 0\nBadDLLP 0\nRollover 0\nTimeout 0\nNonFatalErr 0\nCorrIntErr 0\nHeaderOF 0\nTOTAL_ERR_COR 0\n" + os.WriteFile(filepath.Join(dir, "aer_dev_correctable"), []byte(content), 0o644) + result := parseAERFile(filepath.Join(dir, "aer_dev_correctable")) + if result == nil { + t.Fatal("expected non-nil result for valid file") + } + if aerTotal(result) != 0 { + t.Errorf("expected total 0, got %d", aerTotal(result)) + } + + // File with non-zero total + content2 := "Undefined 0\nDLProt 0\nTOTAL_ERR_FATAL 3\n" + os.WriteFile(filepath.Join(dir, "aer_dev_fatal"), []byte(content2), 0o644) + result2 := parseAERFile(filepath.Join(dir, "aer_dev_fatal")) + if aerTotal(result2) != 3 { + t.Errorf("expected fatal total 3, got %d", aerTotal(result2)) + } + + // Missing file + result3 := parseAERFile(filepath.Join(dir, "nonexistent")) + if result3 != nil { + t.Error("expected nil for missing file") + } +} + +// TestReadDeviceLinkStatus exercises the per-device link reader across the +// real sysfs shapes a collector will encounter: +// - Gen5 healthy (cur==max) — not degraded, both present +// - Gen3 in Gen5 slot — speed degraded, width healthy +// - x4 NVMe in x16 slot — width degraded, speed healthy +// - Down link (width==0) — NOT reported as degraded (kernel signals +// link-down with width 0; that's a separate condition) +// - Missing files — hadSpeed/hadWidth false, used by the collector to +// emit "unavailable" rather than a misleading "0 degraded" +func TestReadDeviceLinkStatus(t *testing.T) { + t.Parallel() + + write := func(t *testing.T, dir, name, content string) { + t.Helper() + if err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0o644); err != nil { + t.Fatal(err) + } + } + + t.Run("gen5 healthy", func(t *testing.T) { + t.Parallel() + d := t.TempDir() + write(t, d, "current_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "max_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "current_link_width", "16\n") + write(t, d, "max_link_width", "16\n") + got := readDeviceLinkStatus(d) + if !got.hadSpeed || !got.hadWidth { + t.Fatalf("want hadSpeed && hadWidth, got %+v", got) + } + if got.speedDegraded || got.widthDegraded { + t.Errorf("want no degrade, got %+v", got) + } + }) + + t.Run("gen3 in gen5 slot", func(t *testing.T) { + t.Parallel() + d := t.TempDir() + write(t, d, "current_link_speed", "8.0 GT/s\n") + write(t, d, "max_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "current_link_width", "16\n") + write(t, d, "max_link_width", "16\n") + got := readDeviceLinkStatus(d) + if !got.speedDegraded { + t.Errorf("want speedDegraded, got %+v", got) + } + if got.widthDegraded { + t.Errorf("want no widthDegraded, got %+v", got) + } + }) + + t.Run("x4 in x16 slot", func(t *testing.T) { + t.Parallel() + d := t.TempDir() + write(t, d, "current_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "max_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "current_link_width", "4\n") + write(t, d, "max_link_width", "16\n") + got := readDeviceLinkStatus(d) + if got.speedDegraded { + t.Errorf("want no speedDegraded, got %+v", got) + } + if !got.widthDegraded { + t.Errorf("want widthDegraded, got %+v", got) + } + }) + + t.Run("link down width 0 not a degrade", func(t *testing.T) { + t.Parallel() + d := t.TempDir() + write(t, d, "current_link_speed", "2.5 GT/s PCIe\n") + write(t, d, "max_link_speed", "32.0 GT/s PCIe\n") + write(t, d, "current_link_width", "0\n") + write(t, d, "max_link_width", "16\n") + got := readDeviceLinkStatus(d) + // Speed is still reportable; width==0 is link-down, not degrade. + if got.widthDegraded { + t.Errorf("width 0 must not be degrade, got %+v", got) + } + }) + + t.Run("missing files reported as not-had", func(t *testing.T) { + t.Parallel() + d := t.TempDir() + // No files written — common on virtio-only guest VMs. + got := readDeviceLinkStatus(d) + if got.hadSpeed || got.hadWidth { + t.Errorf("expected hadSpeed==false and hadWidth==false, got %+v", got) + } + if got.speedDegraded || got.widthDegraded { + t.Errorf("no files must not imply degrade, got %+v", got) + } + }) +} + +func TestAerTotal_NilMap(t *testing.T) { + t.Parallel() + if got := aerTotal(nil); got != 0 { + t.Errorf("expected 0 for nil map, got %d", got) + } +} + +// TestParseLinkSpeedGTS exercises the PCIe link-speed string parser. The +// kernel's current_link_speed / max_link_speed files carry "NN.N GT/s" or +// "NN.N GT/s PCIe" text — anything else (Unknown, empty, a stray digit, +// pure garbage) must return (0, false) so the collector's degrade check +// remains correct rather than poisoning the comparison with 0. +func TestParseLinkSpeedGTS(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + wantVal float64 + wantOK bool + }{ + {"Gen5 with PCIe suffix", "32.0 GT/s PCIe", 32.0, true}, + {"Gen3 plain", "8.0 GT/s", 8.0, true}, + {"Gen1 with PCIe suffix", "2.5 GT/s PCIe", 2.5, true}, + {"Gen4", "16.0 GT/s", 16.0, true}, + {"unknown", "Unknown", 0, false}, + {"empty", "", 0, false}, + {"digit only", "32", 0, false}, + {"wrong unit", "32.0 MT/s", 0, false}, + {"garbage", "abc", 0, false}, + {"zero value rejected", "0 GT/s", 0, false}, + {"case insensitive GT/s", "8.0 gt/s", 8.0, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got, ok := parseLinkSpeedGTS(tc.input) + if ok != tc.wantOK { + t.Errorf("ok: want %v, got %v", tc.wantOK, ok) + } + if got != tc.wantVal { + t.Errorf("value: want %g, got %g", tc.wantVal, got) + } + }) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/services.go b/customers/vm-troubleshooting/internal/collector/services.go index 0892fc2..ae382dd 100644 --- a/customers/vm-troubleshooting/internal/collector/services.go +++ b/customers/vm-troubleshooting/internal/collector/services.go @@ -176,8 +176,10 @@ const ( // detectNVSwitch checks for NVSwitch hardware. // Primary: Go-native procfs check (no subprocess). -// Fallback: lspci (when nvidia-nvswitch module not loaded). -func (c *ServicesCollector) detectNVSwitch(ctx context.Context) nvSwitchPresence { +// Fallback: PCI sysfs classification via iteratePCIDevices + classifyPCI. +// The sysfs path works on VFIO-passthrough hosts where no NVIDIA driver is +// loaded (nvidia-nvswitch module absent, lspci emits generic "Bridge" text). +func (c *ServicesCollector) detectNVSwitch(_ context.Context) nvSwitchPresence { // Primary: /proc/driver/nvidia-nvswitch/devices/ is created by the // nvidia-nvswitch kernel module. If present with entries → NVSwitch hardware. // If present but empty → driver loaded, no hardware. If absent → module not loaded. @@ -188,18 +190,15 @@ func (c *ServicesCollector) detectNVSwitch(ctx context.Context) nvSwitchPresence } return nvSwitchAbsent } - // Fallback: lspci (procfs path doesn't exist when nvidia-nvswitch module not loaded) - if !c.Exec.CommandExists("lspci") { - return nvSwitchUnknown - } - result, out, _ := c.Exec.Capture(ctx, executor.CommandSpec{ - Name: "lspci", Timeout: config.TimeoutQuick, - }, 512*1024) - if result.Err != nil { + // Fallback: PCI sysfs bridge-class classification. + devs, err := iteratePCIDevices() + if err != nil { return nvSwitchUnknown } - if strings.Contains(strings.ToLower(string(out)), "nvswitch") { - return nvSwitchPresent + for _, d := range devs { + if classifyPCI(d.Vendor, d.Class) == "nvswitch" { + return nvSwitchPresent + } } return nvSwitchAbsent } @@ -258,29 +257,18 @@ func isIfupdownActive() bool { // This is the physical-passthrough fingerprint on HGX hypervisors: NVSwitch // hardware exists in the PCI tree but host-side FM is not applicable. func isNVSwitchVFIOBound() bool { - entries, err := os.ReadDir("/sys/bus/pci/devices") + devs, err := iteratePCIDevices() if err != nil { return false } nvSwitchCount := 0 vfioBound := 0 - for _, entry := range entries { - base := filepath.Join("/sys/bus/pci/devices", entry.Name()) - vendorBytes, err := os.ReadFile(filepath.Join(base, "vendor")) - if err != nil || strings.TrimSpace(string(vendorBytes)) != "0x10de" { - continue - } - classBytes, err := os.ReadFile(filepath.Join(base, "class")) - if err != nil { - continue - } - // PCI class 0x0680xx: PCI_BASE_CLASS_BRIDGE (0x06) + subclass 0x80 (Other Bridge) - if !strings.HasPrefix(strings.ToLower(strings.TrimSpace(string(classBytes))), "0x0680") { + for _, d := range devs { + if classifyPCI(d.Vendor, d.Class) != "nvswitch" { continue } nvSwitchCount++ - driverLink, readErr := os.Readlink(filepath.Join(base, "driver")) - if readErr == nil && strings.HasSuffix(driverLink, "/vfio-pci") { + if d.Driver == "vfio-pci" { vfioBound++ } } diff --git a/customers/vm-troubleshooting/internal/collector/services_test.go b/customers/vm-troubleshooting/internal/collector/services_test.go index 737e544..7288893 100644 --- a/customers/vm-troubleshooting/internal/collector/services_test.go +++ b/customers/vm-troubleshooting/internal/collector/services_test.go @@ -87,48 +87,47 @@ func TestResolveExistingNilOnDoubleFailure(t *testing.T) { } } -// TestFabricManagerDowngrade_RequiresBothConditions tests all 4 combinations: -// (nvswitch absent + benign = downgrade), (absent + real error = no downgrade), -// (unknown + benign = no downgrade), (present + benign = no downgrade). +// TestFabricManagerDowngrade_RequiresBothConditions tests downgrade +// combinations. NVSwitch presence is PCI-sysfs-driven (R10); tests inject +// the iterator directly instead of faking lspci text. func TestFabricManagerDowngrade_RequiresBothConditions(t *testing.T) { - t.Parallel() + // Not t.Parallel(): the subtests mutate the package-level pciIterator. + + nvSwitchDev := pciDevice{BDF: "0000:01:00.0", Vendor: "0x10de", Class: "0x068000"} + nonNVSwitch := pciDevice{BDF: "0000:02:00.0", Vendor: "0x8086", Class: "0x060400"} tests := []struct { name string - hasLspci bool - lspciOutput string + pciDevs []pciDevice + iteratorFails bool // sysfs unreadable → nvSwitchUnknown fmStatusOutput string fmStatusErr error wantDowngrade bool }{ { name: "absent_and_benign", - hasLspci: true, - lspciOutput: "00:00.0 Host bridge: Intel Corporation\n01:00.0 3D controller: NVIDIA Corporation\n", + pciDevs: []pciDevice{nonNVSwitch}, fmStatusOutput: "nvidia-fabricmanager.service\n Active: failed\n NV_WARN_NOTHING_TO_DO\n", fmStatusErr: fmt.Errorf("exit status 3"), wantDowngrade: true, }, { name: "absent_but_real_error", - hasLspci: true, - lspciOutput: "00:00.0 Host bridge: Intel Corporation\n01:00.0 3D controller: NVIDIA Corporation\n", + pciDevs: []pciDevice{nonNVSwitch}, fmStatusOutput: "nvidia-fabricmanager.service\n Active: failed\n Fatal error: unable to initialize\n", fmStatusErr: fmt.Errorf("exit status 3"), wantDowngrade: false, }, { name: "unknown_and_benign", - hasLspci: false, // no lspci → unknown - lspciOutput: "", + iteratorFails: true, fmStatusOutput: "nvidia-fabricmanager.service\n Active: failed\n NV_WARN_NOTHING_TO_DO\n", fmStatusErr: fmt.Errorf("exit status 3"), wantDowngrade: false, }, { name: "present_and_benign", - hasLspci: true, - lspciOutput: "00:00.0 Host bridge: Intel\n01:00.0 Bridge: NVIDIA NVSwitch\n", + pciDevs: []pciDevice{nvSwitchDev}, fmStatusOutput: "nvidia-fabricmanager.service\n Active: failed\n NV_WARN_NOTHING_TO_DO\n", fmStatusErr: fmt.Errorf("exit status 3"), wantDowngrade: false, @@ -137,17 +136,22 @@ func TestFabricManagerDowngrade_RequiresBothConditions(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - t.Parallel() + // Swap in the fake PCI iterator for this subtest. + origIter := pciIterator + defer func() { pciIterator = origIter }() + if tt.iteratorFails { + pciIterator = func() ([]pciDevice, error) { + return nil, fmt.Errorf("sysfs unreadable") + } + } else { + devs := tt.pciDevs + pciIterator = func() ([]pciDevice, error) { return devs, nil } + } fake := executor.NewFake() fake.RootAccess = true fake.Binaries["systemctl"] = true - if tt.hasLspci { - fake.Binaries["lspci"] = true - fake.Commands["lspci"] = executor.FakeResponse{Stdout: []byte(tt.lspciOutput)} - } - fake.Commands["systemctl status nvidia-fabricmanager"] = executor.FakeResponse{ Stdout: []byte(tt.fmStatusOutput), ExitCode: 3, diff --git a/customers/vm-troubleshooting/internal/collector/storage.go b/customers/vm-troubleshooting/internal/collector/storage.go index 13a6b10..f32b8db 100644 --- a/customers/vm-troubleshooting/internal/collector/storage.go +++ b/customers/vm-troubleshooting/internal/collector/storage.go @@ -2,6 +2,7 @@ package collector import ( "context" + "fmt" "os" "path/filepath" "strings" @@ -21,6 +22,12 @@ func NewStorageCollector(exec executor.Executor, writer *output.Writer, ui ui.UI func (c *StorageCollector) Name() string { return "Storage" } func (c *StorageCollector) ID() string { return "storage" } +// Collect emits raw smartctl and nvme-cli output as artifacts; one derived +// fact (NVMe controller state). Structured parsing of SMART exit-code bits +// (smartctl --health exit status), the NVMe critical_warning bitmask, and +// per-attribute SMART thresholds is intentionally deferred to the dashboard +// layer — the raw tool output is stable enough to version separately from +// the evolving interpretation rules. func (c *StorageCollector) Collect(ctx context.Context) (*CollectorResult, error) { r := NewResult() @@ -33,13 +40,19 @@ func (c *StorageCollector) Collect(ctx context.Context) (*CollectorResult, error if strings.Contains(filepath.Base(dev), "p") { continue } - c.saveCommand(ctx, r, "hardware/nvme_"+filepath.Base(dev)+"_smart.txt", executor.CommandSpec{Name: "nvme", Args: []string{"smart-log", dev}, NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true}, "nvme", "storage") + base := filepath.Base(dev) + c.saveCommand(ctx, r, "hardware/nvme_"+base+"_smart.txt", executor.CommandSpec{Name: "nvme", Args: []string{"smart-log", dev}, NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true}, "nvme", "storage") + c.saveCommand(ctx, r, "hardware/nvme_"+base+"_error_log.txt", executor.CommandSpec{Name: "nvme", Args: []string{"error-log", dev, "-e", "16"}, NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true}, "nvme", "storage") + c.saveCommand(ctx, r, "hardware/nvme_"+base+"_self_test_log.txt", executor.CommandSpec{Name: "nvme", Args: []string{"self-test-log", dev}, NeedsRoot: true, Timeout: config.TimeoutQuick, IgnoreExit: true}, "nvme", "storage") } } else { c.saveSkippedArtifact(r, "hardware/nvme_list.txt", "command", "nvme", "nvme", SkipCommandUnavailable, "nvme: unavailable", "storage") } + // NVMe controller state from sysfs + c.collectNVMeControllerState(r) + // SMART data for block devices (sd*, vd*, xvd*) if c.Exec.CommandExists("smartctl") { for _, pattern := range []string{"/dev/sd*", "/dev/vd*", "/dev/xvd*"} { @@ -62,3 +75,28 @@ func (c *StorageCollector) Collect(ctx context.Context) (*CollectorResult, error return r, nil } + +// collectNVMeControllerState reads /sys/class/nvme/nvme*/state for each controller. +// "dead" state is a critical hardware signal. +func (c *StorageCollector) collectNVMeControllerState(r *CollectorResult) { + entries, err := os.ReadDir("/sys/class/nvme") + if err != nil { + return // no NVMe controllers present + } + var buf strings.Builder + for _, entry := range entries { + if !strings.HasPrefix(entry.Name(), "nvme") { + continue + } + statePath := filepath.Join("/sys/class/nvme", entry.Name(), "state") + data, err := os.ReadFile(statePath) + if err != nil { + continue + } + state := strings.TrimSpace(string(data)) + buf.WriteString(fmt.Sprintf("%s: %s\n", entry.Name(), state)) + } + if buf.Len() > 0 { + c.saveProbeOutput(r, "hardware/nvme_controller_state.txt", buf.String(), "text", "storage") + } +} diff --git a/customers/vm-troubleshooting/internal/collector/sysfs.go b/customers/vm-troubleshooting/internal/collector/sysfs.go new file mode 100644 index 0000000..e86dd05 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/sysfs.go @@ -0,0 +1,42 @@ +package collector + +import ( + "os" + "strconv" + "strings" +) + +// readSysfsInt reads a single integer value from a sysfs file. +// Returns 0 if the file is missing or the contents cannot be parsed. +// Use readSysfsIntOK when the caller needs to distinguish "file absent" +// from "legitimate zero value". +func readSysfsInt(path string) int64 { + n, _ := readSysfsIntOK(path) + return n +} + +// readSysfsIntOK reads a single integer value from a sysfs file. +// The second return value is true iff the file existed and contained a +// parsable integer. Callers that treat presence/absence as semantically +// distinct (e.g. PCIe link width, CPU topology IDs) should use this. +func readSysfsIntOK(path string) (int64, bool) { + data, err := os.ReadFile(path) + if err != nil { + return 0, false + } + n, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0, false + } + return n, true +} + +// readSysfsString reads a trimmed string from a sysfs file. +// Returns "" if the file is missing or unreadable. +func readSysfsString(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} diff --git a/customers/vm-troubleshooting/internal/collector/sysfs_test.go b/customers/vm-troubleshooting/internal/collector/sysfs_test.go new file mode 100644 index 0000000..b7bc567 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/sysfs_test.go @@ -0,0 +1,77 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +func TestReadSysfsInt(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + if err := os.WriteFile(filepath.Join(dir, "count"), []byte("42\n"), 0o644); err != nil { + t.Fatal(err) + } + if got := readSysfsInt(filepath.Join(dir, "count")); got != 42 { + t.Errorf("valid int: want 42, got %d", got) + } + + if err := os.WriteFile(filepath.Join(dir, "bad"), []byte("not_a_number\n"), 0o644); err != nil { + t.Fatal(err) + } + if got := readSysfsInt(filepath.Join(dir, "bad")); got != 0 { + t.Errorf("unparseable: want 0, got %d", got) + } + + if got := readSysfsInt(filepath.Join(dir, "missing")); got != 0 { + t.Errorf("missing file: want 0, got %d", got) + } +} + +func TestReadSysfsIntOK(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + if err := os.WriteFile(filepath.Join(dir, "count"), []byte("42\n"), 0o644); err != nil { + t.Fatal(err) + } + if got, ok := readSysfsIntOK(filepath.Join(dir, "count")); got != 42 || !ok { + t.Errorf("valid int: want (42,true), got (%d,%v)", got, ok) + } + + // Legitimate zero must be distinguishable from missing — the whole point of readSysfsIntOK. + if err := os.WriteFile(filepath.Join(dir, "zero"), []byte("0\n"), 0o644); err != nil { + t.Fatal(err) + } + if got, ok := readSysfsIntOK(filepath.Join(dir, "zero")); got != 0 || !ok { + t.Errorf("explicit zero: want (0,true), got (%d,%v)", got, ok) + } + + if err := os.WriteFile(filepath.Join(dir, "bad"), []byte("not_a_number\n"), 0o644); err != nil { + t.Fatal(err) + } + if got, ok := readSysfsIntOK(filepath.Join(dir, "bad")); got != 0 || ok { + t.Errorf("unparseable: want (0,false), got (%d,%v)", got, ok) + } + + if got, ok := readSysfsIntOK(filepath.Join(dir, "missing")); got != 0 || ok { + t.Errorf("missing file: want (0,false), got (%d,%v)", got, ok) + } +} + +func TestReadSysfsString(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + if err := os.WriteFile(filepath.Join(dir, "label"), []byte(" DIMM_A1 \n"), 0o644); err != nil { + t.Fatal(err) + } + if got := readSysfsString(filepath.Join(dir, "label")); got != "DIMM_A1" { + t.Errorf("trimmed: want %q, got %q", "DIMM_A1", got) + } + + if got := readSysfsString(filepath.Join(dir, "missing")); got != "" { + t.Errorf("missing: want empty, got %q", got) + } +} diff --git a/customers/vm-troubleshooting/internal/collector/system.go b/customers/vm-troubleshooting/internal/collector/system.go index 9b0bb32..49a0e96 100644 --- a/customers/vm-troubleshooting/internal/collector/system.go +++ b/customers/vm-troubleshooting/internal/collector/system.go @@ -107,6 +107,14 @@ func (c *SystemCollector) Collect(ctx context.Context) (*CollectorResult, error) // --- Kernel cmdline (sanitized) --- c.saveKernelCmdline(r) + // --- dmidecode (SMBIOS hardware inventory) --- + if c.Exec.CommandExists("dmidecode") { + c.saveCommand(ctx, r, "hardware/dmidecode.txt", executor.CommandSpec{ + Name: "dmidecode", NeedsRoot: true, Timeout: config.TimeoutQuick, + }, "dmidecode", "hardware") + c.collectECCType(ctx, r) + } + // --- NUMA topology --- if c.Exec.CommandExists("numactl") { c.saveCommand(ctx, r, "hardware/numa_topology.txt", executor.CommandSpec{ @@ -335,3 +343,26 @@ func (c *SystemCollector) saveProcess(ctx context.Context, r *CollectorResult, p } r.AddCommandArtifact(path, spec.String(), result.ExitCode, status, false, result.TimedOut, result.Truncated, result.Duration, "ps", "processes") } + +// collectECCType runs dmidecode -t 16 to extract the memory ECC type. +func (c *SystemCollector) collectECCType(ctx context.Context, r *CollectorResult) { + spec := executor.CommandSpec{ + Name: "dmidecode", Args: []string{"-t", "16"}, + NeedsRoot: true, Timeout: config.TimeoutQuick, + } + result, stdout, _ := c.Exec.Capture(ctx, spec, 64*1024) + if result.Err != nil || result.Skipped { + r.SetFact("memory.ecc_type", "unknown") + return + } + // Parse "Error Correction Type: " from dmidecode output. + for _, line := range strings.Split(string(stdout), "\n") { + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "Error Correction Type:") { + val := strings.TrimSpace(strings.TrimPrefix(trimmed, "Error Correction Type:")) + r.SetFact("memory.ecc_type", val) + return + } + } + r.SetFact("memory.ecc_type", "unknown") +} diff --git a/customers/vm-troubleshooting/internal/collector/thermal.go b/customers/vm-troubleshooting/internal/collector/thermal.go new file mode 100644 index 0000000..e0a9ed4 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/thermal.go @@ -0,0 +1,322 @@ +package collector + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + + "github.com/NexGenCloud/vm-diagnostics/internal/executor" + "github.com/NexGenCloud/vm-diagnostics/internal/output" + "github.com/NexGenCloud/vm-diagnostics/internal/ui" +) + +type ThermalCollector struct{ Base } + +func NewThermalCollector(exec executor.Executor, writer *output.Writer, ui ui.UI) *ThermalCollector { + return &ThermalCollector{Base{Exec: exec, Writer: writer, UI: ui}} +} + +func (c *ThermalCollector) Name() string { return "Thermal" } +func (c *ThermalCollector) ID() string { return "thermal" } + +type hwmonChip struct { + Name string `json:"name"` + Path string `json:"path"` + Fans []hwmonFan `json:"fans,omitempty"` + Temps []hwmonTemp `json:"temps,omitempty"` + Voltages []hwmonVolt `json:"voltages,omitempty"` +} + +type hwmonFan struct { + Label string `json:"label,omitempty"` + RPM int64 `json:"rpm"` + Alarm bool `json:"alarm"` +} + +type hwmonTemp struct { + Label string `json:"label,omitempty"` + InputMC int64 `json:"input_mc"` // millidegrees C + CritMC *int64 `json:"crit_mc,omitempty"` + Alarm bool `json:"alarm"` +} + +type hwmonVolt struct { + Label string `json:"label,omitempty"` + InputMV int64 `json:"input_mv"` // millivolts + Alarm bool `json:"alarm"` +} + +type thermalZone struct { + Type string `json:"type"` + TempMC int64 `json:"temp_mc"` + TripPoints []tripPoint `json:"trip_points,omitempty"` +} + +type tripPoint struct { + Type string `json:"type"` + TempMC int64 `json:"temp_mc"` +} + +type thermalStatus struct { + Hwmon []hwmonChip `json:"hwmon"` + Zones []thermalZone `json:"thermal_zones"` + CPUThrottle *cpuThrottle `json:"cpu_throttle,omitempty"` +} + +// cpuThrottle summarizes per-package and per-core thermal throttle counters. +// The kernel replicates each count to every CPU in the package/core, so naive +// summation across cpu*/thermal_throttle/* double-counts. This struct records +// the deduplicated totals plus per-package / per-core detail for audit. +type cpuThrottle struct { + PackageTotal int64 `json:"package_events_total"` + CoreTotal int64 `json:"core_events_total"` + Packages []throttlePackage `json:"packages,omitempty"` + Cores []throttleCore `json:"cores,omitempty"` +} + +type throttlePackage struct { + PackageID int `json:"package_id"` + Events int64 `json:"events"` +} + +type throttleCore struct { + PackageID int `json:"package_id"` + CoreID int `json:"core_id"` + Events int64 `json:"events"` +} + +func (c *ThermalCollector) Collect(ctx context.Context) (*CollectorResult, error) { + r := NewResult() + + status := thermalStatus{} + fanAlarms, tempAlarms, critExceeded := 0, 0, 0 + + // Enumerate /sys/class/hwmon/ + hwmonEntries, _ := os.ReadDir("/sys/class/hwmon") + for _, entry := range hwmonEntries { + hwPath := filepath.Join("/sys/class/hwmon", entry.Name()) + chip := hwmonChip{ + Name: readSysfsString(filepath.Join(hwPath, "name")), + Path: hwPath, + } + + // Fans + for i := 1; ; i++ { + inputPath := filepath.Join(hwPath, fmt.Sprintf("fan%d_input", i)) + if _, err := os.Stat(inputPath); err != nil { + break + } + fan := hwmonFan{ + Label: readSysfsString(filepath.Join(hwPath, fmt.Sprintf("fan%d_label", i))), + RPM: readSysfsInt(inputPath), + Alarm: readSysfsInt(filepath.Join(hwPath, fmt.Sprintf("fan%d_alarm", i))) == 1, + } + // Only count explicit alarm bits. RPM==0 is ambiguous: disabled + // fan bays and BMC-managed chassis with no kernel register access + // legitimately report fan*_input=0 with alarm=0. Raw RPM remains + // in the JSON artifact for human review. + if fan.Alarm { + fanAlarms++ + } + chip.Fans = append(chip.Fans, fan) + } + + // Temps + for i := 1; ; i++ { + inputPath := filepath.Join(hwPath, fmt.Sprintf("temp%d_input", i)) + if _, err := os.Stat(inputPath); err != nil { + break + } + temp := hwmonTemp{ + Label: readSysfsString(filepath.Join(hwPath, fmt.Sprintf("temp%d_label", i))), + InputMC: readSysfsInt(inputPath), + Alarm: readSysfsInt(filepath.Join(hwPath, fmt.Sprintf("temp%d_alarm", i))) == 1, + } + critPath := filepath.Join(hwPath, fmt.Sprintf("temp%d_crit", i)) + if critVal := readSysfsInt(critPath); critVal > 0 { + temp.CritMC = &critVal + // Kernel temp*_alarm latches on strict greater-than; modern + // CPUs can hover at TjMax under load without an alarm. Use + // > to match the kernel's semantics. + if temp.InputMC > critVal { + critExceeded++ + } + } + if temp.Alarm { + tempAlarms++ + } + chip.Temps = append(chip.Temps, temp) + } + + // Voltages + for i := 0; ; i++ { + inputPath := filepath.Join(hwPath, fmt.Sprintf("in%d_input", i)) + if _, err := os.Stat(inputPath); err != nil { + break + } + volt := hwmonVolt{ + Label: readSysfsString(filepath.Join(hwPath, fmt.Sprintf("in%d_label", i))), + InputMV: readSysfsInt(inputPath), + Alarm: readSysfsInt(filepath.Join(hwPath, fmt.Sprintf("in%d_alarm", i))) == 1, + } + chip.Voltages = append(chip.Voltages, volt) + } + + // Only include chips that have data + if len(chip.Fans) > 0 || len(chip.Temps) > 0 || len(chip.Voltages) > 0 { + status.Hwmon = append(status.Hwmon, chip) + } + } + + // Enumerate /sys/class/thermal/thermal_zone*/ + zoneEntries, _ := os.ReadDir("/sys/class/thermal") + for _, entry := range zoneEntries { + if !strings.HasPrefix(entry.Name(), "thermal_zone") { + continue + } + zonePath := filepath.Join("/sys/class/thermal", entry.Name()) + zone := thermalZone{ + Type: readSysfsString(filepath.Join(zonePath, "type")), + TempMC: readSysfsInt(filepath.Join(zonePath, "temp")), + } + + // Trip points + for i := 0; ; i++ { + typePath := filepath.Join(zonePath, fmt.Sprintf("trip_point_%d_type", i)) + tempPath := filepath.Join(zonePath, fmt.Sprintf("trip_point_%d_temp", i)) + if _, err := os.Stat(typePath); err != nil { + break + } + tp := tripPoint{ + Type: readSysfsString(typePath), + TempMC: readSysfsInt(tempPath), + } + zone.TripPoints = append(zone.TripPoints, tp) + } + status.Zones = append(status.Zones, zone) + } + + // CPU throttle counters: topology-aware dedup so per-CPU replicas + // of the same package/core count are summed only once. + status.CPUThrottle = collectCPUThrottle() + + // Skip only when ALL three sources are absent. A throttle-only host + // (e.g. minimal container with CPU topology but no hwmon) must still + // surface package/core throttle facts. + if len(status.Hwmon) == 0 && len(status.Zones) == 0 && status.CPUThrottle == nil { + r.RecordSkip(SkipSourceUnavailable, "no hwmon, thermal zone, or CPU throttle data found") + return r, nil + } + + r.SetFact("thermal.fan_alarm_count", strconv.Itoa(fanAlarms)) + r.SetFact("thermal.temp_alarm_count", strconv.Itoa(tempAlarms)) + r.SetFact("thermal.critical_trip_exceeded", strconv.Itoa(critExceeded)) + + if status.CPUThrottle != nil { + r.SetFact("thermal.package_throttle_events_total", strconv.FormatInt(status.CPUThrottle.PackageTotal, 10)) + r.SetFact("thermal.core_throttle_events_total", strconv.FormatInt(status.CPUThrottle.CoreTotal, 10)) + } else { + r.SetFact("thermal.package_throttle_events_total", "unavailable") + r.SetFact("thermal.core_throttle_events_total", "unavailable") + } + + c.saveJSONProbe(r, "hardware/thermal_sensors.json", status, "thermal", "hardware") + + return r, nil +} + +// collectCPUThrottle walks /sys/devices/system/cpu/cpu[0-9]+/, reading +// thermal_throttle counters deduplicated per package and per (pkg,core). +// Returns nil when no CPU topology data is readable (e.g. containerized +// environments without the /sys mount). +func collectCPUThrottle() *cpuThrottle { + return collectCPUThrottleAt("/sys/devices/system/cpu") +} + +// collectCPUThrottleAt is the path-injectable core of collectCPUThrottle. +// Production calls use /sys/devices/system/cpu; tests point at a fake tree. +func collectCPUThrottleAt(cpuBase string) *cpuThrottle { + entries, err := os.ReadDir(cpuBase) + if err != nil { + return nil + } + type cpuIndex struct { + n int + path string + } + var cpus []cpuIndex + for _, e := range entries { + name := e.Name() + if !strings.HasPrefix(name, "cpu") { + continue + } + nStr := strings.TrimPrefix(name, "cpu") + if nStr == "" { + continue + } + n, err := strconv.Atoi(nStr) + if err != nil { + continue // skips cpuidle, cpufreq, cpuhotplug, etc. + } + cpus = append(cpus, cpuIndex{n: n, path: filepath.Join(cpuBase, name)}) + } + if len(cpus) == 0 { + return nil + } + sort.Slice(cpus, func(i, j int) bool { return cpus[i].n < cpus[j].n }) + + type coreKey struct{ pkg, core int } + seenPkg := map[int]bool{} + seenCore := map[coreKey]bool{} + var packages []throttlePackage + var cores []throttleCore + var pkgTotal, coreTotal int64 + readAny := false + + for _, cpu := range cpus { + pkgID, pkgOK := readSysfsIntOK(filepath.Join(cpu.path, "topology", "physical_package_id")) + coreID, coreOK := readSysfsIntOK(filepath.Join(cpu.path, "topology", "core_id")) + if !pkgOK || !coreOK { + continue + } + pkg := int(pkgID) + core := int(coreID) + if !seenPkg[pkg] { + seenPkg[pkg] = true + events, ok := readSysfsIntOK(filepath.Join(cpu.path, "thermal_throttle", "package_throttle_count")) + if ok { + readAny = true + pkgTotal += events + packages = append(packages, throttlePackage{PackageID: pkg, Events: events}) + } + } + k := coreKey{pkg: pkg, core: core} + if !seenCore[k] { + seenCore[k] = true + events, ok := readSysfsIntOK(filepath.Join(cpu.path, "thermal_throttle", "core_throttle_count")) + if ok { + readAny = true + coreTotal += events + cores = append(cores, throttleCore{PackageID: pkg, CoreID: core, Events: events}) + } + } + } + // Return nil when no throttle counter was actually read. Kernel exposes + // thermal_throttle only on architectures/platforms that support it + // (Intel/AMD x86 with msr driver); ARM hosts and many container mounts + // have /sys/devices/system/cpu/ but no thermal_throttle dir. Reporting + // "0" in that case would misrepresent "not probed" as "probed, clean". + if !readAny { + return nil + } + return &cpuThrottle{ + PackageTotal: pkgTotal, + CoreTotal: coreTotal, + Packages: packages, + Cores: cores, + } +} diff --git a/customers/vm-troubleshooting/internal/collector/thermal_test.go b/customers/vm-troubleshooting/internal/collector/thermal_test.go new file mode 100644 index 0000000..82e57f6 --- /dev/null +++ b/customers/vm-troubleshooting/internal/collector/thermal_test.go @@ -0,0 +1,144 @@ +package collector + +import ( + "os" + "path/filepath" + "strconv" + "testing" +) + +// writeFakeCPU stages one /sys-style cpu dir under root. +func writeFakeCPU(t *testing.T, root string, n, pkg, core int, pkgEvents, coreEvents string) { + t.Helper() + base := filepath.Join(root, "cpu"+strconv.Itoa(n)) + if err := os.MkdirAll(filepath.Join(base, "topology"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(base, "thermal_throttle"), 0o755); err != nil { + t.Fatal(err) + } + mustWrite(t, filepath.Join(base, "topology", "physical_package_id"), strconv.Itoa(pkg)) + mustWrite(t, filepath.Join(base, "topology", "core_id"), strconv.Itoa(core)) + if pkgEvents != "" { + mustWrite(t, filepath.Join(base, "thermal_throttle", "package_throttle_count"), pkgEvents) + } + if coreEvents != "" { + mustWrite(t, filepath.Join(base, "thermal_throttle", "core_throttle_count"), coreEvents) + } +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.WriteFile(path, []byte(content+"\n"), 0o644); err != nil { + t.Fatal(err) + } +} + +// TestCollectCPUThrottle_Dedup is the correctness check for R18: hyperthread +// siblings replicate the same package and core throttle counters, and a naive +// sum would double-count. The topology we build here: +// +// cpu0: pkg=0 core=0 (pkg_events=10, core_events=3) +// cpu1: pkg=0 core=0 (HT sibling of cpu0 — same counters mirrored) +// cpu2: pkg=0 core=1 (pkg_events=10 — same package, MUST NOT re-add) +// (core_events=5) +// cpu3: pkg=0 core=1 (HT sibling of cpu2 — mirror) +// cpu4: pkg=1 core=0 (pkg_events=7, core_events=1) +// cpu5: pkg=1 core=0 (HT sibling) +// +// Expected totals: +// +// package_events_total = 10 (pkg 0) + 7 (pkg 1) = 17 +// core_events_total = 3 (pkg0 core0) + 5 (pkg0 core1) + 1 (pkg1 core0) = 9 +func TestCollectCPUThrottle_Dedup(t *testing.T) { + t.Parallel() + root := t.TempDir() + + // pkg 0, core 0 + writeFakeCPU(t, root, 0, 0, 0, "10", "3") + writeFakeCPU(t, root, 1, 0, 0, "10", "3") // HT sibling — mirror + // pkg 0, core 1 + writeFakeCPU(t, root, 2, 0, 1, "10", "5") // same pkg — must not re-add pkg count + writeFakeCPU(t, root, 3, 0, 1, "10", "5") + // pkg 1, core 0 + writeFakeCPU(t, root, 4, 1, 0, "7", "1") + writeFakeCPU(t, root, 5, 1, 0, "7", "1") + + // Decoy siblings that aren't cpu[0-9]+ — must be ignored. + if err := os.MkdirAll(filepath.Join(root, "cpuidle"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(root, "cpufreq"), 0o755); err != nil { + t.Fatal(err) + } + + got := collectCPUThrottleAt(root) + if got == nil { + t.Fatal("expected non-nil cpuThrottle") + } + if got.PackageTotal != 17 { + t.Errorf("package_events_total: want 17, got %d", got.PackageTotal) + } + if got.CoreTotal != 9 { + t.Errorf("core_events_total: want 9, got %d", got.CoreTotal) + } + if len(got.Packages) != 2 { + t.Errorf("packages: want 2 unique, got %d (%+v)", len(got.Packages), got.Packages) + } + if len(got.Cores) != 3 { + t.Errorf("cores: want 3 unique, got %d (%+v)", len(got.Cores), got.Cores) + } +} + +// TestCollectCPUThrottle_NoTopology: containers / bare sandboxes without +// topology files must return nil, not garbage. The thermal collector uses +// the nil sentinel to drive the "unavailable" fact value. +func TestCollectCPUThrottle_NoTopology(t *testing.T) { + t.Parallel() + root := t.TempDir() + // Directory exists but no cpu entries — mirrors a minimal sandbox. + if got := collectCPUThrottleAt(root); got != nil { + t.Errorf("expected nil, got %+v", got) + } +} + +// TestCollectCPUThrottle_NoThrottleFiles: when topology is readable but +// no CPU exposes thermal_throttle counters (common on ARM, and on some +// containerized /sys mounts), the function must return nil so the fact +// is reported as "unavailable" rather than a misleading "0". +func TestCollectCPUThrottle_NoThrottleFiles(t *testing.T) { + t.Parallel() + root := t.TempDir() + // Two CPUs with topology but no thermal_throttle directory at all. + for i := 0; i < 2; i++ { + base := filepath.Join(root, "cpu"+strconv.Itoa(i)) + if err := os.MkdirAll(filepath.Join(base, "topology"), 0o755); err != nil { + t.Fatal(err) + } + mustWrite(t, filepath.Join(base, "topology", "physical_package_id"), "0") + mustWrite(t, filepath.Join(base, "topology", "core_id"), strconv.Itoa(i)) + } + if got := collectCPUThrottleAt(root); got != nil { + t.Errorf("expected nil (no throttle files readable), got %+v", got) + } +} + +// TestCollectCPUThrottle_PartialTopology: when a cpuN is missing topology +// files, it must be skipped (no nil pointer, no garbage contribution). +func TestCollectCPUThrottle_PartialTopology(t *testing.T) { + t.Parallel() + root := t.TempDir() + // cpu0 with topology; cpu1 with no topology dir (must be skipped silently). + writeFakeCPU(t, root, 0, 0, 0, "4", "2") + if err := os.MkdirAll(filepath.Join(root, "cpu1"), 0o755); err != nil { + t.Fatal(err) + } + + got := collectCPUThrottleAt(root) + if got == nil { + t.Fatal("expected non-nil when at least one CPU has topology") + } + if got.PackageTotal != 4 || got.CoreTotal != 2 { + t.Errorf("partial: want (4,2), got (%d,%d)", got.PackageTotal, got.CoreTotal) + } +} diff --git a/customers/vm-troubleshooting/internal/executor/fake.go b/customers/vm-troubleshooting/internal/executor/fake.go index a9d2185..bb51804 100644 --- a/customers/vm-troubleshooting/internal/executor/fake.go +++ b/customers/vm-troubleshooting/internal/executor/fake.go @@ -23,6 +23,11 @@ type FakeExecutor struct { Commands map[string]FakeResponse Binaries map[string]bool RootAccess bool + + // Calls records every CommandSpec passed through RunToFile or Capture, + // in invocation order. Tests inspect this to assert argument sets, + // timeouts, and absence of expected-but-never-run commands. + Calls []CommandSpec } // NewFake creates a FakeExecutor with empty maps. @@ -47,6 +52,7 @@ func (f *FakeExecutor) commandKey(spec CommandSpec) string { } func (f *FakeExecutor) RunToFile(_ context.Context, spec CommandSpec, w io.Writer) CommandResult { + f.Calls = append(f.Calls, spec) if spec.NeedsRoot && !f.RootAccess { fmt.Fprintln(w, "[SKIPPED - requires root privileges]") return CommandResult{Skipped: true} @@ -73,6 +79,7 @@ func (f *FakeExecutor) RunToFile(_ context.Context, spec CommandSpec, w io.Write } func (f *FakeExecutor) Capture(_ context.Context, spec CommandSpec, _ int64) (CommandResult, []byte, []byte) { + f.Calls = append(f.Calls, spec) if spec.NeedsRoot && !f.RootAccess { return CommandResult{Skipped: true}, []byte("[SKIPPED - requires root privileges]"), nil diff --git a/customers/vm-troubleshooting/internal/output/manifest.go b/customers/vm-troubleshooting/internal/output/manifest.go index 5555336..d93c324 100644 --- a/customers/vm-troubleshooting/internal/output/manifest.go +++ b/customers/vm-troubleshooting/internal/output/manifest.go @@ -123,6 +123,18 @@ var integerFactKeys = map[string]bool{ // P1.3 SystemCollector "meminfo.mem_available_kib": true, "meminfo.hugetlb_kib": true, "meminfo.swap_total_kib": true, "meminfo.swap_free_kib": true, + // Hardware failure signal expansion + "edac.ue_total": true, "edac.ce_total": true, "edac.dimm_count": true, + "pcie.aer_fatal_total": true, "pcie.aer_nonfatal_total": true, "pcie.aer_devices_with_errors": true, + "pcie.link_speed_degraded_count": true, "pcie.link_width_degraded_count": true, + "nic.hw_error_interfaces": true, "nic.link_flap_interfaces": true, "devlink.fw_fatal_count": true, + "thermal.fan_alarm_count": true, "thermal.temp_alarm_count": true, "thermal.critical_trip_exceeded": true, + "thermal.package_throttle_events_total": true, "thermal.core_throttle_events_total": true, + "crash_dump_count": true, + "gpu.recovery_action_required": true, "gpu.inforom_invalid_count": true, "gpu.row_remap_failure_count": true, + // NOTE: ipmi.critical_sensor_count, ipmi.sel_critical_event_count, smart.failing_devices, + // nvme.critical_warning_devices are planned but not yet populated by any collector. + // Add them here when IPMI sensor parsing and smartctl exit-bit parsing are implemented. } // ConvertFacts converts string facts to typed JSON values per the explicit allowlist. diff --git a/customers/vm-troubleshooting/internal/output/report.go b/customers/vm-troubleshooting/internal/output/report.go index 5620fc6..8a2283e 100644 --- a/customers/vm-troubleshooting/internal/output/report.go +++ b/customers/vm-troubleshooting/internal/output/report.go @@ -45,7 +45,7 @@ type ReportRecord struct { ErrorCount int `json:"error_count,omitempty"` } -const reportSchemaVersion = "3.3.0" +const reportSchemaVersion = "3.4.0" // WriteReport writes report.ndjson from manifest input data. // Order is deterministic: per collector (registration order) → artifacts → issues → facts (sorted) → summary. diff --git a/customers/vm-troubleshooting/internal/runner/runner.go b/customers/vm-troubleshooting/internal/runner/runner.go index ad2ccd7..719c3ee 100644 --- a/customers/vm-troubleshooting/internal/runner/runner.go +++ b/customers/vm-troubleshooting/internal/runner/runner.go @@ -155,6 +155,10 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { registry.Register(collector.NewAdditionalCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewStorageCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewInfiniBandCollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewEDACCollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewPCIeCollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewIPMICollector(r.Exec, writer, r.UI)) + registry.Register(collector.NewThermalCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewHypervisorCollector(r.Exec, writer, r.UI)) registry.Register(collector.NewOVSCollector(r.Exec, writer, r.UI)) @@ -366,7 +370,7 @@ func (r *Runner) Run(ctx context.Context) (*RunResult, error) { } manifestMeta := output.ManifestMeta{ - SchemaVersion: "3.3.0", + SchemaVersion: "3.4.0", SchemaRef: "schemas/manifest.schema.json", ArchiveID: archiveName, Version: config.Version, diff --git a/customers/vm-troubleshooting/internal/triage/critical.go b/customers/vm-troubleshooting/internal/triage/critical.go index 41fa390..93c073a 100644 --- a/customers/vm-troubleshooting/internal/triage/critical.go +++ b/customers/vm-troubleshooting/internal/triage/critical.go @@ -31,6 +31,7 @@ type CriticalPattern struct { // NOTE: Xid/SXid is owned by triage/xid.go; OOM is owned by collector/journal.go. // Do not add patterns here that duplicate those owners. var criticalPatterns = []CriticalPattern{ + // --- Existing patterns --- { Name: "Kernel Panic", Code: FindingCriticalLog, @@ -41,6 +42,18 @@ var criticalPatterns = []CriticalPattern{ Confidence: collector.ConfidenceHigh, KernelOnly: true, }, + { + // Soft lockup: more specific than Kernel BUG — must precede it. + // Low confidence: many non-HW causes (bugs, starvation, console slowness). + Name: "Soft Lockup", + Code: FindingCriticalLog, + FingerprintKey: "soft_lockup", + Pattern: regexp.MustCompile(`(?i)BUG: soft lockup - CPU#\d+ stuck for \d+s`), + Severity: collector.SeverityWarning, + Category: "KERN", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, { Name: "Kernel BUG", Code: FindingCriticalLog, @@ -112,6 +125,33 @@ var criticalPatterns = []CriticalPattern{ Confidence: collector.ConfidenceHigh, }, { + // NETDEV WATCHDOG must precede the generic Timeout pattern (first match wins). + Name: "NETDEV Watchdog TX Hung", + Code: FindingCriticalLog, + FingerprintKey: "netdev_watchdog", + Pattern: regexp.MustCompile(`(?i)NETDEV WATCHDOG: \S+ \(\S+\): transmit queue \d+ timed out`), + Severity: collector.SeverityWarning, + Category: "NET", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // Specific PCIe hotplug timeout — must precede the generic Timeout + // pattern. Surfaces genuine hardware-bus signals that would otherwise + // be lumped into the generic TIMEOUT bucket. + Name: "PCIe Hotplug Timeout", + Code: FindingCriticalLog, + FingerprintKey: "pcie_hotplug_timeout", + Pattern: regexp.MustCompile(`(?i)pcieport [0-9a-f:.]+: pciehp: Timeout on hotplug command`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // KernelOnly: userspace "timed out" messages (systemd-networkd-wait-online, + // libvirt keepalives) are not hardware signals. Kernel-emitted device + // timeouts (mlx5 cmd_work_handler, etc.) remain covered. Name: "Timeout", Code: FindingCriticalLog, FingerprintKey: "timeout", @@ -119,12 +159,353 @@ var criticalPatterns = []CriticalPattern{ Severity: collector.SeverityWarning, Category: "TIMEOUT", Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + + // --- Hardware failure signal expansion: high-confidence CRITICAL --- + { + Name: "PCIe AER Fatal", + Code: FindingCriticalLog, + FingerprintKey: "pcie_aer_fatal", + Pattern: regexp.MustCompile(`(?i)PCIe Bus Error: severity=Uncorrectable \(Fatal\)`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "EDAC Uncorrectable Error", + Code: FindingCriticalLog, + FingerprintKey: "edac_ue", + Pattern: regexp.MustCompile(`(?i)EDAC MC\d+: \d+ UE`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "EDAC MCE Memory Error", + Code: FindingCriticalLog, + FingerprintKey: "edac_mce", + Pattern: regexp.MustCompile(`(?i)EDAC \w+ MC\d+: HANDLING MCE MEMORY ERROR`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "GHES Fatal Hardware Error", + Code: FindingCriticalLog, + FingerprintKey: "ghes_fatal", + Pattern: regexp.MustCompile(`(?i)\{[^}]*\}\[Hardware Error\]:.*event severity: fatal`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "Memory Failure (hwpoison)", + Code: FindingCriticalLog, + FingerprintKey: "memory_failure", + Pattern: regexp.MustCompile(`(?i)Memory failure: 0x[0-9a-f]+:`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "NVMe Controller Down", + Code: FindingCriticalLog, + FingerprintKey: "nvme_controller_down", + Pattern: regexp.MustCompile(`(?i)nvme nvme\d+: controller is down`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "NVMe Device Not Ready", + Code: FindingCriticalLog, + FingerprintKey: "nvme_not_ready", + Pattern: regexp.MustCompile(`(?i)nvme nvme\d+: Device not ready; aborting`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "SCSI Medium Error", + Code: FindingCriticalLog, + FingerprintKey: "scsi_medium_error", + Pattern: regexp.MustCompile(`(?i)Sense Key\s*:\s*Medium Error`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // Confidence: low. Optical drives, USB SD readers, and removable media + // legitimately emit "Sense Key : Hardware Error" on benign timing + // quirks. The finding is still surfaced to the dashboard; it just + // doesn't drive RMA-level triage. + Name: "SCSI Hardware Error", + Code: FindingCriticalLog, + FingerprintKey: "scsi_hw_error", + Pattern: regexp.MustCompile(`(?i)Sense Key\s*:\s*Hardware Error`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "Block I/O Error", + Code: FindingCriticalLog, + FingerprintKey: "block_io_error", + Pattern: regexp.MustCompile(`(?i)blk_update_request: I/O error, dev`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "Filesystem Read-Only", + Code: FindingCriticalLog, + FingerprintKey: "fs_readonly", + Pattern: regexp.MustCompile(`(?i)Remounting filesystem read-only`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // Confidence: low. "SATA link down" at boot on unconnected ports is + // normal PHY training on empty controllers. Real SATA death is + // covered by COMRESET, ATA Hard Reset, and Block I/O Error. + Name: "SATA Link Down", + Code: FindingCriticalLog, + FingerprintKey: "sata_link_down", + Pattern: regexp.MustCompile(`(?i)ata\d+: SATA link down`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + // Accept kernel's canonical subtarget form `ata1.00: COMRESET failed` + // (dot-subtarget notation for Port-Multiplier-aware hosts). + Name: "COMRESET Failed", + Code: FindingCriticalLog, + FingerprintKey: "comreset_failed", + Pattern: regexp.MustCompile(`(?i)ata\d+(?:\.\d+)?: COMRESET failed`), + Severity: collector.SeverityCritical, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "NMI Hard Lockup", + Code: FindingCriticalLog, + FingerprintKey: "hard_lockup", + Pattern: regexp.MustCompile(`(?i)Watchdog detected hard LOCKUP on cpu \d+`), + Severity: collector.SeverityCritical, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "MLX5 Firmware Fatal", + Code: FindingCriticalLog, + FingerprintKey: "mlx5_fw_fatal", + Pattern: regexp.MustCompile(`(?i)mlx5_core [0-9a-f:.]+:.*firmware fatal error`), + Severity: collector.SeverityCritical, + Category: "NET", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + + // --- Hardware failure signal expansion: high-confidence WARNING --- + { + Name: "PCIe AER Non-Fatal", + Code: FindingCriticalLog, + FingerprintKey: "pcie_aer_nonfatal", + Pattern: regexp.MustCompile(`(?i)PCIe Bus Error: severity=Uncorrectable \(Non-Fatal\)`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "PCIe AER Correctable", + Code: FindingCriticalLog, + FingerprintKey: "pcie_aer_correctable", + Pattern: regexp.MustCompile(`(?i)PCIe Bus Error: severity=Corrected`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "EDAC Correctable Error", + Code: FindingCriticalLog, + FingerprintKey: "edac_ce", + Pattern: regexp.MustCompile(`(?i)EDAC MC\d+: \d+ CE`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "GHES Correctable Error", + Code: FindingCriticalLog, + FingerprintKey: "ghes_correctable", + Pattern: regexp.MustCompile(`(?i)\{[^}]*\}\[Hardware Error\]:.*event severity: (corrected|recoverable)`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "BERT Previous Boot Error", + Code: FindingCriticalLog, + FingerprintKey: "bert_error", + Pattern: regexp.MustCompile(`(?i)BERT: Error records from previous boot`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + Name: "CPU Thermal Throttle", + Code: FindingCriticalLog, + FingerprintKey: "cpu_thermal", + Pattern: regexp.MustCompile(`(?i)CPU\d+: (?:Package|Core) temperature above threshold`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // Covers both the block-layer form (`blk_update_request: I/O error, + // dev nvme0n1`) and the native driver form (`nvme nvme0: I/O error`) + // which is emitted for controller-level I/O errors. + Name: "NVMe I/O Error", + Code: FindingCriticalLog, + FingerprintKey: "nvme_io_error", + Pattern: regexp.MustCompile(`(?i)(?:I/O error, dev nvme\d+n\d+|nvme nvme\d+: I/O error)`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // Covers both the timeout-driven reset and the post-abort reset path + // (`nvme nvme0: resetting controller`) which omits "timeout,". + Name: "NVMe Controller Reset", + Code: FindingCriticalLog, + FingerprintKey: "nvme_reset", + Pattern: regexp.MustCompile(`(?i)nvme nvme\d+: .*(?:timeout, reset controller|resetting controller)`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + { + // CAVEAT: Kernel 6.9+ (Ubuntu 24.04 HWE, 25.x) enables AHCI Link Power + // Management by default, which can produce ata error/reset messages during + // normal power state transitions. Low confidence — only meaningful when + // correlated with SMART failure or I/O errors. + Name: "ATA Failed Command", + Code: FindingCriticalLog, + FingerprintKey: "ata_failed_cmd", + Pattern: regexp.MustCompile(`(?i)ata\d+\.\d+: failed command:`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + // See AHCI LPM caveat above. + Name: "ATA Hard Reset", + Code: FindingCriticalLog, + FingerprintKey: "ata_hard_reset", + Pattern: regexp.MustCompile(`(?i)ata\d+: hard resetting link`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "RAS Page Retirement", + Code: FindingCriticalLog, + FingerprintKey: "ras_page_retire", + Pattern: regexp.MustCompile(`(?i)RAS: Soft-offlining pfn:`), + Severity: collector.SeverityWarning, + Category: "HW", + Confidence: collector.ConfidenceHigh, + KernelOnly: true, + }, + + // --- Low-confidence kernel patterns --- + // These have many non-HW causes (bugs, starvation, console slowness). + // Only meaningful when correlated with Tier 1 hardware signals. + // They live in criticalPatterns (not lowConfidencePatterns) so they run + // against all kernel sources (dmesg, journal_kernel, journal_errors). + { + Name: "RCU Stall", + Code: FindingCriticalLog, + FingerprintKey: "rcu_stall", + Pattern: regexp.MustCompile(`(?i)rcu:.*detected stalls on CPUs`), + Severity: collector.SeverityWarning, + Category: "KERN", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "Hung Task", + Code: FindingCriticalLog, + FingerprintKey: "hung_task", + Pattern: regexp.MustCompile(`(?i)INFO: task .+ blocked for more than \d+ seconds`), + Severity: collector.SeverityWarning, + Category: "KERN", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "EXT4 Error", + Code: FindingCriticalLog, + FingerprintKey: "ext4_error", + Pattern: regexp.MustCompile(`(?i)EXT4-fs error \(device`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "XFS CRC Error", + Code: FindingCriticalLog, + FingerprintKey: "xfs_crc_error", + Pattern: regexp.MustCompile(`(?i)XFS \(\S+\): Metadata CRC error`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, + }, + { + Name: "BTRFS Checksum Error", + Code: FindingCriticalLog, + FingerprintKey: "btrfs_csum", + Pattern: regexp.MustCompile(`(?i)BTRFS.*csum failed`), + Severity: collector.SeverityWarning, + Category: "DISK", + Confidence: collector.ConfidenceLow, + KernelOnly: true, }, } // lowConfidencePatterns are applied only to error-priority sources (journal_errors.txt). // They produce confidence=low findings to avoid flooding SUMMARY.txt. var lowConfidencePatterns = []CriticalPattern{ + // Generic catch-all — must be the only entry or last. { Name: "Error/Fail", Code: FindingCriticalLog, @@ -167,10 +548,18 @@ func isPCIHotplugLine(line string) bool { // normalizeCriticalLine strips high-cardinality tokens for deduplication. // The original line is preserved in evidence; the normalized form is only // used for dedup keys and fingerprint generation. +// +// Dmesg-only timestamp prefixes (kernel `[12.345]` form and human +// `[Fri Feb 6 ...]` form) are stripped to empty, not replaced with a +// placeholder: NDJSON MESSAGE fields never carry these prefixes, so +// matching the journal form requires removal. Without this, one kernel +// event found in both dmesg.txt and journal_kernel.ndjson would produce +// two distinct findings with different fingerprints. func normalizeCriticalLine(line string) string { n := pidBracketRe.ReplaceAllString(line, "[_]") - n = kernelTsRe.ReplaceAllString(n, "[_]") - n = dmesgHumanTsRe.ReplaceAllString(n, "[_]") + n = kernelTsRe.ReplaceAllString(n, "") + n = dmesgHumanTsRe.ReplaceAllString(n, "") + n = strings.TrimSpace(n) n = handlerRe.ReplaceAllString(n, "(handler_)") n = ipPortRe.ReplaceAllString(n, "_._._._:_") // Targeted: PCI BDF and timing normalization for known repetitive hotplug families. diff --git a/customers/vm-troubleshooting/internal/triage/critical_hw_test.go b/customers/vm-troubleshooting/internal/triage/critical_hw_test.go new file mode 100644 index 0000000..e317781 --- /dev/null +++ b/customers/vm-troubleshooting/internal/triage/critical_hw_test.go @@ -0,0 +1,374 @@ +package triage + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/NexGenCloud/vm-diagnostics/internal/collector" +) + +// TestHWPatterns_HighConfidenceCritical verifies that each new high-confidence +// CRITICAL hardware pattern matches expected log lines. +func TestHWPatterns_HighConfidenceCritical(t *testing.T) { + t.Parallel() + cases := []struct { + name string + line string + wantName string + severity collector.Severity + }{ + {"PCIe AER fatal", "[1.0] pcieport 0000:00:01.0: PCIe Bus Error: severity=Uncorrectable (Fatal), type=Transaction Layer", "PCIe AER Fatal", collector.SeverityCritical}, + {"EDAC UE", "[1.0] EDAC MC0: 1 UE ie31200 at DIMM 0", "EDAC Uncorrectable Error", collector.SeverityCritical}, + {"EDAC MCE", "[1.0] EDAC skx MC0: HANDLING MCE MEMORY ERROR", "EDAC MCE Memory Error", collector.SeverityCritical}, + {"GHES fatal", "[1.0] {1}[Hardware Error]: event severity: fatal", "GHES Fatal Hardware Error", collector.SeverityCritical}, + {"Memory failure", "[1.0] Memory failure: 0x1234abcd: recovery action for dirty LRU page", "Memory Failure (hwpoison)", collector.SeverityCritical}, + {"NVMe controller down", "[1.0] nvme nvme0: controller is down", "NVMe Controller Down", collector.SeverityCritical}, + {"SCSI medium error", "[1.0] sd 0:0:0:0: [sda] Sense Key : Medium Error", "SCSI Medium Error", collector.SeverityCritical}, + {"Block I/O error", "[1.0] blk_update_request: I/O error, dev sda, sector 12345", "Block I/O Error", collector.SeverityCritical}, + {"FS read-only", "[1.0] EXT4-fs (sda1): Remounting filesystem read-only", "Filesystem Read-Only", collector.SeverityCritical}, + // SATA Link Down was moved to ConfidenceLow (R9) — empty controllers + // at boot are normal PHY training, not hardware failure. The pattern + // still fires; it just doesn't assert here. Phase 3 R27 adds a + // low-confidence equivalent test. + {"Hard lockup", "[1.0] Watchdog detected hard LOCKUP on cpu 3", "NMI Hard Lockup", collector.SeverityCritical}, + {"MLX5 fw fatal", "[1.0] mlx5_core 0000:3b:00.0: firmware fatal error detected", "MLX5 Firmware Fatal", collector.SeverityCritical}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\n"+tc.line+"\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + found := false + for _, f := range tr.Findings { + if f.Title == tc.wantName { + found = true + if f.Severity != tc.severity { + t.Errorf("expected severity %s, got %s", tc.severity, f.Severity) + } + if f.Confidence != collector.ConfidenceHigh { + t.Errorf("expected high confidence, got %s", f.Confidence) + } + } + } + if !found { + titles := make([]string, len(tr.Findings)) + for i, f := range tr.Findings { + titles[i] = f.Title + } + t.Errorf("expected finding %q not found; got %v", tc.wantName, titles) + } + }) + } +} + +// TestHWPatterns_HighConfidenceWarning verifies warning-level hardware patterns. +func TestHWPatterns_HighConfidenceWarning(t *testing.T) { + t.Parallel() + cases := []struct { + name string + line string + wantName string + }{ + {"PCIe AER non-fatal", "[1.0] pcieport 0000:00:01.0: PCIe Bus Error: severity=Uncorrectable (Non-Fatal)", "PCIe AER Non-Fatal"}, + {"PCIe AER correctable", "[1.0] pcieport 0000:00:01.0: PCIe Bus Error: severity=Corrected", "PCIe AER Correctable"}, + {"EDAC CE", "[1.0] EDAC MC0: 1 CE ie31200 at DIMM 0", "EDAC Correctable Error"}, + {"BERT", "[1.0] BERT: Error records from previous boot", "BERT Previous Boot Error"}, + {"NETDEV watchdog", "[1.0] NETDEV WATCHDOG: eth0 (mlx5_core): transmit queue 5 timed out", "NETDEV Watchdog TX Hung"}, + {"CPU thermal", "[1.0] CPU0: Package temperature above threshold, cpu clock throttled", "CPU Thermal Throttle"}, + {"NVMe I/O error", "[1.0] I/O error, dev nvme0n1, sector 0 op 0x0", "NVMe I/O Error"}, + {"RAS page retire", "[1.0] RAS: Soft-offlining pfn: 0x12345", "RAS Page Retirement"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\n"+tc.line+"\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + found := false + for _, f := range tr.Findings { + if f.Title == tc.wantName { + found = true + if f.Severity != collector.SeverityWarning { + t.Errorf("expected warning severity, got %s", f.Severity) + } + } + } + if !found { + titles := make([]string, len(tr.Findings)) + for i, f := range tr.Findings { + titles[i] = f.Title + } + t.Errorf("expected finding %q not found; got %v", tc.wantName, titles) + } + }) + } +} + +// TestHWPatterns_LowConfidence verifies low-confidence patterns match and produce +// low confidence. These are kernel-only patterns that now run against all kernel +// sources (dmesg, journal_kernel, journal_errors). +func TestHWPatterns_LowConfidence(t *testing.T) { + t.Parallel() + cases := []struct { + name string + line string + wantName string + }{ + {"soft lockup", "BUG: soft lockup - CPU#3 stuck for 22s!", "Soft Lockup"}, + {"RCU stall", "rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:", "RCU Stall"}, + {"hung task", "INFO: task kworker/0:1 blocked for more than 120 seconds.", "Hung Task"}, + {"ext4 error", "EXT4-fs error (device sda1): ext4_lookup: bad entry in directory", "EXT4 Error"}, + {"xfs crc", "XFS (sda1): Metadata CRC error detected at block 0x123", "XFS CRC Error"}, + {"btrfs csum", "BTRFS warning (device sda1): csum failed root 5 ino 256", "BTRFS Checksum Error"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + // These are kernel-only patterns in criticalPatterns — dmesg is a kernel source. + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), []byte("---\n"+tc.line+"\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + found := false + for _, f := range tr.Findings { + if f.Title == tc.wantName { + found = true + if f.Confidence != collector.ConfidenceLow { + t.Errorf("expected low confidence, got %s", f.Confidence) + } + } + } + if !found { + titles := make([]string, len(tr.Findings)) + for i, f := range tr.Findings { + titles[i] = f.Title + } + t.Errorf("expected finding %q not found; got %v", tc.wantName, titles) + } + }) + } +} + +// TestHWPatterns_KernelOnlyNotMatchedFromUserspace verifies kernel-only patterns +// do not match non-kernel journal sources. +func TestHWPatterns_KernelOnlyNotMatchedFromUserspace(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/journal_errors.ndjson"), []byte( + `{"MESSAGE":"PCIe Bus Error: severity=Uncorrectable (Fatal)","PRIORITY":"3","SYSLOG_IDENTIFIER":"my-app","_SYSTEMD_UNIT":"my-app.service","_TRANSPORT":"stdout","__REALTIME_TIMESTAMP":"1","_BOOT_ID":"b1"}`+"\n", + ), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + for _, f := range tr.Findings { + if f.Title == "PCIe AER Fatal" { + t.Error("kernel-only PCIe AER pattern should not match userspace source") + } + } +} + +// TestCriticalPatternOrdering locks in the ordering invariants that correctness +// depends on. The matcher uses first-match-wins, so any more-specific pattern +// must precede its generic form. A refactor that re-sorts the slice would +// silently regress these relationships if the invariants weren't tested. +func TestCriticalPatternOrdering(t *testing.T) { + t.Parallel() + index := func(patterns []CriticalPattern, name string) int { + for i, p := range patterns { + if p.Name == name { + return i + } + } + return -1 + } + + cases := []struct { + before, after string + }{ + // Specific patterns that must precede the generic "Kernel BUG". + {"Soft Lockup", "Kernel BUG"}, + // Specific network / PCIe timeouts must precede the generic "Timeout". + {"NETDEV Watchdog TX Hung", "Timeout"}, + {"PCIe Hotplug Timeout", "Timeout"}, + // Block layer catches both SD and NVMe before the native NVMe form. + {"Block I/O Error", "NVMe I/O Error"}, + } + for _, tc := range cases { + b := index(criticalPatterns, tc.before) + a := index(criticalPatterns, tc.after) + if b < 0 { + t.Errorf("pattern %q not found in criticalPatterns", tc.before) + continue + } + if a < 0 { + t.Errorf("pattern %q not found in criticalPatterns", tc.after) + continue + } + if b >= a { + t.Errorf("pattern ordering violated: %q (idx %d) must precede %q (idx %d)", + tc.before, b, tc.after, a) + } + } + + // Error/Fail is the generic catch-all in lowConfidencePatterns; it must + // stay last (or the only entry) so specific patterns added later still + // match first. + if n := len(lowConfidencePatterns); n == 0 || lowConfidencePatterns[n-1].Name != "Error/Fail" { + t.Errorf("Error/Fail must be last in lowConfidencePatterns; got %d entries, last=%q", + n, func() string { + if n == 0 { + return "" + } + return lowConfidencePatterns[n-1].Name + }()) + } +} + +// TestHWPatterns_MissingCoverage covers patterns and forms that weren't +// exercised by the original high-confidence test bucket, including the +// regex-expansion fixes (R5, R6, R7) and the confidence downgrades (R8, R9). +func TestHWPatterns_MissingCoverage(t *testing.T) { + t.Parallel() + cases := []struct { + name string + line string + wantName string + wantSev collector.Severity + wantConf collector.Confidence + }{ + // R5: COMRESET with the ata1.00: subtarget form. + { + name: "COMRESET with subtarget", + line: "[1.0] ata1.00: COMRESET failed (errno=-16)", + wantName: "COMRESET Failed", + wantSev: collector.SeverityCritical, + wantConf: collector.ConfidenceHigh, + }, + // R6: NVMe reset without "timeout," prefix. The legacy form + // "timeout, reset controller" intentionally routes to the generic + // Timeout finding (first-match-wins; see TestCriticalPatternOrdering). + { + name: "NVMe reset post-abort", + line: "[1.0] nvme nvme0: resetting controller", + wantName: "NVMe Controller Reset", + wantSev: collector.SeverityWarning, + wantConf: collector.ConfidenceHigh, + }, + // R7: NVMe native driver I/O error form. The block-layer form + // (`blk_update_request: I/O error, dev nvme0n1`) intentionally + // routes to the generic Block I/O Error finding because that + // pattern is positioned earlier in the list (and covers sd/vd too). + { + name: "NVMe native I/O error", + line: "[1.0] nvme nvme0: I/O error 0x4/0x4", + wantName: "NVMe I/O Error", + wantSev: collector.SeverityWarning, + wantConf: collector.ConfidenceHigh, + }, + // R8: SCSI Hardware Error now low confidence. + { + name: "SCSI Hardware Error low confidence", + line: "[1.0] sd 0:0:0:0: [sda] Sense Key : Hardware Error", + wantName: "SCSI Hardware Error", + wantSev: collector.SeverityCritical, + wantConf: collector.ConfidenceLow, + }, + // R9: SATA Link Down now low confidence. + { + name: "SATA Link Down low confidence", + line: "[1.0] ata1: SATA link down (SStatus 0 SControl 300)", + wantName: "SATA Link Down", + wantSev: collector.SeverityCritical, + wantConf: collector.ConfidenceLow, + }, + // R16: new PCIe Hotplug Timeout pattern. + { + name: "PCIe Hotplug Timeout", + line: "[1.0] pcieport 0000:65:00.0: pciehp: Timeout on hotplug command 0x13e1 (issued 1528 msec ago)", + wantName: "PCIe Hotplug Timeout", + wantSev: collector.SeverityWarning, + wantConf: collector.ConfidenceHigh, + }, + // Block I/O Error (core disk-failure signal, no existing coverage). + { + name: "Block I/O Error", + line: "[1.0] blk_update_request: I/O error, dev sda, sector 12345", + wantName: "Block I/O Error", + wantSev: collector.SeverityCritical, + wantConf: collector.ConfidenceHigh, + }, + // AHCI LPM low-confidence patterns. + { + name: "ATA Failed Command", + line: "[1.0] ata5.00: failed command: READ FPDMA QUEUED", + wantName: "ATA Failed Command", + wantSev: collector.SeverityWarning, + wantConf: collector.ConfidenceLow, + }, + { + name: "ATA Hard Reset", + line: "[1.0] ata5: hard resetting link", + wantName: "ATA Hard Reset", + wantSev: collector.SeverityWarning, + wantConf: collector.ConfidenceLow, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + workDir := t.TempDir() + os.MkdirAll(filepath.Join(workDir, "logs"), 0o755) + os.WriteFile(filepath.Join(workDir, "logs/dmesg.txt"), + []byte("---\n"+tc.line+"\n"), 0o644) + + tr, err := AnalyzeCriticalLogs(context.Background(), workDir) + if err != nil { + t.Fatal(err) + } + var got *Finding + for i := range tr.Findings { + if tr.Findings[i].Title == tc.wantName { + got = &tr.Findings[i] + break + } + } + if got == nil { + titles := make([]string, len(tr.Findings)) + for i, f := range tr.Findings { + titles[i] = f.Title + } + t.Fatalf("expected finding %q not found; got %v", tc.wantName, titles) + } + if got.Severity != tc.wantSev { + t.Errorf("severity: want %s, got %s", tc.wantSev, got.Severity) + } + if got.Confidence != tc.wantConf { + t.Errorf("confidence: want %s, got %s", tc.wantConf, got.Confidence) + } + }) + } +} diff --git a/customers/vm-troubleshooting/schemas/manifest.schema.json b/customers/vm-troubleshooting/schemas/manifest.schema.json index 16cd7ca..3d507fe 100644 --- a/customers/vm-troubleshooting/schemas/manifest.schema.json +++ b/customers/vm-troubleshooting/schemas/manifest.schema.json @@ -57,7 +57,8 @@ "procfs", "netlink", "sysctl", "ps", "top", "text", "binary", "ss", "mount", "lsmod", "pip", "docker", "nmcli", "networkctl", "resolvectl", "bridge", "netplan", "iptables", "nft", "ufw", "firewall-cmd", "ibstat", "ibstatus", "ibv_devinfo", "rdma", "apt-mark", "sh", - "hostname", "date", "uptime", "uname", "csv" + "hostname", "date", "uptime", "uname", "csv", + "ipmitool", "dmidecode", "ethtool", "devlink", "perfquery" ] }, "tags": { @@ -68,7 +69,8 @@ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", "packages", "storage", "infiniband", "processes", "config", "triage", - "hypervisor", "ovs" + "hypervisor", "ovs", + "edac", "ipmi", "pcie", "thermal" ] } } diff --git a/customers/vm-troubleshooting/schemas/report-record.schema.json b/customers/vm-troubleshooting/schemas/report-record.schema.json index ff0ca0e..f3222a8 100644 --- a/customers/vm-troubleshooting/schemas/report-record.schema.json +++ b/customers/vm-troubleshooting/schemas/report-record.schema.json @@ -27,7 +27,8 @@ "identity", "cpu", "memory", "disk", "hardware", "gpu", "gpu-errors", "gpu-health", "network", "firewall", "docker", "docker-security", "services", "journal", "oom", "packages", "storage", "infiniband", "processes", "config", "triage", - "hypervisor", "ovs" + "hypervisor", "ovs", + "edac", "ipmi", "pcie", "thermal" ] } }, From a2e066024beb135ce97384191716aa87d274eeed Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 15:15:34 +0200 Subject: [PATCH 13/23] feat(vm-troubleshooting-dashboard): triage UX, issue state, deploy stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deployment & packaging - Add Dockerfile (pnpm frontend build + CGO-free Go binary), docker-compose (Caddy → oauth2-proxy → dashboard), DEPLOYMENT.md runbook, .dockerignore, .env.example, Caddyfile.example, oauth2-proxy.cfg.example. - Add cmd/dashboard entrypoint: data/web flags, archive cap, graceful shutdown; require --auth-shared-token or --trust-forwarded-user on non-loopback listen; tests for listenRequiresAuth (incl. IPv6 loopback). Backend API & persistence - API: GET /api/v1/archives/{id}/issue-state, POST .../issue-state/{fp} with state ack|dismissed|; prefer X-Forwarded-Email over X-Forwarded-User for uploaded_by when trusting proxies. - SQLite: issue_state table; archives.uptime_seconds; collectors.skip_reasons_json; extra indexes on issues (confidence, category, collector, code, fingerprint). - Store: migrations, SetIssueState/LoadIssueStates, persist uptime and skip reasons; list/get hydrate them. - Ingest: attach collector SkipReasons; extract uptime from system/hypervisor facts (uptime_seconds / system.uptime_*); uptime unit tests. - Evidence suggestions: category-based path-prefix bonus (DISK/HW/NET/MEM/GPU); category tests. Issue-state store tests. Frontend — overview & issues - New DashboardPage: severity KPI grid, composed overview copy, host context, facts/signals, skips, collector grid, grouped issues preview (facts, boot, grouping, title, summary, source, units libs + Vitest coverage). - IssuesPage: URL-backed hideLowConf/grouped/hideDismissed; low-confidence hidden-count banner; compareIssues default sort; pattern groups vs flat list; confidence pills; IssueRowBody avoids duplicated title/message rendering. - IssueDetailPage: ack/dismiss/clear, copy summary, confidence pill, pattern siblings, occurrence/source callout, ranked prev/next + j/k shortcuts, severity badge icons. Frontend — artifacts & polish - ArtifactBrowser: path filter, severity dots from related_artifact_paths, JSON tree + system overview card, #L line hash highlight + share links. - SeverityBadge: shared severity metadata + icons; KV long-value wrapping. - Types: SkipReason, issue states, uptime_seconds; utils: sampleLine, cleanTitle for primary finding titles. - archives API: invalidate detail cache on re-upload; removeQueries on delete. - issues API: issue-state hooks. Tooling & docs - package.json: vitest + Testing Library + jsdom; pnpm-lock.yaml updated. - vitest.config.ts + vitest.setup.ts. - .gitignore: /dashboard anchor, ignore live Caddy/oauth2-proxy configs. - CODEMAP: link DEPLOYMENT.md and USER_GUIDE.md. --- .../.dockerignore | 39 + .../vm-troubleshooting-dashboard/.env.example | 24 + .../vm-troubleshooting-dashboard/.gitignore | 9 +- .../vm-troubleshooting-dashboard/CODEMAP.md | 2 + .../Caddyfile.example | 37 + .../DEPLOYMENT.md | 419 ++++++++++ .../vm-troubleshooting-dashboard/Dockerfile | 91 +++ .../cmd/dashboard/main.go | 107 +++ .../cmd/dashboard/main_test.go | 53 ++ .../docker-compose.yml | 115 +++ .../frontend/package.json | 12 +- .../frontend/pnpm-lock.yaml | 713 ++++++++++++++++++ .../frontend/src/api/archives.ts | 12 +- .../frontend/src/api/issues.ts | 51 +- .../artifacts/ArtifactBrowserPage.tsx | 243 +++++- .../src/components/artifacts/JsonTree.tsx | 140 ++++ .../artifacts/SystemIdentityCard.tsx | 73 ++ .../components/dashboard/DashboardPage.tsx | 418 ++++++++++ .../issue-detail/IssueDetailPage.tsx | 368 ++++++++- .../src/components/issues/IssuesPage.tsx | 479 ++++++++++-- .../src/components/ui/confidence-pill.tsx | 37 + .../frontend/src/components/ui/kv.tsx | 9 +- .../src/components/ui/severity-badge.tsx | 27 +- .../frontend/src/lib/boot.test.ts | 101 +++ .../frontend/src/lib/boot.ts | 61 ++ .../frontend/src/lib/clipboard.ts | 14 + .../frontend/src/lib/component.test.ts | 36 + .../frontend/src/lib/component.ts | 25 + .../frontend/src/lib/facts.test.ts | 94 +++ .../frontend/src/lib/facts.ts | 360 +++++++++ .../frontend/src/lib/facts.value.test.ts | 49 ++ .../frontend/src/lib/grouping.test.ts | 198 +++++ .../frontend/src/lib/grouping.ts | 107 +++ .../frontend/src/lib/ranking.test.ts | 105 +++ .../frontend/src/lib/ranking.ts | 56 ++ .../frontend/src/lib/severity.test.ts | 64 ++ .../frontend/src/lib/severity.ts | 147 ++++ .../frontend/src/lib/source.test.ts | 30 + .../frontend/src/lib/source.ts | 21 + .../frontend/src/lib/summary.ts | 127 ++++ .../frontend/src/lib/title.test.ts | 68 ++ .../frontend/src/lib/title.ts | 46 ++ .../frontend/src/lib/units.test.ts | 73 ++ .../frontend/src/lib/units.ts | 87 +++ .../frontend/src/lib/utils.test.ts | 21 + .../frontend/src/lib/utils.ts | 25 +- .../frontend/src/types.ts | 14 + .../frontend/vitest.config.ts | 18 + .../frontend/vitest.setup.ts | 1 + .../internal/api/server.go | 46 +- .../internal/ingest/ingest.go | 49 ++ .../internal/ingest/uptime_test.go | 90 +++ .../internal/model/types.go | 25 +- .../internal/store/evidence.go | 33 + .../internal/store/evidence_category_test.go | 76 ++ .../internal/store/issue_state_test.go | 125 +++ .../internal/store/schema.sql | 30 +- .../internal/store/store.go | 144 +++- .../oauth2-proxy.cfg.example | 50 ++ 59 files changed, 5919 insertions(+), 175 deletions(-) create mode 100644 customers/vm-troubleshooting-dashboard/.dockerignore create mode 100644 customers/vm-troubleshooting-dashboard/.env.example create mode 100644 customers/vm-troubleshooting-dashboard/Caddyfile.example create mode 100644 customers/vm-troubleshooting-dashboard/DEPLOYMENT.md create mode 100644 customers/vm-troubleshooting-dashboard/Dockerfile create mode 100644 customers/vm-troubleshooting-dashboard/cmd/dashboard/main.go create mode 100644 customers/vm-troubleshooting-dashboard/cmd/dashboard/main_test.go create mode 100644 customers/vm-troubleshooting-dashboard/docker-compose.yml create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/JsonTree.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/SystemIdentityCard.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/components/ui/confidence-pill.tsx create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/clipboard.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.value.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/summary.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/title.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/title.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/units.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/units.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/vitest.config.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/vitest.setup.ts create mode 100644 customers/vm-troubleshooting-dashboard/internal/ingest/uptime_test.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/evidence_category_test.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/issue_state_test.go create mode 100644 customers/vm-troubleshooting-dashboard/oauth2-proxy.cfg.example diff --git a/customers/vm-troubleshooting-dashboard/.dockerignore b/customers/vm-troubleshooting-dashboard/.dockerignore new file mode 100644 index 0000000..bc5195a --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/.dockerignore @@ -0,0 +1,39 @@ +.git +.gitignore +.dockerignore +Dockerfile +docker-compose.yml +docker-compose.*.yml +*.md + +# Build artefacts +bin/ +dashboard +frontend/dist/ +frontend/node_modules/ +node_modules/ + +# Runtime state +dashboard-data/ +*.db +*.db-journal +*.db-wal +*.db-shm + +# Local dev / editor +.env +.env.* +!.env.example +.idea/ +.vscode/ +.DS_Store +*.swp + +# Test output +*.test +*.out +coverage.* +*.coverprofile + +# CI / tooling +.github/ diff --git a/customers/vm-troubleshooting-dashboard/.env.example b/customers/vm-troubleshooting-dashboard/.env.example new file mode 100644 index 0000000..41d1c2a --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/.env.example @@ -0,0 +1,24 @@ +# Copy to .env and fill in. Do not commit the real .env. + +# Public hostname that Caddy serves on. Must resolve to this host and +# have :80/:443 reachable from the internet for Let's Encrypt to issue. +TRIAGE_HOSTNAME=triage.ngbackend.cloud + +# Address Let's Encrypt registers the ACME account under; used for +# expiry warnings. Keep it a real, monitored inbox. +ACME_EMAIL=cx-ops@example.com + +# From Authentik provider (DEPLOYMENT.md §8a) +OAUTH2_PROXY_CLIENT_ID= +OAUTH2_PROXY_CLIENT_SECRET= + +# Generate with: openssl rand -base64 32 | tr -- '+/' '-_' +OAUTH2_PROXY_COOKIE_SECRET= + +# Optional build metadata (surfaces in /api/v1/version if wired in main.go) +TRIAGE_VERSION=dev +TRIAGE_COMMIT=unknown +TRIAGE_BUILD_DATE=unknown + +# Container timezone (affects log timestamps only; DB stores UTC) +TZ=UTC diff --git a/customers/vm-troubleshooting-dashboard/.gitignore b/customers/vm-troubleshooting-dashboard/.gitignore index c2a37ec..bd6672d 100644 --- a/customers/vm-troubleshooting-dashboard/.gitignore +++ b/customers/vm-troubleshooting-dashboard/.gitignore @@ -1,6 +1,8 @@ # Build output bin/ -dashboard +# Leading slash anchors to this project's root only. A bare `dashboard` +# rule would match the `frontend/src/components/dashboard/` directory too. +/dashboard # Runtime / local data dashboard-data/ @@ -35,3 +37,8 @@ Thumbs.db .env .env.* !.env.example + +# Live oauth2-proxy + Caddy configs (hostnames are deployment-specific). +# The .example templates ARE committed. +oauth2-proxy.cfg +Caddyfile diff --git a/customers/vm-troubleshooting-dashboard/CODEMAP.md b/customers/vm-troubleshooting-dashboard/CODEMAP.md index 7175e9b..83c6bae 100644 --- a/customers/vm-troubleshooting-dashboard/CODEMAP.md +++ b/customers/vm-troubleshooting-dashboard/CODEMAP.md @@ -94,5 +94,7 @@ cd frontend && pnpm build ## Related docs - [`AGENTS.md`](./AGENTS.md) — dashboard rules, schema compat, tests. +- [`DEPLOYMENT.md`](./DEPLOYMENT.md) — hosting the service: systemd, backup, oauth2-proxy + Authentik. +- [`../../docs/USER_GUIDE.md`](../../docs/USER_GUIDE.md) — CX-agent triage workflow. - [`../vm-troubleshooting/CODEMAP.md`](../vm-troubleshooting/CODEMAP.md) — collector output the dashboard consumes. - [`../../docs/plans/post-audit-hardening.md`](../../docs/plans/post-audit-hardening.md) — planned hardening goals (verify in code). diff --git a/customers/vm-troubleshooting-dashboard/Caddyfile.example b/customers/vm-troubleshooting-dashboard/Caddyfile.example new file mode 100644 index 0000000..8d63d03 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/Caddyfile.example @@ -0,0 +1,37 @@ +# Copy to Caddyfile (no suffix). Caddy watches this file on startup only, +# so `docker compose restart caddy` after edits. +# +# Globals: email is used for Let's Encrypt account registration + expiry +# notifications. Auto-HTTPS: Caddy gets and renews certs automatically via +# ACME HTTP-01 (needs :80 reachable from the internet) or TLS-ALPN-01 +# (needs :443). No manual renewals, no cron. + +{ + email {$ACME_EMAIL} + # Uncomment while testing to hit Let's Encrypt's staging CA and avoid + # burning through the production rate limit (5 failures / hour / account). + # acme_ca https://acme-staging-v02.api.letsencrypt.org/directory +} + +{$TRIAGE_HOSTNAME} { + encode zstd gzip + + # Diagnostic archives can be large; match the ingest limit. + request_body { + max_size 256MB + } + + # Reasonable HSTS for an internal tool on its own hostname. + header { + Strict-Transport-Security "max-age=31536000; includeSubDomains" + X-Content-Type-Options "nosniff" + Referrer-Policy "same-origin" + } + + reverse_proxy oauth2-proxy:4180 { + # Caddy sets X-Forwarded-For / X-Forwarded-Proto by default. + # oauth2-proxy's trusted_proxy_ips (covers 172.16.0.0/12) must + # include this service's address on the triage-internal network. + header_up Host {host} + } +} diff --git a/customers/vm-troubleshooting-dashboard/DEPLOYMENT.md b/customers/vm-troubleshooting-dashboard/DEPLOYMENT.md new file mode 100644 index 0000000..f885976 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/DEPLOYMENT.md @@ -0,0 +1,419 @@ +# Dashboard — Deployment & Operations + +Operational runbook for the CX team hosting the Triage dashboard. +Audience: whoever owns the internal Linux host running the service. + +End users (CX agents triaging archives) should read +[`docs/USER_GUIDE.md`](../../docs/USER_GUIDE.md) instead. + +**Two deployment paths.** Pick one and stop reading the other: + +- **[Docker Compose](#0-docker-compose-recommended)** — recommended. + One `docker compose up -d` starts Caddy (auto-HTTPS via Let's Encrypt), + oauth2-proxy, and the dashboard on an isolated network. §0 covers it + end-to-end. Skip §2–§5 and §8d entirely. +- **Bare metal systemd** — §2 onward. Same topology, assembled by hand. + Prefer this only if you can't run Docker. + +§1 (what's running), §6 (backup), §7 (concurrency), §8a–§8c (Authentik ++ oauth2-proxy config) apply to both paths. + +--- + +## 0. Docker Compose (recommended) + +The repo ships a complete stack: `Dockerfile` + `docker-compose.yml` + +templates for every config file. Topology: + +``` +internet → caddy :80, :443 TLS + auto-renew (Let's Encrypt) + → oauth2-proxy :4180 Authentik OIDC; sets X-Forwarded-User + → dashboard :8080 Go backend + embedded SPA +``` + +Only Caddy exposes host ports; the other two live on a private +`triage-internal` bridge network. + +**Prerequisites on the host:** + +- Docker Engine 24+ with `docker compose` (v2) plugin. +- Public DNS: `triage.ngbackend.cloud` resolves to this host. +- Firewall: TCP `:80` and TCP+UDP `:443` reachable from the internet + (Let's Encrypt's ACME HTTP-01 / TLS-ALPN-01 challenge; HTTP/3 on UDP). + If the host is VPN-only, see the DNS-01 / `tls internal` notes in + `Caddyfile.example`. +- Authentik set up per §8a / §8b, with a Provider whose redirect URI is + `https:///oauth2/callback`. + +**First-time setup:** + +```bash +cd customers/vm-troubleshooting-dashboard + +cp .env.example .env # fill in hostname, ACME email, OIDC secrets +cp oauth2-proxy.cfg.example oauth2-proxy.cfg # edit hostnames; see §8c +cp Caddyfile.example Caddyfile # usually unchanged — hostname comes from .env + +docker compose build +docker compose up -d +docker compose logs -f +``` + +First Caddy start takes 10–30 s while it provisions the TLS cert. You +should see `certificate obtained successfully` in the Caddy logs. If +Let's Encrypt is rate-limiting you during testing, uncomment the +`acme_ca` staging line in the Caddyfile. + +**Day two:** + +```bash +docker compose pull # pull oauth2-proxy / caddy updates +docker compose build --pull dashboard # rebuild dashboard with latest base images +docker compose up -d # rolling replace +docker compose logs -f +``` + +Persistent state lives in three named volumes: `triage_triage-data` +(dashboard SQLite + archives), `triage_caddy-data` (certs — losing this +forces a reissue + you'll hit the LE rate limit), `triage_caddy-config` +(runtime Caddy state). + +Back up `triage_triage-data` per §6 and `triage_caddy-data` +opportunistically. Dump a named volume with +`docker run --rm -v :/src -v $(pwd):/dst alpine tar -C /src -czf /dst/.tgz .`. + +--- + +## 1. What you're running + +Two artifacts, one process: + +- **`dashboard`** — a single Go binary (built from `cmd/dashboard`). +- **`frontend/dist/`** — the built SPA. The binary serves it from disk via + the `-web-root` flag. If the directory is missing, non-API routes return + `404 dashboard frontend is not built`. Ship both together. + +State lives in one directory (`-data-dir`, default `./dashboard-data/`): + +- `dashboard.db` + `dashboard.db-wal` + `dashboard.db-shm` — SQLite in WAL mode. +- `archives//…` — extracted archive trees. + +Nothing is written outside `-data-dir`. The dashboard is read-only w.r.t. +archive contents after ingest. + +## 2. Build & install + +From `customers/vm-troubleshooting-dashboard/`: + +```bash +cd frontend && pnpm install && pnpm build && cd .. +go build -o dashboard ./cmd/dashboard + +sudo install -d /opt/vm-dashboard /var/lib/vm-dashboard +sudo install -m 0755 dashboard /opt/vm-dashboard/dashboard +sudo cp -r frontend/dist /opt/vm-dashboard/web +sudo useradd --system --home /var/lib/vm-dashboard --shell /usr/sbin/nologin vm-dashboard || true +sudo chown -R vm-dashboard:vm-dashboard /var/lib/vm-dashboard +``` + +## 3. Run as a systemd service + +`/etc/systemd/system/vm-dashboard.service`: + +```ini +[Unit] +Description=VM Troubleshooting Dashboard +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=vm-dashboard +Group=vm-dashboard +WorkingDirectory=/var/lib/vm-dashboard +ExecStart=/opt/vm-dashboard/dashboard \ + -listen 127.0.0.1:8080 \ + -data-dir /var/lib/vm-dashboard \ + -web-root /opt/vm-dashboard/web \ + -trust-forwarded-user \ + -max-archives 5000 +Restart=on-failure +RestartSec=5s + +# Hardening +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=yes +ReadWritePaths=/var/lib/vm-dashboard +CapabilityBoundingSet= +AmbientCapabilities= + +[Install] +WantedBy=multi-user.target +``` + +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now vm-dashboard +journalctl -u vm-dashboard -f +``` + +## 4. Flags that matter + +| Flag | Default | When to change | +|---|---|---| +| `-listen` | `127.0.0.1:8080` | Keep on loopback; let oauth2-proxy / nginx front it | +| `-data-dir` | `./dashboard-data` | Point at a persisted volume with room for archives | +| `-web-root` | `./frontend/dist` | Absolute path to the shipped `dist/` | +| `-trust-forwarded-user` | off | **Required** when fronted by oauth2-proxy | +| `-auth-shared-token` | empty | Alternative to oauth2-proxy; single bearer token | +| `-max-archives` | `10000` | Lower it — see sizing below | + +Binding to non-loopback without `-trust-forwarded-user` **or** +`-auth-shared-token` is refused at startup. That is intentional. + +## 5. Storage sizing & the `-max-archives` cap + +A typical archive is **1–20 MB extracted**. `-max-archives` caps the row +count; uploads beyond it return HTTP `507 Insufficient Storage`. Rough +sizing for `-data-dir`: + +| Archives | Budget | +|---|---| +| 1,000 | ~20 GB | +| 5,000 | ~100 GB | +| 10,000 | ~200 GB | + +Start with `-max-archives 5000` on a 100 GB volume and raise once you have +a feel for real archive sizes. There is no auto-prune — oldest archives +do not get evicted. When you hit the cap, delete archives from the UI +(or `DELETE /api/v1/archives/{id}`) to free the slot. + +## 6. Backup + +The whole service state is `-data-dir`. Stop-and-copy works but isn't +required — SQLite in WAL mode supports hot backup via the online backup +API, and `sqlite3 .backup` wraps it safely. + +`/etc/cron.daily/vm-dashboard-backup`: + +```bash +#!/bin/sh +set -eu +SRC=/var/lib/vm-dashboard +DST=/var/backups/vm-dashboard +STAMP=$(date -u +%Y%m%dT%H%M%SZ) +mkdir -p "$DST" + +# Hot-consistent copy of the DB (works while the dashboard is running). +sqlite3 "$SRC/dashboard.db" ".backup '$DST/dashboard-$STAMP.db'" + +# Archive blobs are immutable once ingested; rsync is fine. +rsync -a --delete "$SRC/archives/" "$DST/archives/" + +# Retention: keep 14 days of DB snapshots. +find "$DST" -maxdepth 1 -name 'dashboard-*.db' -mtime +14 -delete +``` + +Restore: stop the service, replace `dashboard.db` with the snapshot, +rsync `archives/` back in, start. + +**Do not** `cp dashboard.db` while the service is running — the `-wal` +file may be mid-commit. Use `sqlite3 .backup` or stop the service first. + +## 7. Concurrency & scaling + +**Run a single instance per `-data-dir`.** The process serialises writes +through an in-process mutex (`internal/store/store.go`), and SQLite WAL +permits a single writer at a time. Two dashboards pointed at the same +database will race on the "count-then-insert" cap check and may corrupt +concurrent ingests. If you need HA, front a single instance with a +restart-on-failure supervisor (systemd already does this); don't scale +horizontally. + +Throughput envelope on a small VM (~2 vCPU, 4 GB RAM): dozens of +concurrent browsers, one ingest at a time, archive uploads complete in +2–5 s for typical sizes. This is sized for a CX team, not a fleet. + +## 8. Auth — oauth2-proxy + Authentik (recommended) + +The dashboard has two built-in auth modes and no user directory of its +own. For a team of ~10 behind the VPN, front it with oauth2-proxy +talking to Authentik over OIDC. The dashboard reads the authenticated +user from `X-Forwarded-User`, which oauth2-proxy sets when you enable +`pass_user_headers`. + +Topology: + +``` +browser → TLS terminator (nginx/Caddy, :443) + → oauth2-proxy (127.0.0.1:4180) ← owns cookies + OIDC dance + → dashboard (127.0.0.1:8080) ← trusts X-Forwarded-User +``` + +Chosen over nginx `auth_request` because it's one process for one app and +`pass_user_headers` puts the right header on the upstream request +directly — no `auth_request_set` header-copy dance to get wrong. +([oauth2-proxy OIDC provider docs](https://oauth2-proxy.github.io/oauth2-proxy/configuration/providers/openid_connect)). + +### 8a. Authentik — create the OAuth2/OpenID Provider + +Admin → **Applications → Providers → Create → OAuth2/OpenID Provider**: + +| Field | Value | +|---|---| +| Name | `vm-dashboard-provider` | +| Authentication flow | `default-authentication-flow` | +| Authorization flow | `default-provider-authorization-implicit-consent` | +| Client type | `Confidential` | +| Client ID | *(auto — copy)* | +| Client Secret | *(auto — copy)* | +| Redirect URIs | `https://dashboard.internal.example.com/oauth2/callback` (Strict mode) | +| Signing Key | `authentik Self-signed Certificate` (enables RS256 + JWKS) | +| **Encryption Key** | **Leave blank.** If set, Authentik issues JWE-encrypted id_tokens (5 parts) which oauth2-proxy cannot decrypt — login fails with "compact JWS format must have three parts". | +| Include claims in id_token | **enabled** | +| Subject mode | `Based on the User's hashed ID` | +| Issuer mode | `Each provider has a different issuer, based on the application slug` | +| Scopes | `authentik default OAuth Mapping: OpenID 'openid' / 'email' / 'profile' / 'offline_access'` + `authentik default OAuth Mapping: Proxy outpost` (emits `groups`) | +| Access code validity | `minutes=1` | +| Access token validity | `minutes=5` | +| Refresh token validity | `days=30` | + +The `/oauth2/callback` path is oauth2-proxy's fixed callback — it is not +configurable, so the redirect URI must match exactly. +([Authentik OAuth2 provider reference](https://docs.goauthentik.io/add-secure-apps/providers/oauth2/)). + +### 8b. Authentik — create the Application + +Admin → **Applications → Applications → Create**: + +| Field | Value | +|---|---| +| Name | `VM Troubleshooting Dashboard` | +| Slug | `vm-dashboard` (appears in the issuer URL — don't change later) | +| Provider | `vm-dashboard-provider` | +| Launch URL | `https://dashboard.internal.example.com/` | + +Then **Applications → `VM Troubleshooting Dashboard` → Policy / Group / +User Bindings** → bind the Authentik group `CX-Team` with Order `0`, +Enabled, not negated. This is the enforcement point for who can log in. + +Discovery URL Authentik will publish: +`https:///application/o/vm-dashboard/.well-known/openid-configuration`. +Use that prefix (minus the well-known suffix) as `oidc_issuer_url` below. + +### 8c. oauth2-proxy config + +Generate a cookie secret once: + +```bash +openssl rand -base64 32 | tr -- '+/' '-_' +``` + +`/etc/oauth2-proxy/oauth2-proxy.cfg`: + +```toml +# ---- Provider ---- +provider = "oidc" +provider_display_name = "Authentik" +oidc_issuer_url = "https://authentik.internal.example.com/application/o/vm-dashboard/" +client_id = "REPLACE_WITH_AUTHENTIK_CLIENT_ID" +client_secret = "REPLACE_WITH_AUTHENTIK_CLIENT_SECRET" +redirect_url = "https://dashboard.internal.example.com/oauth2/callback" +scope = "openid profile email groups offline_access" +code_challenge_method = "S256" +skip_provider_button = true +insecure_oidc_allow_unverified_email = false + +# ---- Listener & upstream ---- +http_address = "127.0.0.1:4180" +reverse_proxy = true +trusted_ip = ["127.0.0.1/32"] +upstreams = ["http://127.0.0.1:8080"] + +# ---- Who is allowed in ---- +email_domains = ["*"] +oidc_groups_claim = "groups" +allowed_groups = ["CX-Team"] + +# ---- Headers to the Go dashboard ---- +pass_user_headers = true +pass_access_token = false +pass_authorization_header = false +set_xauthrequest = false + +# ---- Session / cookie hardening ---- +cookie_secret = "REPLACE_WITH_32_BYTE_BASE64_SECRET" +cookie_name = "_oauth2_proxy" +cookie_secure = true +cookie_httponly = true +cookie_samesite = "lax" +cookie_expire = "168h" +cookie_refresh = "1h" +cookie_domains = ["dashboard.internal.example.com"] +whitelist_domains = ["dashboard.internal.example.com"] + +# ---- Misc ---- +request_logging = true +show_debug_on_error = false +``` + +`pass_user_headers = true` is what puts `X-Forwarded-User` on the +upstream request, which `-trust-forwarded-user` consumes. Keep the +dashboard on loopback so nothing but oauth2-proxy can reach it. + +### 8d. nginx (TLS terminator in front of oauth2-proxy) + +```nginx +server { + listen 443 ssl http2; + server_name dashboard.internal.example.com; + ssl_certificate /etc/ssl/certs/dashboard.pem; + ssl_certificate_key /etc/ssl/private/dashboard.key; + + client_max_body_size 256m; # archive uploads + + location / { + proxy_pass http://127.0.0.1:4180; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 60s; + } +} +``` + +## 9. Alternative: shared bearer token (no SSO) + +For a quick lab setup, skip oauth2-proxy and use the built-in bearer: + +```bash +# systemd drop-in +ExecStart=/opt/vm-dashboard/dashboard \ + -listen 0.0.0.0:8080 \ + -data-dir /var/lib/vm-dashboard \ + -web-root /opt/vm-dashboard/web \ + -auth-shared-token "$(openssl rand -hex 32)" +``` + +Every `/api` request must carry `Authorization: Bearer `. +The frontend has no UI for this — use for scripts/curl only, not for +the CX team's daily driver. + +## 10. Upgrades + +Stop the service, replace `dashboard` and `web/`, start it again. +Database migrations are additive and run on startup. Keep a DB snapshot +from step 6 within arm's reach before upgrading. + +## 11. Troubleshooting + +| Symptom | Likely cause | +|---|---| +| `404 dashboard frontend is not built` | `-web-root` points at an empty/missing directory | +| Startup: `non-loopback --listen requires …` | Bound to `0.0.0.0` or a LAN IP without `-trust-forwarded-user` / `-auth-shared-token` | +| Every request 401 via oauth2-proxy | `X-Forwarded-User` missing → confirm `pass_user_headers = true` and dashboard started with `-trust-forwarded-user` | +| `HTTP 507` on upload | `-max-archives` cap hit; delete old archives or raise the cap | +| Disk full, DB locked | Another process holding `dashboard.db-wal`; confirm single instance per `-data-dir` | diff --git a/customers/vm-troubleshooting-dashboard/Dockerfile b/customers/vm-troubleshooting-dashboard/Dockerfile new file mode 100644 index 0000000..995c7ac --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/Dockerfile @@ -0,0 +1,91 @@ +# syntax=docker/dockerfile:1.9 + +# --------------------------------------------------------------------------- +# Stage 1: build the frontend (Vite + React, pnpm pinned via npm) +# --------------------------------------------------------------------------- +# Node pinned to a current LTS minor. We install pnpm directly with npm +# rather than through corepack: some older node:22.x minors ship a corepack +# with stale signing keys that fails to fetch pnpm. npm-install is +# deterministic and avoids that moving part entirely. +FROM node:22.21-bookworm-slim AS frontend +WORKDIR /src/frontend + +ARG PNPM_VERSION=10.15.1 +ENV CI=1 \ + PNPM_HOME=/pnpm \ + PATH=/pnpm:$PATH + +RUN --mount=type=cache,target=/root/.npm \ + npm install -g pnpm@${PNPM_VERSION} + +COPY frontend/package.json frontend/pnpm-lock.yaml ./ +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + pnpm config set store-dir /pnpm/store && \ + pnpm install --frozen-lockfile + +COPY frontend/ ./ +RUN pnpm build + +# --------------------------------------------------------------------------- +# Stage 2: build the Go binary (pure-Go SQLite → CGO disabled, fully static) +# --------------------------------------------------------------------------- +FROM golang:1.25-bookworm AS backend +WORKDIR /src + +ENV CGO_ENABLED=0 \ + GOFLAGS=-trimpath \ + GOTOOLCHAIN=local + +COPY go.mod go.sum ./ +RUN --mount=type=cache,target=/go/pkg/mod \ + go mod download + +COPY . . + +ARG VERSION=dev +ARG COMMIT=unknown +ARG BUILD_DATE=unknown +RUN --mount=type=cache,target=/go/pkg/mod \ + --mount=type=cache,target=/root/.cache/go-build \ + go build \ + -ldflags "-s -w -X main.version=${VERSION} -X main.commit=${COMMIT} -X main.buildDate=${BUILD_DATE}" \ + -o /out/dashboard \ + ./cmd/dashboard + +# --------------------------------------------------------------------------- +# Helper stage: holds an empty /rootfs/data we can COPY into the distroless +# runtime with `--chown` (distroless has no shell/mkdir/chown). Docker copies +# the image's /data ownership onto a fresh named volume on first mount. +# --------------------------------------------------------------------------- +FROM busybox:1.37 AS dirs +RUN mkdir -p /rootfs/data + +# --------------------------------------------------------------------------- +# Stage 3: runtime — distroless static, non-root +# --------------------------------------------------------------------------- +FROM gcr.io/distroless/static-debian12:nonroot AS runtime + +LABEL org.opencontainers.image.title="Triage" \ + org.opencontainers.image.description="VM/host diagnostic archive triage dashboard" \ + org.opencontainers.image.source="https://github.com/NexGenCloud/support-scripts" \ + org.opencontainers.image.licenses="Proprietary" \ + org.opencontainers.image.vendor="NexGen Cloud" + +WORKDIR /app + +COPY --from=backend /out/dashboard /app/dashboard +COPY --from=frontend /src/frontend/dist /app/web +COPY --from=dirs --chown=65532:65532 /rootfs/data /data + +# distroless nonroot = uid/gid 65532 +USER 65532:65532 + +EXPOSE 8080 +VOLUME ["/data"] + +ENTRYPOINT ["/app/dashboard"] +CMD ["-listen", "0.0.0.0:8080", \ + "-data-dir", "/data", \ + "-web-root", "/app/web", \ + "-trust-forwarded-user", \ + "-max-archives", "5000"] diff --git a/customers/vm-troubleshooting-dashboard/cmd/dashboard/main.go b/customers/vm-troubleshooting-dashboard/cmd/dashboard/main.go new file mode 100644 index 0000000..3c2637f --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/cmd/dashboard/main.go @@ -0,0 +1,107 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log" + "net" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/api" + "github.com/NexGenCloud/diagnostic-dashboard/internal/store" +) + +// listenRequiresAuth reports whether the listen address may be reached off-box +// (non-loopback), requiring --auth-shared-token or --trust-forwarded-user. +func listenRequiresAuth(addr string) bool { + host, _, err := net.SplitHostPort(addr) + if err != nil { + if strings.HasPrefix(addr, ":") { + return true + } + return true + } + if host == "" { + return true + } + if host == "0.0.0.0" || host == "[::]" { + return true + } + ip := net.ParseIP(strings.Trim(host, "[]")) + if ip != nil { + return !ip.IsLoopback() + } + ips, err := net.LookupIP(host) + if err != nil { + return true + } + for _, ip := range ips { + if !ip.IsLoopback() { + return true + } + } + return false +} + +func main() { + var ( + listen = flag.String("listen", "127.0.0.1:8080", "HTTP listen address host:port") + dataDir = flag.String("data-dir", "./dashboard-data", "directory for uploaded archives") + webRoot = flag.String("web-root", "./frontend/dist", "directory containing the built frontend") + authToken = flag.String("auth-shared-token", "", "require Authorization: Bearer for /api when listening on non-loopback") + trustForwarded = flag.Bool("trust-forwarded-user", false, "trust X-Forwarded-User and X-Forwarded-For (use only behind a trusted reverse proxy)") + maxArchives = flag.Int("max-archives", 10000, "soft cap on stored archives (uploads return 507 when exceeded)") + ) + flag.Parse() + + if listenRequiresAuth(*listen) && *authToken == "" && !*trustForwarded { + log.Fatal("non-loopback --listen requires --auth-shared-token or --trust-forwarded-user (see docs/plans/post-audit-hardening.md)") + } + + if err := os.MkdirAll(*dataDir, 0o755); err != nil { + log.Fatal(err) + } + st, err := store.New(*dataDir) + if err != nil { + log.Fatal(err) + } + defer st.Close() + server := api.New(st, *webRoot, api.Options{ + AuthSharedToken: *authToken, + TrustForwardedUser: *trustForwarded, + MaxArchives: *maxArchives, + RequireAPIAuth: listenRequiresAuth(*listen), + }) + httpServer := &http.Server{ + Addr: *listen, + Handler: server.Handler(), + ReadHeaderTimeout: 5 * time.Second, + } + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + go func() { + <-ctx.Done() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _ = httpServer.Shutdown(shutdownCtx) + }() + + absDataDir, _ := filepath.Abs(*dataDir) + absWebRoot, _ := filepath.Abs(*webRoot) + fmt.Fprintf(os.Stderr, "dashboard listening on %s\n", *listen) + fmt.Fprintf(os.Stderr, "data dir: %s\n", absDataDir) + fmt.Fprintf(os.Stderr, "web root: %s\n", absWebRoot) + + if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatal(err) + } +} diff --git a/customers/vm-troubleshooting-dashboard/cmd/dashboard/main_test.go b/customers/vm-troubleshooting-dashboard/cmd/dashboard/main_test.go new file mode 100644 index 0000000..56c3a37 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/cmd/dashboard/main_test.go @@ -0,0 +1,53 @@ +package main + +import ( + "net" + "strings" + "testing" +) + +func TestListenRequiresAuth(t *testing.T) { + t.Parallel() + tests := []struct { + addr string + want bool + }{ + {"127.0.0.1:8080", false}, + {"localhost:8080", false}, + {":8080", true}, + {"0.0.0.0:8080", true}, + {"[::]:8080", true}, + {"192.168.1.1:8080", true}, + } + for _, tt := range tests { + tt := tt + t.Run(tt.addr, func(t *testing.T) { + t.Parallel() + if got := listenRequiresAuth(tt.addr); got != tt.want { + t.Fatalf("listenRequiresAuth(%q) = %v, want %v", tt.addr, got, tt.want) + } + }) + } +} + +func TestListenRequiresAuth_InvalidAddrRequiresAuth(t *testing.T) { + t.Parallel() + if !listenRequiresAuth("not-a-valid-hostport") { + t.Fatal("invalid listen address should require auth") + } +} + +func TestListenRequiresAuth_IPv6Loopback(t *testing.T) { + t.Parallel() + host, _, err := net.SplitHostPort("[::1]:8080") + if err != nil { + t.Fatal(err) + } + ip := net.ParseIP(strings.Trim(host, "[]")) + if ip == nil || !ip.IsLoopback() { + t.Fatalf("expected [::1] loopback, host=%q ip=%v", host, ip) + } + if listenRequiresAuth("[::1]:8080") { + t.Fatal("IPv6 loopback should not require auth") + } +} diff --git a/customers/vm-troubleshooting-dashboard/docker-compose.yml b/customers/vm-troubleshooting-dashboard/docker-compose.yml new file mode 100644 index 0000000..665b5ba --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/docker-compose.yml @@ -0,0 +1,115 @@ +# Compose deployment for Triage. Full topology: +# +# internet → caddy (:80, :443) TLS + auto-renew via Let's Encrypt +# → oauth2-proxy (:4180) Authentik OIDC, sets X-Forwarded-User +# → dashboard (:8080) Go backend + embedded SPA +# +# Only caddy is exposed on host ports; oauth2-proxy and dashboard are on +# the private `triage-internal` network. +# +# Setup on a fresh host: +# cp .env.example .env # OIDC secrets, hostname, ACME email +# cp oauth2-proxy.cfg.example oauth2-proxy.cfg +# cp Caddyfile.example Caddyfile # edit hostname if not env-driven +# docker compose up -d +# +# Requirements for auto-HTTPS: +# - The hostname in $TRIAGE_HOSTNAME resolves publicly to this host. +# - TCP :80 and :443 are reachable from the internet (Let's Encrypt +# HTTP-01 / TLS-ALPN-01 challenge). If the host is VPN-only, either +# use Caddy's DNS-01 challenge (requires a DNS provider plugin) or +# `tls internal` for a self-signed cert — see Caddyfile.example. + +name: triage + +services: + dashboard: + build: + context: . + dockerfile: Dockerfile + args: + VERSION: ${TRIAGE_VERSION:-dev} + COMMIT: ${TRIAGE_COMMIT:-unknown} + BUILD_DATE: ${TRIAGE_BUILD_DATE:-unknown} + image: triage/dashboard:${TRIAGE_VERSION:-dev} + restart: unless-stopped + networks: + - triage-internal + volumes: + - triage-data:/data + environment: + TZ: ${TZ:-UTC} + read_only: true + tmpfs: + - /tmp:size=64m,mode=1777 + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + deploy: + resources: + limits: + cpus: "2.0" + memory: 1g + + oauth2-proxy: + image: quay.io/oauth2-proxy/oauth2-proxy:v7.15.2 + restart: unless-stopped + depends_on: + dashboard: + condition: service_started + networks: + - triage-internal + # No `ports:` — only Caddy is exposed to the host. + command: + - --config=/etc/oauth2-proxy/oauth2-proxy.cfg + environment: + # Secrets from .env. `:-` defaults silence interpolation warnings at + # build time; oauth2-proxy refuses to start with empty values so + # misconfiguration fails loudly at `up` time. + OAUTH2_PROXY_CLIENT_ID: ${OAUTH2_PROXY_CLIENT_ID:-} + OAUTH2_PROXY_CLIENT_SECRET: ${OAUTH2_PROXY_CLIENT_SECRET:-} + OAUTH2_PROXY_COOKIE_SECRET: ${OAUTH2_PROXY_COOKIE_SECRET:-} + volumes: + - ./oauth2-proxy.cfg:/etc/oauth2-proxy/oauth2-proxy.cfg:ro + read_only: true + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + + caddy: + image: caddy:2.11.2-alpine + restart: unless-stopped + depends_on: + oauth2-proxy: + condition: service_started + networks: + - triage-internal + ports: + - "80:80" + - "443:443" + - "443:443/udp" # HTTP/3 (QUIC) + environment: + # `:-` default keeps `docker compose build` working before .env + # exists. Caddy fails loudly at startup if the hostname is empty. + TRIAGE_HOSTNAME: ${TRIAGE_HOSTNAME:-} + ACME_EMAIL: ${ACME_EMAIL:-} + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile:ro + - caddy-data:/data # certificates + ACME account (critical to persist) + - caddy-config:/config # autosaved runtime config + security_opt: + - no-new-privileges:true + +volumes: + triage-data: + driver: local + caddy-data: + driver: local + caddy-config: + driver: local + +networks: + triage-internal: + driver: bridge diff --git a/customers/vm-troubleshooting-dashboard/frontend/package.json b/customers/vm-troubleshooting-dashboard/frontend/package.json index 0b2bab4..d8e1009 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/package.json +++ b/customers/vm-troubleshooting-dashboard/frontend/package.json @@ -7,7 +7,9 @@ "dev": "vite", "build": "tsc -b && vite build", "lint": "eslint .", - "preview": "vite preview" + "preview": "vite preview", + "test": "vitest run", + "test:watch": "vitest" }, "dependencies": { "@base-ui/react": "^1.3.0", @@ -29,6 +31,10 @@ }, "devDependencies": { "@eslint/js": "^9.39.4", + "@testing-library/dom": "^10.4.1", + "@testing-library/jest-dom": "^6.9.1", + "@testing-library/react": "^16.3.2", + "@testing-library/user-event": "^14.6.1", "@types/node": "^24.12.2", "@types/react": "^19.2.14", "@types/react-dom": "^19.2.3", @@ -37,8 +43,10 @@ "eslint-plugin-react-hooks": "^7.0.1", "eslint-plugin-react-refresh": "^0.5.2", "globals": "^17.4.0", + "jsdom": "^29.0.2", "typescript": "~6.0.2", "typescript-eslint": "^8.58.0", - "vite": "^8.0.4" + "vite": "^8.0.4", + "vitest": "^4.1.4" } } diff --git a/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml b/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml index 776761b..58413dd 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml +++ b/customers/vm-troubleshooting-dashboard/frontend/pnpm-lock.yaml @@ -60,6 +60,18 @@ importers: '@eslint/js': specifier: ^9.39.4 version: 9.39.4 + '@testing-library/dom': + specifier: ^10.4.1 + version: 10.4.1 + '@testing-library/jest-dom': + specifier: ^6.9.1 + version: 6.9.1 + '@testing-library/react': + specifier: ^16.3.2 + version: 16.3.2(@testing-library/dom@10.4.1)(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5) + '@testing-library/user-event': + specifier: ^14.6.1 + version: 14.6.1(@testing-library/dom@10.4.1) '@types/node': specifier: ^24.12.2 version: 24.12.2 @@ -84,6 +96,9 @@ importers: globals: specifier: ^17.4.0 version: 17.4.0 + jsdom: + specifier: ^29.0.2 + version: 29.0.2(@noble/hashes@1.8.0) typescript: specifier: ~6.0.2 version: 6.0.2 @@ -93,9 +108,30 @@ importers: vite: specifier: ^8.0.4 version: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + vitest: + specifier: ^4.1.4 + version: 4.1.4(@types/node@24.12.2)(jsdom@29.0.2(@noble/hashes@1.8.0))(msw@2.13.2(@types/node@24.12.2)(typescript@6.0.2))(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1)) packages: + '@adobe/css-tools@4.4.4': + resolution: {integrity: sha512-Elp+iwUx5rN5+Y8xLt5/GRoG20WGoDCQ/1Fb+1LiGtvwbDavuSk0jhD/eZdckHAuzcDzccnkv+rEjyWfRx18gg==} + + '@asamuzakjp/css-color@5.1.11': + resolution: {integrity: sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + + '@asamuzakjp/dom-selector@7.0.10': + resolution: {integrity: sha512-KyOb19eytNSELkmdqzZZUXWCU25byIlOld5qVFg0RYdS0T3tt7jeDByxk9hIAC73frclD8GKrHttr0SUjKCCdQ==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + + '@asamuzakjp/generational-cache@1.0.1': + resolution: {integrity: sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + + '@asamuzakjp/nwsapi@2.3.9': + resolution: {integrity: sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==} + '@babel/code-frame@7.29.0': resolution: {integrity: sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==} engines: {node: '>=6.9.0'} @@ -250,6 +286,46 @@ packages: '@types/react': optional: true + '@bramus/specificity@2.4.2': + resolution: {integrity: sha512-ctxtJ/eA+t+6q2++vj5j7FYX3nRu311q1wfYH3xjlLOsczhlhxAg2FWNUXhpGvAw3BWo1xBcvOV6/YLc2r5FJw==} + hasBin: true + + '@csstools/color-helpers@6.0.2': + resolution: {integrity: sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q==} + engines: {node: '>=20.19.0'} + + '@csstools/css-calc@3.2.0': + resolution: {integrity: sha512-bR9e6o2BDB12jzN/gIbjHa5wLJ4UjD1CB9pM7ehlc0ddk6EBz+yYS1EV2MF55/HUxrHcB/hehAyt5vhsA3hx7w==} + engines: {node: '>=20.19.0'} + peerDependencies: + '@csstools/css-parser-algorithms': ^4.0.0 + '@csstools/css-tokenizer': ^4.0.0 + + '@csstools/css-color-parser@4.1.0': + resolution: {integrity: sha512-U0KhLYmy2GVj6q4T3WaAe6NPuFYCPQoE3b0dRGxejWDgcPp8TP7S5rVdM5ZrFaqu4N67X8YaPBw14dQSYx3IyQ==} + engines: {node: '>=20.19.0'} + peerDependencies: + '@csstools/css-parser-algorithms': ^4.0.0 + '@csstools/css-tokenizer': ^4.0.0 + + '@csstools/css-parser-algorithms@4.0.0': + resolution: {integrity: sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w==} + engines: {node: '>=20.19.0'} + peerDependencies: + '@csstools/css-tokenizer': ^4.0.0 + + '@csstools/css-syntax-patches-for-csstree@1.1.3': + resolution: {integrity: sha512-SH60bMfrRCJF3morcdk57WklujF4Jr/EsQUzqkarfHXEFcAR1gg7fS/chAE922Sehgzc1/+Tz5H3Ypa1HiEKrg==} + peerDependencies: + css-tree: ^3.2.1 + peerDependenciesMeta: + css-tree: + optional: true + + '@csstools/css-tokenizer@4.0.0': + resolution: {integrity: sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA==} + engines: {node: '>=20.19.0'} + '@dotenvx/dotenvx@1.61.0': resolution: {integrity: sha512-utL3cpZoFzflyqUkjYbxYujI6STBTmO5LFn4bbin/NZnRWN6wQ7eErhr3/Vpa5h/jicPFC6kTa42r940mQftJQ==} hasBin: true @@ -307,6 +383,15 @@ packages: resolution: {integrity: sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + '@exodus/bytes@1.15.0': + resolution: {integrity: sha512-UY0nlA+feH81UGSHv92sLEPLCeZFjXOuHhrIo0HQydScuQc8s0A7kL/UdgwgDq8g8ilksmuoF35YVTNphV2aBQ==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + peerDependencies: + '@noble/hashes': ^1.8.0 || ^2.0.0 + peerDependenciesMeta: + '@noble/hashes': + optional: true + '@floating-ui/core@1.7.5': resolution: {integrity: sha512-1Ih4WTWyw0+lKyFMcBHGbb5U5FtuHJuujoyyr5zTaWS5EYMeT6Jb2AuDeftsCsEuchO+mM2ij5+q9crhydzLhQ==} @@ -559,6 +644,9 @@ packages: resolution: {integrity: sha512-tlqY9xq5ukxTUZBmoOp+m61cqwQD5pHJtFY3Mn8CA8ps6yghLH/Hw8UPdqg4OLmFW3IFlcXnQNmo/dh8HzXYIQ==} engines: {node: '>=18'} + '@standard-schema/spec@1.1.0': + resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} + '@tailwindcss/node@4.2.2': resolution: {integrity: sha512-pXS+wJ2gZpVXqFaUEjojq7jzMpTGf8rU6ipJz5ovJV6PUGmlJ+jvIwGrzdHdQ80Sg+wmQxUFuoW1UAAwHNEdFA==} @@ -666,12 +754,50 @@ packages: '@tanstack/virtual-core@3.13.23': resolution: {integrity: sha512-zSz2Z2HNyLjCplANTDyl3BcdQJc2k1+yyFoKhNRmCr7V7dY8o8q5m8uFTI1/Pg1kL+Hgrz6u3Xo6eFUB7l66cg==} + '@testing-library/dom@10.4.1': + resolution: {integrity: sha512-o4PXJQidqJl82ckFaXUeoAW+XysPLauYI43Abki5hABd853iMhitooc6znOnczgbTYmEP6U6/y1ZyKAIsvMKGg==} + engines: {node: '>=18'} + + '@testing-library/jest-dom@6.9.1': + resolution: {integrity: sha512-zIcONa+hVtVSSep9UT3jZ5rizo2BsxgyDYU7WFD5eICBE7no3881HGeb/QkGfsJs6JTkY1aQhT7rIPC7e+0nnA==} + engines: {node: '>=14', npm: '>=6', yarn: '>=1'} + + '@testing-library/react@16.3.2': + resolution: {integrity: sha512-XU5/SytQM+ykqMnAnvB2umaJNIOsLF3PVv//1Ew4CTcpz0/BRyy/af40qqrt7SjKpDdT1saBMc42CUok5gaw+g==} + engines: {node: '>=18'} + peerDependencies: + '@testing-library/dom': ^10.0.0 + '@types/react': ^18.0.0 || ^19.0.0 + '@types/react-dom': ^18.0.0 || ^19.0.0 + react: ^18.0.0 || ^19.0.0 + react-dom: ^18.0.0 || ^19.0.0 + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + + '@testing-library/user-event@14.6.1': + resolution: {integrity: sha512-vq7fv0rnt+QTXgPxr5Hjc210p6YKq2kmdziLgnsZGgLJ9e6VAShx1pACLuRjd/AS/sr7phAR58OIIpf0LlmQNw==} + engines: {node: '>=12', npm: '>=6'} + peerDependencies: + '@testing-library/dom': '>=7.21.4' + '@ts-morph/common@0.27.0': resolution: {integrity: sha512-Wf29UqxWDpc+i61k3oIOzcUfQt79PIT9y/MWfAGlrkjg6lBC1hwDECLXPVJAhWjiGbfBCxZd65F/LIZF3+jeJQ==} '@tybys/wasm-util@0.10.1': resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==} + '@types/aria-query@5.0.4': + resolution: {integrity: sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==} + + '@types/chai@5.2.3': + resolution: {integrity: sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==} + + '@types/deep-eql@4.0.2': + resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==} + '@types/estree@1.0.8': resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==} @@ -767,6 +893,35 @@ packages: babel-plugin-react-compiler: optional: true + '@vitest/expect@4.1.4': + resolution: {integrity: sha512-iPBpra+VDuXmBFI3FMKHSFXp3Gx5HfmSCE8X67Dn+bwephCnQCaB7qWK2ldHa+8ncN8hJU8VTMcxjPpyMkUjww==} + + '@vitest/mocker@4.1.4': + resolution: {integrity: sha512-R9HTZBhW6yCSGbGQnDnH3QHfJxokKN4KB+Yvk9Q1le7eQNYwiCyKxmLmurSpFy6BzJanSLuEUDrD+j97Q+ZLPg==} + peerDependencies: + msw: ^2.4.9 + vite: ^6.0.0 || ^7.0.0 || ^8.0.0 + peerDependenciesMeta: + msw: + optional: true + vite: + optional: true + + '@vitest/pretty-format@4.1.4': + resolution: {integrity: sha512-ddmDHU0gjEUyEVLxtZa7xamrpIefdEETu3nZjWtHeZX4QxqJ7tRxSteHVXJOcr8jhiLoGAhkK4WJ3WqBpjx42A==} + + '@vitest/runner@4.1.4': + resolution: {integrity: sha512-xTp7VZ5aXP5ZJrn15UtJUWlx6qXLnGtF6jNxHepdPHpMfz/aVPx+htHtgcAL2mDXJgKhpoo2e9/hVJsIeFbytQ==} + + '@vitest/snapshot@4.1.4': + resolution: {integrity: sha512-MCjCFgaS8aZz+m5nTcEcgk/xhWv0rEH4Yl53PPlMXOZ1/Ka2VcZU6CJ+MgYCZbcJvzGhQRjVrGQNZqkGPttIKw==} + + '@vitest/spy@4.1.4': + resolution: {integrity: sha512-XxNdAsKW7C+FLydqFJLb5KhJtl3PGCMmYwFRfhvIgxJvLSXhhVI1zM8f1qD3Zg7RCjTSzDVyct6sghs9UEgBEQ==} + + '@vitest/utils@4.1.4': + resolution: {integrity: sha512-13QMT+eysM5uVGa1rG4kegGYNp6cnQcsTc67ELFbhNLQO+vgsygtYJx2khvdt4gVQqSSpC/KT5FZZxUpP3Oatw==} + accepts@2.0.0: resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==} engines: {node: '>= 0.6'} @@ -811,9 +966,24 @@ packages: resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==} engines: {node: '>=8'} + ansi-styles@5.2.0: + resolution: {integrity: sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==} + engines: {node: '>=10'} + argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} + aria-query@5.3.0: + resolution: {integrity: sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==} + + aria-query@5.3.2: + resolution: {integrity: sha512-COROpnaoap1E2F000S62r6A60uHZnmlvomhfyT2DlTcrY1OrBKn2UhH7qn5wTC9zMvD0AY7csdPSNwKP+7WiQw==} + engines: {node: '>= 0.4'} + + assertion-error@2.0.1: + resolution: {integrity: sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==} + engines: {node: '>=12'} + ast-types@0.16.1: resolution: {integrity: sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg==} engines: {node: '>=4'} @@ -830,6 +1000,9 @@ packages: engines: {node: '>=6.0.0'} hasBin: true + bidi-js@1.0.3: + resolution: {integrity: sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==} + body-parser@2.2.2: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} @@ -873,6 +1046,10 @@ packages: caniuse-lite@1.0.30001787: resolution: {integrity: sha512-mNcrMN9KeI68u7muanUpEejSLghOKlVhRqS/Za2IeyGllJ9I9otGpR9g3nsw7n4W378TE/LyIteA0+/FOZm4Kg==} + chai@6.2.2: + resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} + engines: {node: '>=18'} + chalk@4.1.2: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} @@ -965,6 +1142,13 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-tree@3.2.1: + resolution: {integrity: sha512-X7sjQzceUhu1u7Y/ylrRZFU2FS6LRiFVp6rKLPg23y3x3c3DOKAwuXGDp+PAGjh6CSnCjYeAul8pcT8bAl+lSA==} + engines: {node: ^10 || ^12.20.0 || ^14.13.0 || >=15.0.0} + + css.escape@1.5.1: + resolution: {integrity: sha512-YUifsXXuknHlUsmlgyY0PKzgPOr7/FjCePfHNt0jxm83wHZi44VDMQ7/fGNkjY3/jV1MC+1CmZbaHzugyeRtpg==} + cssesc@3.0.0: resolution: {integrity: sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==} engines: {node: '>=4'} @@ -977,6 +1161,10 @@ packages: resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==} engines: {node: '>= 12'} + data-urls@7.0.0: + resolution: {integrity: sha512-23XHcCF+coGYevirZceTVD7NdJOqVn+49IHyxgszm+JIiHLoB2TkmPtsYkNWT1pvRSGkc35L6NHs0yHkN2SumA==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + debug@4.4.3: resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} engines: {node: '>=6.0'} @@ -986,6 +1174,9 @@ packages: supports-color: optional: true + decimal.js@10.6.0: + resolution: {integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==} + dedent@1.7.2: resolution: {integrity: sha512-WzMx3mW98SN+zn3hgemf4OzdmyNhhhKz5Ay0pUfQiMQ3e1g+xmTJWp/pKdwKVXhdSkAEGIIzqeuWrL3mV/AXbA==} peerDependencies: @@ -1017,6 +1208,10 @@ packages: resolution: {integrity: sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==} engines: {node: '>= 0.8'} + dequal@2.0.3: + resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} + engines: {node: '>=6'} + detect-libc@2.1.2: resolution: {integrity: sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==} engines: {node: '>=8'} @@ -1025,6 +1220,12 @@ packages: resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==} engines: {node: '>=0.3.1'} + dom-accessibility-api@0.5.16: + resolution: {integrity: sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==} + + dom-accessibility-api@0.6.3: + resolution: {integrity: sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w==} + dotenv@17.4.1: resolution: {integrity: sha512-k8DaKGP6r1G30Lx8V4+pCsLzKr8vLmV2paqEj1Y55GdAgJuIqpRp5FfajGF8KtwMxCz9qJc6wUIJnm053d/WCw==} engines: {node: '>=12'} @@ -1057,6 +1258,10 @@ packages: resolution: {integrity: sha512-Qohcme7V1inbAfvjItgw0EaxVX5q2rdVEZHRBrEQdRZTssLDGsL8Lwrznl8oQ/6kuTJONLaDcGjkNP247XEhcA==} engines: {node: '>=10.13.0'} + entities@6.0.1: + resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} + engines: {node: '>=0.12'} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -1072,6 +1277,9 @@ packages: resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==} engines: {node: '>= 0.4'} + es-module-lexer@2.0.0: + resolution: {integrity: sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==} + es-object-atoms@1.1.1: resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==} engines: {node: '>= 0.4'} @@ -1145,6 +1353,9 @@ packages: resolution: {integrity: sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==} engines: {node: '>=4.0'} + estree-walker@3.0.3: + resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==} + esutils@2.0.3: resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==} engines: {node: '>=0.10.0'} @@ -1169,6 +1380,10 @@ packages: resolution: {integrity: sha512-9Be3ZoN4LmYR90tUoVu2te2BsbzHfhJyfEiAVfz7N5/zv+jduIfLrV2xdQXOHbaD6KgpGdO9PRPM1Y4Q9QkPkA==} engines: {node: ^18.19.0 || >=20.5.0} + expect-type@1.3.0: + resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==} + engines: {node: '>=12.0.0'} + express-rate-limit@8.3.2: resolution: {integrity: sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg==} engines: {node: '>= 16'} @@ -1349,6 +1564,10 @@ packages: resolution: {integrity: sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==} engines: {node: '>=16.9.0'} + html-encoding-sniffer@6.0.0: + resolution: {integrity: sha512-CV9TW3Y3f8/wT0BRFc1/KAVQ3TUHiXmaAb6VW9vtiMFf7SLoMd1PdAc4W3KFOFETBJUb90KatHqlsZMWV+R9Gg==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + http-errors@2.0.1: resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} engines: {node: '>= 0.8'} @@ -1385,6 +1604,10 @@ packages: resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==} engines: {node: '>=0.8.19'} + indent-string@4.0.0: + resolution: {integrity: sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==} + engines: {node: '>=8'} + inherits@2.0.4: resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} @@ -1444,6 +1667,9 @@ packages: resolution: {integrity: sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==} engines: {node: '>=12'} + is-potential-custom-element-name@1.0.1: + resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} + is-promise@4.0.0: resolution: {integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==} @@ -1492,6 +1718,15 @@ packages: resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} hasBin: true + jsdom@29.0.2: + resolution: {integrity: sha512-9VnGEBosc/ZpwyOsJBCQ/3I5p7Q5ngOY14a9bf5btenAORmZfDse1ZEheMiWcJ3h81+Fv7HmJFdS0szo/waF2w==} + engines: {node: ^20.19.0 || ^22.13.0 || >=24.0.0} + peerDependencies: + canvas: ^3.0.0 + peerDependenciesMeta: + canvas: + optional: true + jsesc@3.1.0: resolution: {integrity: sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==} engines: {node: '>=6'} @@ -1622,6 +1857,10 @@ packages: resolution: {integrity: sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==} engines: {node: '>=18'} + lru-cache@11.3.5: + resolution: {integrity: sha512-NxVFwLAnrd9i7KUBxC4DrUhmgjzOs+1Qm50D3oF1/oL+r1NpZ4gA7xvG0/zJ8evR7zIKn4vLf7qTNduWFtCrRw==} + engines: {node: 20 || >=22} + lru-cache@5.1.1: resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} @@ -1630,6 +1869,10 @@ packages: peerDependencies: react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0 + lz-string@1.5.0: + resolution: {integrity: sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==} + hasBin: true + magic-string@0.30.21: resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==} @@ -1637,6 +1880,9 @@ packages: resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} + mdn-data@2.27.1: + resolution: {integrity: sha512-9Yubnt3e8A0OKwxYSXyhLymGW4sCufcLG6VdiDdUGVkPhpqLxlvP5vl1983gQjJl3tqbrM731mjaZaP68AgosQ==} + media-typer@1.1.0: resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==} engines: {node: '>= 0.8'} @@ -1672,6 +1918,10 @@ packages: resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==} engines: {node: '>=18'} + min-indent@1.0.1: + resolution: {integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==} + engines: {node: '>=4'} + minimatch@10.2.5: resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==} engines: {node: 18 || 20 || >=22} @@ -1743,6 +1993,9 @@ packages: resolution: {integrity: sha512-EFVjAYfzWqWsBMRHPMAXLCDIJnpMhdWAqR7xG6M6a2cs6PMFpl/+Z20w9zDW4vkxOFfddegBKq9Rehd0bxWE7A==} engines: {node: '>= 10'} + obug@2.1.1: + resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + on-finished@2.4.1: resolution: {integrity: sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==} engines: {node: '>= 0.8'} @@ -1793,6 +2046,9 @@ packages: resolution: {integrity: sha512-TXfryirbmq34y8QBwgqCVLi+8oA3oWx2eAnSn62ITyEhEYaWRlVZ2DvMM9eZbMs/RfxPu/PK/aBLyGj4IrqMHw==} engines: {node: '>=18'} + parse5@8.0.0: + resolution: {integrity: sha512-9m4m5GSgXjL4AjumKzq1Fgfp3Z8rsvjRNbnkVwfu2ImRqE5D0LnY2QfDen18FSY9C573YU5XxSapdHZTZ2WolA==} + parseurl@1.3.3: resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} engines: {node: '>= 0.8'} @@ -1818,6 +2074,9 @@ packages: path-to-regexp@8.4.2: resolution: {integrity: sha512-qRcuIdP69NPm4qbACK+aDogI5CBDMi1jKe0ry5rSQJz8JVLsC7jV8XpiJjGRLLol3N+R5ihGYcrPLTno6pAdBA==} + pathe@2.0.3: + resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} + picocolors@1.1.1: resolution: {integrity: sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==} @@ -1849,6 +2108,10 @@ packages: resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} engines: {node: '>= 0.8.0'} + pretty-format@27.5.1: + resolution: {integrity: sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==} + engines: {node: ^10.13.0 || ^12.13.0 || ^14.15.0 || >=15.0.0} + pretty-ms@9.3.0: resolution: {integrity: sha512-gjVS5hOP+M3wMm5nmNOucbIrqudzs9v/57bWRHQWLYklXqoXKrVfYW2W9+glfGsqtPgpiz5WwyEEB+ksXIx3gQ==} engines: {node: '>=18'} @@ -1885,6 +2148,9 @@ packages: peerDependencies: react: ^19.2.5 + react-is@17.0.2: + resolution: {integrity: sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==} + react-router@7.14.0: resolution: {integrity: sha512-m/xR9N4LQLmAS0ZhkY2nkPA1N7gQ5TUVa5n8TgANuDTARbn1gt+zLPXEm7W0XDTbrQ2AJSJKhoa6yx1D8BcpxQ==} engines: {node: '>=20.0.0'} @@ -1903,6 +2169,10 @@ packages: resolution: {integrity: sha512-YTUo+Flmw4ZXiWfQKGcwwc11KnoRAYgzAE2E7mXKCjSviTKShtxBsN6YUUBB2gtaBzKzeKunxhUwNHQuRryhWA==} engines: {node: '>= 4'} + redent@3.0.0: + resolution: {integrity: sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==} + engines: {node: '>=8'} + require-directory@2.1.1: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} @@ -1948,6 +2218,10 @@ packages: safer-buffer@2.1.2: resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==} + saxes@6.0.0: + resolution: {integrity: sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==} + engines: {node: '>=v12.22.7'} + scheduler@0.27.0: resolution: {integrity: sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==} @@ -2002,6 +2276,9 @@ packages: resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==} engines: {node: '>= 0.4'} + siginfo@2.0.0: + resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==} + signal-exit@3.0.7: resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} @@ -2020,10 +2297,16 @@ packages: resolution: {integrity: sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==} engines: {node: '>=0.10.0'} + stackback@0.0.2: + resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} + statuses@2.0.2: resolution: {integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==} engines: {node: '>= 0.8'} + std-env@4.1.0: + resolution: {integrity: sha512-Rq7ybcX2RuC55r9oaPVEW7/xu3tj8u4GeBYHBWCychFtzMIr86A7e3PPEBPT37sHStKX3+TiX/Fr/ACmJLVlLQ==} + stdin-discarder@0.2.2: resolution: {integrity: sha512-UhDfHmA92YAlNnCfhmq0VeNL5bDbiZGg7sZ2IvPsXubGkiNa9EC+tUTsjBRsYUAz87btI6/1wf4XoVvQ3uRnmQ==} engines: {node: '>=18'} @@ -2063,6 +2346,10 @@ packages: resolution: {integrity: sha512-aulFJcD6YK8V1G7iRB5tigAP4TsHBZZrOV8pjV++zdUwmeV8uzbY7yn6h9MswN62adStNZFuCIx4haBnRuMDaw==} engines: {node: '>=18'} + strip-indent@3.0.0: + resolution: {integrity: sha512-laJTa3Jb+VQpaC6DseHhF7dXVqHTfJPCRDaEbid/drOhgitgYku/letMUqOXFoWV0zIIUbjpdH2t+tYj4bQMRQ==} + engines: {node: '>=8'} + strip-json-comments@3.1.1: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} @@ -2071,6 +2358,9 @@ packages: resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==} engines: {node: '>=8'} + symbol-tree@3.2.4: + resolution: {integrity: sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==} + tabbable@6.4.0: resolution: {integrity: sha512-05PUHKSNE8ou2dwIxTngl4EzcnsCDZGJ/iCLtDflR/SHB/ny14rXc+qU5P4mG9JkusiV7EivzY9Mhm55AzAvCg==} @@ -2091,10 +2381,21 @@ packages: tiny-invariant@1.3.3: resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} + tinybench@2.9.0: + resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==} + + tinyexec@1.1.1: + resolution: {integrity: sha512-VKS/ZaQhhkKFMANmAOhhXVoIfBXblQxGX1myCQ2faQrfmobMftXeJPcZGp0gS07ocvGJWDLZGyOZDadDBqYIJg==} + engines: {node: '>=18'} + tinyglobby@0.2.16: resolution: {integrity: sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==} engines: {node: '>=12.0.0'} + tinyrainbow@3.1.0: + resolution: {integrity: sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==} + engines: {node: '>=14.0.0'} + tldts-core@7.0.28: resolution: {integrity: sha512-7W5Efjhsc3chVdFhqtaU0KtK32J37Zcr9RKtID54nG+tIpcY79CQK/veYPODxtD/LJ4Lue66jvrQzIX2Z2/pUQ==} @@ -2114,6 +2415,10 @@ packages: resolution: {integrity: sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==} engines: {node: '>=16'} + tr46@6.0.0: + resolution: {integrity: sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==} + engines: {node: '>=20'} + ts-api-utils@2.5.0: resolution: {integrity: sha512-OJ/ibxhPlqrMM0UiNHJ/0CKQkoKF243/AEmplt3qpRgkW8VG7IfOS41h7V8TjITqdByHzrjcS/2si+y4lIh8NA==} engines: {node: '>=18.12'} @@ -2160,6 +2465,10 @@ packages: undici-types@7.16.0: resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} + undici@7.25.0: + resolution: {integrity: sha512-xXnp4kTyor2Zq+J1FfPI6Eq3ew5h6Vl0F/8d9XU5zZQf1tX9s2Su1/3PiMmUANFULpmksxkClamIZcaUqryHsQ==} + engines: {node: '>=20.18.1'} + unicorn-magic@0.3.0: resolution: {integrity: sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==} engines: {node: '>=18'} @@ -2243,10 +2552,67 @@ packages: yaml: optional: true + vitest@4.1.4: + resolution: {integrity: sha512-tFuJqTxKb8AvfyqMfnavXdzfy3h3sWZRWwfluGbkeR7n0HUev+FmNgZ8SDrRBTVrVCjgH5cA21qGbCffMNtWvg==} + engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@opentelemetry/api': ^1.9.0 + '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0 + '@vitest/browser-playwright': 4.1.4 + '@vitest/browser-preview': 4.1.4 + '@vitest/browser-webdriverio': 4.1.4 + '@vitest/coverage-istanbul': 4.1.4 + '@vitest/coverage-v8': 4.1.4 + '@vitest/ui': 4.1.4 + happy-dom: '*' + jsdom: '*' + vite: ^6.0.0 || ^7.0.0 || ^8.0.0 + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@opentelemetry/api': + optional: true + '@types/node': + optional: true + '@vitest/browser-playwright': + optional: true + '@vitest/browser-preview': + optional: true + '@vitest/browser-webdriverio': + optional: true + '@vitest/coverage-istanbul': + optional: true + '@vitest/coverage-v8': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + + w3c-xmlserializer@5.0.0: + resolution: {integrity: sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==} + engines: {node: '>=18'} + web-streams-polyfill@3.3.3: resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} engines: {node: '>= 8'} + webidl-conversions@8.0.1: + resolution: {integrity: sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==} + engines: {node: '>=20'} + + whatwg-mimetype@5.0.0: + resolution: {integrity: sha512-sXcNcHOC51uPGF0P/D4NVtrkjSU2fNsm9iog4ZvZJsL3rjoDAzXZhkm2MWt1y+PUdggKAYVoMAIYcs78wJ51Cw==} + engines: {node: '>=20'} + + whatwg-url@16.0.1: + resolution: {integrity: sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==} + engines: {node: ^20.19.0 || ^22.12.0 || >=24.0.0} + which@2.0.2: resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} engines: {node: '>= 8'} @@ -2257,6 +2623,11 @@ packages: engines: {node: ^16.13.0 || >=18.0.0} hasBin: true + why-is-node-running@2.3.0: + resolution: {integrity: sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==} + engines: {node: '>=8'} + hasBin: true + word-wrap@1.2.5: resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} engines: {node: '>=0.10.0'} @@ -2276,6 +2647,13 @@ packages: resolution: {integrity: sha512-g/eziiSUNBSsdDJtCLB8bdYEUMj4jR7AGeUo96p/3dTafgjHhpF4RiCFPiRILwjQoDXx5MqkBr4fwWtR3Ky4Wg==} engines: {node: '>=20'} + xml-name-validator@5.0.0: + resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==} + engines: {node: '>=18'} + + xmlchars@2.2.0: + resolution: {integrity: sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==} + y18n@5.0.8: resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} engines: {node: '>=10'} @@ -2326,6 +2704,28 @@ packages: snapshots: + '@adobe/css-tools@4.4.4': {} + + '@asamuzakjp/css-color@5.1.11': + dependencies: + '@asamuzakjp/generational-cache': 1.0.1 + '@csstools/css-calc': 3.2.0(@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0))(@csstools/css-tokenizer@4.0.0) + '@csstools/css-color-parser': 4.1.0(@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0))(@csstools/css-tokenizer@4.0.0) + '@csstools/css-parser-algorithms': 4.0.0(@csstools/css-tokenizer@4.0.0) + '@csstools/css-tokenizer': 4.0.0 + + '@asamuzakjp/dom-selector@7.0.10': + dependencies: + '@asamuzakjp/generational-cache': 1.0.1 + '@asamuzakjp/nwsapi': 2.3.9 + bidi-js: 1.0.3 + css-tree: 3.2.1 + is-potential-custom-element-name: 1.0.1 + + '@asamuzakjp/generational-cache@1.0.1': {} + + '@asamuzakjp/nwsapi@2.3.9': {} + '@babel/code-frame@7.29.0': dependencies: '@babel/helper-validator-identifier': 7.28.5 @@ -2538,6 +2938,34 @@ snapshots: optionalDependencies: '@types/react': 19.2.14 + '@bramus/specificity@2.4.2': + dependencies: + css-tree: 3.2.1 + + '@csstools/color-helpers@6.0.2': {} + + '@csstools/css-calc@3.2.0(@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0))(@csstools/css-tokenizer@4.0.0)': + dependencies: + '@csstools/css-parser-algorithms': 4.0.0(@csstools/css-tokenizer@4.0.0) + '@csstools/css-tokenizer': 4.0.0 + + '@csstools/css-color-parser@4.1.0(@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0))(@csstools/css-tokenizer@4.0.0)': + dependencies: + '@csstools/color-helpers': 6.0.2 + '@csstools/css-calc': 3.2.0(@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0))(@csstools/css-tokenizer@4.0.0) + '@csstools/css-parser-algorithms': 4.0.0(@csstools/css-tokenizer@4.0.0) + '@csstools/css-tokenizer': 4.0.0 + + '@csstools/css-parser-algorithms@4.0.0(@csstools/css-tokenizer@4.0.0)': + dependencies: + '@csstools/css-tokenizer': 4.0.0 + + '@csstools/css-syntax-patches-for-csstree@1.1.3(css-tree@3.2.1)': + optionalDependencies: + css-tree: 3.2.1 + + '@csstools/css-tokenizer@4.0.0': {} + '@dotenvx/dotenvx@1.61.0': dependencies: commander: 11.1.0 @@ -2617,6 +3045,10 @@ snapshots: '@eslint/core': 0.17.0 levn: 0.4.1 + '@exodus/bytes@1.15.0(@noble/hashes@1.8.0)': + optionalDependencies: + '@noble/hashes': 1.8.0 + '@floating-ui/core@1.7.5': dependencies: '@floating-ui/utils': 0.2.11 @@ -2826,6 +3258,8 @@ snapshots: '@sindresorhus/merge-streams@4.0.0': {} + '@standard-schema/spec@1.1.0': {} + '@tailwindcss/node@4.2.2': dependencies: '@jridgewell/remapping': 2.3.5 @@ -2909,6 +3343,40 @@ snapshots: '@tanstack/virtual-core@3.13.23': {} + '@testing-library/dom@10.4.1': + dependencies: + '@babel/code-frame': 7.29.0 + '@babel/runtime': 7.29.2 + '@types/aria-query': 5.0.4 + aria-query: 5.3.0 + dom-accessibility-api: 0.5.16 + lz-string: 1.5.0 + picocolors: 1.1.1 + pretty-format: 27.5.1 + + '@testing-library/jest-dom@6.9.1': + dependencies: + '@adobe/css-tools': 4.4.4 + aria-query: 5.3.2 + css.escape: 1.5.1 + dom-accessibility-api: 0.6.3 + picocolors: 1.1.1 + redent: 3.0.0 + + '@testing-library/react@16.3.2(@testing-library/dom@10.4.1)(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.5(react@19.2.5))(react@19.2.5)': + dependencies: + '@babel/runtime': 7.29.2 + '@testing-library/dom': 10.4.1 + react: 19.2.5 + react-dom: 19.2.5(react@19.2.5) + optionalDependencies: + '@types/react': 19.2.14 + '@types/react-dom': 19.2.3(@types/react@19.2.14) + + '@testing-library/user-event@14.6.1(@testing-library/dom@10.4.1)': + dependencies: + '@testing-library/dom': 10.4.1 + '@ts-morph/common@0.27.0': dependencies: fast-glob: 3.3.3 @@ -2920,6 +3388,15 @@ snapshots: tslib: 2.8.1 optional: true + '@types/aria-query@5.0.4': {} + + '@types/chai@5.2.3': + dependencies: + '@types/deep-eql': 4.0.2 + assertion-error: 2.0.1 + + '@types/deep-eql@4.0.2': {} + '@types/estree@1.0.8': {} '@types/json-schema@7.0.15': {} @@ -3036,6 +3513,48 @@ snapshots: '@rolldown/pluginutils': 1.0.0-rc.7 vite: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + '@vitest/expect@4.1.4': + dependencies: + '@standard-schema/spec': 1.1.0 + '@types/chai': 5.2.3 + '@vitest/spy': 4.1.4 + '@vitest/utils': 4.1.4 + chai: 6.2.2 + tinyrainbow: 3.1.0 + + '@vitest/mocker@4.1.4(msw@2.13.2(@types/node@24.12.2)(typescript@6.0.2))(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1))': + dependencies: + '@vitest/spy': 4.1.4 + estree-walker: 3.0.3 + magic-string: 0.30.21 + optionalDependencies: + msw: 2.13.2(@types/node@24.12.2)(typescript@6.0.2) + vite: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + + '@vitest/pretty-format@4.1.4': + dependencies: + tinyrainbow: 3.1.0 + + '@vitest/runner@4.1.4': + dependencies: + '@vitest/utils': 4.1.4 + pathe: 2.0.3 + + '@vitest/snapshot@4.1.4': + dependencies: + '@vitest/pretty-format': 4.1.4 + '@vitest/utils': 4.1.4 + magic-string: 0.30.21 + pathe: 2.0.3 + + '@vitest/spy@4.1.4': {} + + '@vitest/utils@4.1.4': + dependencies: + '@vitest/pretty-format': 4.1.4 + convert-source-map: 2.0.0 + tinyrainbow: 3.1.0 + accepts@2.0.0: dependencies: mime-types: 3.0.2 @@ -3075,8 +3594,18 @@ snapshots: dependencies: color-convert: 2.0.1 + ansi-styles@5.2.0: {} + argparse@2.0.1: {} + aria-query@5.3.0: + dependencies: + dequal: 2.0.3 + + aria-query@5.3.2: {} + + assertion-error@2.0.1: {} + ast-types@0.16.1: dependencies: tslib: 2.8.1 @@ -3087,6 +3616,10 @@ snapshots: baseline-browser-mapping@2.10.17: {} + bidi-js@1.0.3: + dependencies: + require-from-string: 2.0.2 + body-parser@2.2.2: dependencies: bytes: 3.1.2 @@ -3142,6 +3675,8 @@ snapshots: caniuse-lite@1.0.30001787: {} + chai@6.2.2: {} + chalk@4.1.2: dependencies: ansi-styles: 4.3.0 @@ -3215,16 +3750,32 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-tree@3.2.1: + dependencies: + mdn-data: 2.27.1 + source-map-js: 1.2.1 + + css.escape@1.5.1: {} + cssesc@3.0.0: {} csstype@3.2.3: {} data-uri-to-buffer@4.0.1: {} + data-urls@7.0.0(@noble/hashes@1.8.0): + dependencies: + whatwg-mimetype: 5.0.0 + whatwg-url: 16.0.1(@noble/hashes@1.8.0) + transitivePeerDependencies: + - '@noble/hashes' + debug@4.4.3: dependencies: ms: 2.1.3 + decimal.js@10.6.0: {} + dedent@1.7.2: {} deep-is@0.1.4: {} @@ -3242,10 +3793,16 @@ snapshots: depd@2.0.0: {} + dequal@2.0.3: {} + detect-libc@2.1.2: {} diff@8.0.4: {} + dom-accessibility-api@0.5.16: {} + + dom-accessibility-api@0.6.3: {} + dotenv@17.4.1: {} dunder-proto@1.0.1: @@ -3276,6 +3833,8 @@ snapshots: graceful-fs: 4.2.11 tapable: 2.3.2 + entities@6.0.1: {} + env-paths@2.2.1: {} error-ex@1.3.4: @@ -3286,6 +3845,8 @@ snapshots: es-errors@1.3.0: {} + es-module-lexer@2.0.0: {} + es-object-atoms@1.1.1: dependencies: es-errors: 1.3.0 @@ -3381,6 +3942,10 @@ snapshots: estraverse@5.3.0: {} + estree-walker@3.0.3: + dependencies: + '@types/estree': 1.0.8 + esutils@2.0.3: {} etag@1.8.1: {} @@ -3418,6 +3983,8 @@ snapshots: strip-final-newline: 4.0.0 yoctocolors: 2.1.2 + expect-type@1.3.0: {} + express-rate-limit@8.3.2(express@5.2.1): dependencies: express: 5.2.1 @@ -3610,6 +4177,12 @@ snapshots: hono@4.12.12: {} + html-encoding-sniffer@6.0.0(@noble/hashes@1.8.0): + dependencies: + '@exodus/bytes': 1.15.0(@noble/hashes@1.8.0) + transitivePeerDependencies: + - '@noble/hashes' + http-errors@2.0.1: dependencies: depd: 2.0.0 @@ -3644,6 +4217,8 @@ snapshots: imurmurhash@0.1.4: {} + indent-string@4.0.0: {} + inherits@2.0.4: {} ip-address@10.1.0: {} @@ -3678,6 +4253,8 @@ snapshots: is-plain-obj@4.1.0: {} + is-potential-custom-element-name@1.0.1: {} + is-promise@4.0.0: {} is-regexp@3.1.0: {} @@ -3708,6 +4285,32 @@ snapshots: dependencies: argparse: 2.0.1 + jsdom@29.0.2(@noble/hashes@1.8.0): + dependencies: + '@asamuzakjp/css-color': 5.1.11 + '@asamuzakjp/dom-selector': 7.0.10 + '@bramus/specificity': 2.4.2 + '@csstools/css-syntax-patches-for-csstree': 1.1.3(css-tree@3.2.1) + '@exodus/bytes': 1.15.0(@noble/hashes@1.8.0) + css-tree: 3.2.1 + data-urls: 7.0.0(@noble/hashes@1.8.0) + decimal.js: 10.6.0 + html-encoding-sniffer: 6.0.0(@noble/hashes@1.8.0) + is-potential-custom-element-name: 1.0.1 + lru-cache: 11.3.5 + parse5: 8.0.0 + saxes: 6.0.0 + symbol-tree: 3.2.4 + tough-cookie: 6.0.1 + undici: 7.25.0 + w3c-xmlserializer: 5.0.0 + webidl-conversions: 8.0.1 + whatwg-mimetype: 5.0.0 + whatwg-url: 16.0.1(@noble/hashes@1.8.0) + xml-name-validator: 5.0.0 + transitivePeerDependencies: + - '@noble/hashes' + jsesc@3.1.0: {} json-buffer@3.0.1: {} @@ -3805,6 +4408,8 @@ snapshots: chalk: 5.6.2 is-unicode-supported: 1.3.0 + lru-cache@11.3.5: {} + lru-cache@5.1.1: dependencies: yallist: 3.1.1 @@ -3813,12 +4418,16 @@ snapshots: dependencies: react: 19.2.5 + lz-string@1.5.0: {} + magic-string@0.30.21: dependencies: '@jridgewell/sourcemap-codec': 1.5.5 math-intrinsics@1.1.0: {} + mdn-data@2.27.1: {} + media-typer@1.1.0: {} merge-descriptors@2.0.0: {} @@ -3842,6 +4451,8 @@ snapshots: mimic-function@5.0.1: {} + min-indent@1.0.1: {} + minimatch@10.2.5: dependencies: brace-expansion: 5.0.5 @@ -3912,6 +4523,8 @@ snapshots: object-treeify@1.1.33: {} + obug@2.1.1: {} + on-finished@2.4.1: dependencies: ee-first: 1.1.1 @@ -3981,6 +4594,10 @@ snapshots: parse-ms@4.0.0: {} + parse5@8.0.0: + dependencies: + entities: 6.0.1 + parseurl@1.3.3: {} path-browserify@1.0.1: {} @@ -3995,6 +4612,8 @@ snapshots: path-to-regexp@8.4.2: {} + pathe@2.0.3: {} + picocolors@1.1.1: {} picomatch@2.3.2: {} @@ -4018,6 +4637,12 @@ snapshots: prelude-ls@1.2.1: {} + pretty-format@27.5.1: + dependencies: + ansi-regex: 5.0.1 + ansi-styles: 5.2.0 + react-is: 17.0.2 + pretty-ms@9.3.0: dependencies: parse-ms: 4.0.0 @@ -4054,6 +4679,8 @@ snapshots: react: 19.2.5 scheduler: 0.27.0 + react-is@17.0.2: {} + react-router@7.14.0(react-dom@19.2.5(react@19.2.5))(react@19.2.5): dependencies: cookie: 1.1.1 @@ -4072,6 +4699,11 @@ snapshots: tiny-invariant: 1.3.3 tslib: 2.8.1 + redent@3.0.0: + dependencies: + indent-string: 4.0.0 + strip-indent: 3.0.0 + require-directory@2.1.1: {} require-from-string@2.0.2: {} @@ -4128,6 +4760,10 @@ snapshots: safer-buffer@2.1.2: {} + saxes@6.0.0: + dependencies: + xmlchars: 2.2.0 + scheduler@0.27.0: {} semver@6.3.1: {} @@ -4240,6 +4876,8 @@ snapshots: side-channel-map: 1.0.1 side-channel-weakmap: 1.0.2 + siginfo@2.0.0: {} + signal-exit@3.0.7: {} signal-exit@4.1.0: {} @@ -4250,8 +4888,12 @@ snapshots: source-map@0.6.1: {} + stackback@0.0.2: {} + statuses@2.0.2: {} + std-env@4.1.0: {} + stdin-discarder@0.2.2: {} strict-event-emitter@0.5.1: {} @@ -4288,12 +4930,18 @@ snapshots: strip-final-newline@4.0.0: {} + strip-indent@3.0.0: + dependencies: + min-indent: 1.0.1 + strip-json-comments@3.1.1: {} supports-color@7.2.0: dependencies: has-flag: 4.0.0 + symbol-tree@3.2.4: {} + tabbable@6.4.0: {} tagged-tag@1.0.0: {} @@ -4306,11 +4954,17 @@ snapshots: tiny-invariant@1.3.3: {} + tinybench@2.9.0: {} + + tinyexec@1.1.1: {} + tinyglobby@0.2.16: dependencies: fdir: 6.5.0(picomatch@4.0.4) picomatch: 4.0.4 + tinyrainbow@3.1.0: {} + tldts-core@7.0.28: {} tldts@7.0.28: @@ -4327,6 +4981,10 @@ snapshots: dependencies: tldts: 7.0.28 + tr46@6.0.0: + dependencies: + punycode: 2.3.1 + ts-api-utils@2.5.0(typescript@6.0.2): dependencies: typescript: 6.0.2 @@ -4375,6 +5033,8 @@ snapshots: undici-types@7.16.0: {} + undici@7.25.0: {} + unicorn-magic@0.3.0: {} universalify@2.0.1: {} @@ -4415,8 +5075,52 @@ snapshots: fsevents: 2.3.3 jiti: 2.6.1 + vitest@4.1.4(@types/node@24.12.2)(jsdom@29.0.2(@noble/hashes@1.8.0))(msw@2.13.2(@types/node@24.12.2)(typescript@6.0.2))(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1)): + dependencies: + '@vitest/expect': 4.1.4 + '@vitest/mocker': 4.1.4(msw@2.13.2(@types/node@24.12.2)(typescript@6.0.2))(vite@8.0.8(@types/node@24.12.2)(jiti@2.6.1)) + '@vitest/pretty-format': 4.1.4 + '@vitest/runner': 4.1.4 + '@vitest/snapshot': 4.1.4 + '@vitest/spy': 4.1.4 + '@vitest/utils': 4.1.4 + es-module-lexer: 2.0.0 + expect-type: 1.3.0 + magic-string: 0.30.21 + obug: 2.1.1 + pathe: 2.0.3 + picomatch: 4.0.4 + std-env: 4.1.0 + tinybench: 2.9.0 + tinyexec: 1.1.1 + tinyglobby: 0.2.16 + tinyrainbow: 3.1.0 + vite: 8.0.8(@types/node@24.12.2)(jiti@2.6.1) + why-is-node-running: 2.3.0 + optionalDependencies: + '@types/node': 24.12.2 + jsdom: 29.0.2(@noble/hashes@1.8.0) + transitivePeerDependencies: + - msw + + w3c-xmlserializer@5.0.0: + dependencies: + xml-name-validator: 5.0.0 + web-streams-polyfill@3.3.3: {} + webidl-conversions@8.0.1: {} + + whatwg-mimetype@5.0.0: {} + + whatwg-url@16.0.1(@noble/hashes@1.8.0): + dependencies: + '@exodus/bytes': 1.15.0(@noble/hashes@1.8.0) + tr46: 6.0.0 + webidl-conversions: 8.0.1 + transitivePeerDependencies: + - '@noble/hashes' + which@2.0.2: dependencies: isexe: 2.0.0 @@ -4425,6 +5129,11 @@ snapshots: dependencies: isexe: 3.1.5 + why-is-node-running@2.3.0: + dependencies: + siginfo: 2.0.0 + stackback: 0.0.2 + word-wrap@1.2.5: {} wrap-ansi@6.2.0: @@ -4446,6 +5155,10 @@ snapshots: is-wsl: 3.1.1 powershell-utils: 0.1.0 + xml-name-validator@5.0.0: {} + + xmlchars@2.2.0: {} + y18n@5.0.8: {} yallist@3.1.1: {} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts index e90b8b9..8d7e3df 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/archives.ts @@ -42,8 +42,13 @@ export function useUploadArchive() { redirect_url: string }>(apiUrl("/archives"), { method: "POST", body: formData }) }, - onSuccess: () => { + onSuccess: (data) => { void queryClient.invalidateQueries({ queryKey: ["archives"] }) + // Re-uploads reuse the archive_id, so a stale detail cache would + // show the previous upload's values (uploaded_by, counts, etc.). + void queryClient.invalidateQueries({ + queryKey: ["archive", data.archive.archive_id], + }) }, }) } @@ -56,7 +61,7 @@ export function useDeleteArchive() { apiUrl(`/archives/${encodeURIComponent(archiveId)}`), { method: "DELETE" }, ) - if (!response.ok && response.status !== 204) { + if (!response.ok) { let message = response.statusText try { const payload = (await response.json()) as { error?: string } @@ -67,8 +72,9 @@ export function useDeleteArchive() { throw new Error(message) } }, - onSuccess: () => { + onSuccess: (_, archiveId) => { void queryClient.invalidateQueries({ queryKey: ["archives"] }) + queryClient.removeQueries({ queryKey: ["archive", archiveId] }) }, }) } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts index 29f648b..b8bb31c 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts @@ -1,5 +1,10 @@ -import { keepPreviousData, useQuery } from "@tanstack/react-query" -import type { IssueRecord, IssuesResponse } from "@/types" +import { + keepPreviousData, + useQuery, + useMutation, + useQueryClient, +} from "@tanstack/react-query" +import type { IssueRecord, IssuesResponse, IssueState } from "@/types" import type { ArchiveSummary } from "@/types" import { fetchJson, apiUrl } from "./client" @@ -41,3 +46,45 @@ export function useIssueDetail(archiveId: string, issueId: string) { ), }) } + +export function useIssueStates(archiveId: string) { + return useQuery({ + queryKey: ["issue-state", archiveId], + queryFn: () => + fetchJson<{ items: IssueState[] | null }>( + apiUrl( + `/archives/${encodeURIComponent(archiveId)}/issue-state`, + ), + ), + }) +} + +export function useSetIssueState(archiveId: string) { + const qc = useQueryClient() + return useMutation({ + mutationFn: async ({ + fingerprint, + state, + }: { + fingerprint: string + state: "" | "ack" | "dismissed" + }) => { + const res = await fetch( + apiUrl( + `/archives/${encodeURIComponent(archiveId)}/issue-state/${encodeURIComponent(fingerprint)}`, + ), + { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ state }), + }, + ) + if (!res.ok) { + throw new Error(`set state failed (${res.status})`) + } + }, + onSuccess: () => { + qc.invalidateQueries({ queryKey: ["issue-state", archiveId] }) + }, + }) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx index 16c0810..47c0f3e 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/ArtifactBrowserPage.tsx @@ -1,15 +1,27 @@ import { useRef, useState, useEffect, useMemo } from "react" -import { useParams, useNavigate } from "react-router" +import { useParams, useNavigate, useLocation } from "react-router" import { Card, CardContent } from "@/components/ui/card" +import { Input } from "@/components/ui/input" import { buttonVariants } from "@/components/ui/button" import { ArchiveHeader } from "@/components/layout/ArchiveHeader" import { useArchive } from "@/api/archives" import { useArtifacts, useArtifactContent } from "@/api/artifacts" import { formatBytes, encodeSegment, encodePath, cn } from "@/lib/utils" +import { severityBadge } from "@/lib/severity" import { apiUrl } from "@/api/client" import { useVirtualizer } from "@tanstack/react-virtual" -import { Folder, File, Download, ChevronRight, FileWarning } from "lucide-react" -import type { ArtifactRecord } from "@/types" +import { + Folder, + File, + Download, + ChevronRight, + FileWarning, + Search, + Link2, +} from "lucide-react" +import type { ArtifactRecord, IssueRecord } from "@/types" +import { JsonTree } from "./JsonTree" +import { SystemIdentityCard } from "./SystemIdentityCard" export function ArtifactBrowserPage() { const { archiveId, "*": splat } = useParams<{ @@ -17,6 +29,7 @@ export function ArtifactBrowserPage() { "*": string }>() const navigate = useNavigate() + const location = useLocation() const artifactPath = splat ?? "" const { data: archive } = useArchive(archiveId!) @@ -27,6 +40,7 @@ export function ArtifactBrowserPage() { ) const [selectedPath, setSelectedPath] = useState(artifactPath) + const [filter, setFilter] = useState("") useEffect(() => { setSelectedPath(artifactPath) @@ -53,13 +67,37 @@ export function ArtifactBrowserPage() { ) } - const tree = useMemo(() => buildTree(artifacts), [artifacts]) + // Path → highest severity of any issue that cites this path (R20). + const pathSeverity = useMemo( + () => computePathSeverity(archive?.issues ?? []), + [archive?.issues], + ) + + // Filter artifacts by name substring (R20). Client-side; list is small. + const filteredArtifacts = useMemo(() => { + const q = filter.trim().toLowerCase() + if (!q) return artifacts + return artifacts.filter((a) => a.path.toLowerCase().includes(q)) + }, [artifacts, filter]) + + const tree = useMemo( + () => buildTree(filteredArtifacts, pathSeverity), + [filteredArtifacts, pathSeverity], + ) const previewLines = useMemo( () => preview?.content?.split(/\r?\n/) ?? [], [preview?.content], ) + // URL fragment #L42 — highlight-and-scroll target line (R21). + const highlightLine = useMemo(() => { + const m = /^#L(\d+)$/.exec(location.hash) + if (!m) return null + const n = Number(m[1]) + return Number.isFinite(n) && n > 0 ? n : null + }, [location.hash]) + if (isLoading) { return (
@@ -68,6 +106,18 @@ export function ArtifactBrowserPage() { ) } + const isJson = + preview?.content != null && + !preview.binary && + (selectedPath.endsWith(".json") || + preview.content.trimStart().startsWith("{") || + preview.content.trimStart().startsWith("[")) + + const isSystemOverview = + preview?.content != null && + !preview.binary && + selectedPath === "system/overview.txt" + return (
{archive ? ( @@ -78,9 +128,16 @@ export function ArtifactBrowserPage() { {/* File tree */} -

- Files -

+

Files

+
+ + setFilter(e.target.value)} + placeholder="Filter files..." + className="h-8 pl-7 text-xs" + /> +
{tree.map((node) => ( ))} + {tree.length === 0 ? ( +

+ No files match "{filter}" +

+ ) : null}
@@ -141,7 +203,17 @@ export function ArtifactBrowserPage() { file.
) : null} - + {isSystemOverview ? ( + + ) : null} + {isJson ? ( + + ) : ( + + )}
) : !selectedPath ? (
@@ -166,6 +238,9 @@ type TreeNode = { kind: "file" | "folder" exists_on_disk: boolean children: TreeNode[] + /** Highest severity among issues citing this file (for files), or among + * descendants (for folders). Empty string when no citations. */ + severity: string } function TreeNodeView({ @@ -190,6 +265,7 @@ function TreeNodeView({ )} onClick={() => onSelect(node.path)} > + {node.name} {!node.exists_on_disk ? ( @@ -203,6 +279,7 @@ function TreeNodeView({
+ {node.name} @@ -220,7 +297,36 @@ function TreeNodeView({ ) } -function VirtualLogView({ lines }: { lines: string[] }) { +function SeverityDot({ severity }: { severity: string }) { + if (!severity) { + return + } + const meta = severityBadge(severity) + // Use the severity text color; a small colored dot is visually paired with + // the icon on the sibling row, so screen-reader text is the title tooltip. + return ( + + ) +} + +function VirtualLogView({ + lines, + highlightLine, +}: { + lines: string[] + highlightLine: number | null +}) { const parentRef = useRef(null) const virtualizer = useVirtualizer({ count: lines.length, @@ -229,6 +335,14 @@ function VirtualLogView({ lines }: { lines: string[] }) { overscan: 20, }) + // Scroll the highlighted line into view on mount / URL-hash change (R21). + useEffect(() => { + if (highlightLine == null) return + if (highlightLine > lines.length) return + // virtualizer row indices are 0-based; line numbers in the URL are 1-based. + virtualizer.scrollToIndex(highlightLine - 1, { align: "center" }) + }, [highlightLine, lines.length, virtualizer]) + return (
- {virtualizer.getVirtualItems().map((virtualRow) => ( -
- - {virtualRow.index + 1} - - - {lines[virtualRow.index].length > 0 - ? lines[virtualRow.index] - : " "} - -
- ))} + {virtualizer.getVirtualItems().map((virtualRow) => { + const lineNumber = virtualRow.index + 1 + const highlighted = lineNumber === highlightLine + return ( + + ) + })}
) } -function buildTree(artifacts: ArtifactRecord[]): TreeNode[] { +/** + * Map each artifact path to the highest severity of any issue citing it via + * `related_artifact_paths`. Used to render colored dots in the tree (R20). + */ +function computePathSeverity(issues: IssueRecord[]): Map { + const out = new Map() + for (const issue of issues) { + const sev = severityBadge(issue.severity) + for (const p of issue.related_artifact_paths ?? []) { + const prev = out.get(p) + if (!prev || severityBadge(prev).rank < sev.rank) { + out.set(p, sev.key) + } + } + } + return out +} + +function buildTree( + artifacts: ArtifactRecord[], + pathSeverity: Map, +): TreeNode[] { const root: TreeNode[] = [] for (const artifact of artifacts) { const segments = artifact.path.split("/").filter(Boolean) @@ -293,12 +451,14 @@ function buildTree(artifacts: ArtifactRecord[]): TreeNode[] { kind: isLeaf ? "file" : "folder", exists_on_disk: isLeaf ? artifact.exists_on_disk : true, children: [], + severity: isLeaf ? pathSeverity.get(currentPath) ?? "" : "", } current.push(node) current = node.children } } sortTree(root) + propagateSeverity(root) return root } @@ -309,3 +469,20 @@ function sortTree(nodes: TreeNode[]) { }) for (const n of nodes) sortTree(n.children) } + +/** Roll the highest descendant severity up to each folder node. */ +function propagateSeverity(nodes: TreeNode[]): string { + let max = "" + for (const n of nodes) { + if (n.kind === "folder") { + n.severity = propagateSeverity(n.children) + } + if ( + n.severity && + (!max || severityBadge(n.severity).rank > severityBadge(max).rank) + ) { + max = n.severity + } + } + return max +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/JsonTree.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/JsonTree.tsx new file mode 100644 index 0000000..a629b22 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/JsonTree.tsx @@ -0,0 +1,140 @@ +import { useMemo, useState } from "react" +import { ChevronDown, ChevronRight } from "lucide-react" +import { cn } from "@/lib/utils" + +/** + * Recursive JSON tree renderer. Used for .json artifacts under the artifact + * browser preview. Falls back to raw text if the content isn't valid JSON. + * + * Implementation is ~60 lines total (excluding styling) with no new deps: + * - parse once with JSON.parse inside a try-catch + * - render nodes recursively; track an "expanded" Set keyed by path + * - leaf values render inline with a type-specific color + */ +export function JsonTree({ text }: { text: string }) { + const parsed = useMemo(() => { + try { + return { ok: true as const, value: JSON.parse(text) } + } catch (err) { + return { ok: false as const, error: err as Error } + } + }, [text]) + + const [expanded, setExpanded] = useState>(() => new Set(["$"])) + if (!parsed.ok) { + return ( +
+ Not valid JSON — showing raw text below. +
+ ) + } + + const toggle = (path: string) => { + setExpanded((prev) => { + const next = new Set(prev) + if (next.has(path)) next.delete(path) + else next.add(path) + return next + }) + } + + return ( +
+ +
+ ) +} + +interface JsonNodeProps { + value: unknown + path: string + name: string | null + expanded: Set + onToggle: (path: string) => void +} + +function JsonNode({ value, path, name, expanded, onToggle }: JsonNodeProps) { + const isObject = value !== null && typeof value === "object" + if (!isObject) return + + const isArray = Array.isArray(value) + const entries: [string, unknown][] = isArray + ? (value as unknown[]).map((v, i) => [String(i), v]) + : Object.entries(value as Record) + + const open = expanded.has(path) + const summary = isArray + ? `[${entries.length}]` + : `{${entries.length}}` + + return ( +
+ + {open ? ( +
+ {entries.map(([k, v]) => ( + + ))} +
+ ) : null} +
+ ) +} + +function Leaf({ name, value }: { name: string | null; value: unknown }) { + const text = value === null ? "null" : JSON.stringify(value) + const color = leafColor(value) + return ( +
+ {name !== null ? ( + + {JSON.stringify(name)}: + + ) : null} + {text} +
+ ) +} + +function leafColor(value: unknown): string { + if (value === null) return "text-muted-foreground" + switch (typeof value) { + case "number": + return "text-severity-info" + case "boolean": + return "text-severity-warning" + case "string": + return "text-success" + default: + return "text-foreground" + } +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/SystemIdentityCard.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/SystemIdentityCard.tsx new file mode 100644 index 0000000..bd787f2 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/artifacts/SystemIdentityCard.tsx @@ -0,0 +1,73 @@ +import { humanBytes, humanUptime } from "@/lib/units" + +/** + * Parses key-value lines from the collector's `system/overview.txt` and + * renders a compact identity card: Hostname, Kernel, OS, CPU, Cores, RAM + * (humanised), Uptime (humanised), Arch. The raw text still renders below. + * + * This is display-only; the heuristics match the collector's actual format + * (each line is `Key: Value`). + */ +export function SystemIdentityCard({ text }: { text: string }) { + const kv = parseKeyValue(text) + if (!kv.size) return null + const fields: Array<[string, string | undefined]> = [ + ["Hostname", kv.get("hostname")], + ["OS", kv.get("os")], + ["Kernel", kv.get("kernel")], + ["CPU", kv.get("cpu")], + ["CPU cores", kv.get("cpu cores")], + ["Memory", humanizeMemory(kv.get("memory"))], + ["Uptime", humanizeUptime(kv.get("uptime"))], + ["Architecture", kv.get("architecture") ?? kv.get("arch")], + ] + const visible = fields.filter(([, v]) => v && v.trim() !== "") + if (!visible.length) return null + return ( +
+

System identity

+
+ {visible.map(([label, value]) => ( +
+

+ {label} +

+

+ {value} +

+
+ ))} +
+
+ ) +} + +function parseKeyValue(text: string): Map { + const out = new Map() + for (const raw of text.split(/\r?\n/)) { + const line = raw.trim() + if (!line || line.startsWith("#")) continue + // Stop at the first blank-ish separator like `/etc/os-release` header. + const sep = line.indexOf(":") + if (sep <= 0) continue + const key = line.slice(0, sep).trim().toLowerCase() + const val = line.slice(sep + 1).trim() + // Capture the first occurrence only (overview.txt has per-key lines). + if (!out.has(key) && val) out.set(key, val) + } + return out +} + +function humanizeMemory(raw: string | undefined): string | undefined { + if (!raw) return undefined + // Collector emits memory as bytes count string. + const n = Number(raw) + if (Number.isFinite(n) && n > 0) return humanBytes(n) + return raw +} + +function humanizeUptime(raw: string | undefined): string | undefined { + if (!raw) return undefined + const formatted = humanUptime(raw) + return formatted === "unknown" ? raw : formatted +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx new file mode 100644 index 0000000..5a4c3e2 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx @@ -0,0 +1,418 @@ +import { useMemo, useState } from "react" +import { useParams, Link } from "react-router" +import { Card, CardContent } from "@/components/ui/card" +import { buttonVariants } from "@/components/ui/button" +import { SeverityBadge } from "@/components/ui/severity-badge" +import { ConfidencePill } from "@/components/ui/confidence-pill" +import { ArchiveHeader } from "@/components/layout/ArchiveHeader" +import { useArchive } from "@/api/archives" +import { KV } from "@/components/ui/kv" +import type { IssueRecord, SkipReason } from "@/types" +import { + nextStep, + formatDate, + formatDuration, + encodeSegment, + cn, + severityColor, + sampleLine, +} from "@/lib/utils" +import { sourceLabel } from "@/lib/source" +import { + factLabel, + factDescription, + factSignal, + factValue, + type FactSignal, +} from "@/lib/facts" +import { groupIssues, type IssueGroup } from "@/lib/grouping" +import { decoratedTitle } from "@/lib/title" +import { composeOverviewSummary } from "@/lib/summary" + +export function DashboardPage() { + const { archiveId } = useParams<{ archiveId: string }>() + const { data: detail, isLoading, error } = useArchive(archiveId!) + + if (isLoading) { + return ( +
+ Loading archive... +
+ ) + } + if (error || !detail) { + return ( +
+ {error ? String(error) : "Archive not found"} +
+ ) + } + + const summary = detail.summary + const id = encodeSegment(archiveId!) + + return ( +
+ + + {/* Severity counts. Eye-tracking research (Improvado 2026 / Tabular KPI + guide): primary KPI should read loudest. We keep the grid width + identical across all four tiles so layout doesn't shift, and use + weight/contrast to pull Critical (and to a lesser extent Warning) + forward while muting Info / Total. */} +
+ + + + +
+ + {/* Summary + Host context */} +
+ + +

+ Summary +

+
+ {composeOverviewSummary(detail.issues, detail.collectors).map( + (sentence, i) => ( +

{sentence}

+ ), + )} +
+

+ {nextStep(summary)} +

+
+ + View issues + + + Browse artifacts + +
+
+
+ + + +

+ Host context +

+
+ + + + + + +
+
+
+
+ + {/* Top issues (grouped by code to avoid duplicates) */} + {detail.issues.length > 0 ? ( + + ) : null} + + {/* Collectors */} +
+

+ Collectors +

+
+ {detail.collectors.map((c) => ( + + +
+ + {c.collector_id} + + +
+
+ {formatDuration(c.duration_ms)} + {c.artifact_count} artifacts + {c.error_count > 0 ? ( + + {c.error_count} errors + + ) : ( + {c.error_count} errors + )} +
+ {c.skip_reasons && c.skip_reasons.length > 0 ? ( + + ) : null} + {c.facts ? : null} +
+
+ ))} +
+
+
+ ) +} + +function StatCard({ + label, + value, + textClass, + emphasis, +}: { + label: string + value: number + textClass: string + emphasis: "primary" | "secondary" | "muted" +}) { + const weight = + emphasis === "primary" + ? "text-2xl font-bold" + : emphasis === "secondary" + ? "text-2xl font-semibold" + : "text-xl font-medium" + return ( + + +

{value}

+

{label}

+
+
+ ) +} + +function SkipReasonList({ reasons }: { reasons: SkipReason[] }) { + // Present the most informative reason as the tile subtitle. Collectors + // typically emit one (e.g. "no RDMA devices in sysfs" for infiniband); + // when there are multiple, show the first as the headline with a hover + // tooltip summarising the rest. + const first = reasons[0] + const extra = reasons.length > 1 ? reasons.slice(1) : [] + return ( +
0 + ? extra + .map((r) => `${r.reason}${r.detail ? ` — ${r.detail}` : ""}`) + .join("\n") + : undefined + } + > + Skipped — {first.detail || first.reason} + {extra.length > 0 ? ( + + (+{extra.length} more) + + ) : null} +
+ ) +} + +function FactList({ facts }: { facts: Record }) { + const [expanded, setExpanded] = useState(false) + // 1. Drop collector-internal enrichment_* keys. + // 2. Suppress legacy `nvswitch_present` when the dotted `nvswitch.present` + // form is also present (R5 — avoid visual duplication). + const allEntries = Object.entries(facts).filter(([key]) => { + if (key.startsWith("enrichment_")) return false + if (key === "nvswitch_present" && "nvswitch.present" in facts) return false + return true + }) + if (allEntries.length === 0) return null + const entries = expanded ? allEntries : allEntries.slice(0, 4) + const hiddenCount = allEntries.length - 4 + return ( +
+ {entries.map(([key, value]) => ( + + ))} + {hiddenCount > 0 && !expanded ? ( + + ) : null} +
+ ) +} + +function FactRow({ factKey, value }: { factKey: string; value: unknown }) { + const signal = factSignal(factKey, value) + const description = factDescription(factKey) + const display = factValue(factKey, value, signal) + return ( +
+ + + {factLabel(factKey)} + + + {display} + +
+ ) +} + +function SignalDot({ signal }: { signal: FactSignal }) { + // Always reserve the dot's horizontal space so labels align to the same + // column whether or not a row carries a signal. An empty neutral dot + // renders transparent but keeps its mr-1.5. + const tone = + signal === "bad" + ? "bg-severity-critical" + : signal === "warn" + ? "bg-severity-warning" + : signal === "unavailable" + ? "bg-muted-foreground/40" + : signal === "info" + ? "bg-severity-info" + : "bg-transparent" + return ( + + ) +} + +function factValueClass(signal: FactSignal): string { + switch (signal) { + case "bad": + return "text-severity-critical" + case "warn": + return "text-severity-warning" + case "unavailable": + return "italic text-muted-foreground" + case "info": + return "text-severity-info" + default: + return "text-foreground" + } +} + +/** + * Title + subline for a Top-Issues row. Mirrors the Issues-list fix: when + * `group.title` is just the sample's message (no triage title available), + * render the message once without decoration and without a duplicate + * subline. Also guarantees a space before the `(N patterns · M events)` + * counter using an explicit space instead of margin. + */ +function TopIssueRowBody({ + group, + singleton, +}: { + group: IssueGroup + singleton: boolean +}) { + const hasRealTitle = group.title !== group.sample.message + const displayTitle = hasRealTitle + ? decoratedTitle(group.title, group.sample.message) + : group.title + const sub = hasRealTitle + ? sampleLine(group.title, group.sample.message) + : "" + return ( + <> +

+ {displayTitle} + {!singleton ? ( + + {" "}({group.count} patterns · {group.occurrences} events) + + ) : null} +

+

+ {sourceLabel(group.sample)} + {sub ? ` · ${sub}` : ""} +

+ + ) +} + +function TopIssues({ + issues, + archiveId, +}: { + issues: IssueRecord[] + archiveId: string +}) { + // Group by the same key as the Issues list (title+code+category+collector) + // so "top issues" reflects distinct patterns, not duplicated codes. + const groups = useMemo(() => groupIssues(issues).slice(0, 5), [issues]) + + return ( +
+
+

Top issues

+ + View all ({issues.length}) + +
+
+ {groups.map((group) => { + const singleton = group.count === 1 + return ( + + + + {/* Compact left column: severity + confidence stack, sized + to the widest pill (not stretched), so title-start lines + up across rows without growing the card height. */} +
+ + +
+
+ +
+
+
+ + ) + })} +
+
+ ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 488d1f3..b326e7f 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -1,12 +1,27 @@ +import { useEffect, useState } from "react" import { useParams, Link } from "react-router" import { Card, CardContent } from "@/components/ui/card" +import { Button } from "@/components/ui/button" import { SeverityBadge } from "@/components/ui/severity-badge" +import { ConfidencePill } from "@/components/ui/confidence-pill" import { ArchiveHeader } from "@/components/layout/ArchiveHeader" import { useArchive } from "@/api/archives" -import { useIssueDetail } from "@/api/issues" +import { useIssueDetail, useIssueStates, useSetIssueState } from "@/api/issues" import { KV } from "@/components/ui/kv" -import { shortFingerprint, encodeSegment, encodePath, sortFindings } from "@/lib/utils" -import { FileText, ArrowRight } from "lucide-react" +import { shortFingerprint, encodeSegment, encodePath, sortFindings, sampleLine } from "@/lib/utils" +import { groupIssues, groupKey } from "@/lib/grouping" +import { occurrenceCount, rankIssues } from "@/lib/ranking" +import { copyToClipboard } from "@/lib/clipboard" +import { + FileText, + ArrowRight, + Check, + Copy, + CheckCircle, + BellOff, + ChevronLeft, + ChevronRight, +} from "lucide-react" export function IssueDetailPage() { const { archiveId, issueId } = useParams<{ @@ -15,6 +30,27 @@ export function IssueDetailPage() { }>() const { data: archive } = useArchive(archiveId!) const { data, isLoading, error } = useIssueDetail(archiveId!, issueId!) + const { data: statesData } = useIssueStates(archiveId!) + const setStateMutation = useSetIssueState(archiveId!) + + // Keyboard prev/next (R18). Defined at the top level so hook order stays + // stable across the early-return branches below. + useEffect(() => { + function onKey(e: KeyboardEvent) { + if (e.target instanceof HTMLInputElement || e.target instanceof HTMLTextAreaElement) { + return + } + if (e.key === "j") { + const link = document.getElementById("issue-next") as HTMLAnchorElement | null + link?.click() + } else if (e.key === "k") { + const link = document.getElementById("issue-prev") as HTMLAnchorElement | null + link?.click() + } + } + window.addEventListener("keydown", onKey) + return () => window.removeEventListener("keydown", onKey) + }, []) if (isLoading) { return ( @@ -52,12 +88,65 @@ export function IssueDetailPage() { ? "Triage evidence" : "Suggested evidence" + // Members of the same pattern group (R15). The user lands on one row; + // linking to the sibling rows closes the "where are the other 7?" loop. + // Recomputed on every render — archives carry <1k issues so this is cheap, + // and the computation has to live after the early-return guards for + // `data`, which rules out a useMemo here. + const siblings = findSiblings(archive?.issues ?? [], issue) + + // Timeline / stats (R16). The message carries `(Nx in )`; parse it. + const occCount = occurrenceCount(issue.message) + const messageSource = extractSource(issue.message) + + // Issue state (R14): Ack / Dismissed / cleared. Keyed by fingerprint. + const currentState = + issue.issue_fingerprint && statesData?.items + ? statesData.items.find( + (s) => s.issue_fingerprint === issue.issue_fingerprint, + )?.state ?? "" + : "" + + // Prev/Next navigation (R18). Operates over the ranked list of archive + // issues so the order matches the list view a CX agent used to land here. + const ranked = rankIssues(archive?.issues ?? []) + const currentIdx = ranked.findIndex((i) => i.id === issue.id) + const prevIssue = currentIdx > 0 ? ranked[currentIdx - 1] : undefined + const nextIssue = currentIdx >= 0 && currentIdx < ranked.length - 1 ? ranked[currentIdx + 1] : undefined + return (
{archive ? ( ) : null} + {/* Prev/Next navigation (R18) */} +
+ {prevIssue ? ( + + + Prev + + ) : } + + {currentIdx >= 0 ? `${currentIdx + 1} of ${ranked.length}` : null} + + {nextIssue ? ( + + Next + + + ) : } +
+ {/* Issue header */}
@@ -66,14 +155,42 @@ export function IssueDetailPage() { {primaryFinding ? (

- {issue.message} + {sampleLine(primaryFinding.title, issue.message)}

) : null}

{issue.collector} · {issue.code} · {issue.category} + {currentState ? ( + + {currentState} + + ) : null}

- +
+ {issue.issue_fingerprint ? ( + + setStateMutation.mutate({ + fingerprint: issue.issue_fingerprint!, + state, + }) + } + /> + ) : null} + + + +
@@ -85,6 +202,29 @@ export function IssueDetailPage() {

What happened

+ {occCount > 1 || messageSource ? ( +
+ {occCount > 1 ? ( + + + {occCount} + + {" occurrences"} + + ) : null} + {messageSource ? ( + + Source:{" "} + + {messageSource} + + + ) : null} +
+ ) : null} {primaryFinding ? ( <>

@@ -108,23 +248,8 @@ export function IssueDetailPage() { - {/* How serious */} - - -

- Classification -

-
- - - {issue.confidence} confidence - - - {issue.category} - -
- - + {/* Classification card was removed — severity / confidence are in the + header row; category lives in the Metadata sidebar. */} {/* What to do next */} {primaryFinding?.action ? ( @@ -221,19 +346,49 @@ export function IssueDetailPage() { - + {issue.issue_fingerprint ? ( + + ) : ( + + )}
+ {/* Related findings in the same pattern group */} + {siblings.length > 0 ? ( + + +

+ Other entries in this group ({siblings.length}) +

+
+ {siblings.slice(0, 8).map((s) => ( + + {s.message} + + ))} + {siblings.length > 8 ? ( + + +{siblings.length - 8} more in Issues list + + ) : null} +
+
+
+ ) : null} + {/* Related artifacts */} @@ -277,6 +432,161 @@ export function IssueDetailPage() { ) } +/** + * Find sibling issues in the same pattern group as `issue`. Empty when + * `issues` is empty or no other member shares the grouping key. + */ +function findSiblings( + issues: import("@/types").IssueRecord[], + issue: import("@/types").IssueRecord, +): import("@/types").IssueRecord[] { + if (!issues.length) return [] + const myKey = groupKey(issue) + const group = groupIssues(issues).find((g) => g.key === myKey) + if (!group) return [] + return group.members.filter((m) => m.id !== issue.id) +} + +/** + * Parse the source hint from `(Nx in path/to/file.txt)` suffix. Returns an + * empty string when the pattern isn't present. + */ +function extractSource(message: string | undefined | null): string { + if (!message) return "" + const m = /\(\d+\s*[x×]\s+in\s+([^)]+)\)\s*$/.exec(message) + return m ? m[1].trim() : "" +} + +/** + * Ack / Dismiss state buttons (R14). Clicking the same state twice toggles + * it off (clears state). Disabled while the mutation is in flight. + */ +function StateButtons({ + currentState, + disabled, + onSet, +}: { + currentState: string + disabled: boolean + onSet: (state: "" | "ack" | "dismissed") => void +}) { + const isAck = currentState === "ack" + const isDismissed = currentState === "dismissed" + return ( +
+ + +
+ ) +} + +/** + * Copy-to-ticket button (R17). Composes a multi-line block ready for pasting + * into tickets and issues; pure client-side. + */ +function CopySummaryButton({ + issue, + archiveId, + hostname, + generatedAt, + title, + evidencePaths, +}: { + issue: import("@/types").IssueRecord + archiveId: string + hostname: string + generatedAt: string + title: string + evidencePaths: string[] +}) { + const [copied, setCopied] = useState(false) + const onCopy = async () => { + const lines = [ + `Host: ${hostname || "(unknown)"}`, + `Archive: ${archiveId}${generatedAt ? ` (generated ${generatedAt})` : ""}`, + `Finding: ${title} (${issue.severity}/${issue.confidence || "unknown"})`, + issue.issue_fingerprint ? `Fingerprint: ${issue.issue_fingerprint}` : "", + evidencePaths.length > 0 ? `Evidence: ${evidencePaths.join(", ")}` : "", + `Message: ${issue.message}`, + ] + .filter(Boolean) + .join("\n") + if (await copyToClipboard(lines)) { + setCopied(true) + setTimeout(() => setCopied(false), 2000) + } + } + return ( + + ) +} + function uniquePaths(paths: string[]) { return Array.from(new Set(paths.filter(Boolean))) } + +/** + * KV row that renders a click-to-copy fingerprint. Shows the short form to + * save sidebar width; copies the full hash on click so a CX agent can paste + * it into a ticket or a grep without opening dev tools. + */ +function FingerprintCopy({ fingerprint }: { fingerprint: string }) { + const [copied, setCopied] = useState(false) + const onCopy = async () => { + if (await copyToClipboard(fingerprint)) { + setCopied(true) + setTimeout(() => setCopied(false), 1500) + } + } + return ( +
+

+ Fingerprint +

+ +
+ ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx index feaa46c..25f7305 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx @@ -13,25 +13,93 @@ import { TableRow, } from "@/components/ui/table" import { SeverityBadge } from "@/components/ui/severity-badge" -import { Pill } from "@/components/ui/pill" +import { ConfidencePill } from "@/components/ui/confidence-pill" import { ArchiveHeader } from "@/components/layout/ArchiveHeader" import { useArchive } from "@/api/archives" -import { useIssues, type IssueFilters } from "@/api/issues" -import { encodeSegment, categoryLabel, primaryFindingTitle, cn } from "@/lib/utils" -import { Search, X, ArrowDown, ArrowUp, ArrowUpDown } from "lucide-react" +import { useIssues, useIssueStates, type IssueFilters } from "@/api/issues" +import type { IssueRecord } from "@/types" +import { encodeSegment, categoryLabel, primaryFindingTitle, sampleLine, cn } from "@/lib/utils" +import { confidenceBadge, severityBadge } from "@/lib/severity" +import { compareIssues } from "@/lib/ranking" +import { groupIssues, type IssueGroup } from "@/lib/grouping" +import { decoratedTitle } from "@/lib/title" +import { bootEpochMs, isBootTimeIssue } from "@/lib/boot" +import { sourceLabel } from "@/lib/source" +import { + Search, + X, + ArrowDown, + ArrowUp, + ArrowUpDown, + Eye, + EyeOff, + ChevronRight, + ChevronDown, + Layers, + List, +} from "lucide-react" type SortField = "severity" | "confidence" | "source" type SortDir = "asc" | "desc" -const severityRank: Record = { critical: 3, warning: 2, info: 1 } -const confidenceRank: Record = { high: 2, low: 1 } - const confidenceOptions: SelectOption[] = [ { value: "", label: "All confidence" }, { value: "high", label: "High confidence" }, { value: "low", label: "Low confidence" }, ] +/** + * Compact issue body for table rows. + * - With a real triage `title`: decorated title line + stripped sample subline + * (subline suppressed when it collapses to the title or to an empty string). + * - Without a title (fallback): render the raw message once — no decorative + * token appending, no duplicated subline. This is the rule that kills the + * "Timeout: · 0000:05:00.0" / echoed-message render observed on expanded + * group members that have no triage finding. + */ +function IssueRowBody({ + title, + message, + compact = false, +}: { + title?: string + message: string + compact?: boolean +}) { + if (!title) { + return ( + + {message} + + ) + } + const decorated = decoratedTitle(title, message) + const sub = sampleLine(title, message) + const showSub = !!sub && sub !== decorated && sub !== title + return ( + <> + + {decorated} + + {showSub ? ( + + {sub} + + ) : null} + + ) +} + export function IssuesPage() { const { archiveId } = useParams<{ archiveId: string }>() const [searchParams] = useSearchParams() @@ -46,9 +114,20 @@ export function IssuesPage() { q: searchParams.get("q") ?? "", } + // "Hide low confidence" — defaults ON per plan R3. URL-backed so links + // preserve state. Explicitly hideLowConf=0 disables; anything else is ON. + const hideLowConf = searchParams.get("hideLowConf") !== "0" + // Pattern grouping defaults ON per plan R8. URL-backed. + const grouped = searchParams.get("grouped") !== "0" + // Dismissed-hiding defaults ON per plan R14. URL-backed. + const hideDismissed = searchParams.get("hideDismissed") !== "0" + const [filters, setFilters] = useState(committedFilters) const [sortField, setSortField] = useState("severity") const [sortDir, setSortDir] = useState("desc") + const [expandedGroups, setExpandedGroups] = useState>( + () => new Set(), + ) const { data: archive, isLoading: archiveLoading } = useArchive(archiveId!) const { @@ -56,9 +135,17 @@ export function IssuesPage() { isLoading: issuesLoading, error, } = useIssues(archiveId!, committedFilters) + const { data: statesData } = useIssueStates(archiveId!) const issues = issuesData?.items ?? [] const isLoading = archiveLoading || issuesLoading + // Fingerprint → state. Dismissed fingerprints hide from the default view. + const stateByFp = useMemo(() => { + const m = new Map() + for (const s of statesData?.items ?? []) m.set(s.issue_fingerprint, s.state) + return m + }, [statesData?.items]) + // Build dropdown options from archive issues, narrowed by the OTHER active filters. const allIssues = archive?.issues ?? [] @@ -127,11 +214,22 @@ export function IssuesPage() { } }, [severityOptions, sourceOptions]) - const commitFilters = (next: IssueFilters) => { + const commitFilters = ( + next: IssueFilters, + overrideHideLow?: boolean, + overrideGrouped?: boolean, + overrideHideDismissed?: boolean, + ) => { const params = new URLSearchParams() for (const [key, value] of Object.entries(next)) { if (value) params.set(key, value) } + const effectiveHide = overrideHideLow ?? hideLowConf + if (!effectiveHide) params.set("hideLowConf", "0") + const effectiveGrouped = overrideGrouped ?? grouped + if (!effectiveGrouped) params.set("grouped", "0") + const effectiveHideDismissed = overrideHideDismissed ?? hideDismissed + if (!effectiveHideDismissed) params.set("hideDismissed", "0") const suffix = params.toString() ? `?${params.toString()}` : "" navigate( `/archives/${encodeSegment(archiveId!)}/issues${suffix}`, @@ -159,7 +257,25 @@ export function IssuesPage() { const hasActiveFilters = Object.values(filters).some(Boolean) - // Client-side sort + // Count hidden issues per the low-confidence toggle. Criticals in the hidden + // set escalate the banner: per the plan, we must never silently hide a + // Critical — show the count in an amber callout. + const hiddenLowConf = useMemo(() => { + if (!hideLowConf) return { total: 0, critical: 0 } + let total = 0 + let critical = 0 + for (const i of issues) { + if (confidenceBadge(i.confidence).key === "low") { + total += 1 + if (severityBadge(i.severity).key === "critical") critical += 1 + } + } + return { total, critical } + }, [issues, hideLowConf]) + + // Client-side sort. The "severity" column (default landing) uses the + // lexicographic composite from R0.D; other columns keep single-field sort + // for manual override. const toggleSort = (field: SortField) => { if (sortField === field) { setSortDir((d) => (d === "desc" ? "asc" : "desc")) @@ -169,24 +285,76 @@ export function IssuesPage() { } } + const visibleIssues = useMemo(() => { + return issues.filter((i) => { + if (hideLowConf && confidenceBadge(i.confidence).key === "low") return false + if (hideDismissed && i.issue_fingerprint && stateByFp.get(i.issue_fingerprint) === "dismissed") return false + return true + }) + }, [issues, hideLowConf, hideDismissed, stateByFp]) + + const hiddenDismissedCount = useMemo(() => { + if (!hideDismissed) return 0 + return issues.filter( + (i) => i.issue_fingerprint && stateByFp.get(i.issue_fingerprint) === "dismissed", + ).length + }, [issues, hideDismissed, stateByFp]) + const sortedIssues = useMemo(() => { + if (sortField === "severity") { + const ordered = [...visibleIssues].sort(compareIssues) + return sortDir === "desc" ? ordered : ordered.reverse() + } const dir = sortDir === "desc" ? -1 : 1 - return [...issues].sort((a, b) => { + return [...visibleIssues].sort((a, b) => { let cmp = 0 - switch (sortField) { - case "severity": - cmp = (severityRank[a.severity] ?? 0) - (severityRank[b.severity] ?? 0) - break - case "confidence": - cmp = (confidenceRank[a.confidence] ?? 0) - (confidenceRank[b.confidence] ?? 0) - break - case "source": - cmp = categoryLabel(a.category).localeCompare(categoryLabel(b.category)) - break + if (sortField === "confidence") { + cmp = confidenceBadge(a.confidence).rank - confidenceBadge(b.confidence).rank + } else { + cmp = categoryLabel(a.category).localeCompare(categoryLabel(b.category)) } return cmp * dir }) - }, [issues, sortField, sortDir]) + }, [visibleIssues, sortField, sortDir]) + + const groups = useMemo( + () => (grouped ? groupIssues(visibleIssues) : []), + [visibleIssues, grouped], + ) + + // Apply the same sortField/sortDir to groups so the column-header sort + // buttons affect the default (grouped) view too. groupIssues already + // returns composite-desc order, so "severity desc" is a no-op; flipping + // to "asc" reverses. For confidence/source we sort fresh. + const sortedGroups = useMemo(() => { + if (!grouped) return [] + if (sortField === "severity") { + return sortDir === "desc" ? groups : [...groups].reverse() + } + const dir = sortDir === "desc" ? -1 : 1 + const arr = [...groups] + arr.sort((a, b) => { + let cmp = 0 + if (sortField === "confidence") { + cmp = confidenceBadge(a.confidence).rank - confidenceBadge(b.confidence).rank + } else { + cmp = sourceLabel(a.sample).localeCompare(sourceLabel(b.sample)) + } + return cmp * dir + }) + return arr + }, [groups, grouped, sortField, sortDir]) + + const bootMs = bootEpochMs(archive?.summary) + + const toggleGroup = (key: string) => { + setExpandedGroups((prev) => { + const next = new Set(prev) + if (next.has(key)) next.delete(key) + else next.add(key) + return next + }) + } const sortIcon = (field: SortField) => { if (sortField !== field) return @@ -203,6 +371,154 @@ export function IssuesPage() { ) } + function renderFlatRow(issue: IssueRecord) { + const title = primaryFindingTitle(issue.triage_findings) + const conf = confidenceBadge(issue.confidence) + const state = issue.issue_fingerprint + ? stateByFp.get(issue.issue_fingerprint) ?? "" + : "" + const dismissed = state === "dismissed" + return ( + + navigate( + `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(issue.id)}`, + ) + } + > + + + + + + + + {sourceLabel(issue)} + + + + + + ) + } + + function renderGroupRows(group: IssueGroup): React.ReactNode[] { + const expanded = expandedGroups.has(group.key) + const confKey = confidenceBadge(group.confidence).key + const rowOpacity = confKey === "low" ? "opacity-75" : "" + const allBootTime = + bootMs != null && group.members.every((m) => isBootTimeIssue(m, bootMs)) + const rows: React.ReactNode[] = [] + + const singleton = group.count === 1 + const handleClick = () => { + if (singleton) { + navigate( + `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(group.sample.id)}`, + ) + } else { + toggleGroup(group.key) + } + } + + // When grouping has no triage title to work with, the sample's message is + // pushed into `group.title` as a fallback (see grouping.ts). Detect that + // case so we don't decorate the message with a trailing token and don't + // repeat it as a subline. + const hasRealTitle = group.title !== group.sample.message + const displayTitle = hasRealTitle + ? decoratedTitle(group.title, group.sample.message) + : group.title + rows.push( + + + + + + + + + {sourceLabel(group.sample)} + + + + {!singleton ? ( + expanded ? ( + + ) : ( + + ) + ) : null} + {displayTitle} + {!singleton ? ( + + {" "}({group.count} patterns · {group.occurrences} events) + + ) : null} + {allBootTime ? ( + + boot + + ) : null} + + {!expanded && hasRealTitle ? ( + + {sampleLine(group.title, group.sample.message)} + + ) : null} + + , + ) + + if (expanded && !singleton) { + for (const member of group.members) { + const memberConf = confidenceBadge(member.confidence) + const memberTitle = primaryFindingTitle(member.triage_findings) + rows.push( + + navigate( + `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(member.id)}`, + ) + } + > + + + + {sourceLabel(member)} + + + + + , + ) + } + } + return rows + } + return (
{archive ? ( @@ -254,6 +570,40 @@ export function IssuesPage() { }} className="w-[140px] border-border bg-card focus-visible:border-ring" /> + + +
+ ) : null} + {hideDismissed && hiddenDismissedCount > 0 ? ( +
+ + {hiddenDismissedCount} dismissed finding{hiddenDismissedCount === 1 ? "" : "s"} hidden. + + +
+ ) : null}
{/* Results */} @@ -278,7 +667,7 @@ export function IssuesPage() {
{String(error)}
- ) : issues.length === 0 ? ( + ) : (grouped ? groups.length === 0 : sortedIssues.length === 0) ? (

No matching issues

@@ -291,7 +680,7 @@ export function IssuesPage() { toggleSort("severity")} > @@ -300,7 +689,7 @@ export function IssuesPage() { toggleSort("confidence")} > @@ -309,7 +698,7 @@ export function IssuesPage() { toggleSort("source")} > @@ -321,41 +710,9 @@ export function IssuesPage() { - {sortedIssues.map((issue) => { - const title = primaryFindingTitle(issue.triage_findings) - const isLow = issue.confidence.toLowerCase() === "low" - return ( - - navigate( - `/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(issue.id)}`, - ) - } - > - - - - - {issue.confidence} - - - {categoryLabel(issue.category)} - - - - {title || issue.message} - - {title ? ( - - {issue.message} - - ) : null} - - - ) - })} + {grouped + ? sortedGroups.flatMap((group) => renderGroupRows(group)) + : sortedIssues.map((issue) => renderFlatRow(issue))}

diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/confidence-pill.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/confidence-pill.tsx new file mode 100644 index 0000000..131e970 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/confidence-pill.tsx @@ -0,0 +1,37 @@ +import { cn } from "@/lib/utils" +import { confidenceBadge } from "@/lib/severity" + +/** + * Confidence pill: always renders a visible label (never an empty span) and + * pairs the label with a shape icon. Low confidence gets a dashed border; + * unknown confidence (pre-R9 archives) gets a dotted border. High confidence + * is the un-adorned default. + * + * Used in the Issues list row and the Issue detail CLASSIFICATION card. + */ +export function ConfidencePill({ + confidence, + className, + showIcon = true, +}: { + confidence: string | null | undefined + className?: string + showIcon?: boolean +}) { + const meta = confidenceBadge(confidence) + const Icon = meta.icon + return ( + + {showIcon ? : null} + {meta.label} + + ) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx index 7ae1ec8..9b4b63a 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/kv.tsx @@ -1,8 +1,13 @@ export function KV({ label, value }: { label: string; value: string }) { return ( -
+

{label}

-

{value}

+

+ {value} +

) } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx index 8ee0b5b..54822db 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/ui/severity-badge.tsx @@ -1,31 +1,36 @@ import { cn } from "@/lib/utils" +import { severityBadge } from "@/lib/severity" -const styles: Record = { - critical: "bg-severity-critical-muted text-severity-critical", - warning: "bg-severity-warning-muted text-severity-warning", - info: "bg-severity-info-muted text-severity-info", - ok: "bg-success-muted text-success", -} - +/** + * Severity badge that pairs color with a shape icon (WCAG 1.4.1 — color + * alone is not sufficient). The icon differs per severity: + * critical → AlertCircle, warning → AlertTriangle, info → Info, + * unknown → HelpCircle. + */ export function SeverityBadge({ severity, label, className, + showIcon = true, }: { severity: string label?: string className?: string + showIcon?: boolean }) { - const key = severity.toLowerCase() + const meta = severityBadge(severity) + const Icon = meta.icon return ( - {label ?? severity} + {showIcon ? : null} + {label ?? meta.label} ) } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.test.ts new file mode 100644 index 0000000..906ff8b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.test.ts @@ -0,0 +1,101 @@ +import { describe, expect, it } from "vitest" +import { bootEpochMs, isBootTimeIssue, parseDmesgWallClock } from "./boot" +import type { ArchiveSummary, IssueRecord } from "@/types" + +function summary(partial: Partial): ArchiveSummary { + return { + archive_id: "x", + schema_version: "3.2.0", + generated_at: "", + hostname: "", + platform: {}, + uploaded_at: "2026-04-17T09:09:00Z", + uploaded_by: "anon", + issue_counts: { critical: 0, warning: 0, info: 0, total: 0 }, + collector_count: 0, + artifact_count: 0, + triage_finding_count: null, + status: "ready", + compressed_size: 0, + ...partial, + } +} + +function issue(message: string): IssueRecord { + return { + id: "x", + collector: "triage", + code: "critical_log", + severity: "critical", + confidence: "low", + category: "DISK", + message, + } +} + +describe("parseDmesgWallClock", () => { + it("parses the kernel's [Day Mon DD HH:MM:SS YYYY] format", () => { + const ts = parseDmesgWallClock("[Tue Dec 2 03:33:25 2025] ata1: SATA link down") + expect(ts).not.toBeNull() + // Verify it's in December 2025. + const d = new Date(ts!) + expect(d.getUTCFullYear()).toBe(2025) + expect(d.getUTCMonth()).toBe(11) + }) + + it("returns null for messages without a timestamp", () => { + expect(parseDmesgWallClock("firewall inactive")).toBeNull() + expect(parseDmesgWallClock("")).toBeNull() + expect(parseDmesgWallClock(undefined)).toBeNull() + }) +}) + +describe("bootEpochMs", () => { + it("computes the boot epoch from generated_at and uptime_seconds", () => { + const s = summary({ + generated_at: "2026-04-17T09:08:00Z", + uptime_seconds: 3600, + }) + const boot = bootEpochMs(s) + expect(boot).not.toBeNull() + // Boot epoch should be one hour before generated_at. + expect(new Date(boot!).toISOString()).toBe("2026-04-17T08:08:00.000Z") + }) + + it("returns null when inputs are missing", () => { + expect(bootEpochMs(undefined)).toBeNull() + expect(bootEpochMs(summary({ uptime_seconds: undefined }))).toBeNull() + expect(bootEpochMs(summary({ generated_at: "" }))).toBeNull() + }) +}) + +describe("isBootTimeIssue", () => { + it("flags issues whose timestamp falls in the first 60s of uptime", () => { + // Use dmesg-format generated_at so both sides are parsed in the same + // (local) timezone; the production comparison relies on the host running + // the dashboard sharing a TZ with the host that produced the archive — + // this is explicitly documented on the helper. + const s = summary({ + generated_at: "Tue Dec 2 03:35:00 2025", + uptime_seconds: 95, + }) + const boot = bootEpochMs(s) + const sataBoot = issue("[Tue Dec 2 03:33:25 2025] ata1: SATA link down") + expect(isBootTimeIssue(sataBoot, boot)).toBe(true) + }) + + it("does not flag issues outside the boot window", () => { + const s = summary({ + generated_at: "Thu Apr 17 09:00:00 2026", + uptime_seconds: 88 * 86400, // 88 days + }) + const boot = bootEpochMs(s) + const recent = issue("[Wed Apr 16 10:00:00 2026] later event") + expect(isBootTimeIssue(recent, boot)).toBe(false) + }) + + it("returns false when inputs can't be parsed", () => { + expect(isBootTimeIssue(issue("no timestamp"), 0)).toBe(false) + expect(isBootTimeIssue(issue("[whatever]"), null)).toBe(false) + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.ts new file mode 100644 index 0000000..c8e65ec --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/boot.ts @@ -0,0 +1,61 @@ +/** + * Boot-time annotation (R12). Given an archive's `generated_at` wall-clock + * and `uptime_seconds`, compute when the system booted; then for each issue + * message bearing a dmesg wall-clock `[Day Mon DD HH:MM:SS YYYY]` stamp, + * decide whether it fell within the first minute of uptime (boot-time noise). + * + * Returns `null` whenever any input is missing or unparseable — the caller + * then degrades gracefully (no boot chip rendered, no false positives). + * + * Caveat: dmesg timestamps are emitted in the host's local timezone and + * `generated_at` is usually UTC. This helper uses `Date.parse` on both sides, + * which means an archive viewer in a different timezone can miss or + * mis-classify boot-time. Per the plan this is an advisory annotation (never + * a filter that hides data), so false negatives are acceptable and false + * positives are bounded by the 60s window. + */ + +import type { IssueRecord, ArchiveSummary } from "@/types" + +const BOOT_WINDOW_MS = 60_000 + +export function bootEpochMs(summary: ArchiveSummary | undefined): number | null { + if (!summary) return null + const { generated_at, uptime_seconds } = summary + if (!generated_at || uptime_seconds == null) return null + const gen = Date.parse(generated_at) + if (!Number.isFinite(gen)) return null + const uptimeMs = uptime_seconds * 1000 + if (!Number.isFinite(uptimeMs) || uptimeMs < 0) return null + return gen - uptimeMs +} + +/** + * Parse the leading `[Tue Dec 2 03:33:25 2025]` wall-clock from a dmesg line. + * Returns epoch milliseconds or null if absent/malformed. + */ +export function parseDmesgWallClock(message: string | undefined): number | null { + if (!message) return null + // Canonical kernel format: "[Day Mon D HH:MM:SS YYYY]" or "[Day Mon DD ...]". + const m = /^\[([A-Za-z]{3} [A-Za-z]{3}\s+\d{1,2} \d{2}:\d{2}:\d{2} \d{4})\]/.exec( + message, + ) + if (!m) return null + // JavaScript's Date.parse accepts the format. + const t = Date.parse(m[1]) + return Number.isFinite(t) ? t : null +} + +/** + * True when the issue's wall-clock timestamp (if any) is within the first + * 60s of the system's uptime. Returns false on any missing input. + */ +export function isBootTimeIssue( + issue: IssueRecord, + bootMs: number | null, +): boolean { + if (bootMs == null) return false + const issueMs = parseDmesgWallClock(issue.message) + if (issueMs == null) return false + return issueMs >= bootMs && issueMs < bootMs + BOOT_WINDOW_MS +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/clipboard.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/clipboard.ts new file mode 100644 index 0000000..83e8189 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/clipboard.ts @@ -0,0 +1,14 @@ +/** + * Copy `text` to the system clipboard. Returns true on success, false when + * the browser denies permission or the API is unavailable. Call sites should + * treat failure quietly — the data is still visible on screen for manual copy. + */ +export async function copyToClipboard(text: string): Promise { + if (typeof navigator === "undefined" || !navigator.clipboard) return false + try { + await navigator.clipboard.writeText(text) + return true + } catch { + return false + } +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts new file mode 100644 index 0000000..132685b --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts @@ -0,0 +1,36 @@ +import { describe, it, expect } from "vitest" +import { messageComponent } from "./component" + +describe("messageComponent", () => { + it("extracts tag with PID after a dmesg-style timestamp", () => { + expect( + messageComponent( + "[Tue Dec 2 03:32:55 2025] NetworkManager[1234]: DHCPv6 lease expired", + ), + ).toBe("NetworkManager") + }) + + it("extracts kernel tag without PID", () => { + expect( + messageComponent("[Tue Dec 2 03:32:55 2025] kernel: ata1: SError: { ... }"), + ).toBe("kernel") + }) + + it("extracts sshd tag from a plain RFC3164 line (no timestamp prefix)", () => { + expect(messageComponent("sshd[5678]: Failed password for root from 10.0.0.1")).toBe("sshd") + }) + + it("allows hyphenated component names like systemd-logind", () => { + expect(messageComponent("systemd-logind: Removed session 42.")).toBe("systemd-logind") + }) + + it("returns null when no tag shape is present", () => { + expect(messageComponent("Firewall inactive (ufw)")).toBeNull() + }) + + it("returns null for empty and undefined input", () => { + expect(messageComponent("")).toBeNull() + expect(messageComponent(undefined)).toBeNull() + expect(messageComponent(null)).toBeNull() + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts new file mode 100644 index 0000000..ae75a74 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts @@ -0,0 +1,25 @@ +/** + * Extracts the RFC3164 / RFC5424 "tag" (appname/program) from a syslog-style + * message. The tag names the originating component — useful when the + * category bucket is too coarse (e.g. 25 distinct runtime errors all + * landing under "System Logs" because they share the ERR category). + * + * Matches: + * "[Tue Dec 2 03:32:55 2025] NetworkManager[1234]: DHCPv6 lease ..." → "NetworkManager" + * "[Tue Dec 2 03:32:55 2025] kernel: ata1: SError: { ... }" → "kernel" + * "sshd[5678]: Failed password for root ..." → "sshd" + * "systemd-logind: Removed session 42." → "systemd-logind" + * + * Returns `null` when the message doesn't carry an identifiable tag, so + * callers can gracefully fall back to the existing source label. + */ +const TAG = "[A-Za-z][\\w.-]{0,31}" +const DMESG_PREFIX = /^\[[^\]]+\]\s+/ +const TAG_RE = new RegExp(`^(${TAG})(?:\\[\\d+\\])?:`) + +export function messageComponent(message: string | undefined | null): string | null { + if (!message) return null + const stripped = message.replace(DMESG_PREFIX, "") + const match = TAG_RE.exec(stripped) + return match ? match[1] : null +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.test.ts new file mode 100644 index 0000000..0eb8f5a --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from "vitest" +import { factLabel, factSignal, factDescription } from "./facts" + +describe("factLabel", () => { + it("returns the friendly label for registered keys", () => { + expect(factLabel("pcie.aer_fatal_total")).toBe("PCIe AER fatal errors") + expect(factLabel("crash_dump_count")).toBe("Crash dumps present") + expect(factLabel("nvswitch.present")).toBe("NVSwitch present") + }) + + it("returns registered labels for OVS keys", () => { + expect(factLabel("ovs.bridge_count")).toBe("OVS bridges") + expect(factLabel("ovs.stale_socket_count")).toBe("OVS stale sockets") + }) + + it("humanises unknown dotted keys using the last segment", () => { + expect(factLabel("made.up.key")).toBe("Key") + expect(factLabel("some.weird_metric_name")).toBe("Weird metric name") + }) + + it("resolves a dotted key against the registry via last-segment fallback", () => { + // Collector emits namespaced forms like `system.iommu_enabled`; we + // register the bare form once and expect both variants to hit. + expect(factLabel("system.iommu_enabled")).toBe("IOMMU enabled") + expect(factLabel("system.ecc_type")).toBe("ECC type") + expect(factLabel("hypervisor.pci_nvidia_count")).toBe("NVIDIA PCI devices") + expect(factLabel("hypervisor.vfio_bound_count")).toBe("VFIO-bound GPUs") + expect(factLabel("system.hugetlb_kib")).toBe("HugeTLB allocated") + expect(factLabel("system.transparent_hugepage")).toBe("Transparent hugepages") + }) + + it("humanises unknown flat keys with underscores", () => { + expect(factLabel("brand_new_count")).toBe("Brand new count") + }) +}) + +describe("factSignal", () => { + it("flags non-zero error counters as bad", () => { + expect(factSignal("pcie.aer_fatal_total", 1)).toBe("bad") + expect(factSignal("crash_dump_count", 2)).toBe("bad") + expect(factSignal("devlink.fw_fatal_count", 5)).toBe("bad") + }) + + it("flags zero error counters as neutral (checked clean)", () => { + expect(factSignal("pcie.aer_fatal_total", 0)).toBe("neutral") + expect(factSignal("crash_dump_count", 0)).toBe("neutral") + }) + + it("flags string 'unavailable' sentinel correctly", () => { + expect(factSignal("pcie.aer_fatal_total", "unavailable")).toBe("unavailable") + expect(factSignal("crash_dump_count", "unavailable")).toBe("unavailable") + expect(factSignal("pcie.link_speed_degraded_count", "unavailable")).toBe( + "unavailable", + ) + }) + + it("warns for soft-error counters when > 0", () => { + expect(factSignal("pcie.link_speed_degraded_count", 2)).toBe("warn") + expect(factSignal("thermal.package_throttle_events_total", 1)).toBe("warn") + expect(factSignal("failed_service_count", 3)).toBe("warn") + }) + + it("parses string numerics the same as numbers", () => { + expect(factSignal("pcie.aer_fatal_total", "3")).toBe("bad") + expect(factSignal("pcie.aer_fatal_total", "0")).toBe("neutral") + }) + + it("flags firewall_posture=inactive as warn", () => { + expect(factSignal("firewall_posture", "inactive")).toBe("warn") + expect(factSignal("firewall_posture", "active")).toBe("neutral") + }) + + it("treats presence facts as info when true", () => { + expect(factSignal("nvswitch.present", "true")).toBe("info") + expect(factSignal("nvswitch.present", true)).toBe("info") + expect(factSignal("nvswitch.present", "false")).toBe("neutral") + }) + + it("returns neutral for unknown keys (forward-compat)", () => { + expect(factSignal("made.up.key", 42)).toBe("neutral") + }) +}) + +describe("factDescription", () => { + it("returns a description for keys that declare one", () => { + const desc = factDescription("pcie.aer_fatal_total") + expect(desc).toMatch(/fatal/i) + }) + + it("returns undefined for identity facts without descriptions", () => { + expect(factDescription("cpu_cores")).toBeUndefined() + expect(factDescription("made.up.key")).toBeUndefined() + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.ts new file mode 100644 index 0000000..acdcffb --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.ts @@ -0,0 +1,360 @@ +/** + * Fact registry: friendly label + threshold signal for facts emitted by the + * gather-info collectors. Unknown keys pass through with the raw key as label + * and a neutral signal — forward-compat is preserved (see AGENTS.md). + * + * A `bad` signal means "this is almost certainly a problem worth RMA-level + * attention"; `warn` means "worth a look"; `unavailable` means "the collector + * could not probe this on this host" (distinct from zero-checked-clean). + */ + +import { humanBytes, humanUptime } from "./units" + +export type FactSignal = "neutral" | "info" | "warn" | "bad" | "unavailable" + +export interface FactMeta { + label: string + description?: string + signal: (v: unknown) => FactSignal + /** + * Optional value formatter. Overrides the default stringification and the + * suffix-based byte/uptime auto-detection. Receives the raw value and must + * return a display string. + */ + format?: (v: unknown) => string +} + +/** Format a number as bytes given an input unit multiplier (e.g. 1024 for KiB). */ +function bytesFrom(mult: number) { + return (v: unknown): string => { + if (typeof v === "number" && Number.isFinite(v)) return humanBytes(v * mult) + const n = Number(v) + return Number.isFinite(n) ? humanBytes(n * mult) : String(v ?? "N/A") + } +} + +const formatKib = bytesFrom(1024) +const formatBytes = bytesFrom(1) +const formatUptime = (v: unknown): string => humanUptime(v) + +/** + * Interpret a fact value as a number when possible. + * Returns the string "unavailable" when the collector emitted that sentinel, + * a finite number when the value parses, or null otherwise. + */ +function asNumeric(v: unknown): number | "unavailable" | null { + if (v === "unavailable") return "unavailable" + if (typeof v === "number" && Number.isFinite(v)) return v + if (typeof v === "string") { + const n = Number(v) + if (Number.isFinite(n) && v.trim() !== "") return n + } + return null +} + +function badIfPositive(v: unknown): FactSignal { + const n = asNumeric(v) + if (n === "unavailable") return "unavailable" + if (typeof n === "number") return n > 0 ? "bad" : "neutral" + return "neutral" +} + +function warnIfPositive(v: unknown): FactSignal { + const n = asNumeric(v) + if (n === "unavailable") return "unavailable" + if (typeof n === "number") return n > 0 ? "warn" : "neutral" + return "neutral" +} + +function infoPresence(v: unknown): FactSignal { + // "true" string or boolean true → info; anything else → neutral. + if (v === true || v === "true") return "info" + return "neutral" +} + +export const factRegistry: Record = { + // PCIe — hardware-critical counters + "pcie.aer_fatal_total": { + label: "PCIe AER fatal errors", + description: "Fatal-class Advanced Error Reporting counts summed across PCI devices. Non-zero indicates a hardware fault.", + signal: badIfPositive, + }, + "pcie.aer_nonfatal_total": { + label: "PCIe AER non-fatal errors", + description: "Non-fatal AER counts summed across PCI devices. Recoverable but often an early warning.", + signal: warnIfPositive, + }, + "pcie.aer_devices_with_errors": { + label: "Devices with AER errors", + description: "Count of PCI devices reporting at least one AER error.", + signal: badIfPositive, + }, + "pcie.link_speed_degraded_count": { + label: "PCIe link-speed degraded", + description: "Devices running below their maximum negotiated link speed (e.g. Gen3 in a Gen5 slot).", + signal: warnIfPositive, + }, + "pcie.link_width_degraded_count": { + label: "PCIe link-width degraded", + description: "Devices negotiated to fewer lanes than the slot supports (e.g. x4 in an x16 slot).", + signal: warnIfPositive, + }, + + // Thermal + "thermal.fan_alarm_count": { + label: "Fan alarm bits", + description: "Number of hwmon fan sensors reporting their alarm bit set.", + signal: badIfPositive, + }, + "thermal.critical_trip_exceeded": { + label: "Thermal critical trips exceeded", + description: "Thermal zones that have exceeded their critical trip point.", + signal: badIfPositive, + }, + "thermal.package_throttle_events_total": { + label: "CPU package throttle events", + description: "Cumulative package-level thermal throttle events across unique CPU packages.", + signal: warnIfPositive, + }, + "thermal.core_throttle_events_total": { + label: "CPU core throttle events", + description: "Cumulative core-level thermal throttle events across unique CPU cores.", + signal: warnIfPositive, + }, + "thermal.sensor_count": { + label: "Thermal sensors present", + signal: () => "neutral", + }, + + // Crash dumps / journal + "crash_dump_count": { + label: "Crash dumps present", + description: "Entries under /var/crash. Non-zero indicates a prior kernel crash was captured.", + signal: badIfPositive, + }, + "journal.oom_event_count": { + label: "OOM events in journal", + description: "Out-of-memory kills recorded by the kernel. Non-zero means something exceeded available RAM.", + signal: warnIfPositive, + }, + "oom_event_count": { + label: "OOM events", + signal: warnIfPositive, + }, + + // Network + "devlink.fw_fatal_count": { + label: "NIC firmware fatal errors", + description: "Devlink-reported firmware fatal health events. Non-zero is a genuine NIC fault.", + signal: badIfPositive, + }, + "nic.hw_error_interfaces": { + label: "NICs with hardware errors", + description: "Interfaces with non-zero hardware error counters.", + signal: badIfPositive, + }, + "nic.link_flap_interfaces": { + label: "NICs with link flapping", + description: "Interfaces whose carrier state changed above a threshold (rough heuristic).", + signal: warnIfPositive, + }, + "network.manager": { + label: "Network manager", + signal: () => "neutral", + }, + + // Services + "failed_service_count": { + label: "Failed services", + description: "systemd units in failed state at collection time.", + signal: warnIfPositive, + }, + + // EDAC + "edac.uncorrectable_errors": { + label: "Uncorrectable ECC errors", + description: "DIMM-level uncorrectable memory errors. Non-zero means bad memory.", + signal: badIfPositive, + }, + "edac.correctable_errors": { + label: "Correctable ECC errors", + description: "DIMM-level corrected memory errors. Accumulates over time; a rising count is worth tracking.", + signal: warnIfPositive, + }, + "edac.dimm_count": { + label: "DIMMs detected", + signal: () => "neutral", + }, + "edac.mc_count": { + label: "Memory controllers", + signal: () => "neutral", + }, + "edac.present": { + label: "EDAC present", + signal: infoPresence, + }, + "edac_dimm_count": { label: "DIMMs detected", signal: () => "neutral" }, + "edac_mc_count": { label: "Memory controllers", signal: () => "neutral" }, + "edac_present": { label: "EDAC present", signal: infoPresence }, + + // IPMI / NVSwitch / nvidia + "ipmi.present": { + label: "IPMI present", + signal: infoPresence, + }, + "ipmi_present": { label: "IPMI present", signal: infoPresence }, + "nvswitch.present": { + label: "NVSwitch present", + signal: infoPresence, + }, + // Legacy form — suppressed by the fact tile when the dotted form exists. + "nvswitch_present": { + label: "NVSwitch present (legacy)", + signal: infoPresence, + }, + + // Triage aggregates + "critical_event_count": { + label: "Critical events detected", + description: "Count of events classified as critical by triage.", + signal: warnIfPositive, + }, + "oss_classified_count": { + label: "OSS-classified entries", + signal: () => "neutral", + }, + "firewall_posture": { + label: "Firewall posture", + signal: (v) => (v === "inactive" ? "warn" : "neutral"), + }, + + // System identity + "cpu_cores": { label: "CPU cores", signal: () => "neutral" }, + "cpu_model": { label: "CPU model", signal: () => "neutral" }, + "hostname": { label: "Hostname", signal: () => "neutral" }, + "kernel": { label: "Kernel", signal: () => "neutral" }, + "os": { label: "OS", signal: () => "neutral" }, + "iommu_enabled": { label: "IOMMU enabled", signal: infoPresence }, + "transparent_hugepage": { label: "Transparent hugepages", signal: () => "neutral" }, + "ecc_type": { label: "ECC type", signal: () => "info" }, + "memory_total": { label: "Memory total", signal: () => "neutral", format: formatBytes }, + "hugetlb_kib": { label: "HugeTLB allocated", signal: () => "neutral", format: formatKib }, + "mem_available_kib": { label: "Memory available", signal: () => "neutral", format: formatKib }, + "swap_free_kib": { label: "Swap free", signal: () => "neutral", format: formatKib }, + "swap_total_kib": { label: "Swap total", signal: () => "neutral", format: formatKib }, + "uptime": { label: "Uptime", signal: () => "neutral", format: formatUptime }, + "uptime_seconds": { label: "Uptime", signal: () => "neutral", format: formatUptime }, + + // EDAC extras + "edac.ce_total": { + label: "EDAC correctable errors", + description: "Cumulative corrected memory errors across all controllers.", + signal: warnIfPositive, + }, + "edac.ue_total": { + label: "EDAC uncorrectable errors", + description: "Cumulative uncorrectable memory errors. Non-zero = bad DIMM.", + signal: badIfPositive, + }, + + // Hypervisor + "host_driver_count": { label: "Host drivers", signal: () => "neutral" }, + "pci_nvidia_count": { label: "NVIDIA PCI devices", signal: () => "neutral" }, + "vfio_bound_count": { label: "VFIO-bound GPUs", signal: () => "neutral" }, + "vcpu_total": { label: "vCPU total", signal: () => "neutral" }, + "running_domains": { label: "Running VMs", signal: () => "neutral" }, + "total_domains": { label: "VMs defined", signal: () => "neutral" }, + + // Docker + "container_count": { label: "Containers", signal: () => "neutral" }, + "docker_version": { label: "Docker version", signal: () => "neutral" }, + "vllm_container_count": { label: "vLLM containers", signal: () => "neutral" }, + + // Triage extras + "xid_classified_count": { label: "XID classified entries", signal: () => "neutral" }, + + // Thermal extras + "temp_alarm_count": { label: "Temperature alarms", signal: warnIfPositive }, + + // Open vSwitch + "ovs.bridge_count": { label: "OVS bridges", signal: () => "neutral" }, + "ovs.port_count": { label: "OVS ports", signal: () => "neutral" }, + "ovs.datapath_kind": { label: "OVS datapath", signal: () => "info" }, + "ovs.stale_socket_count": { + label: "OVS stale sockets", + description: "Leftover control sockets under /var/run/openvswitch; non-zero suggests an unclean restart.", + signal: warnIfPositive, + }, +} + +/** + * Format a fact value for display. + * Lookup order: + * 1. Registry entry with `format` — explicit override wins. + * 2. Suffix-based auto-humanisation: `_kib`/`_mib`/`_bytes`/uptime keys. + * 3. Default — numbers stringify, booleans → Yes/No, others → String(). + * Unknown signals of "unavailable" short-circuit with a literal sentinel. + */ +export function factValue(key: string, value: unknown, signal: FactSignal): string { + if (signal === "unavailable") return "unavailable" + if (value == null) return "N/A" + + const registered = resolve(key) + if (registered?.format) return registered.format(value) + + const lower = key.toLowerCase() + if (typeof value === "number" && Number.isFinite(value)) { + if (lower.endsWith("_kib") || lower.endsWith("_kb")) return humanBytes(value * 1024) + if (lower.endsWith("_mib") || lower.endsWith("_mb")) return humanBytes(value * 1024 * 1024) + if (lower.endsWith("_gib") || lower.endsWith("_gb")) return humanBytes(value * 1024 * 1024 * 1024) + if (lower.endsWith("_bytes")) return humanBytes(value) + if (lower === "uptime_seconds") return humanUptime(value) + return value.toString() + } + + if (typeof value === "string" && (lower === "uptime" || lower === "uptime_seconds")) { + return humanUptime(value) + } + + if (typeof value === "boolean") return value ? "Yes" : "No" + return String(value) +} + +/** + * Resolve a registry entry for a fact key. Tries the exact key first; on + * miss, falls back to the last dotted segment. This lets one registry entry + * cover both the flat form (`hugetlb_kib`) and the namespaced form that some + * collectors emit (`system.hugetlb_kib`) without duplicating entries. + */ +function resolve(key: string): FactMeta | undefined { + const direct = factRegistry[key] + if (direct) return direct + const last = key.split(".").pop() + if (last && last !== key) return factRegistry[last] + return undefined +} + +/** + * Human label for a fact key. + * Lookup-first (exact then last-segment); on miss, humanise the last dotted + * segment so new collector keys land with a readable label instead of raw + * dots/underscores. Explicit registry entries always win. + */ +export function factLabel(key: string): string { + const entry = resolve(key) + if (entry) return entry.label + const last = key.split(".").pop() ?? key + const spaced = last.replace(/_/g, " ").trim() + if (!spaced) return key + return spaced.charAt(0).toUpperCase() + spaced.slice(1) +} + +/** Threshold signal for a fact value; unknown keys return `neutral`. */ +export function factSignal(key: string, value: unknown): FactSignal { + return resolve(key)?.signal(value) ?? "neutral" +} + +/** Optional tooltip description for a fact key. */ +export function factDescription(key: string): string | undefined { + return resolve(key)?.description +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.value.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.value.test.ts new file mode 100644 index 0000000..bc35583 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/facts.value.test.ts @@ -0,0 +1,49 @@ +import { describe, it, expect } from "vitest" +import { factValue, factLabel } from "./facts" + +describe("factValue", () => { + it("humanises registered byte-count keys", () => { + expect(factValue("mem_available_kib", 1484225536, "neutral")).toBe("1.38 TiB") + expect(factValue("hugetlb_kib", 19398656000, "neutral")).toBe("18.1 TiB") + expect(factValue("swap_total_kib", 3355442528, "neutral")).toBe("3.12 TiB") + expect(factValue("memory_total", 2161124530746, "neutral")).toBe("1.97 TiB") + }) + + it("humanises uptime keys", () => { + expect(factValue("uptime", "126755m44.34s", "neutral")).toMatch(/d .*h .*m/) + expect(factValue("uptime_seconds", 3661, "neutral")).toBe("1h 1m") + }) + + it("auto-humanises unknown keys with byte-unit suffixes", () => { + // Unknown key, but suffix-driven: _kib → * 1024 + expect(factValue("future.cache_size_kib", 2048, "neutral")).toBe("2.00 MiB") + expect(factValue("future.heap_bytes", 1024 * 1024, "neutral")).toBe("1.00 MiB") + }) + + it("short-circuits on unavailable signal", () => { + expect(factValue("pcie.aer_fatal_total", 0, "unavailable")).toBe("unavailable") + }) + + it("formats booleans as Yes/No", () => { + expect(factValue("some_flag", true, "info")).toBe("Yes") + expect(factValue("some_flag", false, "neutral")).toBe("No") + }) + + it("passes plain numbers through unchanged", () => { + expect(factValue("cpu_cores", 112, "neutral")).toBe("112") + }) + + it("returns N/A for null values", () => { + expect(factValue("anything", null, "neutral")).toBe("N/A") + expect(factValue("anything", undefined, "neutral")).toBe("N/A") + }) +}) + +describe("factLabel (new registry entries)", () => { + it("registers acronym-heavy system facts", () => { + expect(factLabel("iommu_enabled")).toBe("IOMMU enabled") + expect(factLabel("ecc_type")).toBe("ECC type") + expect(factLabel("xid_classified_count")).toBe("XID classified entries") + expect(factLabel("vllm_container_count")).toBe("vLLM containers") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.test.ts new file mode 100644 index 0000000..64384b0 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.test.ts @@ -0,0 +1,198 @@ +import { describe, expect, it } from "vitest" +import { groupIssues, groupKey } from "./grouping" +import type { IssueRecord } from "@/types" + +function make(partial: Partial): IssueRecord { + return { + id: "", + collector: "triage", + code: "critical_log", + severity: "info", + confidence: "", + category: "", + message: "", + ...partial, + } +} + +describe("groupKey", () => { + it("uses the primary triage finding title when available", () => { + const issue = make({ + triage_findings: [ + { + code: "x", + severity: "critical", + confidence: "low", + category: "DISK", + title: "SATA Link Down", + description: "", + }, + ], + message: "ata1: link down", + }) + expect(groupKey(issue)).toContain("SATA Link Down") + }) + + it("falls back to the message when no triage findings", () => { + const issue = make({ message: "bare message" }) + expect(groupKey(issue)).toContain("bare message") + }) +}) + +describe("groupIssues", () => { + it("collapses 8 SATA Link Down rows (ata1..ata8) into one group", () => { + const findings = [ + { + code: "x", + severity: "critical", + confidence: "low", + category: "DISK", + title: "SATA Link Down", + description: "", + }, + ] + const issues = [1, 2, 3, 4, 5, 6, 7, 8].map((n) => + make({ + id: `sata${n}`, + severity: "critical", + confidence: "low", + category: "DISK", + triage_findings: findings, + message: `ata${n}: SATA link down (2x in logs/dmesg.txt)`, + }), + ) + const groups = groupIssues(issues) + expect(groups).toHaveLength(1) + expect(groups[0].count).toBe(8) + expect(groups[0].occurrences).toBe(16) + expect(groups[0].title).toBe("SATA Link Down") + expect(groups[0].severity).toBe("critical") + expect(groups[0].confidence).toBe("low") + }) + + it("keeps distinct titles separate", () => { + const issues = [ + make({ + id: "a", + code: "critical_log", + category: "HW", + triage_findings: [ + { + code: "x", + severity: "warning", + confidence: "high", + category: "HW", + title: "PCIe Hotplug Timeout", + description: "", + }, + ], + message: "pcieport ...", + }), + make({ + id: "b", + code: "critical_log", + category: "DISK", + triage_findings: [ + { + code: "x", + severity: "critical", + confidence: "low", + category: "DISK", + title: "SATA Link Down", + description: "", + }, + ], + message: "ata1 ...", + }), + ] + const groups = groupIssues(issues) + expect(groups).toHaveLength(2) + }) + + it("reduces severity to MAX and confidence to MIN across members", () => { + const issues = [ + make({ + id: "hi", + severity: "warning", + confidence: "high", + triage_findings: [ + { + code: "x", + severity: "warning", + confidence: "high", + category: "", + title: "Thing", + description: "", + }, + ], + }), + make({ + id: "lo", + severity: "critical", + confidence: "low", + triage_findings: [ + { + code: "x", + severity: "critical", + confidence: "low", + category: "", + title: "Thing", + description: "", + }, + ], + }), + ] + const groups = groupIssues(issues) + expect(groups).toHaveLength(1) + expect(groups[0].severity).toBe("critical") // max + expect(groups[0].confidence).toBe("low") // min + }) + + it("sorts groups by composite rank — real hardware warning above noise bucket", () => { + const pcieFinding = { + code: "x", + severity: "warning", + confidence: "high", + category: "HW", + title: "PCIe Hotplug Timeout", + description: "", + } + const sataFinding = { + code: "x", + severity: "critical", + confidence: "low", + category: "DISK", + title: "SATA Link Down", + description: "", + } + const issues: IssueRecord[] = [ + // 32 repetitions of PCIe warning — one group, occurrence count 32 + make({ + id: "pcie", + severity: "warning", + confidence: "high", + category: "HW", + triage_findings: [pcieFinding], + message: "PCIe timeout (32x in logs/dmesg.txt)", + }), + // 4 SATA criticals — one group + ...[1, 2, 3, 4].map((n) => + make({ + id: `sata${n}`, + severity: "critical", + confidence: "low", + category: "DISK", + triage_findings: [sataFinding], + message: `ata${n} (1x in logs/dmesg.txt)`, + }), + ), + ] + const groups = groupIssues(issues) + expect(groups).toHaveLength(2) + // Critical outranks Warning (lexicographic rule), so the SATA group + // still leads. Visual collapse + "hide low confidence" toggle (R3) + // are what promote PCIe to the visible top; ranking respects severity. + expect(groups[0].title).toBe("SATA Link Down") + expect(groups[1].title).toBe("PCIe Hotplug Timeout") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.ts new file mode 100644 index 0000000..0b8e1d6 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/grouping.ts @@ -0,0 +1,107 @@ +/** + * Group Issues into patterns so 8 rows of "SATA Link Down on ata1..ata8" + * collapse into one row "SATA Link Down (8 ports, 16 occurrences)". + * + * Grouping key is `code | category | collector | title`. The title is the + * primary triage finding's title when available, otherwise the raw message. + * We deliberately do NOT group on fingerprint — fingerprints differ between + * `ata1` and `ata2` (that's the kernel's port-specific signature), which is + * exactly the case we want to collapse on. + * + * Reduction rules when members disagree: + * - Severity: MAX (highest wins). A bucket with even one critical is a + * critical bucket. + * - Confidence: MIN (weakest wins). A bucket with any low-confidence member + * is collectively low-confidence. + * + * This is client-side grouping — intentional so the frontend stays + * self-contained while archives remain small (<1000 issues per archive). + * If scale changes, the same grouping key can move into the API layer. + */ + +import type { IssueRecord } from "@/types" +import { confidenceBadge, severityBadge } from "@/lib/severity" +import { occurrenceCount, compareIssues } from "@/lib/ranking" +import { primaryFindingTitle } from "@/lib/utils" + +export interface IssueGroup { + key: string + severity: string + confidence: string + title: string + /** Number of distinct issue rows in the group. */ + count: number + /** Sum of per-row "(Nx in )" occurrence counts. */ + occurrences: number + /** Representative member, chosen as the highest-ranked by compareIssues. */ + sample: IssueRecord + members: IssueRecord[] +} + +export function groupKey(issue: IssueRecord): string { + const title = primaryFindingTitle(issue.triage_findings) ?? issue.message + return [issue.code ?? "", issue.category ?? "", issue.collector ?? "", title].join("|") +} + +export function groupIssues(issues: IssueRecord[]): IssueGroup[] { + const buckets = new Map() + for (const issue of issues) { + const key = groupKey(issue) + const bucket = buckets.get(key) + if (bucket) { + bucket.push(issue) + } else { + buckets.set(key, [issue]) + } + } + + const groups: IssueGroup[] = [] + for (const [key, members] of buckets) { + // Choose sample by composite rank; also use it to derive reduced severity/confidence. + const sorted = [...members].sort(compareIssues) + const sample = sorted[0] + + let maxSev = severityBadge(sample.severity) + let minConf = confidenceBadge(sample.confidence) + let occurrences = 0 + for (const m of members) { + const sev = severityBadge(m.severity) + if (sev.rank > maxSev.rank) maxSev = sev + const conf = confidenceBadge(m.confidence) + if (conf.rank < minConf.rank) minConf = conf + occurrences += occurrenceCount(m.message) + } + + groups.push({ + key, + severity: maxSev.key, + confidence: minConf.key, + title: primaryFindingTitle(sample.triage_findings) ?? sample.message, + count: members.length, + occurrences, + sample, + members, + }) + } + + // Sort groups by the same composite used for individual issues, using the + // sample with the reduced severity/confidence applied. + groups.sort((a, b) => { + const aSample: IssueRecord = { + ...a.sample, + severity: a.severity, + confidence: a.confidence, + // Ensure occurrences tiebreak works via the message suffix. + message: `${a.sample.message} (${a.occurrences}x in _)`, + } + const bSample: IssueRecord = { + ...b.sample, + severity: b.severity, + confidence: b.confidence, + message: `${b.sample.message} (${b.occurrences}x in _)`, + } + return compareIssues(aSample, bSample) + }) + + return groups +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.test.ts new file mode 100644 index 0000000..0994594 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from "vitest" +import { compareIssues, occurrenceCount, rankIssues } from "./ranking" +import type { IssueRecord } from "@/types" + +function makeIssue(partial: Partial): IssueRecord { + return { + id: "x", + collector: "triage", + code: "critical_log", + severity: "info", + confidence: "", + category: "", + message: "", + ...partial, + } +} + +describe("occurrenceCount", () => { + it("parses the (Nx in ) suffix", () => { + expect(occurrenceCount("... (3x in logs/dmesg.txt)")).toBe(3) + expect(occurrenceCount("... (32x in logs/dmesg.txt)")).toBe(32) + expect(occurrenceCount("... (1× in logs/dmesg.txt)")).toBe(1) + }) + + it("returns 1 when no suffix is present", () => { + expect(occurrenceCount("bare message")).toBe(1) + expect(occurrenceCount("")).toBe(1) + expect(occurrenceCount(undefined)).toBe(1) + }) +}) + +describe("compareIssues", () => { + it("prefers higher severity", () => { + const warn = makeIssue({ id: "w", severity: "warning", confidence: "high" }) + const crit = makeIssue({ id: "c", severity: "critical", confidence: "low" }) + const sorted = rankIssues([warn, crit]) + expect(sorted[0].id).toBe("c") + }) + + it("breaks severity ties on confidence", () => { + const lo = makeIssue({ id: "lo", severity: "warning", confidence: "low" }) + const hi = makeIssue({ id: "hi", severity: "warning", confidence: "high" }) + const sorted = rankIssues([lo, hi]) + expect(sorted[0].id).toBe("hi") + }) + + it("treats unknown confidence as better than low", () => { + const low = makeIssue({ id: "low", severity: "critical", confidence: "low" }) + const unk = makeIssue({ id: "unk", severity: "critical", confidence: "" }) + const sorted = rankIssues([low, unk]) + expect(sorted[0].id).toBe("unk") + }) + + it("breaks severity+confidence ties on occurrence count", () => { + const few = makeIssue({ + id: "few", + severity: "warning", + confidence: "high", + message: "thing (1x in logs/a.txt)", + }) + const many = makeIssue({ + id: "many", + severity: "warning", + confidence: "high", + message: "thing (32x in logs/a.txt)", + }) + const sorted = rankIssues([few, many]) + expect(sorted[0].id).toBe("many") + }) + + it("puts a real PCIe hotplug warning above SATA Link Down boot noise (HGX scenario)", () => { + const pcie = makeIssue({ + id: "pcie", + severity: "warning", + confidence: "high", + code: "critical_log", + message: "PCIe Hotplug Timeout (32x in logs/dmesg.txt)", + }) + const sataBoot = (n: number) => + makeIssue({ + id: `sata${n}`, + severity: "critical", + confidence: "low", + code: "critical_log", + message: `SATA Link Down ata${n} (2x in logs/dmesg.txt)`, + }) + const issues = [sataBoot(1), sataBoot(2), sataBoot(3), sataBoot(4), pcie] + const sorted = rankIssues(issues) + // Criticals still lead severity, but within the critical bucket the + // low-confidence ones are sorted deterministically below any higher- + // confidence critical. PCIe (warning/high) ranks below all 4 criticals — + // grouping (Phase 2) then collapses the 4 SATA rows into 1. + expect(sorted[0].severity).toBe("critical") + expect(sorted[sorted.length - 1].id).toBe("pcie") + }) + + it("is a stable comparator", () => { + const a = makeIssue({ id: "a", severity: "info", confidence: "high", code: "z" }) + const b = makeIssue({ id: "b", severity: "info", confidence: "high", code: "a" }) + // b's code comes first lexicographically. + expect(compareIssues(a, b)).toBeGreaterThan(0) + expect(compareIssues(b, a)).toBeLessThan(0) + expect(compareIssues(a, a)).toBe(0) + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.ts new file mode 100644 index 0000000..52975c5 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/ranking.ts @@ -0,0 +1,56 @@ +/** + * Composite rank comparator for the Issues list default sort. + * + * Lexicographic order (each tier breaks ties on the previous): + * 1. Severity rank (critical > warning > info > unknown) + * 2. Confidence rank (high > unknown > low) + * 3. Occurrence count descending (higher count = higher priority) + * 4. Code, then message lexicographic, for deterministic output. + * + * Why lexicographic, not a single composite score? Industry consensus + * (Sentry/Datadog/PagerDuty) — a composite formula can invert the correct + * order, e.g. 54× low-confidence criticals outranking one high-confidence + * warning. Lexicographic preserves the intent: Severity is the dominant + * signal; Confidence tie-breaks it. + */ + +import type { IssueRecord } from "@/types" +import { confidenceBadge, severityBadge } from "@/lib/severity" + +/** + * Parse the trailing "(Nx in )" suffix from an issue message and + * return N. Returns 1 when absent — a finding with no suffix represents a + * single occurrence. + */ +export function occurrenceCount(message: string | null | undefined): number { + if (!message) return 1 + // Matches "(3x in logs/dmesg.txt)" or "(3× in logs/dmesg.txt)". + const m = /\((\d+)\s*[x×]\s+in\s+[^)]+\)\s*$/.exec(message) + if (!m) return 1 + const n = Number(m[1]) + return Number.isFinite(n) && n > 0 ? n : 1 +} + +export function compareIssues(a: IssueRecord, b: IssueRecord): number { + // 1. Severity — higher rank first. + const sevDiff = severityBadge(b.severity).rank - severityBadge(a.severity).rank + if (sevDiff !== 0) return sevDiff + + // 2. Confidence — higher rank first. + const confDiff = confidenceBadge(b.confidence).rank - confidenceBadge(a.confidence).rank + if (confDiff !== 0) return confDiff + + // 3. Occurrence count — higher first. + const countDiff = occurrenceCount(b.message) - occurrenceCount(a.message) + if (countDiff !== 0) return countDiff + + // 4. Deterministic lexicographic tiebreak. + const codeCmp = (a.code ?? "").localeCompare(b.code ?? "") + if (codeCmp !== 0) return codeCmp + return (a.message ?? "").localeCompare(b.message ?? "") +} + +/** Returns a new array sorted by {@link compareIssues}. */ +export function rankIssues(issues: IssueRecord[]): IssueRecord[] { + return [...issues].sort(compareIssues) +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.test.ts new file mode 100644 index 0000000..9830b20 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from "vitest" +import { confidenceBadge, severityBadge } from "./severity" + +describe("severityBadge", () => { + it("distinguishes the three expected severities", () => { + expect(severityBadge("critical").key).toBe("critical") + expect(severityBadge("warning").key).toBe("warning") + expect(severityBadge("info").key).toBe("info") + }) + + it("falls back to unknown for empty or unrecognised values", () => { + expect(severityBadge(null).key).toBe("unknown") + expect(severityBadge("").key).toBe("unknown") + expect(severityBadge("emergency").key).toBe("unknown") + }) + + it("ranks critical > warning > info > unknown", () => { + expect(severityBadge("critical").rank).toBeGreaterThan( + severityBadge("warning").rank, + ) + expect(severityBadge("warning").rank).toBeGreaterThan( + severityBadge("info").rank, + ) + expect(severityBadge("info").rank).toBeGreaterThan( + severityBadge("unknown").rank, + ) + }) + + it("returns a distinct icon per severity so color is not the only signal", () => { + const icons = new Set( + ["critical", "warning", "info", "unknown"].map((s) => severityBadge(s).icon), + ) + expect(icons.size).toBe(4) + }) + + it("is case-insensitive", () => { + expect(severityBadge("CRITICAL").key).toBe("critical") + expect(severityBadge(" Warning ").key).toBe("warning") + }) +}) + +describe("confidenceBadge", () => { + it("recognises high and low", () => { + expect(confidenceBadge("high").key).toBe("high") + expect(confidenceBadge("low").key).toBe("low") + }) + + it("treats missing confidence as 'unknown', ranked between high and low", () => { + const unknown = confidenceBadge("") + expect(unknown.key).toBe("unknown") + expect(unknown.rank).toBeGreaterThan(confidenceBadge("low").rank) + expect(unknown.rank).toBeLessThan(confidenceBadge("high").rank) + }) + + it("only de-opaques low confidence rows", () => { + expect(confidenceBadge("high").rowOpacity).toBe("") + expect(confidenceBadge("low").rowOpacity).toMatch(/opacity/) + }) + + it("uses a dashed border for low confidence", () => { + expect(confidenceBadge("low").borderStyle).toBe("border-dashed") + expect(confidenceBadge("high").borderStyle).toBe("border-solid") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.ts new file mode 100644 index 0000000..56cc9b5 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/severity.ts @@ -0,0 +1,147 @@ +/** + * Shape + color severity and confidence badges. + * + * Per WCAG 1.4.1 and Section 508, color alone must never be the only signal. + * Every severity carries an icon shape (circle / triangle / info / square) + * that is distinguishable in grayscale and to colorblind users. + * + * Icons are chosen from `lucide-react` (already a dependency). Callers render + * the returned `icon` as a JSX element and pair it with the `label` text. + */ + +import type { LucideIcon } from "lucide-react" +import { + AlertCircle, + AlertTriangle, + Circle, + HelpCircle, + Info, +} from "lucide-react" + +export type SeverityKey = "critical" | "warning" | "info" | "unknown" +export type ConfidenceKey = "high" | "low" | "unknown" + +export interface SeverityBadge { + key: SeverityKey + label: string + icon: LucideIcon + /** Tailwind text color class */ + color: string + /** Tailwind border color class (softer, for row borders / panels). */ + border: string + /** Fill/background tint class for chip backgrounds. */ + fill: string + /** Numeric rank, high = more severe. Stable for sort reuse. */ + rank: number +} + +export interface ConfidenceBadge { + key: ConfidenceKey + label: string + icon: LucideIcon + /** Tailwind opacity class for the row when confidence is low. */ + rowOpacity: string + /** Tailwind border-style modifier (dashed vs solid). */ + borderStyle: string + /** Text color class. */ + color: string + /** Numeric rank, high = stronger confidence. */ + rank: number +} + +export function severityBadge(raw: string | null | undefined): SeverityBadge { + const key = normaliseSeverity(raw) + switch (key) { + case "critical": + return { + key, + label: "Critical", + icon: AlertCircle, + color: "text-severity-critical", + border: "border-severity-critical/40", + fill: "bg-severity-critical/10", + rank: 3, + } + case "warning": + return { + key, + label: "Warning", + icon: AlertTriangle, + color: "text-severity-warning", + border: "border-severity-warning/40", + fill: "bg-severity-warning/10", + rank: 2, + } + case "info": + return { + key, + label: "Info", + icon: Info, + color: "text-severity-info", + border: "border-severity-info/40", + fill: "bg-severity-info/10", + rank: 1, + } + default: + return { + key: "unknown", + label: "Unknown", + icon: HelpCircle, + color: "text-muted-foreground", + border: "border-border", + fill: "bg-muted", + rank: 0, + } + } +} + +export function confidenceBadge(raw: string | null | undefined): ConfidenceBadge { + const key = normaliseConfidence(raw) + switch (key) { + case "high": + return { + key, + label: "high confidence", + icon: Circle, + rowOpacity: "", + borderStyle: "border-solid", + color: "text-foreground", + rank: 2, + } + case "low": + return { + key, + label: "low confidence", + icon: Circle, + // 50% opacity to de-emphasise, but the value + icon still render. + rowOpacity: "opacity-50", + borderStyle: "border-dashed", + color: "text-muted-foreground", + rank: 0, + } + default: + return { + key: "unknown", + label: "unknown confidence", + icon: HelpCircle, + rowOpacity: "opacity-75", + borderStyle: "border-dotted", + color: "text-muted-foreground", + // Unknown ranks between low and high so pre-R9 archives with missing + // confidence don't sort to the bottom with low-conf noise. + rank: 1, + } + } +} + +function normaliseSeverity(raw: string | null | undefined): SeverityKey { + const v = raw?.toLowerCase().trim() ?? "" + if (v === "critical" || v === "warning" || v === "info") return v + return "unknown" +} + +function normaliseConfidence(raw: string | null | undefined): ConfidenceKey { + const v = raw?.toLowerCase().trim() ?? "" + if (v === "high" || v === "low") return v + return "unknown" +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts new file mode 100644 index 0000000..1767a42 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts @@ -0,0 +1,30 @@ +import { describe, it, expect } from "vitest" +import { sourceLabel } from "./source" + +describe("sourceLabel", () => { + it("returns just the category when no syslog tag is present in the message", () => { + expect(sourceLabel({ category: "HW", message: "pcieport 0000:4c:10.0: ..." })).toBe( + "Hardware", + ) + }) + + it("appends a syslog tag when it adds information", () => { + expect( + sourceLabel({ category: "ERR", message: "NetworkManager[1234]: DHCP lease expired" }), + ).toBe("System Logs · NetworkManager") + }) + + it("drops a tag that would just repeat the category (case-insensitive)", () => { + // TIMEOUT-category messages start with "Timeout:"; the extracted tag is + // "Timeout", same as the category label — appending it is noise. + expect(sourceLabel({ category: "TIMEOUT", message: "Timeout: [Tue ...] pci ..." })).toBe( + "Timeout", + ) + }) + + it("passes unknown categories through (forward-compat)", () => { + expect(sourceLabel({ category: "FUTURE_BUCKET", message: "kernel: something" })).toBe( + "FUTURE_BUCKET · kernel", + ) + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts new file mode 100644 index 0000000..e9e57fd --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts @@ -0,0 +1,21 @@ +import type { IssueRecord } from "@/types" +import { categoryLabel } from "./utils" +import { messageComponent } from "./component" + +/** + * Human source label for an issue: category first, then a syslog-tag + * sub-label when the message carries one. Pairs friendly category rendering + * with component-level disambiguation so e.g. 25 rows under "System Logs" + * split into "System Logs · NetworkManager", "· sshd", "· kernel", etc. + */ +export function sourceLabel(issue: Pick): string { + const category = categoryLabel(issue.category) + const component = messageComponent(issue.message) + if (!component) return category + // Don't append a sub-label that just repeats the category (e.g. the + // TIMEOUT-category messages literally start with "Timeout:" — appending + // that tag gives "Timeout · Timeout", which is useless and overflows + // the column). + if (component.toLowerCase() === category.toLowerCase()) return category + return `${category} · ${component}` +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/summary.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/summary.ts new file mode 100644 index 0000000..f29aebd --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/summary.ts @@ -0,0 +1,127 @@ +/** + * Compose a plain-language, deterministic Overview summary from the archive's + * grouped issues, collector skip reasons, and threshold-flagged facts. + * + * No LLM, no heuristics beyond what `facts.ts` and `grouping.ts` already + * encode. The output is a short paragraph like: + * + * "8 critical entries are all low-confidence SATA Link Down on ata1..ata8 + * (likely boot-time noise). The most significant hardware signal is PCIe + * Hotplug Timeout (32×, high confidence). Firewall is inactive." + */ + +import type { IssueRecord, CollectorRecord } from "@/types" +import { groupIssues, type IssueGroup } from "@/lib/grouping" +import { factLabel, factSignal } from "@/lib/facts" + +/** + * Build the summary sentences. + * + * Returns an array of sentences (string[]) so callers can join with spaces or + * render each in a distinct

. An empty array means "archive is quiet". + */ +export function composeOverviewSummary( + issues: IssueRecord[], + collectors: CollectorRecord[], +): string[] { + const sentences: string[] = [] + + if (issues.length === 0 && !hasAnyBadFact(collectors)) { + return ["No issues recorded and no collector flagged an error."] + } + + const groups = groupIssues(issues) + + // 1. Noise: critical-severity groups with all-low confidence. + // Surface them first with "likely noise" framing so the reader doesn't + // mistake them for a real fault. + const noisyGroups = groups.filter( + (g) => g.severity === "critical" && g.confidence === "low", + ) + if (noisyGroups.length > 0) { + const entries = noisyGroups.reduce((n, g) => n + g.count, 0) + const occ = noisyGroups.reduce((n, g) => n + g.occurrences, 0) + const names = uniqueTitles(noisyGroups).slice(0, 3).join(", ") + sentences.push( + `${entries} critical ${pluralize("entry", entries, "entries")} ` + + `(${occ} total occurrences) are low-confidence ${names} — often boot-time ` + + `noise on inactive hardware, not a genuine fault.`, + ) + } + + // 2. Real hardware/signal findings: critical-high, warning-high, and any + // warning-unknown. Pick the first by composite rank — that's "the thing + // to look at first". + const realSignal = groups.find( + (g) => + (g.severity === "critical" && g.confidence !== "low") || + (g.severity === "warning" && g.confidence === "high"), + ) + if (realSignal) { + sentences.push( + `The most significant signal is ${realSignal.title} ` + + `(${realSignal.occurrences}× occurrences, ${realSignal.confidence} confidence).`, + ) + } + + // 3. Cross-collector bad facts — a per-fact bullet rolled into one sentence. + const badFacts = collectBadFacts(collectors) + if (badFacts.length > 0) { + const shown = badFacts.slice(0, 3).map((f) => `${f.label} (${f.collector})`) + sentences.push(`Non-zero error counters: ${shown.join(", ")}.`) + } + + // 4. Any "inactive firewall" fact → call it out. + for (const c of collectors) { + if (c.facts && c.facts["firewall_posture"] === "inactive") { + sentences.push("Firewall is inactive.") + break + } + } + + // Fallback when we had issues but none matched any of the framings above. + if (sentences.length === 0) { + sentences.push( + `${issues.length} ${pluralize("finding", issues.length)} recorded; ` + + `review the Issues tab for detail.`, + ) + } + + return sentences +} + +function uniqueTitles(groups: IssueGroup[]): string[] { + const seen = new Set() + const titles: string[] = [] + for (const g of groups) { + if (!seen.has(g.title)) { + seen.add(g.title) + titles.push(g.title) + } + } + return titles +} + +function collectBadFacts( + collectors: CollectorRecord[], +): Array<{ key: string; label: string; collector: string }> { + const out: Array<{ key: string; label: string; collector: string }> = [] + for (const c of collectors) { + if (!c.facts) continue + for (const [key, value] of Object.entries(c.facts)) { + if (factSignal(key, value) === "bad") { + out.push({ key, label: factLabel(key), collector: c.collector_id }) + } + } + } + return out +} + +function hasAnyBadFact(collectors: CollectorRecord[]): boolean { + return collectBadFacts(collectors).length > 0 +} + +function pluralize(singular: string, n: number, plural?: string): string { + if (n === 1) return singular + return plural ?? `${singular}s` +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.test.ts new file mode 100644 index 0000000..7a23986 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from "vitest" +import { decoratedTitle, extractDistinguishingToken } from "./title" + +describe("extractDistinguishingToken", () => { + it("extracts SATA port numbers", () => { + expect(extractDistinguishingToken("ata1: SATA link down")).toBe("ata1") + expect(extractDistinguishingToken("ata1.00: COMRESET failed")).toBe("ata1.00") + }) + + it("extracts PCIe BDFs", () => { + expect(extractDistinguishingToken("pcieport 0000:4c:10.0: timeout")).toBe( + "0000:4c:10.0", + ) + expect(extractDistinguishingToken("ptr 0000:0b:00.0: DOE ABORT")).toBe( + "0000:0b:00.0", + ) + }) + + it("extracts NVMe devices", () => { + expect(extractDistinguishingToken("I/O error dev nvme0n1")).toBe("nvme0n1") + expect(extractDistinguishingToken("nvme nvme0: resetting")).toBe("nvme0") + }) + + it("extracts mlx5 ports", () => { + expect(extractDistinguishingToken("mlx5_0 cmd_work_handler")).toBe("mlx5_0") + expect(extractDistinguishingToken("mlx5_bond_0 is down")).toBe( + "mlx5_bond_0", + ) + }) + + it("extracts systemd unit names", () => { + expect( + extractDistinguishingToken("systemd-networkd-wait-online.service timed out"), + ).toBe("systemd-networkd-wait-online.service") + }) + + it("returns undefined for messages with no known tokens", () => { + expect(extractDistinguishingToken("firewall inactive")).toBeUndefined() + expect(extractDistinguishingToken("")).toBeUndefined() + expect(extractDistinguishingToken(undefined)).toBeUndefined() + }) +}) + +describe("decoratedTitle", () => { + it("appends the distinguishing token", () => { + expect(decoratedTitle("SATA Link Down", "ata3: SATA link down")).toBe( + "SATA Link Down · ata3", + ) + expect( + decoratedTitle( + "Timeout", + "pcieport 0000:4c:10.0: pciehp: Timeout on hotplug command", + ), + ).toBe("Timeout · 0000:4c:10.0") + }) + + it("leaves the title alone when there's no token", () => { + expect(decoratedTitle("Firewall Posture: inactive", "ufw inactive")).toBe( + "Firewall Posture: inactive", + ) + }) + + it("does not duplicate when the title already ends with the token", () => { + expect(decoratedTitle("Timeout · ata1", "ata1 timed out")).toBe( + "Timeout · ata1", + ) + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.ts new file mode 100644 index 0000000..189cbcf --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/title.ts @@ -0,0 +1,46 @@ +/** + * Extracts the first "distinguishing token" from an issue message so generic + * titles like "Timeout" or "SATA Link Down" don't all look identical in the + * list view. The extracted token is appended to the title as `Title · token` + * in renderers; callers can use the raw message elsewhere. + * + * Tokens recognised (in declared order; first match wins): + * - SATA / ATA port: ata3, ata1.00 + * - PCIe BDF: 0000:0b:00.0, 0000:4c:10.0 + * - NVMe device: nvme0n1, nvme0 + * - mlx5 port: mlx5_0, mlx5_bond_0 + * - systemd unit name: foo.service, bar.mount + * + * Returns undefined when no known pattern matches; callers fall back to the + * bare title. + */ + +const tokenPatterns: RegExp[] = [ + /\b(ata\d+(?:\.\d+)?)\b/i, + /\b([0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f])\b/i, + /\b(nvme\d+n?\d*)\b/i, + /\b(mlx\d(?:_bond)?_\d)\b/i, + /\b([a-z0-9-]+\.(?:service|mount|socket|target|timer|slice))\b/i, +] + +export function extractDistinguishingToken( + message: string | null | undefined, +): string | undefined { + if (!message) return undefined + for (const re of tokenPatterns) { + const m = re.exec(message) + if (m) return m[1] + } + return undefined +} + +export function decoratedTitle( + title: string, + message: string | null | undefined, +): string { + const token = extractDistinguishingToken(message) + if (!token) return title + // Avoid the ugly case where title already ends with the token. + if (title.toLowerCase().endsWith(token.toLowerCase())) return title + return `${title} · ${token}` +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.test.ts new file mode 100644 index 0000000..c019b1e --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from "vitest" +import { humanBytes, humanUptime, parseUptimeSeconds } from "./units" + +describe("humanBytes", () => { + it("formats bytes with binary units", () => { + expect(humanBytes(0)).toBe("0 B") + expect(humanBytes(1023)).toBe("1023 B") + expect(humanBytes(1024)).toBe("1.00 KiB") + expect(humanBytes(1024 * 1024)).toBe("1.00 MiB") + expect(humanBytes(1024 ** 3)).toBe("1.00 GiB") + expect(humanBytes(1024 ** 4)).toBe("1.00 TiB") + }) + + it("reduces decimals as values grow", () => { + expect(humanBytes(1500)).toBe("1.46 KiB") + expect(humanBytes(15_000_000)).toBe("14.3 MiB") + expect(humanBytes(150_000_000_000)).toBe("140 GiB") + }) + + it("handles the HGX-sized memory (≈2.1 TB)", () => { + // 2146124587856 bytes → ≈1.95 TiB + const out = humanBytes(2146124587856) + expect(out).toMatch(/^1\.9\d TiB$/) + }) + + it("handles invalid input gracefully", () => { + expect(humanBytes(-1)).toBe("0 B") + expect(humanBytes(Number.NaN)).toBe("0 B") + expect(humanBytes(Number.POSITIVE_INFINITY)).toBe("0 B") + }) +}) + +describe("parseUptimeSeconds", () => { + it("accepts a number of seconds", () => { + expect(parseUptimeSeconds(0)).toBe(0) + expect(parseUptimeSeconds(3661)).toBe(3661) + }) + + it("accepts a plain numeric string", () => { + expect(parseUptimeSeconds("3600")).toBe(3600) + }) + + it("parses the collector's compound format", () => { + // 126755 minutes + 44.34 seconds = 126755 * 60 + 44 = 7,605,344s + expect(parseUptimeSeconds("126755m44.34s")).toBe(7605344) + }) + + it("parses canonical d/h/m/s forms", () => { + expect(parseUptimeSeconds("1d")).toBe(86400) + expect(parseUptimeSeconds("1d 2h 3m 4s")).toBe(86400 + 7200 + 180 + 4) + }) + + it("returns null for unparseable input", () => { + expect(parseUptimeSeconds("hello")).toBeNull() + expect(parseUptimeSeconds(undefined)).toBeNull() + expect(parseUptimeSeconds(null)).toBeNull() + expect(parseUptimeSeconds(-1)).toBeNull() + }) +}) + +describe("humanUptime", () => { + it("formats days/hours/minutes for long uptime", () => { + expect(humanUptime(88 * 86400 + 1 * 3600 + 45 * 60)).toBe("88d 1h 45m") + }) + + it("formats short uptimes in seconds", () => { + expect(humanUptime(30)).toBe("30s") + }) + + it("falls back to 'unknown' for bad input", () => { + expect(humanUptime("nonsense")).toBe("unknown") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.ts new file mode 100644 index 0000000..8087c91 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/units.ts @@ -0,0 +1,87 @@ +/** + * Humanised units for display. `formatBytes` already exists in `utils.ts` but + * caps at GB — this module adds TiB + binary suffixes and a robust uptime + * formatter that accepts either an integer seconds value or the goofy + * `126755m44.34s` string the collector's overview.txt can emit. + */ + +const BINARY_UNITS = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] as const + +/** Humanise a byte count with binary (1024) units. */ +export function humanBytes(bytes: number): string { + if (!Number.isFinite(bytes) || bytes < 0) return "0 B" + let value = bytes + let unit = 0 + while (value >= 1024 && unit < BINARY_UNITS.length - 1) { + value /= 1024 + unit += 1 + } + const decimals = value >= 100 || unit === 0 ? 0 : value >= 10 ? 1 : 2 + return `${value.toFixed(decimals)} ${BINARY_UNITS[unit]}` +} + +/** + * Humanise an uptime value. Accepts: + * - a number: seconds since boot + * - a string: "126755m44.34s" (collector's overview format), "1d 2h 3m", or + * a plain numeric string ("86400"). + * + * Returns "unknown" when the input can't be parsed. + */ +export function humanUptime(input: unknown): string { + const seconds = parseUptimeSeconds(input) + if (seconds == null) return "unknown" + return formatSeconds(seconds) +} + +/** Internal — exported for tests. */ +export function parseUptimeSeconds(input: unknown): number | null { + if (typeof input === "number" && Number.isFinite(input) && input >= 0) { + return Math.floor(input) + } + if (typeof input !== "string") return null + const trimmed = input.trim() + if (!trimmed) return null + // Plain numeric. + const asNum = Number(trimmed) + if (Number.isFinite(asNum) && asNum >= 0 && /^\d+(\.\d+)?$/.test(trimmed)) { + return Math.floor(asNum) + } + // Compound "NNdNNhNNmNNs" or variants with spaces. Also catches the + // collector's "126755m44.34s" one-off. + let total = 0 + let found = false + for (const match of trimmed.matchAll(/(\d+(?:\.\d+)?)\s*([dhms])/gi)) { + const val = Number(match[1]) + if (!Number.isFinite(val)) return null + found = true + switch (match[2].toLowerCase()) { + case "d": + total += val * 86400 + break + case "h": + total += val * 3600 + break + case "m": + total += val * 60 + break + case "s": + total += val + break + } + } + if (!found) return null + return Math.floor(total) +} + +function formatSeconds(seconds: number): string { + if (seconds < 60) return `${seconds}s` + const days = Math.floor(seconds / 86400) + const hours = Math.floor((seconds % 86400) / 3600) + const minutes = Math.floor((seconds % 3600) / 60) + const parts: string[] = [] + if (days > 0) parts.push(`${days}d`) + if (hours > 0 || days > 0) parts.push(`${hours}h`) + parts.push(`${minutes}m`) + return parts.join(" ") +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts new file mode 100644 index 0000000..7b72caa --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts @@ -0,0 +1,21 @@ +import { describe, expect, it } from "vitest" +import { sampleLine } from "./utils" + +describe("sampleLine", () => { + it("strips a repeated title prefix (with ':') from the message", () => { + expect( + sampleLine("Firewall Posture: inactive", "Firewall Posture: inactive: Firewall inactive (ufw)"), + ).toBe("Firewall inactive (ufw)") + }) + + it("returns the message unchanged when it does not echo the title", () => { + expect( + sampleLine("PCIe Hotplug Timeout", "pcieport 0000:4c:10.0: Timeout on hotplug command 0x05c0"), + ).toBe("pcieport 0000:4c:10.0: Timeout on hotplug command 0x05c0") + }) + + it("returns the message unchanged when the title is falsy", () => { + expect(sampleLine("", "anything")).toBe("anything") + expect(sampleLine(undefined, "anything")).toBe("anything") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts index 9f7e13f..ffe0e8e 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts @@ -38,6 +38,18 @@ export function shortFingerprint(value: string) { return `${value.slice(0, 8)}\u2026` } +/** + * Strip a "Title:" prefix from a message when the title has been echoed in + * the message text. Prevents renderings like "Firewall Posture: inactive: + * Firewall inactive (ufw)" where the subline re-states what the title + * already says. + */ +export function sampleLine(title: string | undefined, message: string): string { + if (!title) return message + const prefix = `${title}:` + return message.startsWith(prefix) ? message.slice(prefix.length).trimStart() : message +} + export function severityColor(severity: string) { switch (severity.toLowerCase()) { case "critical": @@ -159,6 +171,17 @@ export function sortFindings(findings: TriageFinding[]) { return [...findings].sort(compareFindingPrimaryOrder) } +/** + * Normalise a raw finding title: trim whitespace and drop a single trailing + * colon. Some collectors emit titles like `"Timeout:"` — the colon then + * leaks into `decoratedTitle()` output as `"Timeout: · 0000:05:00.0"`. + */ +function cleanTitle(raw: string | undefined | null): string | undefined { + if (!raw) return undefined + const trimmed = raw.trim().replace(/:+$/, "").trim() + return trimmed.length > 0 ? trimmed : undefined +} + /** Returns the title of the highest-priority finding, or undefined. */ export function primaryFindingTitle( findings?: TriageFinding[], @@ -171,5 +194,5 @@ export function primaryFindingTitle( best = cur } } - return best.title + return cleanTitle(best.title) } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts index 6384118..41683cd 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts @@ -31,11 +31,18 @@ export type ArchiveSummary = { * eventually fills these in — render "—" for null in the meantime). */ triage_finding_count: number | null + uptime_seconds?: number | null status: string error_reason?: string compressed_size: number } +export type SkipReason = { + reason: string + detail: string + artifact_path?: string +} + export type CollectorRecord = { collector_id: string status: string @@ -44,6 +51,7 @@ export type CollectorRecord = { skipped_count: number error_count: number facts?: Record + skip_reasons?: SkipReason[] } export type TriageFinding = { @@ -117,6 +125,12 @@ export type IssuesResponse = { total: number } +export type IssueState = { + issue_fingerprint: string + state: "ack" | "dismissed" + updated_at: string +} + export type ArtifactsResponse = { items: ArtifactRecord[] total: number diff --git a/customers/vm-troubleshooting-dashboard/frontend/vitest.config.ts b/customers/vm-troubleshooting-dashboard/frontend/vitest.config.ts new file mode 100644 index 0000000..9ea5526 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/vitest.config.ts @@ -0,0 +1,18 @@ +import path from "path" +import { defineConfig } from "vitest/config" +import react from "@vitejs/plugin-react" + +export default defineConfig({ + plugins: [react()], + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, + }, + test: { + environment: "jsdom", + globals: true, + include: ["src/**/*.test.{ts,tsx}"], + setupFiles: ["./vitest.setup.ts"], + }, +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/vitest.setup.ts b/customers/vm-troubleshooting-dashboard/frontend/vitest.setup.ts new file mode 100644 index 0000000..b9e7622 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/vitest.setup.ts @@ -0,0 +1 @@ +import "@testing-library/jest-dom/vitest" diff --git a/customers/vm-troubleshooting-dashboard/internal/api/server.go b/customers/vm-troubleshooting-dashboard/internal/api/server.go index 5dc1340..a5e1e81 100644 --- a/customers/vm-troubleshooting-dashboard/internal/api/server.go +++ b/customers/vm-troubleshooting-dashboard/internal/api/server.go @@ -53,8 +53,10 @@ const uploadLimiterTTL = 15 * time.Minute const uploadLimiterMaxEntries = 4096 // forwardedUserHeaders are the identity headers trusted when the operator -// opts in to --trust-forwarded-user. -var forwardedUserHeaders = []string{"X-Forwarded-User", "X-Forwarded-Email", "X-Remote-User"} +// opts in to --trust-forwarded-user. Ordered by preference for display: +// X-Forwarded-Email is human-readable, X-Forwarded-User under Authentik's +// default "hashed ID" subject mode is an opaque hash. +var forwardedUserHeaders = []string{"X-Forwarded-Email", "X-Forwarded-User", "X-Remote-User"} type limiterEntry struct { limiter *rate.Limiter @@ -219,6 +221,8 @@ func (s *Server) routes() { s.mux.HandleFunc("DELETE /api/v1/archives/{archiveID}", s.handleDeleteArchive) s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/issues", s.handleListIssues) s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/issues/{issueID}", s.handleGetIssue) + s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/issue-state", s.handleListIssueState) + s.mux.HandleFunc("POST /api/v1/archives/{archiveID}/issue-state/{fingerprint}", s.handleSetIssueState) s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts", s.handleListArtifacts) s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts/view/{path...}", s.handleViewArtifact) s.mux.HandleFunc("GET /api/v1/archives/{archiveID}/artifacts/download/{path...}", s.handleDownloadArtifact) @@ -399,6 +403,44 @@ func (s *Server) handleGetIssue(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusNotFound, "issue not found") } +// handleListIssueState returns the map of fingerprint → state rows for an +// archive. Empty objects are valid ("no state set on any issue"). +func (s *Server) handleListIssueState(w http.ResponseWriter, r *http.Request) { + archiveID := r.PathValue("archiveID") + states, err := s.store.LoadIssueStates(archiveID) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + writeJSON(w, http.StatusOK, map[string]any{"items": states}) +} + +// handleSetIssueState sets (or clears, with `{"state": ""}`) the state for +// a single issue fingerprint. Accepts JSON `{"state": "ack" | "dismissed" | ""}`. +func (s *Server) handleSetIssueState(w http.ResponseWriter, r *http.Request) { + archiveID := r.PathValue("archiveID") + fingerprint := r.PathValue("fingerprint") + var body struct { + State string `json:"state"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeError(w, http.StatusBadRequest, "invalid JSON body") + return + } + switch body.State { + case "", "ack", "dismissed": + // ok + default: + writeError(w, http.StatusBadRequest, "state must be 'ack', 'dismissed', or empty") + return + } + if err := s.store.SetIssueState(archiveID, fingerprint, body.State); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + w.WriteHeader(http.StatusNoContent) +} + func (s *Server) handleListArtifacts(w http.ResponseWriter, r *http.Request) { archive, ok := s.getArchive(w, r) if !ok { diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go index eb5eac2..3dbd2c4 100644 --- a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go @@ -11,6 +11,7 @@ import ( "os" "path" "path/filepath" + "strconv" "strings" "time" @@ -128,6 +129,7 @@ func buildDetail(workDir string, manifest model.Manifest, uploadedBy string, com SkippedCount: collector.SkippedCount, ErrorCount: collector.ErrorCount, Facts: collector.Facts, + SkipReasons: append([]model.ManifestSkipReason(nil), collector.SkipReasons...), }) for _, issue := range collector.Issues { record := model.IssueRecord{ @@ -193,6 +195,7 @@ func buildDetail(workDir string, manifest model.Manifest, uploadedBy string, com ArtifactCount: len(artifacts), Status: "ready", StorageBytes: compressedSize, + UptimeSeconds: extractUptimeSeconds(manifest.Collectors), } return &model.ArchiveDetail{ @@ -203,6 +206,52 @@ func buildDetail(workDir string, manifest model.Manifest, uploadedBy string, com }, nil } +// extractUptimeSeconds looks across the system collector's facts for an +// integer "uptime_seconds"-shaped key. Returns nil when no fact matches or +// the value isn't a finite positive number. +func extractUptimeSeconds(collectors map[string]model.ManifestCollector) *int64 { + keys := []string{"uptime_seconds", "system.uptime_seconds", "system.uptime"} + for _, name := range []string{"system", "hypervisor"} { + c, ok := collectors[name] + if !ok { + continue + } + for _, k := range keys { + raw, ok := c.Facts[k] + if !ok { + continue + } + if v, ok := coerceSeconds(raw); ok { + return &v + } + } + } + return nil +} + +func coerceSeconds(v any) (int64, bool) { + switch t := v.(type) { + case float64: + if t >= 0 { + return int64(t), true + } + case int64: + if t >= 0 { + return t, true + } + case int: + if t >= 0 { + return int64(t), true + } + case string: + n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64) + if err == nil && n >= 0 { + return n, true + } + } + return 0, false +} + func extractArchive(ctx context.Context, archivePath, dst string) error { file, err := os.Open(archivePath) if err != nil { diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/uptime_test.go b/customers/vm-troubleshooting-dashboard/internal/ingest/uptime_test.go new file mode 100644 index 0000000..2b986ba --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/uptime_test.go @@ -0,0 +1,90 @@ +package ingest + +import ( + "testing" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +// Uptime is extracted from the system (or hypervisor) collector's facts at +// ingest time. The helper has to accept the handful of number-shaped types +// that land on the Go side when a JSON manifest is unmarshalled +// (float64 is the default for JSON numbers; the collector also emits +// pre-typed int64 values via integerFactKeys). +func TestExtractUptimeSecondsFromSystemFacts(t *testing.T) { + t.Parallel() + cases := []struct { + name string + coll map[string]model.ManifestCollector + wantOK bool + wantValue int64 + }{ + { + name: "uptime_seconds float64 (default unmarshal)", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"uptime_seconds": float64(3600)}}, + }, + wantOK: true, wantValue: 3600, + }, + { + name: "uptime_seconds int64", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"uptime_seconds": int64(120)}}, + }, + wantOK: true, wantValue: 120, + }, + { + name: "uptime_seconds as numeric string", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"uptime_seconds": "7200"}}, + }, + wantOK: true, wantValue: 7200, + }, + { + name: "falls back to hypervisor when system has no uptime", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"cpu_cores": float64(8)}}, + "hypervisor": {Facts: map[string]any{"uptime_seconds": float64(42)}}, + }, + wantOK: true, wantValue: 42, + }, + { + name: "absent across all collectors", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"cpu_cores": float64(8)}}, + }, + wantOK: false, + }, + { + name: "string 'unavailable' sentinel → not set", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"uptime_seconds": "unavailable"}}, + }, + wantOK: false, + }, + { + name: "negative value rejected", + coll: map[string]model.ManifestCollector{ + "system": {Facts: map[string]any{"uptime_seconds": float64(-1)}}, + }, + wantOK: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := extractUptimeSeconds(tc.coll) + if !tc.wantOK { + if got != nil { + t.Errorf("expected nil, got %d", *got) + } + return + } + if got == nil { + t.Fatalf("expected %d, got nil", tc.wantValue) + } + if *got != tc.wantValue { + t.Errorf("expected %d, got %d", tc.wantValue, *got) + } + }) + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/model/types.go b/customers/vm-troubleshooting-dashboard/internal/model/types.go index 0519fa2..49882b3 100644 --- a/customers/vm-troubleshooting-dashboard/internal/model/types.go +++ b/customers/vm-troubleshooting-dashboard/internal/model/types.go @@ -143,14 +143,24 @@ type ArtifactRecord struct { ExistsOnDisk bool `json:"exists_on_disk"` } +// IssueState is the per-issue triage state (ack/dismissed). Keyed by +// archive_id + issue_fingerprint — re-uploading the same archive preserves +// state, and issues without a fingerprint cannot hold state. +type IssueState struct { + Fingerprint string `json:"issue_fingerprint"` + State string `json:"state"` // "ack" | "dismissed" + UpdatedAt string `json:"updated_at"` +} + type CollectorRecord struct { - ID string `json:"collector_id"` - Status string `json:"status"` - DurationMS int64 `json:"duration_ms"` - ArtifactCount int `json:"artifact_count"` - SkippedCount int `json:"skipped_count"` - ErrorCount int `json:"error_count"` - Facts map[string]any `json:"facts,omitempty"` + ID string `json:"collector_id"` + Status string `json:"status"` + DurationMS int64 `json:"duration_ms"` + ArtifactCount int `json:"artifact_count"` + SkippedCount int `json:"skipped_count"` + ErrorCount int `json:"error_count"` + Facts map[string]any `json:"facts,omitempty"` + SkipReasons []ManifestSkipReason `json:"skip_reasons,omitempty"` } type IssueCounts struct { @@ -177,6 +187,7 @@ type ArchiveSummary struct { Status string `json:"status"` ErrorReason string `json:"error_reason,omitempty"` StorageBytes int64 `json:"compressed_size"` + UptimeSeconds *int64 `json:"uptime_seconds,omitempty"` } type ArchiveDetail struct { diff --git a/customers/vm-troubleshooting-dashboard/internal/store/evidence.go b/customers/vm-troubleshooting-dashboard/internal/store/evidence.go index 2effde1..ceecdaf 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/evidence.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/evidence.go @@ -10,6 +10,35 @@ import ( const defaultSuggestedArtifactLimit = 6 +// categoryArtifactPrefixes broadens evidence linking beyond strict +// parser-hint/token matches: a DISK-category finding should elevate storage +// and block-device artifacts even if its exact parser-hint isn't attached. +// The bonus is additive, so exact hint matches still outrank broadened ones. +// Prefixes are case-insensitive and matched against the normalised path. +var categoryArtifactPrefixes = map[string][]string{ + "DISK": {"storage/", "hardware/block_devices", "hardware/nvme_", "hardware/smart_"}, + "HW": {"hardware/pcie_", "hardware/lspci", "hardware/dmidecode", "hardware/edac"}, + "NET": {"network/"}, + "MEM": {"hardware/edac", "hardware/memory"}, + "GPU": {"hardware/nvidia", "hardware/gpu", "hardware/dcgm"}, +} + +const categoryArtifactBonus = 5 + +func categoryArtifactMatch(category, path string) bool { + prefixes, ok := categoryArtifactPrefixes[strings.ToUpper(strings.TrimSpace(category))] + if !ok { + return false + } + lower := strings.ToLower(path) + for _, p := range prefixes { + if strings.HasPrefix(lower, p) { + return true + } + } + return false +} + type evidenceResolver struct { maxSuggestions int } @@ -44,6 +73,10 @@ func (r evidenceResolver) suggest(issue model.IssueRecord, artifacts []model.Art candidates := make([]rankedArtifact, 0, len(artifacts)) for _, artifact := range artifacts { score, tokenHits, locality := scoreArtifact(issueFeatures, issue.Collector, artifact) + if categoryArtifactMatch(issue.Category, artifact.Path) { + score += categoryArtifactBonus + locality++ + } if score <= 0 { continue } diff --git a/customers/vm-troubleshooting-dashboard/internal/store/evidence_category_test.go b/customers/vm-troubleshooting-dashboard/internal/store/evidence_category_test.go new file mode 100644 index 0000000..76faa14 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/evidence_category_test.go @@ -0,0 +1,76 @@ +package store + +import ( + "testing" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +// Category-aware evidence broadening (R13) ensures a DISK-category finding +// elevates storage/hardware artifacts even when the exact parser-hint match +// isn't attached. The broadening is additive: the exact-match path still +// wins when present. +func TestCategoryArtifactMatch(t *testing.T) { + t.Parallel() + cases := []struct { + name string + category string + path string + want bool + }{ + {"DISK matches storage/", "DISK", "storage/block_devices.txt", true}, + {"DISK matches hardware/nvme_", "DISK", "hardware/nvme_smart.txt", true}, + {"DISK matches hardware/smart_", "DISK", "hardware/smart_log.txt", true}, + {"DISK does not match network/", "DISK", "network/route.txt", false}, + {"HW matches hardware/pcie_", "HW", "hardware/pcie_aer.json", true}, + {"HW matches hardware/lspci", "HW", "hardware/lspci-nn.txt", true}, + {"HW does not match services/", "HW", "services/systemctl.txt", false}, + {"NET matches network/", "NET", "network/devlink.json", true}, + {"unknown category", "NOPE", "anywhere.txt", false}, + {"empty path", "HW", "", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := categoryArtifactMatch(tc.category, tc.path) + if got != tc.want { + t.Errorf("%s+%s: want %v got %v", tc.category, tc.path, tc.want, got) + } + }) + } +} + +// A DISK finding with no parser-hint match should still pick up +// storage/smart_* as evidence thanks to the category bonus. If a completely +// unrelated artifact exists it must rank below. +func TestScoreArtifactCategoryBonusLiftsStorage(t *testing.T) { + t.Parallel() + issue := model.IssueRecord{ + Collector: "journal", + Code: "critical_log", + Severity: "critical", + Confidence: "high", + Category: "DISK", + Message: "I/O error on device sda3", + } + storageArtifact := model.ArtifactRecord{ + Path: "storage/smart_sda.txt", + Collector: "storage", + ContentType: "text/plain", + ExistsOnDisk: true, + } + unrelated := model.ArtifactRecord{ + Path: "network/route.txt", + Collector: "network", + ContentType: "text/plain", + ExistsOnDisk: true, + } + + resolver := evidenceResolver{maxSuggestions: 5} + got := resolver.suggest(issue, []model.ArtifactRecord{unrelated, storageArtifact}) + if len(got) == 0 { + t.Fatalf("expected at least one suggestion") + } + if got[0] != "storage/smart_sda.txt" { + t.Errorf("category-broadened storage artifact should rank first, got %q (full: %v)", got[0], got) + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/issue_state_test.go b/customers/vm-troubleshooting-dashboard/internal/store/issue_state_test.go new file mode 100644 index 0000000..7cb21a7 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/issue_state_test.go @@ -0,0 +1,125 @@ +package store + +import ( + "testing" +) + +// Issue-state CRUD is the backbone of the Ack/Dismiss workflow. The tests +// here verify the contract promised in the plan: +// - set replaces previous state for the same (archive, fingerprint). +// - empty state clears the row (distinct from "ack" or "dismissed"). +// - load returns every row; idempotent list semantics. +// - state survives across re-uploads of the same archive because the +// table is keyed by archive_id + issue_fingerprint. +func TestIssueStateSetAndLoad(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + // Insert a stub archive row so the FK allows our state rows. + if _, err := st.db.Exec( + `INSERT INTO archives (archive_id, storage_path) VALUES (?, ?)`, + "arc-1", root, + ); err != nil { + t.Fatalf("seed archive: %v", err) + } + + if err := st.SetIssueState("arc-1", "fp-a", "ack"); err != nil { + t.Fatalf("set ack: %v", err) + } + if err := st.SetIssueState("arc-1", "fp-b", "dismissed"); err != nil { + t.Fatalf("set dismissed: %v", err) + } + + states, err := st.LoadIssueStates("arc-1") + if err != nil { + t.Fatalf("load: %v", err) + } + if len(states) != 2 { + t.Fatalf("expected 2 state rows, got %d", len(states)) + } + m := map[string]string{} + for _, s := range states { + m[s.Fingerprint] = s.State + } + if m["fp-a"] != "ack" { + t.Errorf("fp-a: want ack, got %q", m["fp-a"]) + } + if m["fp-b"] != "dismissed" { + t.Errorf("fp-b: want dismissed, got %q", m["fp-b"]) + } +} + +func TestIssueStateUpsertReplacesPrevious(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + if _, err := st.db.Exec(`INSERT INTO archives (archive_id, storage_path) VALUES (?, ?)`, "a", root); err != nil { + t.Fatal(err) + } + + if err := st.SetIssueState("a", "fp", "ack"); err != nil { + t.Fatal(err) + } + if err := st.SetIssueState("a", "fp", "dismissed"); err != nil { + t.Fatal(err) + } + states, err := st.LoadIssueStates("a") + if err != nil { + t.Fatal(err) + } + if len(states) != 1 { + t.Fatalf("upsert must not duplicate; got %d rows", len(states)) + } + if states[0].State != "dismissed" { + t.Errorf("state: want dismissed, got %q", states[0].State) + } +} + +func TestIssueStateEmptyClears(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + if _, err := st.db.Exec(`INSERT INTO archives (archive_id, storage_path) VALUES (?, ?)`, "a", root); err != nil { + t.Fatal(err) + } + _ = st.SetIssueState("a", "fp", "ack") + if err := st.SetIssueState("a", "fp", ""); err != nil { + t.Fatalf("clear: %v", err) + } + states, err := st.LoadIssueStates("a") + if err != nil { + t.Fatal(err) + } + if len(states) != 0 { + t.Errorf("empty state must delete the row; got %d", len(states)) + } +} + +func TestIssueStateRejectsEmptyFingerprint(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + defer st.Close() + + if err := st.SetIssueState("a", "", "ack"); err == nil { + t.Fatalf("expected error for empty fingerprint") + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/schema.sql b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql index be59599..7a5a3e9 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/schema.sql +++ b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql @@ -15,19 +15,21 @@ CREATE TABLE IF NOT EXISTS archives ( platform_kernel TEXT, schema_version TEXT, manifest_json BLOB, - triage_finding_count INTEGER + triage_finding_count INTEGER, + uptime_seconds INTEGER ); CREATE TABLE IF NOT EXISTS collectors ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - archive_id TEXT NOT NULL REFERENCES archives(archive_id) ON DELETE CASCADE, - collector_id TEXT NOT NULL, - status TEXT NOT NULL, - duration_ms INTEGER NOT NULL, - artifact_count INTEGER NOT NULL DEFAULT 0, - skipped_count INTEGER NOT NULL DEFAULT 0, - error_count INTEGER NOT NULL DEFAULT 0, - facts_json TEXT, + id INTEGER PRIMARY KEY AUTOINCREMENT, + archive_id TEXT NOT NULL REFERENCES archives(archive_id) ON DELETE CASCADE, + collector_id TEXT NOT NULL, + status TEXT NOT NULL, + duration_ms INTEGER NOT NULL, + artifact_count INTEGER NOT NULL DEFAULT 0, + skipped_count INTEGER NOT NULL DEFAULT 0, + error_count INTEGER NOT NULL DEFAULT 0, + facts_json TEXT, + skip_reasons_json TEXT, UNIQUE(archive_id, collector_id) ); @@ -45,6 +47,14 @@ CREATE TABLE IF NOT EXISTS issues ( unresolved_artifacts_json TEXT ); +CREATE TABLE IF NOT EXISTS issue_state ( + archive_id TEXT NOT NULL REFERENCES archives(archive_id) ON DELETE CASCADE, + issue_fingerprint TEXT NOT NULL, + state TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), + PRIMARY KEY (archive_id, issue_fingerprint) +); + CREATE INDEX IF NOT EXISTS idx_issues_archive ON issues(archive_id); CREATE INDEX IF NOT EXISTS idx_issues_severity ON issues(archive_id, severity); CREATE INDEX IF NOT EXISTS idx_issues_confidence ON issues(archive_id, confidence); diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store.go b/customers/vm-troubleshooting-dashboard/internal/store/store.go index cfca26f..2b83a0b 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/store.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/store.go @@ -63,6 +63,14 @@ func New(rootDir string) (*Store, error) { db.Close() return nil, err } + if err := migrateCollectorsSkipReasons(db); err != nil { + db.Close() + return nil, err + } + if err := migrateArchivesUptimeSeconds(db); err != nil { + db.Close() + return nil, err + } st := &Store{db: db, rootDir: rootDir} if err := st.backfillTriageFindingCounts(); err != nil { @@ -88,6 +96,88 @@ func migrateArchivesTriageCount(db *sql.DB) error { return nil } +// SetIssueState upserts the state row for (archive, fingerprint). Passing +// `state == ""` deletes the row (clears the state). +func (s *Store) SetIssueState(archiveID, fingerprint, state string) error { + if fingerprint == "" { + return fmt.Errorf("issue_fingerprint required") + } + if state == "" { + _, err := s.db.Exec( + `DELETE FROM issue_state WHERE archive_id = ? AND issue_fingerprint = ?`, + archiveID, fingerprint, + ) + return err + } + _, err := s.db.Exec( + `INSERT INTO issue_state (archive_id, issue_fingerprint, state) + VALUES (?, ?, ?) + ON CONFLICT(archive_id, issue_fingerprint) + DO UPDATE SET state = excluded.state, + updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now')`, + archiveID, fingerprint, state, + ) + return err +} + +// LoadIssueStates returns all stored state rows for an archive. +func (s *Store) LoadIssueStates(archiveID string) ([]model.IssueState, error) { + rows, err := s.db.Query( + `SELECT issue_fingerprint, state, updated_at + FROM issue_state WHERE archive_id = ? ORDER BY updated_at DESC`, + archiveID, + ) + if err != nil { + return nil, err + } + defer rows.Close() + var out []model.IssueState + for rows.Next() { + var s model.IssueState + if err := rows.Scan(&s.Fingerprint, &s.State, &s.UpdatedAt); err != nil { + return nil, err + } + out = append(out, s) + } + return out, rows.Err() +} + +// migrateArchivesUptimeSeconds adds the uptime_seconds column to existing +// databases (R12). NULL values represent "unknown" — pre-migration archives +// or archives whose system collector didn't emit the fact. +func migrateArchivesUptimeSeconds(db *sql.DB) error { + var n int + err := db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('archives') WHERE name='uptime_seconds'`).Scan(&n) + if err != nil { + return fmt.Errorf("pragma archives uptime_seconds: %w", err) + } + if n > 0 { + return nil + } + if _, err := db.Exec(`ALTER TABLE archives ADD COLUMN uptime_seconds INTEGER`); err != nil { + return fmt.Errorf("add uptime_seconds: %w", err) + } + return nil +} + +// migrateCollectorsSkipReasons adds the skip_reasons_json column to existing +// databases. NULL values represent "unknown" (pre-migration archives); +// populated rows carry the JSON-marshalled skip reasons from the manifest. +func migrateCollectorsSkipReasons(db *sql.DB) error { + var n int + err := db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('collectors') WHERE name='skip_reasons_json'`).Scan(&n) + if err != nil { + return fmt.Errorf("pragma collectors: %w", err) + } + if n > 0 { + return nil + } + if _, err := db.Exec(`ALTER TABLE collectors ADD COLUMN skip_reasons_json TEXT`); err != nil { + return fmt.Errorf("add skip_reasons_json: %w", err) + } + return nil +} + // backfillTriageFindingCounts populates triage_finding_count for archives // whose value is NULL. Rows are read fully into memory before any UPDATE is // issued, then updates run inside a single transaction. This avoids @@ -220,11 +310,16 @@ func (s *Store) SaveBounded(detail *model.ArchiveDetail, maxArchives int) error triageCount := countTriageFindingsOnDisk(storagePath) + var uptimeVal any + if detail.Summary.UptimeSeconds != nil { + uptimeVal = *detail.Summary.UptimeSeconds + } _, err = tx.Exec(`INSERT INTO archives ( archive_id, storage_path, uploaded_at, uploaded_by, status, error_reason, compressed_size, hostname, generated_at, version, commit_hash, - platform_os, platform_kernel, schema_version, manifest_json, triage_finding_count - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + platform_os, platform_kernel, schema_version, manifest_json, + triage_finding_count, uptime_seconds + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, archiveID, storagePath, detail.Summary.UploadedAt.UTC().Format("2006-01-02T15:04:05Z"), @@ -241,6 +336,7 @@ func (s *Store) SaveBounded(detail *model.ArchiveDetail, maxArchives int) error detail.Summary.SchemaVersion, manifestJSON, triageCount, + uptimeVal, ) if err != nil { return fmt.Errorf("insert archive: %w", err) @@ -256,12 +352,23 @@ func (s *Store) SaveBounded(detail *model.ArchiveDetail, maxArchives int) error s := string(data) factsJSON = &s } + var skipReasonsJSON *string + if len(c.SkipReasons) > 0 { + data, err := json.Marshal(c.SkipReasons) + if err != nil { + return fmt.Errorf("marshal skip reasons: %w", err) + } + s := string(data) + skipReasonsJSON = &s + } _, err := tx.Exec(`INSERT INTO collectors ( archive_id, collector_id, status, duration_ms, - artifact_count, skipped_count, error_count, facts_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + artifact_count, skipped_count, error_count, facts_json, + skip_reasons_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, archiveID, c.ID, c.Status, c.DurationMS, c.ArtifactCount, c.SkippedCount, c.ErrorCount, factsJSON, + skipReasonsJSON, ) if err != nil { return fmt.Errorf("insert collector %q: %w", c.ID, err) @@ -392,7 +499,8 @@ SELECT COALESCE(iss.info, 0), COALESCE(iss.total, 0), COALESCE(cc.collector_count, 0), - a.triage_finding_count + a.triage_finding_count, + a.uptime_seconds FROM archives a LEFT JOIN ( SELECT archive_id, @@ -423,7 +531,7 @@ ORDER BY a.uploaded_at DESC` for rows.Next() { var a model.ArchiveSummary var uploadedAt string - var triage sql.NullInt64 + var triage, uptime sql.NullInt64 if err := rows.Scan( &a.ArchiveID, &a.Hostname, &a.GeneratedAt, &uploadedAt, &a.UploadedBy, &a.Status, &a.ErrorReason, @@ -431,7 +539,7 @@ ORDER BY a.uploaded_at DESC` &a.Platform.OS, &a.Platform.Kernel, &a.SchemaVersion, &a.IssueCounts.Critical, &a.IssueCounts.Warning, &a.IssueCounts.Info, &a.IssueCounts.Total, &a.CollectorCount, - &triage, + &triage, &uptime, ); err != nil { return nil, 0, 0, fmt.Errorf("scan archive row: %w", err) } @@ -440,6 +548,10 @@ ORDER BY a.uploaded_at DESC` n := int(triage.Int64) a.TriageFindingCount = &n } + if uptime.Valid { + v := uptime.Int64 + a.UptimeSeconds = &v + } items = append(items, a) } if err := rows.Err(); err != nil { @@ -472,23 +584,28 @@ func (s *Store) Get(archiveID string) (*model.ArchiveDetail, bool) { var storagePath, uploadedAt string var manifestJSON []byte + var uptime sql.NullInt64 err := s.db.QueryRow(`SELECT archive_id, storage_path, COALESCE(hostname, ''), COALESCE(generated_at, ''), uploaded_at, uploaded_by, status, error_reason, compressed_size, COALESCE(version, ''), COALESCE(commit_hash, ''), COALESCE(platform_os, ''), COALESCE(platform_kernel, ''), - COALESCE(schema_version, ''), manifest_json + COALESCE(schema_version, ''), manifest_json, uptime_seconds FROM archives WHERE archive_id = ?`, archiveID).Scan( &a.ArchiveID, &storagePath, &a.Hostname, &a.GeneratedAt, &uploadedAt, &a.UploadedBy, &a.Status, &a.ErrorReason, &a.StorageBytes, &a.Version, &a.Commit, &a.Platform.OS, &a.Platform.Kernel, &a.SchemaVersion, - &manifestJSON, + &manifestJSON, &uptime, ) if err != nil { return nil, false } a.UploadedAt, _ = parseTime(uploadedAt) + if uptime.Valid { + v := uptime.Int64 + a.UptimeSeconds = &v + } a.IssueCounts = s.issueCounts(archiveID) collectors := s.loadCollectors(archiveID) @@ -548,7 +665,8 @@ func (s *Store) issueCounts(archiveID string) model.IssueCounts { func (s *Store) loadCollectors(archiveID string) []model.CollectorRecord { rows, err := s.db.Query(`SELECT collector_id, status, duration_ms, - artifact_count, skipped_count, error_count, facts_json + artifact_count, skipped_count, error_count, facts_json, + skip_reasons_json FROM collectors WHERE archive_id = ? ORDER BY collector_id`, archiveID) if err != nil { return nil @@ -558,16 +676,20 @@ func (s *Store) loadCollectors(archiveID string) []model.CollectorRecord { var collectors []model.CollectorRecord for rows.Next() { var c model.CollectorRecord - var factsJSON *string + var factsJSON, skipReasonsJSON *string if err := rows.Scan( &c.ID, &c.Status, &c.DurationMS, &c.ArtifactCount, &c.SkippedCount, &c.ErrorCount, &factsJSON, + &skipReasonsJSON, ); err != nil { continue } if factsJSON != nil { _ = json.Unmarshal([]byte(*factsJSON), &c.Facts) } + if skipReasonsJSON != nil { + _ = json.Unmarshal([]byte(*skipReasonsJSON), &c.SkipReasons) + } collectors = append(collectors, c) } return collectors diff --git a/customers/vm-troubleshooting-dashboard/oauth2-proxy.cfg.example b/customers/vm-troubleshooting-dashboard/oauth2-proxy.cfg.example new file mode 100644 index 0000000..ab43ead --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/oauth2-proxy.cfg.example @@ -0,0 +1,50 @@ +# Copy to oauth2-proxy.cfg and fill in the placeholders. +# Secrets (client_id, client_secret, cookie_secret) come from .env via +# OAUTH2_PROXY_* env vars — do NOT add them here. +# +# See DEPLOYMENT.md §8 for the full walkthrough. + +# ---- Provider ---- +provider = "oidc" +provider_display_name = "Authentik" +oidc_issuer_url = "https://authentik.example.com/application/o/triage/" +redirect_url = "https://triage.ngbackend.cloud/oauth2/callback" +scope = "openid profile email groups offline_access" +code_challenge_method = "S256" +skip_provider_button = true +insecure_oidc_allow_unverified_email = false + +# ---- Listener & upstream ---- +http_address = "0.0.0.0:4180" +reverse_proxy = true +# Only trust X-Forwarded-* from the host TLS terminator and the Docker +# bridge gateway. 172.16.0.0/12 covers Docker's default bridge pool; +# tighten to your compose network gateway (see `docker network inspect +# triage_triage-internal`) once you know the exact CIDR. +trusted_proxy_ips = ["127.0.0.1/32", "172.16.0.0/12"] +upstreams = ["http://dashboard:8080"] + +# ---- Who is allowed in ---- +email_domains = ["*"] +oidc_groups_claim = "groups" +allowed_groups = ["CX-Team"] + +# ---- Headers to the Go dashboard ---- +pass_user_headers = true +pass_access_token = false +pass_authorization_header = false +set_xauthrequest = false + +# ---- Session / cookie hardening ---- +cookie_name = "_oauth2_proxy" +cookie_secure = true +cookie_httponly = true +cookie_samesite = "lax" +cookie_expire = "168h" +cookie_refresh = "1h" +cookie_domains = ["triage.ngbackend.cloud"] +whitelist_domains = ["triage.ngbackend.cloud"] + +# ---- Misc ---- +request_logging = true +show_debug_on_error = false From 5a8fd0cd223426fbea81d4972c5bf8a025e9c44e Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 15:56:00 +0200 Subject: [PATCH 14/23] =?UTF-8?q?feat(vm-troubleshooting-dashboard):=20sys?= =?UTF-8?q?tem-log=20digestibility=20=E2=80=94=20structured=20component=20?= =?UTF-8?q?+=20chip=20breakdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under the "System Logs" category, dozens of runtime errors collapsed into an indistinguishable list (WSL, dxgk, LSB, udev, NVIDIA, arbitrary daemons). Users had no way to pivot. This change surfaces structure: every journal row carries a component sub-label (SYSLOG_IDENTIFIER, authoritative), the Source column renders as primary + muted secondary (two lines, not a run-on string), and expanded groups get a chip-row breakdown users can filter by. Shape before detail. Backend (Go) - IssueRecord.Component (optional, omitempty) + nullable `component` column on issues table with an additive migration that also backfills existing archives on startup (mirrors the established triage_finding_count pattern; WHERE component IS NULL for re-runnability; per-archive error boundary so a missing/corrupt tarball cannot block upgrade). - New store/journal_component.go: extensible SourceHandler slice indexes logs/journal_errors.ndjson by normalized message → SYSLOG_IDENTIFIER (falls back to _SYSTEMD_UNIT minus .service). Majority-wins resolution across all TriageFinding evidence lines; ties return empty (frontend fallback takes over). Tactical duplication of normalizeCriticalLine from the collector (two Go modules, internal/ boundary) — release-blocking TestNormalizeParity guards drift; delete this file when the collector schema promotes the identifier upstream. - Ingest calls EnrichIssues in buildDetail(); LoadTriageMap exported so the backfill and ingest paths share it. Frontend (TypeScript) - New lib/log-component.ts owns two shared helpers — isLogSourced() gates the secondary line to log-sourced issues only (ERR/KERN or .ndjson / dmesg.txt / logs/* artifact paths) and effectiveComponent() enforces a single precedence chain (structured → ruleset → null). Both source.ts and breakdown.ts import from here so the Source cell and chips can never disagree. - lib/source.ts now returns { primary, secondary? } with a compareSourceLabels helper; old `A · B` concatenation gone. - lib/component.ts shrunk to a bounded ordered ruleset (WSL, dxgk, NVIDIA, LSB, udev + syslog fallback) with a hard "one rule per family" policy in the header and a documented success condition ("delete this file when the structured path covers the tail"). \b-anchored WSL/LSB rules tolerate leading prefix tokens. - New lib/breakdown.ts: pure buildComponentBreakdown + shouldShowBreakdown helpers. Count-desc sort with alphabetical tiebreak, 6-chip cap with +N-more overflow, "other" bucket for ruleset-miss so counts always sum to the group total. overflowAriaLabel names every hidden entry for screen readers. - IssuesPage: SourceCell component for the two-line cell (three call sites kept DRY); chip row above expanded members with aria-pressed toggle filtering; "other" chip rendered dashed + italic so it reads as second-class; parent group row's secondary is suppressed when the chip row will enumerate multiple components (prevents showing sample's component as if it spoke for the group). - DashboardPage TopIssues: subline now primary + muted-opacity secondary inline, not a compound string. - IssueDetailPage: Category / Confidence / Severity rows removed from the METADATA sidebar (already present as pills next to the title — sidebar keeps Code / Collector / Fingerprint only). - sampleLine is null-safe. Tests - Backend: +3 test files of coverage. TestNormalizeParity exhaustively pins every regex in the copy to the collector; lookup/majority/tie/fallback and malformed-NDJSON cases; store migration + backfill round-trip; integration ingest asserts structured Component on a journal row and empty on a dmesg row in the same archive. Go test: 84 passed. - Frontend: 33 new cases across log-component, breakdown, component (6 new family rules + a prefixed-WSL regression), source (scoping gate regression), and sampleLine null safety. Vitest: 120 passed. Explicitly not in this change - No collector / archive-schema changes. The structured-component path is implemented at the dashboard ingest boundary. - No groupKey / fingerprint-semantics changes. Chips are an in-expansion pivot only. - Heuristic auto-inference (TF-IDF, clustering) deliberately rejected — systemd's SYSLOG_IDENTIFIER is the auto-detection; the ruleset is a bounded fallback for the non-journal tail. - Track B polish (OVS facts, TopIssues pill alignment, stat-card hierarchy, fingerprint copy, group counter wording) — deferred to its own PR. --- .../components/dashboard/DashboardPage.tsx | 10 +- .../issue-detail/IssueDetailPage.tsx | 12 +- .../src/components/issues/IssuesPage.tsx | 142 ++++++++- .../frontend/src/lib/breakdown.test.ts | 123 ++++++++ .../frontend/src/lib/breakdown.ts | 73 +++++ .../frontend/src/lib/component.test.ts | 46 ++- .../frontend/src/lib/component.ts | 59 ++-- .../frontend/src/lib/log-component.test.ts | 72 +++++ .../frontend/src/lib/log-component.ts | 51 ++++ .../frontend/src/lib/source.test.ts | 65 ++-- .../frontend/src/lib/source.ts | 60 +++- .../frontend/src/lib/utils.test.ts | 12 + .../frontend/src/lib/utils.ts | 6 +- .../frontend/src/types.ts | 7 + .../internal/ingest/ingest.go | 7 + .../internal/ingest/ingest_test.go | 95 ++++++ .../internal/model/types.go | 6 + .../internal/store/journal_component.go | 265 ++++++++++++++++ .../internal/store/journal_component_test.go | 282 ++++++++++++++++++ .../internal/store/schema.sql | 3 +- .../internal/store/store.go | 193 +++++++++++- .../internal/store/store_test.go | 189 ++++++++++++ 22 files changed, 1699 insertions(+), 79 deletions(-) create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/breakdown.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/breakdown.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.test.ts create mode 100644 customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.ts create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/journal_component.go create mode 100644 customers/vm-troubleshooting-dashboard/internal/store/journal_component_test.go diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx index 5a4c3e2..2661572 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx @@ -339,6 +339,7 @@ function TopIssueRowBody({ const sub = hasRealTitle ? sampleLine(group.title, group.sample.message) : "" + const source = sourceLabel(group.sample) return ( <>

@@ -349,9 +350,12 @@ function TopIssueRowBody({ ) : null}

-

- {sourceLabel(group.sample)} - {sub ? ` · ${sub}` : ""} +

+ {source.primary} + {source.secondary ? ( + {` · ${source.secondary}`} + ) : null} + {sub ? {` · ${sub}`} : null}

) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index b326e7f..f0611f7 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -342,14 +342,16 @@ export function IssueDetailPage() {

Metadata

+ {/* + Category / Confidence / Severity intentionally omitted from + this sidebar — they already render as pills next to the + issue title (see the header row). Keeping them here would + duplicate the same three fields a third time, against the + "shape before detail" polish pass. + */}
- - {issue.issue_fingerprint ? ( ) : ( diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx index 25f7305..a6d69c0 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx @@ -24,7 +24,14 @@ import { compareIssues } from "@/lib/ranking" import { groupIssues, type IssueGroup } from "@/lib/grouping" import { decoratedTitle } from "@/lib/title" import { bootEpochMs, isBootTimeIssue } from "@/lib/boot" -import { sourceLabel } from "@/lib/source" +import { compareSourceLabels, sourceLabel, type SourceLabel } from "@/lib/source" +import { effectiveComponent } from "@/lib/log-component" +import { + buildComponentBreakdown, + shouldShowBreakdown, + overflowAriaLabel, + OTHER_KEY, +} from "@/lib/breakdown" import { Search, X, @@ -100,6 +107,37 @@ function IssueRowBody({ ) } +/** + * Two-line Source cell: category on the primary line, component below it + * in smaller muted text. Renders a single line when the issue is not + * log-sourced (see lib/log-component.ts `isLogSourced`). + */ +function SourceCell({ label, compact = false }: { label: SourceLabel; compact?: boolean }) { + const title = label.secondary ? `${label.primary} · ${label.secondary}` : label.primary + return ( +
+
+ {label.primary} +
+ {label.secondary ? ( +
+ {label.secondary} +
+ ) : null} +
+ ) +} + export function IssuesPage() { const { archiveId } = useParams<{ archiveId: string }>() const [searchParams] = useSearchParams() @@ -128,6 +166,19 @@ export function IssuesPage() { const [expandedGroups, setExpandedGroups] = useState>( () => new Set(), ) + // Component chip filter, keyed by group.key → active component key (or + // undefined for "no filter"). Toggling the same chip clears the filter. + const [componentFilter, setComponentFilter] = useState>( + {}, + ) + const toggleComponentFilter = (groupKey: string, key: string) => { + setComponentFilter((prev) => { + const next = { ...prev } + if (next[groupKey] === key) delete next[groupKey] + else next[groupKey] = key + return next + }) + } const { data: archive, isLoading: archiveLoading } = useArchive(archiveId!) const { @@ -338,7 +389,7 @@ export function IssuesPage() { if (sortField === "confidence") { cmp = confidenceBadge(a.confidence).rank - confidenceBadge(b.confidence).rank } else { - cmp = sourceLabel(a.sample).localeCompare(sourceLabel(b.sample)) + cmp = compareSourceLabels(sourceLabel(a.sample), sourceLabel(b.sample)) } return cmp * dir }) @@ -398,8 +449,8 @@ export function IssuesPage() { - - {sourceLabel(issue)} + + @@ -435,6 +486,15 @@ export function IssuesPage() { const displayTitle = hasRealTitle ? decoratedTitle(group.title, group.sample.message) : group.title + // When the chip row below is going to enumerate distinct components, + // the parent's Source cell must NOT imply a single component — that + // would silently show the `group.sample`'s component as if it spoke + // for the whole group. Suppress the secondary here; the chip row is + // the authoritative breakdown. + const multiComponent = !singleton && shouldShowBreakdown(group.members) + const groupLabel: SourceLabel = multiComponent + ? { primary: sourceLabel(group.sample).primary } + : sourceLabel(group.sample) rows.push( - - {sourceLabel(group.sample)} + + @@ -484,7 +544,71 @@ export function IssuesPage() { ) if (expanded && !singleton) { - for (const member of group.members) { + const activeKey = componentFilter[group.key] + if (shouldShowBreakdown(group.members)) { + const { chips, overflow } = buildComponentBreakdown(group.members) + rows.push( + + + + +
+ Components: + {chips.map((c) => { + const active = activeKey === c.key + const isOther = c.key === OTHER_KEY + return ( + + ) + })} + {overflow.length > 0 ? ( + + +{overflow.length} more + + ) : null} +
+
+
, + ) + } + const visibleMembers = activeKey + ? group.members.filter( + (m) => (effectiveComponent(m) ?? OTHER_KEY) === activeKey, + ) + : group.members + for (const member of visibleMembers) { const memberConf = confidenceBadge(member.confidence) const memberTitle = primaryFindingTitle(member.triage_findings) rows.push( @@ -502,8 +626,8 @@ export function IssuesPage() { > - - {sourceLabel(member)} + + ): IssueRecord { + return { + id: "id", + collector: "c", + code: "x", + severity: "info", + confidence: "high", + category: "ERR", + message: "", + ...partial, + } +} + +describe("shouldShowBreakdown", () => { + it("returns false when every member resolves to the same component", () => { + const members = [ + issue({ component: "NetworkManager" }), + issue({ component: "NetworkManager" }), + ] + expect(shouldShowBreakdown(members)).toBe(false) + }) + + it("returns true when ≥2 distinct components are present", () => { + const members = [issue({ component: "NetworkManager" }), issue({ component: "sshd" })] + expect(shouldShowBreakdown(members)).toBe(true) + }) + + it("counts 'other' as a distinct component (so structured + unclassified → true)", () => { + const members = [ + issue({ component: "sshd" }), + issue({ message: "nothing matchable here" }), + ] + expect(shouldShowBreakdown(members)).toBe(true) + }) + + it("returns false for an empty list", () => { + expect(shouldShowBreakdown([])).toBe(false) + }) +}) + +describe("buildComponentBreakdown", () => { + it("groups balanced counts and sorts alphabetically on tie", () => { + const { chips, overflow } = buildComponentBreakdown( + [issue({ component: "sshd" }), issue({ component: "NetworkManager" })], + 6, + ) + expect(overflow).toEqual([]) + expect(chips).toEqual([ + { key: "NetworkManager", count: 1 }, + { key: "sshd", count: 1 }, + ]) + }) + + it("sorts by count descending, then alphabetically (case-insensitive via localeCompare)", () => { + const { chips } = buildComponentBreakdown([ + issue({ component: "sshd" }), + issue({ component: "sshd" }), + issue({ component: "NetworkManager" }), + issue({ component: "cron" }), + ]) + expect(chips).toEqual([ + { key: "sshd", count: 2 }, + { key: "cron", count: 1 }, + { key: "NetworkManager", count: 1 }, + ]) + }) + + it("caps at cap chips and moves the rest into overflow", () => { + const members = [ + issue({ component: "a" }), + issue({ component: "b" }), + issue({ component: "c" }), + issue({ component: "d" }), + issue({ component: "e" }), + issue({ component: "f" }), + issue({ component: "g" }), + ] + const { chips, overflow } = buildComponentBreakdown(members, 6) + expect(chips.length).toBe(6) + expect(overflow.length).toBe(1) + expect(overflow[0].key).toBe("g") // a-f each count 1, tie → alphabetical; g is last + }) + + it("mixes structured, fallback, and 'other' buckets correctly", () => { + const { chips } = buildComponentBreakdown([ + issue({ component: "NetworkManager" }), // structured + issue({ message: "sshd[1234]: something" }), // ruleset → "sshd" + issue({ message: "no identifiable pattern here" }), // → other + ]) + const keys = chips.map(c => c.key).sort() + expect(keys).toEqual([OTHER_KEY, "NetworkManager", "sshd"].sort()) + }) + + it("returns empty chips and overflow for empty members", () => { + expect(buildComponentBreakdown([])).toEqual({ chips: [], overflow: [] }) + }) +}) + +describe("overflowAriaLabel", () => { + it("names every overflow entry with its count", () => { + const label = overflowAriaLabel([ + { key: "sshd", count: 2 }, + { key: "cron", count: 1 }, + { key: "NetworkManager", count: 1 }, + ]) + expect(label).toBe("3 more components: sshd (2), cron (1), NetworkManager (1)") + }) + + it("uses the singular form for a single overflow entry", () => { + expect(overflowAriaLabel([{ key: "rsyslogd", count: 4 }])).toBe( + "1 more component: rsyslogd (4)", + ) + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/breakdown.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/breakdown.ts new file mode 100644 index 0000000..2b1af3e --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/breakdown.ts @@ -0,0 +1,73 @@ +/** + * Component breakdown helpers for expanded system-log groups. + * + * `buildComponentBreakdown` counts occurrences per effective component + * (backend-structured first, frontend ruleset second, "other" for misses + * so counts always sum to the group total), sorts descending by count + * with an alphabetical tiebreak, and splits beyond `cap` into an + * `overflow` list rendered as a single "+N more" chip. + * + * `shouldShowBreakdown` is true when ≥2 distinct effective components are + * present — which is the only case where the chip row adds signal. + * + * Both helpers must derive the key via `effectiveComponent` so the chips + * stay consistent with the Source cell on every row. + */ + +import type { IssueRecord } from "@/types" +import { effectiveComponent } from "./log-component" + +export interface BreakdownChip { + key: string + count: number +} + +export interface Breakdown { + chips: BreakdownChip[] + overflow: BreakdownChip[] +} + +export const OTHER_KEY = "other" +export const DEFAULT_CHIP_CAP = 6 + +function effectiveKey(issue: Pick): string { + return effectiveComponent(issue) ?? OTHER_KEY +} + +export function buildComponentBreakdown( + members: IssueRecord[], + cap: number = DEFAULT_CHIP_CAP, +): Breakdown { + const counts = new Map() + for (const m of members) { + const key = effectiveKey(m) + counts.set(key, (counts.get(key) ?? 0) + 1) + } + const all: BreakdownChip[] = [...counts.entries()] + .map(([key, count]) => ({ key, count })) + .sort((a, b) => b.count - a.count || a.key.localeCompare(b.key)) + if (all.length <= cap) { + return { chips: all, overflow: [] } + } + return { chips: all.slice(0, cap), overflow: all.slice(cap) } +} + +export function shouldShowBreakdown(members: IssueRecord[]): boolean { + const seen = new Set() + for (const m of members) { + seen.add(effectiveKey(m)) + if (seen.size >= 2) return true + } + return false +} + +/** + * Build the accessible label for the "+N more" chip: a comma-separated + * list naming every overflow component with its count. Used both as + * `aria-label` (for screen readers and keyboard flows that don't hover) + * and as the native `title` attribute (pointer tooltip). + */ +export function overflowAriaLabel(overflow: BreakdownChip[]): string { + const parts = overflow.map(o => `${o.key} (${o.count})`).join(", ") + return `${overflow.length} more component${overflow.length === 1 ? "" : "s"}: ${parts}` +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts index 132685b..163f534 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.test.ts @@ -1,7 +1,7 @@ import { describe, it, expect } from "vitest" import { messageComponent } from "./component" -describe("messageComponent", () => { +describe("messageComponent — syslog-tag capture fallback", () => { it("extracts tag with PID after a dmesg-style timestamp", () => { expect( messageComponent( @@ -34,3 +34,47 @@ describe("messageComponent", () => { expect(messageComponent(null)).toBeNull() }) }) + +describe("messageComponent — bounded static rules", () => { + it("labels WSL launcher lines as WSL", () => { + expect( + messageComponent("WSL (1 - init(docker-desktop)) ERROR: ConfigApplyWindowsLibPath:2092: …"), + ).toBe("WSL") + expect(messageComponent("WSL (376) ERROR: CheckConnection: connect() failed: 101")).toBe("WSL") + }) + + it("matches WSL even with a leading token (timestamp or title prefix)", () => { + // Collector summaries sometimes prefix the raw line with the finding + // title or a bracketed timestamp. The WSL rule should still catch it. + expect( + messageComponent("Error/Fail: WSL (1 - init(docker-desktop)) ERROR: …"), + ).toBe("WSL") + expect( + messageComponent("[Tue Dec 2 03:32:55 2025] WSL (376) ERROR: connect failed"), + ).toBe("WSL") + }) + + it("labels WSL DirectX/GPU paravirt lines as dxgk", () => { + expect( + messageComponent("misc dxg: dxgk: dxgkio_query_adapter_info: IoctI failed: -2"), + ).toBe("dxgk") + }) + + it("labels NVIDIA driver (NVRM) lines as NVIDIA", () => { + expect( + messageComponent("NVRM: Xid (PCI:0000:3b:00): 79, pid=1234, name=python3"), + ).toBe("NVIDIA") + }) + + it("labels LSB init-script failures as LSB", () => { + expect(messageComponent("Failed to start LSB: OpenIPMI Driver init script")).toBe("LSB") + }) + + it("labels udev-related lines as udev", () => { + expect( + messageComponent( + "internal error: Failed to get udev device for syspath '/sys/devices/virtual/dmi/id'", + ), + ).toBe("udev") + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts index ae75a74..1a7232b 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/component.ts @@ -1,25 +1,50 @@ /** - * Extracts the RFC3164 / RFC5424 "tag" (appname/program) from a syslog-style - * message. The tag names the originating component — useful when the - * category bucket is too coarse (e.g. 25 distinct runtime errors all - * landing under "System Logs" because they share the ERR category). + * FALLBACK ONLY — bounded ordered ruleset for messages the backend could + * not classify via SYSLOG_IDENTIFIER (see internal/store/journal_component.go + * on the Go side and IssueRecord.component). Callers should read + * `issue.component` first and only consult this helper on empty. * - * Matches: - * "[Tue Dec 2 03:32:55 2025] NetworkManager[1234]: DHCPv6 lease ..." → "NetworkManager" - * "[Tue Dec 2 03:32:55 2025] kernel: ata1: SError: { ... }" → "kernel" - * "sshd[5678]: Failed password for root ..." → "sshd" - * "systemd-logind: Removed session 42." → "systemd-logind" + * Ownership / review policy — HARD RULE: + * One rule per family. A PR that adds a second rule for the same + * family (e.g. another `dxgk` variant, another WSL spelling) is a + * no-merge signal: the family belongs in the structured path upstream, + * not in this table. If the regex table starts growing, something + * upstream is wrong. * - * Returns `null` when the message doesn't carry an identifiable tag, so - * callers can gracefully fall back to the existing source label. + * Success condition (delete this file): + * When the collector's next schema revision promotes SYSLOG_IDENTIFIER + * to `TriageFinding.Component` / `ManifestIssue.Component`, the + * dashboard's structured path will cover the journal-sourced tail and + * only truly unstructured messages will miss. At that point this + * fallback becomes net-negative — delete the file and inline + * `effectiveComponent` to just return `issue.component ?? null`. */ -const TAG = "[A-Za-z][\\w.-]{0,31}" -const DMESG_PREFIX = /^\[[^\]]+\]\s+/ -const TAG_RE = new RegExp(`^(${TAG})(?:\\[\\d+\\])?:`) + +type Rule = + | { re: RegExp; label: string } // static: matches → fixed label + | { re: RegExp; capture: 1 } // capture: matches → group 1 + +// Ordered, most-specific-first. First match wins. +const RULES: readonly Rule[] = [ + // WSL launcher: "WSL (N - init(...)) ERROR:". Use \b not ^ so we still + // match when the message carries a leading token (timestamp, finding + // title prefix, etc.). + { re: /\bWSL\s*\(/, label: "WSL" }, + { re: /(?:^|\s)(?:misc\s+)?dxg[kn]?(?:io_|:|\s)/i, label: "dxgk" }, + { re: /\bNVRM\b/, label: "NVIDIA" }, + { re: /\bFailed to start LSB:/, label: "LSB" }, + { re: /\budev\b|\/sys\/devices\//i, label: "udev" }, + // Syslog RFC3164/5424 tag — optional dmesg timestamp prefix + tag[pid]?: + { re: /^\[[^\]]+\]\s+([A-Za-z][\w.-]{0,31})(?:\[\d+\])?:/, capture: 1 }, + { re: /^([A-Za-z][\w.-]{0,31})(?:\[\d+\])?:/, capture: 1 }, +] export function messageComponent(message: string | undefined | null): string | null { if (!message) return null - const stripped = message.replace(DMESG_PREFIX, "") - const match = TAG_RE.exec(stripped) - return match ? match[1] : null + for (const rule of RULES) { + const m = rule.re.exec(message) + if (!m) continue + return "label" in rule ? rule.label : m[1] ?? null + } + return null } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.test.ts new file mode 100644 index 0000000..a6954a5 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.test.ts @@ -0,0 +1,72 @@ +import { describe, it, expect } from "vitest" +import { effectiveComponent, isLogSourced } from "./log-component" + +describe("isLogSourced", () => { + it("returns true for ERR category regardless of artifacts", () => { + expect(isLogSourced({ category: "ERR" })).toBe(true) + }) + + it("returns true for KERN category regardless of artifacts", () => { + expect(isLogSourced({ category: "KERN" })).toBe(true) + }) + + it("returns true when related_artifact_paths contains an .ndjson log", () => { + expect( + isLogSourced({ + category: "FUTURE", + related_artifact_paths: ["logs/journal_errors.ndjson"], + }), + ).toBe(true) + }) + + it("returns true when related_artifact_paths contains dmesg.txt", () => { + expect( + isLogSourced({ category: "FUTURE", related_artifact_paths: ["logs/dmesg.txt"] }), + ).toBe(true) + }) + + it("returns true when suggested_artifact_paths starts with logs/", () => { + expect( + isLogSourced({ category: "FUTURE", suggested_artifact_paths: ["logs/custom.log"] }), + ).toBe(true) + }) + + it("returns false for non-log category with no log-like artifacts", () => { + expect( + isLogSourced({ + category: "FW", + related_artifact_paths: ["network/ufw_status.txt"], + }), + ).toBe(false) + }) + + it("returns false for non-log category with no artifacts at all", () => { + expect(isLogSourced({ category: "DISK" })).toBe(false) + }) +}) + +describe("effectiveComponent", () => { + it("prefers the structured component field when present", () => { + expect( + effectiveComponent({ + component: "NetworkManager", + message: "kernel: something that the regex would match", + }), + ).toBe("NetworkManager") + }) + + it("falls back to the message ruleset when component is missing", () => { + expect(effectiveComponent({ message: "sshd[123]: Failed password" })).toBe("sshd") + }) + + it("returns null when both inputs produce nothing", () => { + expect(effectiveComponent({ message: "Firewall inactive (ufw)" })).toBeNull() + }) + + it("returns null when component is empty-string (treated as null)", () => { + // Backend emits omitempty; empty strings should not reach here, but be + // defensive: `"" ?? fallback` in TS evaluates to "" (not fallback), so + // we document that `undefined` is the intended "not present" value. + expect(effectiveComponent({ component: undefined, message: "no tag here" })).toBeNull() + }) +}) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.ts new file mode 100644 index 0000000..099a2cc --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/log-component.ts @@ -0,0 +1,51 @@ +/** + * Shared helpers for deciding whether an issue is eligible to carry a + * component sub-label, and for resolving which component to surface. + * + * Both the Source cell (lib/source.ts) and the breakdown chips + * (lib/breakdown.ts) import from here so they can never disagree about + * (a) which issues should show a component at all, and (b) what that + * component should be. + */ + +import type { IssueRecord } from "@/types" +import { messageComponent } from "./component" + +/** + * True when the issue's data shape is compatible with decorating it by + * component — i.e. it really did come from a log artifact. Prevents the + * fallback ruleset in component.ts from sprouting secondary labels on + * unrelated rows (e.g. a Firewall row whose message happens to mention + * "udev" should not gain a muted "udev" line). + */ +export function isLogSourced( + issue: Pick, +): boolean { + if (issue.category === "ERR" || issue.category === "KERN") return true + const paths = [ + ...(issue.related_artifact_paths ?? []), + ...(issue.suggested_artifact_paths ?? []), + ] + return paths.some( + p => + p.endsWith(".ndjson") || + p.endsWith("dmesg.txt") || + p.startsWith("logs/"), + ) +} + +/** + * Single precedence for "what component does this issue belong to": + * 1. Backend-structured `issue.component` (from SYSLOG_IDENTIFIER) + * 2. Frontend bounded ruleset applied to the message + * 3. null + * + * Callers must use this helper rather than duplicating the chain inline — + * the consistency between the Source cell and the breakdown chips depends + * on it. + */ +export function effectiveComponent( + issue: Pick, +): string | null { + return issue.component ?? messageComponent(issue.message) ?? null +} diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts index 1767a42..9c33bd4 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.test.ts @@ -2,29 +2,60 @@ import { describe, it, expect } from "vitest" import { sourceLabel } from "./source" describe("sourceLabel", () => { - it("returns just the category when no syslog tag is present in the message", () => { - expect(sourceLabel({ category: "HW", message: "pcieport 0000:4c:10.0: ..." })).toBe( - "Hardware", - ) + it("returns only the primary when the issue is not log-sourced", () => { + expect( + sourceLabel({ category: "HW", message: "pcieport 0000:4c:10.0: ..." }), + ).toEqual({ primary: "Hardware" }) + }) + + it("returns primary + secondary for log-sourced rows with a matching message", () => { + expect( + sourceLabel({ + category: "ERR", + message: "NetworkManager[1234]: DHCP lease expired", + }), + ).toEqual({ primary: "System Logs", secondary: "NetworkManager" }) + }) + + it("prefers the structured component over the message ruleset", () => { + expect( + sourceLabel({ + category: "ERR", + message: "kernel: something the regex would match", + component: "NetworkManager", + }), + ).toEqual({ primary: "System Logs", secondary: "NetworkManager" }) + }) + + it("gate prevents non-log categories from sprouting a secondary line", () => { + // Firewall row whose message happens to contain "ufw:" — must NOT grow a + // muted secondary from the fallback ruleset. This is the regression test + // for the 9.6 gate. + expect( + sourceLabel({ category: "FW", message: "ufw: blocked 10.0.0.1" }), + ).toEqual({ primary: "Firewall" }) }) - it("appends a syslog tag when it adds information", () => { + it("drops a secondary that would just repeat the primary (case-insensitive)", () => { expect( - sourceLabel({ category: "ERR", message: "NetworkManager[1234]: DHCP lease expired" }), - ).toBe("System Logs · NetworkManager") + sourceLabel({ category: "TIMEOUT", message: "Timeout: [Tue ...] pci ..." }), + ).toEqual({ primary: "Timeout" }) }) - it("drops a tag that would just repeat the category (case-insensitive)", () => { - // TIMEOUT-category messages start with "Timeout:"; the extracted tag is - // "Timeout", same as the category label — appending it is noise. - expect(sourceLabel({ category: "TIMEOUT", message: "Timeout: [Tue ...] pci ..." })).toBe( - "Timeout", - ) + it("passes unknown log-sourced categories through (forward-compat)", () => { + // Unknown category but has a log-like artifact → still log-sourced. + expect( + sourceLabel({ + category: "FUTURE_BUCKET", + message: "kernel: something", + related_artifact_paths: ["logs/custom.ndjson"], + }), + ).toEqual({ primary: "FUTURE_BUCKET", secondary: "kernel" }) }) - it("passes unknown categories through (forward-compat)", () => { - expect(sourceLabel({ category: "FUTURE_BUCKET", message: "kernel: something" })).toBe( - "FUTURE_BUCKET · kernel", - ) + it("returns only primary when the ruleset misses and no structured component is present", () => { + expect( + sourceLabel({ category: "ERR", message: "Firewall inactive (ufw)" }), + ).toEqual({ primary: "System Logs" }) }) }) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts index e9e57fd..2e812d9 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/source.ts @@ -1,21 +1,51 @@ import type { IssueRecord } from "@/types" import { categoryLabel } from "./utils" -import { messageComponent } from "./component" +import { effectiveComponent, isLogSourced } from "./log-component" +export interface SourceLabel { + primary: string + secondary?: string +} + +/** + * Two-line source label for an issue: primary (category) + optional muted + * secondary (component). The secondary line is only returned for + * log-sourced issues — see `isLogSourced` in lib/log-component.ts — so the + * fallback ruleset cannot decorate non-log categories. + * + * Render with primary in normal text and secondary in muted, smaller text + * below it. This replaces the old run-on "A · B" single string so the + * eye parses the cell as two fields, not one sentence. + */ /** - * Human source label for an issue: category first, then a syslog-tag - * sub-label when the message carries one. Pairs friendly category rendering - * with component-level disambiguation so e.g. 25 rows under "System Logs" - * split into "System Logs · NetworkManager", "· sshd", "· kernel", etc. + * Deterministic sort compare for `SourceLabel`: primary asc, then secondary + * asc (empty secondary sorts first). Use this when sorting table rows by + * the Source column, instead of stringifying and comparing — that avoids + * accidents like "System Logs" sorting next to "System Logs · X". */ -export function sourceLabel(issue: Pick): string { - const category = categoryLabel(issue.category) - const component = messageComponent(issue.message) - if (!component) return category - // Don't append a sub-label that just repeats the category (e.g. the - // TIMEOUT-category messages literally start with "Timeout:" — appending - // that tag gives "Timeout · Timeout", which is useless and overflows - // the column). - if (component.toLowerCase() === category.toLowerCase()) return category - return `${category} · ${component}` +export function compareSourceLabels(a: SourceLabel, b: SourceLabel): number { + return ( + a.primary.localeCompare(b.primary) || + (a.secondary ?? "").localeCompare(b.secondary ?? "") + ) +} + +export function sourceLabel( + issue: Pick< + IssueRecord, + | "category" + | "message" + | "component" + | "related_artifact_paths" + | "suggested_artifact_paths" + >, +): SourceLabel { + const primary = categoryLabel(issue.category) + if (!isLogSourced(issue)) return { primary } + const secondary = effectiveComponent(issue) ?? undefined + if (!secondary) return { primary } + // Don't append a secondary that just repeats the primary (e.g. Timeout / + // Timeout). This preserves the previous behaviour. + if (secondary.toLowerCase() === primary.toLowerCase()) return { primary } + return { primary, secondary } } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts index 7b72caa..77af1ba 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.test.ts @@ -17,5 +17,17 @@ describe("sampleLine", () => { it("returns the message unchanged when the title is falsy", () => { expect(sampleLine("", "anything")).toBe("anything") expect(sampleLine(undefined, "anything")).toBe("anything") + expect(sampleLine(null, "anything")).toBe("anything") + }) + + it("returns an empty string when the message is missing", () => { + expect(sampleLine("Title", "")).toBe("") + expect(sampleLine("Title", undefined)).toBe("") + expect(sampleLine("Title", null)).toBe("") + expect(sampleLine(null, null)).toBe("") + }) + + it("trims leading whitespace after stripping the title prefix", () => { + expect(sampleLine("T", "T: padded")).toBe("padded") }) }) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts index ffe0e8e..223a2fa 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/lib/utils.ts @@ -44,7 +44,11 @@ export function shortFingerprint(value: string) { * Firewall inactive (ufw)" where the subline re-states what the title * already says. */ -export function sampleLine(title: string | undefined, message: string): string { +export function sampleLine( + title: string | null | undefined, + message: string | null | undefined, +): string { + if (!message) return "" if (!title) return message const prefix = `${title}:` return message.startsWith(prefix) ? message.slice(prefix.length).trimStart() : message diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts index 41683cd..ec0ed54 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/types.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/types.ts @@ -81,6 +81,13 @@ export type IssueRecord = { suggested_artifact_paths?: string[] unresolved_artifact_paths?: string[] triage_findings?: TriageFinding[] + /** + * Structured component label (e.g. "NetworkManager", "sshd") derived at + * ingest from SYSLOG_IDENTIFIER in logs/journal_errors.ndjson. Empty when + * the issue is not journal-sourced or lookup failed — the UI falls back + * to the bounded ruleset in lib/component.ts via lib/log-component.ts. + */ + component?: string } export type ArtifactRecord = { diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go index 3dbd2c4..b02fc0f 100644 --- a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest.go @@ -157,6 +157,13 @@ func buildDetail(workDir string, manifest model.Manifest, uploadedBy string, com } } + // Enrich IssueRecord.Component from structured logs (currently + // SYSLOG_IDENTIFIER in journal_errors.ndjson). Requires triage findings + // because that's where source_artifact paths + per-line evidence live. + if triageByFP := store.LoadTriageMap(workDir); len(triageByFP) > 0 { + issues = store.EnrichIssues(workDir, issues, triageByFP) + } + for _, artifact := range manifest.ArtifactIndex { artifacts = append(artifacts, model.ArtifactRecord{ Path: artifact.Path, diff --git a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go index 16a2a93..6917d31 100644 --- a/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/ingest/ingest_test.go @@ -215,6 +215,101 @@ func writeTarFile(t *testing.T, tw *tar.Writer, name string, data []byte) { } } +// TestIngestEnrichesComponentFromJournal ingests an archive that contains +// logs/journal_errors.ndjson and asserts the resulting IssueRecord carries +// a structured Component derived from SYSLOG_IDENTIFIER. +func TestIngestEnrichesComponentFromJournal(t *testing.T) { + root := t.TempDir() + st, err := store.New(root) + if err != nil { + t.Fatalf("store.New: %v", err) + } + fp := "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + fpDmesg := "cccccccccccccccccccccccccccccccc" + archivePath := createArchive(t, root, func(tw *tar.Writer) { + writeTarFile(t, tw, "manifest.json", mustJSON(t, model.Manifest{ + SchemaVersion: "3.1.0", + ArchiveID: "archive-enrich", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "test-host", + Platform: model.ManifestPlatform{OS: "Ubuntu 24.04", Kernel: "6.8.0"}, + ArtifactIndex: []model.ManifestArtifact{}, + Collectors: map[string]model.ManifestCollector{ + "critical-events": { + Status: "ok", + Issues: []model.ManifestIssue{ + { + Code: "critical_log", + Severity: "info", + Confidence: "low", + Category: "ERR", + Message: "Error/Fail (1x in logs/journal_errors.ndjson)", + IssueFingerprint: fp, + }, + { + Code: "critical_log", + Severity: "info", + Confidence: "low", + Category: "ERR", + Message: "some dmesg event", + IssueFingerprint: fpDmesg, + }, + }, + }, + }, + })) + writeTarFile(t, tw, "logs/journal_errors.ndjson", []byte( + `{"MESSAGE":"DHCPv6 lease expired","SYSLOG_IDENTIFIER":"NetworkManager"}`+"\n", + )) + writeTarFile(t, tw, "triage/_data/critical_events.json", mustJSON(t, model.TriageEnvelope{ + Kind: "triage_result", + SchemaVersion: "3.1.0", + ArchiveID: "archive-enrich", + Analyzer: "critical-events", + Findings: []model.TriageFinding{ + { + Code: "critical_log", + Severity: "info", + Confidence: "low", + Category: "ERR", + Title: "Error/Fail", + Evidence: []string{"DHCPv6 lease expired"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + Fingerprint: fp, + }, + { + Code: "critical_log", + Severity: "info", + Confidence: "low", + Category: "ERR", + Title: "dmesg line", + Evidence: []string{"some dmesg event"}, + SourceArtifacts: []string{"logs/dmesg.txt"}, + Fingerprint: fpDmesg, + }, + }, + })) + }) + + if _, err := Ingest(context.Background(), st, archivePath, "tester@example.com", 0); err != nil { + t.Fatalf("Ingest: %v", err) + } + loaded, ok := st.Get("archive-enrich") + if !ok { + t.Fatalf("expected archive in store") + } + byFP := map[string]model.IssueRecord{} + for _, is := range loaded.Issues { + byFP[is.Fingerprint] = is + } + if got := byFP[fp].Component; got != "NetworkManager" { + t.Errorf("journal-sourced Component = %q, want NetworkManager", got) + } + if got := byFP[fpDmesg].Component; got != "" { + t.Errorf("dmesg-sourced Component = %q, want empty", got) + } +} + func mustJSON(t *testing.T, value any) []byte { t.Helper() data, err := json.Marshal(value) diff --git a/customers/vm-troubleshooting-dashboard/internal/model/types.go b/customers/vm-troubleshooting-dashboard/internal/model/types.go index 49882b3..8c594e1 100644 --- a/customers/vm-troubleshooting-dashboard/internal/model/types.go +++ b/customers/vm-troubleshooting-dashboard/internal/model/types.go @@ -120,6 +120,12 @@ type IssueRecord struct { UnresolvedArtifactPaths []string `json:"unresolved_artifact_paths,omitempty"` TriageFindings []TriageFinding `json:"triage_findings,omitempty"` Source string `json:"source,omitempty"` + // Component is a structured "what component produced this issue" label, + // derived at ingest from SYSLOG_IDENTIFIER in logs/journal_errors.ndjson + // (fallback to _SYSTEMD_UNIT). Empty when the issue is not journal-sourced + // or lookup failed; the frontend falls back to a bounded regex ruleset. + // See internal/store/journal_component.go for the enrichment pipeline. + Component string `json:"component,omitempty"` } type ArtifactRecord struct { diff --git a/customers/vm-troubleshooting-dashboard/internal/store/journal_component.go b/customers/vm-troubleshooting-dashboard/internal/store/journal_component.go new file mode 100644 index 0000000..69e7a54 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/journal_component.go @@ -0,0 +1,265 @@ +// Package-local enrichment of IssueRecord.Component from structured log +// artifacts (currently logs/journal_errors.ndjson). This code exists as +// tactical duplication — the normalizeCriticalLine function below is a +// verbatim copy of customers/vm-troubleshooting/internal/triage/critical.go. +// +// Why duplicated: the collector and dashboard are separate Go modules +// (github.com/NexGenCloud/vm-diagnostics vs github.com/NexGenCloud/diagnostic-dashboard) +// plus a monorepo internal/ boundary that forbids cross-project import. +// When the collector schema next revisits its contract, SYSLOG_IDENTIFIER +// should be promoted to TriageFinding.Component or ManifestIssue.Component +// and this whole file — enrichment, normalization, and parity tests — can +// be deleted. Track that deletion in the follow-up. +// +// Any change to the collector's normalizeCriticalLine MUST update the +// copy below AND the TestNormalizeParity table in journal_component_test.go +// in the same change — the test is release-blocking. + +package store + +import ( + "bufio" + "encoding/json" + "io" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +// SourceHandler describes how to index one kind of structured log artifact +// for component lookup. Add an entry to sourceHandlers to register a new +// structured source (e.g. container stdout NDJSON) without touching the +// core flow. +type SourceHandler struct { + Path string + Index func(io.Reader) (LookupIndex, error) +} + +// LookupIndex maps a normalized message to the set of identifiers +// observed on source lines that normalize to that key, with per-identifier +// occurrence counts. The count map enables majority-wins resolution when +// multiple source lines share a normalized form but carry different +// identifiers (rare, but defensive). +type LookupIndex map[string]map[string]int + +var sourceHandlers = []SourceHandler{ + {Path: "logs/journal_errors.ndjson", Index: buildJournalIndex}, +} + +// EnrichIssues populates IssueRecord.Component for log-sourced issues. +// storageDir is the directory containing the extracted archive +// (must contain logs/ and triage/_data/). triageByFP maps issue +// fingerprints to triage findings — load it via LoadTriageMap(storageDir). +// +// An issue is enriched iff it has a fingerprint AND at least one matching +// triage finding whose SourceArtifacts contains a registered handler path. +// Each finding's Evidence lines are normalized and looked up; identifier +// counts are summed across all hits. The winning component is the +// majority identifier; ties or zero hits leave Component empty. +// +// The returned slice is the same slice as the input (mutated in place). +func EnrichIssues( + storageDir string, + issues []model.IssueRecord, + triageByFP map[string][]model.TriageFinding, +) []model.IssueRecord { + if storageDir == "" || len(issues) == 0 || len(triageByFP) == 0 { + return issues + } + // Lazily build one LookupIndex per handler path — don't open files we + // don't need. nil means "tried and failed, don't retry". + indices := make(map[string]LookupIndex, len(sourceHandlers)) + loadIndex := func(h SourceHandler) LookupIndex { + if idx, ok := indices[h.Path]; ok { + return idx + } + idx := readIndex(filepath.Join(storageDir, h.Path), h.Index) + indices[h.Path] = idx + return idx + } + + for i := range issues { + fp := issues[i].Fingerprint + if fp == "" { + continue + } + findings := triageByFP[fp] + if len(findings) == 0 { + continue + } + if c := resolveComponent(findings, loadIndex); c != "" { + issues[i].Component = c + } + } + return issues +} + +func readIndex(path string, parse func(io.Reader) (LookupIndex, error)) LookupIndex { + f, err := os.Open(path) + if err != nil { + return nil + } + defer f.Close() + idx, err := parse(f) + if err != nil { + // Best-effort: a partial index (some lines parsed before the error) + // is still better than nothing. parse returns what it built. + return idx + } + return idx +} + +// resolveComponent aggregates identifier hits across every finding and +// every handler matching the finding's SourceArtifacts. Majority wins; +// tie (or zero hits) returns "". +func resolveComponent( + findings []model.TriageFinding, + loadIndex func(SourceHandler) LookupIndex, +) string { + counts := make(map[string]int) + for _, f := range findings { + for _, h := range sourceHandlers { + if !sliceContains(f.SourceArtifacts, h.Path) { + continue + } + idx := loadIndex(h) + if idx == nil { + continue + } + for _, ev := range f.Evidence { + hits := idx[normalizeCriticalLine(ev)] + for ident, c := range hits { + counts[ident] += c + } + } + } + } + return majorityWin(counts) +} + +func majorityWin(counts map[string]int) string { + best := "" + bestCount := 0 + tie := false + for ident, c := range counts { + switch { + case c > bestCount: + best = ident + bestCount = c + tie = false + case c == bestCount && ident != best: + tie = true + } + } + if tie || bestCount == 0 { + return "" + } + return best +} + +// buildJournalIndex parses logs/journal_errors.ndjson. Each record's MESSAGE +// is normalized and indexed to its SYSLOG_IDENTIFIER (or _SYSTEMD_UNIT +// minus ".service" suffix as fallback). Malformed lines are skipped; +// a scan error is returned alongside the partial index. +func buildJournalIndex(r io.Reader) (LookupIndex, error) { + idx := make(LookupIndex) + scanner := bufio.NewScanner(r) + // Journal MESSAGEs can be long. Match the collector's tolerance. + scanner.Buffer(make([]byte, 0, 64*1024), 16*1024*1024) + var rec struct { + Message string `json:"MESSAGE"` + SyslogIdentifier string `json:"SYSLOG_IDENTIFIER"` + SystemdUnit string `json:"_SYSTEMD_UNIT"` + } + for scanner.Scan() { + line := scanner.Bytes() + if len(line) == 0 { + continue + } + rec = struct { + Message string `json:"MESSAGE"` + SyslogIdentifier string `json:"SYSLOG_IDENTIFIER"` + SystemdUnit string `json:"_SYSTEMD_UNIT"` + }{} + if err := json.Unmarshal(line, &rec); err != nil { + continue // skip malformed + } + if rec.Message == "" { + continue + } + ident := rec.SyslogIdentifier + if ident == "" { + ident = strings.TrimSuffix(rec.SystemdUnit, ".service") + } + if ident == "" { + continue + } + key := normalizeCriticalLine(rec.Message) + if key == "" { + continue + } + bucket := idx[key] + if bucket == nil { + bucket = make(map[string]int) + idx[key] = bucket + } + bucket[ident]++ + } + return idx, scanner.Err() +} + +func sliceContains(haystack []string, needle string) bool { + for _, s := range haystack { + if s == needle { + return true + } + } + return false +} + +// -- Normalization (tactical duplicate of collector's normalizeCriticalLine) -- +// Source of truth: customers/vm-troubleshooting/internal/triage/critical.go +// Any edit here MUST be paired with an edit to the collector and to +// TestNormalizeParity in journal_component_test.go in the same change. + +var ( + pidBracketRe = regexp.MustCompile(`\[\s*\d+\]`) + kernelTsRe = regexp.MustCompile(`\[\s*\d+\.\d+\]`) + dmesgHumanTsRe = regexp.MustCompile(`\[(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}\]`) + handlerRe = regexp.MustCompile(`\(handler\d+\)`) + ipPortRe = regexp.MustCompile(`from \d+\.\d+\.\d+\.\d+ port \d+`) + pciBDFNormRe = regexp.MustCompile(`\b[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]\b`) + issuedMsecRe = regexp.MustCompile(`\bissued\s+\d+\s+msec\s+ago\b`) + explicitHexRe = regexp.MustCompile(`\b0x[0-9a-fA-F]+\b`) + bareHexTokenRe = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`) +) + +func isPCIHotplugLine(line string) bool { + lower := strings.ToLower(line) + return strings.Contains(lower, "pciehp") || + strings.Contains(lower, "data object exchange") +} + +func normalizeCriticalLine(line string) string { + n := pidBracketRe.ReplaceAllString(line, "[_]") + n = kernelTsRe.ReplaceAllString(n, "") + n = dmesgHumanTsRe.ReplaceAllString(n, "") + n = strings.TrimSpace(n) + n = handlerRe.ReplaceAllString(n, "(handler_)") + n = ipPortRe.ReplaceAllString(n, "_._._._:_") + if isPCIHotplugLine(n) { + n = pciBDFNormRe.ReplaceAllString(n, "BDF") + n = issuedMsecRe.ReplaceAllString(n, "issued _ msec ago") + } + n = explicitHexRe.ReplaceAllString(n, "0xHEX") + n = bareHexTokenRe.ReplaceAllStringFunc(n, func(token string) string { + if !strings.ContainsAny(strings.ToLower(token), "abcdef") { + return token + } + return "HEX" + }) + return n +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/journal_component_test.go b/customers/vm-troubleshooting-dashboard/internal/store/journal_component_test.go new file mode 100644 index 0000000..f93b0d1 --- /dev/null +++ b/customers/vm-troubleshooting-dashboard/internal/store/journal_component_test.go @@ -0,0 +1,282 @@ +package store + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/NexGenCloud/diagnostic-dashboard/internal/model" +) + +// TestNormalizeParity is release-blocking. It asserts that every regex in +// the collector's customers/vm-troubleshooting/internal/triage/critical.go +// normalizeCriticalLine is reproduced here. Any drift between the two +// copies is a silent-miss failure mode (the dashboard's lookup will fail +// for lines that the collector normalized differently). If the collector +// changes its normalizer, update THIS test AND the function in +// journal_component.go in the same change. +func TestNormalizeParity(t *testing.T) { + t.Parallel() + cases := []struct { + name string + in string + want string + }{ + // pidBracketRe: `[]` → `[_]` + {"pid_bracket", "sshd[1234]: Failed password", "sshd[_]: Failed password"}, + {"pid_bracket_spaces", "daemon[ 42]: hi", "daemon[_]: hi"}, + + // kernelTsRe: `[12345.678]` (kernel timestamp) → stripped (to "") + // Trimming happens after; final form has no leading timestamp. + {"kernel_ts", "[ 12.345] kernel: hi", "kernel: hi"}, + + // dmesgHumanTsRe: `[Mon Feb 6 02:31:46 2026]` → stripped + {"dmesg_human_ts", "[Fri Feb 6 02:31:46 2026] kernel: boot", "kernel: boot"}, + + // handlerRe: `(handler)` → `(handler_)` + {"handler_num", "ovs (handler42): msg", "ovs (handler_): msg"}, + + // ipPortRe: `from port ` → `_._._._:_` + {"ip_port", "sshd: Failed password from 10.0.0.1 port 22", "sshd: Failed password _._._._:_"}, + + // PCI hotplug family: BDF + issued-msec normalisation (gated by isPCIHotplugLine) + { + "pciehp_bdf", + "pciehp 0000:05:00.0: Slot(0): Link Down issued 12 msec ago", + "pciehp BDF: Slot(0): Link Down issued _ msec ago", + }, + { + "doe_bdf", + "data object exchange on 0000:aa:bb.1 issued 5 msec ago", + "data object exchange on BDF issued _ msec ago", + }, + // Non-hotplug line: BDF should NOT be normalized. + {"non_hotplug_bdf_kept", "pcieport 0000:05:00.0: fallen off the bus", "pcieport 0000:05:00.0: fallen off the bus"}, + + // explicitHexRe: `0x` → `0xHEX` + {"explicit_hex", "code 0xdeadbeef failed", "code 0xHEX failed"}, + + // bareHexTokenRe: 8+ hex chars WITH a-f letters → `HEX`. Digits-only stays. + {"bare_hex_token", "trace abcdef12 in log", "trace HEX in log"}, + {"bare_digits_kept", "counter 12345678 stays", "counter 12345678 stays"}, + + // TrimSpace: leading/trailing whitespace removed. + {"trim_space", " hello ", "hello"}, + + // Composite: multiple transforms in one line. + { + "composite", + "[1234567.890] NetworkManager[567]: from 10.0.0.1 port 22 trace deadbeef12 hex 0x1234", + "NetworkManager[_]: _._._._:_ trace HEX hex 0xHEX", + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := normalizeCriticalLine(tc.in) + if got != tc.want { + t.Errorf("normalizeCriticalLine(%q):\n got: %q\n want: %q", tc.in, got, tc.want) + } + }) + } +} + +func TestBuildJournalIndex_HitMissAndFallback(t *testing.T) { + t.Parallel() + ndjson := strings.Join([]string{ + // Hit: NetworkManager identifier via SYSLOG_IDENTIFIER + `{"MESSAGE":"DHCPv6 lease expired","SYSLOG_IDENTIFIER":"NetworkManager","_SYSTEMD_UNIT":"NetworkManager.service"}`, + // Hit: fallback to _SYSTEMD_UNIT minus ".service" + `{"MESSAGE":"unit entered failed state","SYSLOG_IDENTIFIER":"","_SYSTEMD_UNIT":"kubelet.service"}`, + // Skip: empty MESSAGE + `{"MESSAGE":"","SYSLOG_IDENTIFIER":"noise"}`, + // Skip: both identifiers empty + `{"MESSAGE":"no ident","SYSLOG_IDENTIFIER":"","_SYSTEMD_UNIT":""}`, + // Skip: malformed JSON — must not abort indexing of later lines + `{bad json`, + // Hit after malformed: sshd + `{"MESSAGE":"Failed password","SYSLOG_IDENTIFIER":"sshd"}`, + // Duplicate normalized key w/ different ident — tests count summing + `{"MESSAGE":"Failed password","SYSLOG_IDENTIFIER":"sshd"}`, + `{"MESSAGE":"Failed password","SYSLOG_IDENTIFIER":"sshd-session"}`, + }, "\n") + + idx, err := buildJournalIndex(strings.NewReader(ndjson)) + if err != nil { + t.Fatalf("buildJournalIndex: %v", err) + } + + get := func(rawMsg string) map[string]int { + return idx[normalizeCriticalLine(rawMsg)] + } + + if got := get("DHCPv6 lease expired"); got["NetworkManager"] != 1 { + t.Errorf("NetworkManager hit: got %v", got) + } + if got := get("unit entered failed state"); got["kubelet"] != 1 { + t.Errorf("_SYSTEMD_UNIT fallback: got %v", got) + } + if got := get("Failed password"); got["sshd"] != 2 || got["sshd-session"] != 1 { + t.Errorf("Failed password count map: got %v", got) + } + if _, ok := idx[normalizeCriticalLine("no ident")]; ok { + t.Errorf("empty-identifier line should not be indexed") + } +} + +func TestEnrichIssues_JournalHit(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), strings.Join([]string{ + `{"MESSAGE":"DHCPv6 lease expired","SYSLOG_IDENTIFIER":"NetworkManager"}`, + `{"MESSAGE":"Failed password","SYSLOG_IDENTIFIER":"sshd"}`, + }, "\n")) + + triageByFP := map[string][]model.TriageFinding{ + "fp-nm": {{ + Evidence: []string{"DHCPv6 lease expired"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + } + issues := []model.IssueRecord{{ID: "1", Fingerprint: "fp-nm"}} + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "NetworkManager" { + t.Errorf("expected Component=NetworkManager, got %q", issues[0].Component) + } +} + +func TestEnrichIssues_MajorityWins(t *testing.T) { + t.Parallel() + dir := t.TempDir() + // Two lines normalize to the same key but carry different identifiers. + // 2:1 majority for "winner". + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), strings.Join([]string{ + `{"MESSAGE":"shared message","SYSLOG_IDENTIFIER":"winner"}`, + `{"MESSAGE":"shared message","SYSLOG_IDENTIFIER":"winner"}`, + `{"MESSAGE":"shared message","SYSLOG_IDENTIFIER":"runner-up"}`, + }, "\n")) + triageByFP := map[string][]model.TriageFinding{ + "fp-shared": {{ + Evidence: []string{"shared message"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + } + issues := []model.IssueRecord{{ID: "1", Fingerprint: "fp-shared"}} + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "winner" { + t.Errorf("expected majority 'winner', got %q", issues[0].Component) + } +} + +func TestEnrichIssues_Tie(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), strings.Join([]string{ + `{"MESSAGE":"tied message","SYSLOG_IDENTIFIER":"alpha"}`, + `{"MESSAGE":"tied message","SYSLOG_IDENTIFIER":"bravo"}`, + }, "\n")) + triageByFP := map[string][]model.TriageFinding{ + "fp-tie": {{ + Evidence: []string{"tied message"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + } + issues := []model.IssueRecord{{ID: "1", Fingerprint: "fp-tie"}} + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "" { + t.Errorf("expected empty on tie, got %q", issues[0].Component) + } +} + +func TestEnrichIssues_MalformedNDJSONToleratedPerLine(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), strings.Join([]string{ + `{bad json`, + `{"MESSAGE":"good line","SYSLOG_IDENTIFIER":"goodly"}`, + ``, // blank line + }, "\n")) + triageByFP := map[string][]model.TriageFinding{ + "fp-good": {{ + Evidence: []string{"good line"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + } + issues := []model.IssueRecord{{ID: "1", Fingerprint: "fp-good"}} + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "goodly" { + t.Errorf("malformed lines should not block indexing: got %q", issues[0].Component) + } +} + +func TestEnrichIssues_NoJournalNoComponent(t *testing.T) { + t.Parallel() + dir := t.TempDir() // no logs/journal_errors.ndjson + triageByFP := map[string][]model.TriageFinding{ + "fp-any": {{ + Evidence: []string{"anything"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + } + issues := []model.IssueRecord{{ID: "1", Fingerprint: "fp-any"}} + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "" { + t.Errorf("expected empty Component when journal is missing, got %q", issues[0].Component) + } +} + +func TestEnrichIssues_OnlyJournalSourcedIssuesEnriched(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), + `{"MESSAGE":"journal msg","SYSLOG_IDENTIFIER":"jident"}`, + ) + triageByFP := map[string][]model.TriageFinding{ + "fp-journal": {{ + Evidence: []string{"journal msg"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + }}, + "fp-dmesg": {{ + Evidence: []string{"some dmesg line"}, + SourceArtifacts: []string{"logs/dmesg.txt"}, + }}, + } + issues := []model.IssueRecord{ + {ID: "1", Fingerprint: "fp-journal"}, + {ID: "2", Fingerprint: "fp-dmesg"}, + } + EnrichIssues(dir, issues, triageByFP) + if issues[0].Component != "jident" { + t.Errorf("journal-sourced: got %q", issues[0].Component) + } + if issues[1].Component != "" { + t.Errorf("dmesg-sourced should stay empty: got %q", issues[1].Component) + } +} + +func TestEnrichIssues_NoFingerprintNoLookup(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "logs", "journal_errors.ndjson"), + `{"MESSAGE":"anything","SYSLOG_IDENTIFIER":"whatever"}`, + ) + // triageByFP has no matching entries; no fingerprint on the issue either. + issues := []model.IssueRecord{{ID: "1"}} + EnrichIssues(dir, issues, map[string][]model.TriageFinding{}) + if issues[0].Component != "" { + t.Errorf("expected empty Component, got %q", issues[0].Component) + } +} + +func mustWriteFile(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} diff --git a/customers/vm-troubleshooting-dashboard/internal/store/schema.sql b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql index 7a5a3e9..127647c 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/schema.sql +++ b/customers/vm-troubleshooting-dashboard/internal/store/schema.sql @@ -44,7 +44,8 @@ CREATE TABLE IF NOT EXISTS issues ( message TEXT NOT NULL, issue_fingerprint TEXT, related_artifacts_json TEXT, - unresolved_artifacts_json TEXT + unresolved_artifacts_json TEXT, + component TEXT ); CREATE TABLE IF NOT EXISTS issue_state ( diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store.go b/customers/vm-troubleshooting-dashboard/internal/store/store.go index 2b83a0b..532b382 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/store.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/store.go @@ -6,6 +6,7 @@ import ( "encoding/json" "errors" "fmt" + "log" "os" "path/filepath" "sort" @@ -71,12 +72,20 @@ func New(rootDir string) (*Store, error) { db.Close() return nil, err } + if err := migrateIssuesComponent(db); err != nil { + db.Close() + return nil, err + } st := &Store{db: db, rootDir: rootDir} if err := st.backfillTriageFindingCounts(); err != nil { db.Close() return nil, fmt.Errorf("backfill triage_finding_count: %w", err) } + if err := st.backfillIssueComponents(); err != nil { + db.Close() + return nil, fmt.Errorf("backfill issues.component: %w", err) + } return st, nil } @@ -178,6 +187,25 @@ func migrateCollectorsSkipReasons(db *sql.DB) error { return nil } +// migrateIssuesComponent adds the component column to existing issues tables. +// NULL means "not derived" — the frontend falls back to the bounded ruleset +// in frontend/src/lib/component.ts. Populated values come from +// SYSLOG_IDENTIFIER in logs/journal_errors.ndjson (see journal_component.go). +func migrateIssuesComponent(db *sql.DB) error { + var n int + err := db.QueryRow(`SELECT COUNT(*) FROM pragma_table_info('issues') WHERE name='component'`).Scan(&n) + if err != nil { + return fmt.Errorf("pragma issues: %w", err) + } + if n > 0 { + return nil + } + if _, err := db.Exec(`ALTER TABLE issues ADD COLUMN component TEXT`); err != nil { + return fmt.Errorf("add component: %w", err) + } + return nil +} + // backfillTriageFindingCounts populates triage_finding_count for archives // whose value is NULL. Rows are read fully into memory before any UPDATE is // issued, then updates run inside a single transaction. This avoids @@ -229,8 +257,144 @@ func (s *Store) backfillTriageFindingCounts() error { return tx.Commit() } +// backfillIssueComponents populates issues.component for archives whose +// storage directory is still on disk, using the same journal enrichment +// as the ingest path. Pre-read the candidate archive list fully before +// any writes (matching the backfillTriageFindingCounts pattern), then +// process each archive in its own transaction so a failure on one does +// not roll back the others. Per-archive failures are logged and skipped. +// +// The UPDATE uses `WHERE id = ? AND component IS NULL` so the backfill +// is re-runnable without re-processing already-populated rows. +func (s *Store) backfillIssueComponents() error { + type pending struct { + archiveID string + storagePath string + } + // Narrow to archives that still have at least one NULL component row. + // Archives whose issues are all populated (or have no issues) are skipped. + rows, err := s.db.Query(` + SELECT a.archive_id, a.storage_path FROM archives a + WHERE EXISTS ( + SELECT 1 FROM issues i + WHERE i.archive_id = a.archive_id AND i.component IS NULL + )`) + if err != nil { + return err + } + var todo []pending + for rows.Next() { + var p pending + if err := rows.Scan(&p.archiveID, &p.storagePath); err != nil { + rows.Close() + return err + } + todo = append(todo, p) + } + if err := rows.Err(); err != nil { + rows.Close() + return err + } + rows.Close() + if len(todo) == 0 { + return nil + } + + for _, p := range todo { + if _, err := os.Stat(p.storagePath); err != nil { + log.Printf("backfill component: archive %s storage_path %q missing, skipping: %v", + p.archiveID, p.storagePath, err) + continue + } + if err := s.backfillOneArchiveComponents(p.archiveID, p.storagePath); err != nil { + // One archive's failure must not block the rest. + log.Printf("backfill component: archive %s failed: %v", p.archiveID, err) + } + } + return nil +} + +// backfillOneArchiveComponents runs the journal enrichment against a +// single archive and writes the resolved components in one transaction. +func (s *Store) backfillOneArchiveComponents(archiveID, storagePath string) error { + issues, err := s.loadIssuesForBackfill(archiveID) + if err != nil { + return fmt.Errorf("load issues: %w", err) + } + if len(issues) == 0 { + return nil + } + triageByFP := LoadTriageMap(storagePath) + EnrichIssues(storagePath, issues, triageByFP) + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + stmt, err := tx.Prepare(`UPDATE issues SET component = ? WHERE id = ? AND component IS NULL`) + if err != nil { + return err + } + defer stmt.Close() + + updated := 0 + for _, issue := range issues { + if issue.Component == "" || issue.ID == "" { + continue + } + id, err := strconv.ParseInt(issue.ID, 10, 64) + if err != nil { + continue + } + res, err := stmt.Exec(issue.Component, id) + if err != nil { + return err + } + if n, _ := res.RowsAffected(); n > 0 { + updated++ + } + } + if err := tx.Commit(); err != nil { + return err + } + if updated > 0 { + log.Printf("backfill component: archive %s populated %d/%d issues", + archiveID, updated, len(issues)) + } + return nil +} + +// loadIssuesForBackfill returns a minimal IssueRecord slice for a single +// archive. Only the fields EnrichIssues consults (ID, Fingerprint) are +// populated; the returned records are NOT intended for general use. +func (s *Store) loadIssuesForBackfill(archiveID string) ([]model.IssueRecord, error) { + rows, err := s.db.Query( + `SELECT id, issue_fingerprint FROM issues WHERE archive_id = ? AND component IS NULL`, + archiveID, + ) + if err != nil { + return nil, err + } + defer rows.Close() + var issues []model.IssueRecord + for rows.Next() { + var id int64 + var fp *string + if err := rows.Scan(&id, &fp); err != nil { + return nil, err + } + rec := model.IssueRecord{ID: strconv.FormatInt(id, 10)} + if fp != nil { + rec.Fingerprint = *fp + } + issues = append(issues, rec) + } + return issues, rows.Err() +} + func countTriageFindingsOnDisk(storagePath string) int { - m := loadTriageMap(storagePath) + m := LoadTriageMap(storagePath) n := 0 for _, findings := range m { n += len(findings) @@ -391,14 +555,18 @@ func (s *Store) SaveBounded(detail *model.ArchiveDetail, maxArchives int) error if issue.Fingerprint != "" { fp = &issue.Fingerprint } + var component *string + if issue.Component != "" { + component = &issue.Component + } _, err := tx.Exec(`INSERT INTO issues ( archive_id, collector_id, code, severity, confidence, category, message, issue_fingerprint, - related_artifacts_json, unresolved_artifacts_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + related_artifacts_json, unresolved_artifacts_json, component + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, archiveID, issue.Collector, issue.Code, issue.Severity, issue.Confidence, issue.Category, issue.Message, fp, - relJSON, unresJSON, + relJSON, unresJSON, component, ) if err != nil { return fmt.Errorf("insert issue: %w", err) @@ -700,7 +868,7 @@ func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { rows, err := s.db.Query(`SELECT id, collector_id, code, severity, confidence, category, message, issue_fingerprint, - related_artifacts_json, unresolved_artifacts_json + related_artifacts_json, unresolved_artifacts_json, component FROM issues WHERE archive_id = ? ORDER BY CASE severity WHEN 'critical' THEN 0 @@ -720,11 +888,11 @@ func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { for rows.Next() { var issue model.IssueRecord var dbID int64 - var fp, relJSON, unresJSON *string + var fp, relJSON, unresJSON, component *string if err := rows.Scan( &dbID, &issue.Collector, &issue.Code, &issue.Severity, &issue.Confidence, &issue.Category, &issue.Message, &fp, - &relJSON, &unresJSON, + &relJSON, &unresJSON, &component, ); err != nil { continue } @@ -738,11 +906,14 @@ func (s *Store) loadIssues(archiveID, storagePath string) []model.IssueRecord { if unresJSON != nil { _ = json.Unmarshal([]byte(*unresJSON), &issue.UnresolvedArtifactPaths) } + if component != nil { + issue.Component = *component + } issue.Source = "manifest" if issue.Fingerprint != "" { if !triageLoaded { - triageByFP = loadTriageMap(storagePath) + triageByFP = LoadTriageMap(storagePath) triageLoaded = true } if matched := triageByFP[issue.Fingerprint]; len(matched) > 0 { @@ -772,10 +943,12 @@ func (s *Store) enrichIssueEvidence(issues []model.IssueRecord, artifacts []mode return issues } -// loadTriageMap reads triage findings from triage/_data/*.json on disk and +// LoadTriageMap reads triage findings from triage/_data/*.json on disk and // indexes them by fingerprint. Files are read in lexical order so multiple // matching findings have a deterministic post-condition. -func loadTriageMap(storagePath string) map[string][]model.TriageFinding { +// +// Exported so ingest and backfill paths can share it with the read path. +func LoadTriageMap(storagePath string) map[string][]model.TriageFinding { result := make(map[string][]model.TriageFinding) root := filepath.Join(storagePath, "triage", "_data") entries, err := os.ReadDir(root) diff --git a/customers/vm-troubleshooting-dashboard/internal/store/store_test.go b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go index 30dffe2..10e37d3 100644 --- a/customers/vm-troubleshooting-dashboard/internal/store/store_test.go +++ b/customers/vm-troubleshooting-dashboard/internal/store/store_test.go @@ -31,6 +31,195 @@ func TestNewEnsuresTriageFindingCountColumn(t *testing.T) { } } +// TestBackfillIssueComponents covers 9.5: a pre-existing archive whose +// issues.component rows are NULL must be populated on the next Store.New() +// without any re-upload. Verifies that a missing storage_path does not +// abort the backfill, and that re-running is a no-op (WHERE component IS +// NULL predicate). +func TestBackfillIssueComponents(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + + archiveID := "archive-backfill" + archiveDir := st.ArchiveDir(archiveID) + fp := "dddddddddddddddddddddddddddddddd" + fpMissing := "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" + + // On-disk fixtures for the archive the backfill will re-open. + mustWriteFile(t, filepath.Join(archiveDir, "logs", "journal_errors.ndjson"), + `{"MESSAGE":"DHCPv6 lease expired","SYSLOG_IDENTIFIER":"NetworkManager"}`, + ) + mustWriteFile(t, filepath.Join(archiveDir, "triage", "_data", "critical_events.json"), + string(mustMarshal(t, model.TriageEnvelope{ + Kind: "triage_result", + SchemaVersion: "3.1.0", + ArchiveID: archiveID, + Analyzer: "critical-events", + Findings: []model.TriageFinding{{ + Code: "critical_log", + Evidence: []string{"DHCPv6 lease expired"}, + SourceArtifacts: []string{"logs/journal_errors.ndjson"}, + Fingerprint: fp, + }}, + })), + ) + + detail := &model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: archiveID, + SchemaVersion: "3.1.0", + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Hostname: "host", + UploadedAt: time.Now().UTC(), + Status: "ready", + }, + Issues: []model.IssueRecord{ + {Collector: "critical-events", Code: "critical_log", Severity: "info", + Confidence: "low", Category: "ERR", Message: "journal hit", + Fingerprint: fp}, + {Collector: "critical-events", Code: "critical_log", Severity: "info", + Confidence: "low", Category: "ERR", Message: "no evidence", + Fingerprint: fpMissing}, + }, + StorageDir: archiveDir, + } + if err := st.Save(detail); err != nil { + t.Fatalf("Save: %v", err) + } + // Simulate pre-migration state: NULL every component that Save may have + // populated via the ingest-side call path. + if _, err := st.db.Exec(`UPDATE issues SET component = NULL WHERE archive_id = ?`, archiveID); err != nil { + t.Fatalf("null components: %v", err) + } + st.Close() + + // Reopen — this triggers backfillIssueComponents(). + st2, err := New(root) + if err != nil { + t.Fatalf("New (reopen): %v", err) + } + defer st2.Close() + + got := componentsByFP(t, st2, archiveID) + if got[fp] != "NetworkManager" { + t.Errorf("after backfill, fp=%s Component=%q, want NetworkManager", fp, got[fp]) + } + if got[fpMissing] != "" { + t.Errorf("finding without lookup evidence should stay empty, got %q", got[fpMissing]) + } + + // Idempotency check: re-running the backfill must not overwrite. + if err := st2.backfillIssueComponents(); err != nil { + t.Fatalf("re-run backfill: %v", err) + } + got2 := componentsByFP(t, st2, archiveID) + if got2[fp] != "NetworkManager" { + t.Errorf("idempotent re-run changed value: %q", got2[fp]) + } +} + +// TestBackfillSkipsMissingStoragePath proves one archive's missing on-disk +// storage does not abort backfill for the others. +func TestBackfillSkipsMissingStoragePath(t *testing.T) { + t.Parallel() + root := t.TempDir() + st, err := New(root) + if err != nil { + t.Fatalf("New: %v", err) + } + + // Archive A: storage directory exists (but has no journal file → no + // enrichment hits, just proves the path is reachable). + archiveA := "archive-a" + dirA := st.ArchiveDir(archiveA) + if err := os.MkdirAll(dirA, 0o755); err != nil { + t.Fatal(err) + } + if err := st.Save(&model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: archiveA, SchemaVersion: "3.1.0", + UploadedAt: time.Now().UTC(), Status: "ready", + }, + Issues: []model.IssueRecord{{ + Collector: "x", Code: "c", Severity: "info", Confidence: "low", + Category: "ERR", Message: "m", Fingerprint: "fffffffffffffffffffffffffffffff0", + }}, + StorageDir: dirA, + }); err != nil { + t.Fatal(err) + } + + // Archive B: database row exists but storage directory never created. + archiveB := "archive-b" + dirB := st.ArchiveDir(archiveB) + if err := st.Save(&model.ArchiveDetail{ + Summary: model.ArchiveSummary{ + ArchiveID: archiveB, SchemaVersion: "3.1.0", + UploadedAt: time.Now().UTC(), Status: "ready", + }, + Issues: []model.IssueRecord{{ + Collector: "x", Code: "c", Severity: "info", Confidence: "low", + Category: "ERR", Message: "m", Fingerprint: "fffffffffffffffffffffffffffffff1", + }}, + StorageDir: dirB, + }); err != nil { + t.Fatal(err) + } + // NULL all components and remove B's directory (dirB was never created). + if _, err := st.db.Exec(`UPDATE issues SET component = NULL`); err != nil { + t.Fatal(err) + } + st.Close() + + // Should not error even though archive-b's storage_path is absent. + st2, err := New(root) + if err != nil { + t.Fatalf("New (reopen): %v", err) + } + defer st2.Close() +} + +func componentsByFP(t *testing.T, st *Store, archiveID string) map[string]string { + t.Helper() + rows, err := st.db.Query( + `SELECT issue_fingerprint, component FROM issues WHERE archive_id = ?`, + archiveID, + ) + if err != nil { + t.Fatal(err) + } + defer rows.Close() + out := map[string]string{} + for rows.Next() { + var fp, comp *string + if err := rows.Scan(&fp, &comp); err != nil { + t.Fatal(err) + } + var f, c string + if fp != nil { + f = *fp + } + if comp != nil { + c = *comp + } + out[f] = c + } + return out +} + +func mustMarshal(t *testing.T, v any) []byte { + t.Helper() + b, err := json.Marshal(v) + if err != nil { + t.Fatal(err) + } + return b +} + func TestGetAddsFallbackEvidence(t *testing.T) { t.Parallel() From 7c488bd439b496df139a8312164fba988a7321a7 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:01:53 +0200 Subject: [PATCH 15/23] =?UTF-8?q?feat(vm-troubleshooting-dashboard):=20tra?= =?UTF-8?q?ck=20B=20polish=20=E2=80=94=20stat=20hierarchy,=20fingerprint?= =?UTF-8?q?=20copy=20affordance,=20finding/event=20wording?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final polish pass paired with the Track A log-digestibility landing. B1 (OVS fact registry + dotted-key humaniser fallback) and B2 (TopIssues pill stacking into a fixed-width left column) were already in place before this commit; the items below close out the remaining gaps. B3 — Stat-card visual hierarchy. Bumped the Critical tile's number from text-2xl to text-3xl font-bold so it reads loudest; Warning stays text-2xl/semibold, Info/Total stay text-xl/medium + muted foreground. Tile dimensions are unchanged, so the four-card grid doesn't reflow. B4 — Fingerprint copy affordance (per review hardening). Separated the truncated hash from the copy action: the hash renders as static , and a dedicated 6×6 icon button sits beside it with an aria-label ("Copy fingerprint" / "Fingerprint copied"), visible focus ring, and a Check/Copy icon swap on success. Users can no longer misread the hash as a navigable control. copyToClipboard (already extracted in lib/clipboard.ts) is reused from CopySummaryButton. B5 — Group counter wording. Renamed "(N patterns · M events)" to "(N findings · M events)" in both IssuesPage and DashboardPage TopIssues, since group.count = members.length (distinct findings, each with its own fingerprint) and group.occurrences sums the (Nx in ...) suffixes. "Findings" matches CX support vocabulary better than Sentry-style "patterns". Verification: 120 Vitest cases green; backend untouched; pnpm build clean. --- .../components/dashboard/DashboardPage.tsx | 9 ++-- .../issue-detail/IssueDetailPage.tsx | 43 +++++++++++++------ .../src/components/issues/IssuesPage.tsx | 2 +- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx index 2661572..5643bec 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/dashboard/DashboardPage.tsx @@ -184,9 +184,12 @@ function StatCard({ textClass: string emphasis: "primary" | "secondary" | "muted" }) { + // Visual hierarchy: Critical reads loudest, Warning second, Info/Total + // recede. Tile dimensions stay identical across all four so the grid + // does not reflow — only the number typography shifts. const weight = emphasis === "primary" - ? "text-2xl font-bold" + ? "text-3xl font-bold" : emphasis === "secondary" ? "text-2xl font-semibold" : "text-xl font-medium" @@ -322,7 +325,7 @@ function factValueClass(signal: FactSignal): string { * Title + subline for a Top-Issues row. Mirrors the Issues-list fix: when * `group.title` is just the sample's message (no triage title available), * render the message once without decoration and without a duplicate - * subline. Also guarantees a space before the `(N patterns · M events)` + * subline. Also guarantees a space before the `(N findings · M events)` * counter using an explicit space instead of margin. */ function TopIssueRowBody({ @@ -346,7 +349,7 @@ function TopIssueRowBody({ {displayTitle} {!singleton ? ( - {" "}({group.count} patterns · {group.occurrences} events) + {" "}({group.count} findings · {group.occurrences} events) ) : null}

diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index f0611f7..1a86393 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -8,7 +8,7 @@ import { ArchiveHeader } from "@/components/layout/ArchiveHeader" import { useArchive } from "@/api/archives" import { useIssueDetail, useIssueStates, useSetIssueState } from "@/api/issues" import { KV } from "@/components/ui/kv" -import { shortFingerprint, encodeSegment, encodePath, sortFindings, sampleLine } from "@/lib/utils" +import { shortFingerprint, encodeSegment, encodePath, sortFindings, sampleLine, cn } from "@/lib/utils" import { groupIssues, groupKey } from "@/lib/grouping" import { occurrenceCount, rankIssues } from "@/lib/ranking" import { copyToClipboard } from "@/lib/clipboard" @@ -563,9 +563,10 @@ function uniquePaths(paths: string[]) { } /** - * KV row that renders a click-to-copy fingerprint. Shows the short form to - * save sidebar width; copies the full hash on click so a CX agent can paste - * it into a ticket or a grep without opening dev tools. + * KV row with a fingerprint and a dedicated copy button. The truncated + * hash is static text — not a button — so users do not misread it as + * a navigable link. The adjacent icon button is the only affordance + * for copying and carries an aria-label for screen readers. */ function FingerprintCopy({ fingerprint }: { fingerprint: string }) { const [copied, setCopied] = useState(false) @@ -580,15 +581,31 @@ function FingerprintCopy({ fingerprint }: { fingerprint: string }) {

Fingerprint

- +
+ + {shortFingerprint(fingerprint)} + + +
) } diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx index a6d69c0..717a26d 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issues/IssuesPage.tsx @@ -522,7 +522,7 @@ export function IssuesPage() { {displayTitle} {!singleton ? ( - {" "}({group.count} patterns · {group.occurrences} events) + {" "}({group.count} findings · {group.occurrences} events) ) : null} {allBootTime ? ( From 43131cb41cd19e38757053eaca813166b58da291 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:16:11 +0200 Subject: [PATCH 16/23] =?UTF-8?q?fix(vm-troubleshooting-dashboard):=20comp?= =?UTF-8?q?act-view=20readability=20=E2=80=94=20group=20title,=20member=20?= =?UTF-8?q?messages,=20fingerprint=20label?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three concrete issues surfaced while reviewing the Nautilus archive in compact / narrow-viewport screenshots. - Issues list: the group parent row was clipping "Error/Fail" to "Err..." because the inline count string "(N findings · M events)" shared a line-clamp-1 flex container with the title. Moved the count to its own muted second line — title never fights the counter for horizontal space, both are readable, and the overall row height is unchanged since the subline was already present for non-singleton groups. - Issues list: member-row messages under an expanded group were truncating at a common prefix (e.g. four rows all showed "misc dxg: dxgk:..." with no way to see -2 vs -22 vs -75). Bumped the compact subline to line-clamp-2 and the titleless fallback to line-clamp-3; non-compact views keep their original single-line clamp. - Issue detail METADATA: FingerprintCopy was rendering its label as uppercase tracking-wide while the adjacent KV rows (Collector, Code) used mixed-case text-xs. Aligned to the KV style so the three labels in the sidebar read as one group. --- .../issue-detail/IssueDetailPage.tsx | 9 ++++-- .../src/components/issues/IssuesPage.tsx | 30 +++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 1a86393..db855f9 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -578,9 +578,12 @@ function FingerprintCopy({ fingerprint }: { fingerprint: string }) { } return (
-

- Fingerprint -

+ {/* + Match the rest of this METADATA sidebar (KV component style) — + mixed-case `text-xs text-muted-foreground`, NOT uppercase. Keeping + this label visually consistent with Collector / Code above it. + */} +

Fingerprint

{message} @@ -99,7 +102,16 @@ function IssueRowBody({ {decorated} {showSub ? ( - + {sub} ) : null} @@ -520,20 +532,20 @@ export function IssuesPage() { ) ) : null} {displayTitle} - {!singleton ? ( - - {" "}({group.count} findings · {group.occurrences} events) - - ) : null} {allBootTime ? ( boot ) : null} + {!singleton ? ( + + {group.count} findings · {group.occurrences} events + + ) : null} {!expanded && hasRealTitle ? ( {sampleLine(group.title, group.sample.message)} From c2b05a68ed75d147c027939f0ce03c912a0b4edc Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:18:07 +0200 Subject: [PATCH 17/23] fix(vm-troubleshooting-dashboard): Top Issues grid overflow on narrow viewports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .top-issue-list was declared as `display: grid` with no `grid-template-columns`, so each row's implicit track defaulted to min-width:auto — which means the track expands to the child's widest unbreakable content and the `truncate` on the subline becomes a no-op. Long sublines like "System Logs · misc dxg: dxgk: dxgkio_is_feature_enabled: Ioctl failed: -75 (2x in logs/journal_errors.ndjson)" then pushed the whole card past its container, creating visible horizontal overflow on the Overview page at phone widths. Fix is the same one already used by .artifact-browser-grid in the same stylesheet — `grid-template-columns: minmax(0, 1fr)` constrains the track to the available width and lets the inner truncate/line-clamp clipping take over. --- .../vm-troubleshooting-dashboard/frontend/src/index.css | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/index.css b/customers/vm-troubleshooting-dashboard/frontend/src/index.css index 004b39e..ec4773e 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/index.css +++ b/customers/vm-troubleshooting-dashboard/frontend/src/index.css @@ -170,6 +170,12 @@ .top-issue-list { display: grid; + /* minmax(0, 1fr) is what makes truncate work inside: without an + explicit columns track, grid items default to min-width:auto and + expand to their widest child, which lets long unwrapped strings + (e.g. the "(Nx in logs/journal_errors.ndjson)" tail) push the + card past its container. */ + grid-template-columns: minmax(0, 1fr); gap: 0.5rem; } From c00687f34c1021871f550ee439b12fe4ecc72226 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:26:38 +0200 Subject: [PATCH 18/23] refactor(vm-troubleshooting-dashboard): consolidate issue-detail main column into a single finding card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old layout produced three problems across every issue variant: - WHAT HAPPENED card echoed the page title (e.g. "Failed Systemd Services" / "Firewall Posture: inactive" / "Error/Fail") right underneath the page's own h2 — pure duplication. - RECOMMENDED ACTION lived in its own card immediately below. Two separate frames for one continuous thought ("here's what / here's what to do") increased visual weight without adding information. - Low-confidence log findings (no explicit action) left the left column looking empty next to a 3-card sidebar. Merged the two cards into one vertically-flowing "finding" card with signposted sub-sections: [ hero stat strip ] occurrence count + source link + group size What happened prose description (with (Nx in …) tail stripped, dropped entirely when it would just echo the Evidence block below) Recommended action inline green-accent row (not a full card) Evidence raw matched lines in a mono block — the actual string a CX agent would grep for How to investigate three generic tips, only shown when no explicit action is available (so low-conf Error/Fail is never just a title) The hero stat strip promotes (Nx occurrences) to a proper tabular-num hero number, makes the source file a real link, and adds "N in this group" when the pattern has siblings. Card renders these conditionally so short-signal issues (single firewall fact, one failed service) don't carry an empty band. 120 Vitest cases still green; issue-detail view has no dedicated component test yet (earlier plan follow-up), but the behaviour is covered by the data the triage collector emits. --- .../issue-detail/IssueDetailPage.tsx | 161 ++++++++++++------ 1 file changed, 109 insertions(+), 52 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index db855f9..0c5e1d8 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -196,77 +196,134 @@ export function IssueDetailPage() {
{/* Main content */}
- {/* What happened */} + {/* Finding card — consolidates WHAT HAPPENED + RECOMMENDED ACTION + + EVIDENCE into one vertically-flowing card so the left column + has visual weight even for info / low-confidence issues. */} - -

- What happened -

- {occCount > 1 || messageSource ? ( -
+ + {/* Stat strip: occurrence count, source, group size. Only + rendered when there's at least one signal to show so + short-signal issues (e.g. "Firewall inactive") don't + carry an empty band at the top. */} + {(occCount > 1 || messageSource || siblings.length > 0) && ( +
{occCount > 1 ? ( - - +
+ {occCount} - {" occurrences"} - + occurrences +
) : null} {messageSource ? ( - - Source:{" "} +
+ {messageSource} - +
+ ) : null} + {siblings.length > 0 ? ( +
+ + {siblings.length + 1} + + + in this group + +
) : null}
- ) : null} - {primaryFinding ? ( - <> -

- {primaryFinding.title} -

-

- {primaryFinding.description} -

- - ) : ( -

- {issue.message} -

)} - {findings.length === 0 ? ( -
- No triage match for this fingerprint. Showing the best - evidence we could find from the archive. + +
+

What happened

+ {(() => { + const cleanedDesc = (primaryFinding?.description ?? "") + .replace(/\s*\(\d+x\s+in\s+[^)]+\)\s*$/, "") + .trim() + const ev = primaryFinding?.evidence ?? [] + // Skip the prose description when it's the same raw line + // the Evidence block will show — otherwise the same text + // appears twice in three lines. + const hideDescAsDup = + ev.length === 1 && cleanedDesc && cleanedDesc === ev[0].trim() + const descToShow = primaryFinding + ? hideDescAsDup + ? null + : cleanedDesc || null + : issue.message + return descToShow ? ( +

+ {descToShow} +

+ ) : null + })()} + {findings.length === 0 ? ( +
+ No triage match for this fingerprint. Showing the best + evidence we could find from the archive. +
+ ) : null} +
+ + {/* Recommended action — inline, not its own card, so the + "finding → action" pair reads as one thought. */} + {primaryFinding?.action ? ( +
+

Recommended action

+
+ +

+ {primaryFinding.action} +

+
) : null} - - - {/* Classification card was removed — severity / confidence are in the - header row; category lives in the Metadata sidebar. */} + {/* Evidence — raw matched line(s) from the triage analyzer. + Mono font so a CX agent can grep for the exact string; + visually distinct from the prose description above. */} + {primaryFinding?.evidence?.length ? ( +
+

Evidence

+
+                    {primaryFinding.evidence.join("\n")}
+                  
+
+ ) : null} - {/* What to do next */} - {primaryFinding?.action ? ( - - -

- Recommended action -

-
- -

- {primaryFinding.action} -

+ {/* Fallback guidance — when the triage analyzer didn't emit + an action (common for low-confidence log-match findings), + give the CX agent a generic scaffold so the left column + is never just a title. */} + {!primaryFinding?.action && (messageSource || evidencePaths.length > 0) ? ( +
+

How to investigate

+
    +
  • + Open the source log to read the lines around each + occurrence — repeated events often share a trigger + visible in the surrounding context. +
  • +
  • + Compare timestamps with other findings in this archive; + correlations across unrelated components usually point + at a common root cause. +
  • +
  • + If this pattern is known benign noise for your + environment, use Dismiss{" "} + to keep it out of future reviews. +
  • +
- - - ) : null} + ) : null} + + {/* Supporting findings */} {findings.length > 1 ? ( From 14dea8807da1df9b44919c05198fd204a12abd09 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:30:39 +0200 Subject: [PATCH 19/23] fix(vm-troubleshooting-dashboard): hide "What happened" section when its body would be empty or duplicated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two holes in the previous consolidation pass were visible in the refreshed Error/Fail and Firewall screenshots: - Error/Fail (new_issue_3.png): when the description was dropped because it equalled the single-line Evidence below, the "WHAT HAPPENED" section label still rendered above a now-empty body. The card read "WHAT HAPPENED / [nothing] / EVIDENCE / misc dxg…". - Firewall Posture (new_issue_2.png): the description still duplicated the page subline ("Firewall inactive (ufw)" written twice within ~90px). The earlier dedup only compared description-to-evidence, not description-to-subline. Moved the entire "What happened" sub-section decision into one IIFE that computes the effective body, hides the section label when the body would be empty, and dedups against both the single-line evidence block AND the page subline. For short-signal findings (firewall, single-service failures) the card now collapses cleanly to just the Recommended action; for log-match findings with only evidence, the stat strip + Evidence + How-to-investigate read without a ghost label. --- .../issue-detail/IssueDetailPage.tsx | 68 +++++++++++-------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 0c5e1d8..05f0448 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -239,36 +239,46 @@ export function IssueDetailPage() {
)} -
-

What happened

- {(() => { - const cleanedDesc = (primaryFinding?.description ?? "") - .replace(/\s*\(\d+x\s+in\s+[^)]+\)\s*$/, "") - .trim() - const ev = primaryFinding?.evidence ?? [] - // Skip the prose description when it's the same raw line - // the Evidence block will show — otherwise the same text - // appears twice in three lines. - const hideDescAsDup = - ev.length === 1 && cleanedDesc && cleanedDesc === ev[0].trim() - const descToShow = primaryFinding - ? hideDescAsDup - ? null - : cleanedDesc || null - : issue.message - return descToShow ? ( -

- {descToShow} -

- ) : null - })()} - {findings.length === 0 ? ( -
- No triage match for this fingerprint. Showing the best - evidence we could find from the archive. + {(() => { + // Decide the "What happened" sub-section once, up front, so + // we never render the section label above an empty body. + const pageSubline = primaryFinding + ? sampleLine(primaryFinding.title, issue.message).trim() + : "" + const cleanedDesc = (primaryFinding?.description ?? "") + .replace(/\s*\(\d+x\s+in\s+[^)]+\)\s*$/, "") + .trim() + const ev = primaryFinding?.evidence ?? [] + const evSingle = ev.length === 1 ? ev[0].trim() : null + // Hide the description when it would just echo either the + // single-line evidence block below or the page subline above. + const descIsDup = + (evSingle !== null && cleanedDesc === evSingle) || + (pageSubline !== "" && cleanedDesc === pageSubline) + const descToShow = primaryFinding + ? descIsDup + ? null + : cleanedDesc || null + : issue.message + const hasNoMatch = findings.length === 0 + if (!descToShow && !hasNoMatch) return null + return ( +
+

What happened

+ {descToShow ? ( +

+ {descToShow} +

+ ) : null} + {hasNoMatch ? ( +
+ No triage match for this fingerprint. Showing the + best evidence we could find from the archive. +
+ ) : null}
- ) : null} -
+ ) + })()} {/* Recommended action — inline, not its own card, so the "finding → action" pair reads as one thought. */} From 32f433b07b6af15b7ed50bd8d4b2cee1d5295ef6 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:35:00 +0200 Subject: [PATCH 20/23] refactor(vm-troubleshooting-dashboard): polish issue-detail header, tips, and column balance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three remaining refinements called out on the previous review pass. - Header separates "what this is" from "what I can do". Severity and confidence pills (display — describe the finding) move inline with the title; action buttons (Ack / Dismiss / Copy summary) stay right-aligned on their own. The previous right cluster mixed interactive and display controls, which blurred the affordance. - "How to investigate" bullets are now context-aware. The source-file bullet reads differently for .ndjson (mentions SYSLOG_IDENTIFIER / _SYSTEMD_UNIT / timestamp fields as grep anchors) vs dmesg.txt (suggests reading surrounding kernel lines) vs other files. The cadence bullet uses the actual occurrence count ("fired N times — tight burst vs steady cadence") when occCount > 1, or the sibling count when the finding is a singleton with a populated group. The bullet list stays capped at 3 items. - Moved "Other entries in this group" from the sidebar to the main column, immediately after the finding card. The sibling list is content (clickable navigation to peer findings), not metadata; placing it in the main column balances the layout when the sidebar was outweighing a thin left column on low-signal issues. Sidebar now carries only METADATA and RELATED ARTIFACTS — the two genuinely meta-context cards. 120 Vitest cases still green. --- .../issue-detail/IssueDetailPage.tsx | 173 ++++++++++++------ 1 file changed, 116 insertions(+), 57 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 05f0448..10cafe6 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -147,27 +147,36 @@ export function IssueDetailPage() { ) : }
- {/* Issue header */} + {/* Issue header. + Pills (severity, confidence) sit inline with the title because + they describe *the thing*. Action buttons (Ack / Dismiss / Copy) + are isolated on the right because they operate *on the thing*. + Mixing them previously read as "five equivalent controls"; this + split makes "what this is" vs "what I can do" instantly scannable. */}
-
-

- {primaryFinding?.title || issue.message} -

+
+
+

+ {primaryFinding?.title || issue.message} +

+ + + {currentState ? ( + + {currentState} + + ) : null} +
{primaryFinding ? ( -

+

{sampleLine(primaryFinding.title, issue.message)}

) : null}

{issue.collector} · {issue.code} · {issue.category} - {currentState ? ( - - {currentState} - - ) : null}

-
+
{issue.issue_fingerprint ? ( - -
@@ -308,25 +315,74 @@ export function IssueDetailPage() { {/* Fallback guidance — when the triage analyzer didn't emit an action (common for low-confidence log-match findings), - give the CX agent a generic scaffold so the left column - is never just a title. */} + assemble a short context-aware tip list. Each bullet is + predicated on an actual fact of this issue (source file + kind, occurrence count, sibling count) so the guidance + stays useful instead of reading as boilerplate. */} {!primaryFinding?.action && (messageSource || evidencePaths.length > 0) ? (

How to investigate

    -
  • - Open the source log to read the lines around each - occurrence — repeated events often share a trigger - visible in the surrounding context. -
  • -
  • - Compare timestamps with other findings in this archive; - correlations across unrelated components usually point - at a common root cause. -
  • + {messageSource ? ( +
  • + Open{" "} + + {messageSource} + {" "} + {messageSource.endsWith(".ndjson") ? ( + <> + and grep for the matched line — each record + also carries{" "} + SYSLOG_IDENTIFIER,{" "} + _SYSTEMD_UNIT, + and a timestamp. + + ) : messageSource.endsWith("dmesg.txt") ? ( + <> + and read the surrounding lines — kernel + events often precede the visible error by a + few lines. + + ) : ( + <>to see the context around each occurrence. + )} +
  • + ) : evidencePaths.length > 0 ? ( +
  • + Open the Related artifacts to see the context + around each occurrence. +
  • + ) : null} + {occCount > 1 ? ( +
  • + This event fired{" "} + + {occCount} + {" "} + times — a tight burst usually points at a single + trigger; a steady cadence at a polling or retry + loop. +
  • + ) : siblings.length > 0 ? ( +
  • + There are{" "} + + {siblings.length} + {" "} + related findings in the same group. If they all + match the same component, they likely share a root + cause. +
  • + ) : null}
  • If this pattern is known benign noise for your - environment, use Dismiss{" "} + environment, use{" "} + + Dismiss + {" "} to keep it out of future reviews.
@@ -335,6 +391,39 @@ export function IssueDetailPage() { + {/* Other entries in this pattern group — moved here from the + sidebar so the left column's weight matches its importance, + and so the sibling list (which is *content*, not *metadata*) + lives alongside the rest of the finding content. */} + {siblings.length > 0 ? ( + + +

+ Other entries in this group ({siblings.length}) +

+
+ {siblings.slice(0, 8).map((s) => ( + + {s.message} + + ))} + {siblings.length > 8 ? ( + + +{siblings.length - 8} more in Issues list + + ) : null} +
+
+
+ ) : null} + {/* Supporting findings */} {findings.length > 1 ? (
@@ -428,36 +517,6 @@ export function IssueDetailPage() { - {/* Related findings in the same pattern group */} - {siblings.length > 0 ? ( - - -

- Other entries in this group ({siblings.length}) -

-
- {siblings.slice(0, 8).map((s) => ( - - {s.message} - - ))} - {siblings.length > 8 ? ( - - +{siblings.length - 8} more in Issues list - - ) : null} -
-
-
- ) : null} - {/* Related artifacts */} From 767b4e161c122a3e69806027e2012cb421f974fa Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:40:02 +0200 Subject: [PATCH 21/23] fix(vm-troubleshooting-dashboard): strip redundant "Title:" prefix from sibling rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every row in "Other entries in this group" was starting with the same "Error/Fail:" prefix because the collector prefixes the raw line with the finding title. Since the card header already names the group, the prefix is redundant — worse, it steals ~70px of horizontal space before the differentiating suffix (e.g. "dxgkio_is_feature_enabled: Ioctl failed: -22") can start. Uses the existing sampleLine() helper (same one that strips the prefix from the page subline) so the sibling rows now lead with their distinguishing detail. --- .../src/components/issue-detail/IssueDetailPage.tsx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 10cafe6..4fb5a26 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -408,7 +408,16 @@ export function IssueDetailPage() { to={`/archives/${encodeSegment(archiveId!)}/issues/${encodeSegment(s.id)}`} className="block rounded-md bg-muted px-2.5 py-1.5 text-xs text-foreground no-underline transition-colors hover:bg-secondary" > - {s.message} + {/* Strip the redundant "Title: " prefix shared by + every sibling — the card header already names the + group, so repeating "Error/Fail:" on every row + wastes the first ~70px of width where the + differentiating suffix should be visible. */} + + {primaryFinding + ? sampleLine(primaryFinding.title, s.message) + : s.message} + ))} {siblings.length > 8 ? ( From 25951c2c9be2b19eea0470618547a18fa92e7801 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:44:05 +0200 Subject: [PATCH 22/23] refactor(vm-troubleshooting-dashboard): unify issue-detail main card on a two-slot skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Across the three issue-detail variants each view had been landing on a different skeleton — ni_1 showed "What happened → Recommended action → Evidence" (3 slots), ni_2 collapsed to "Recommended action" alone (1 slot), ni_3 rendered "Evidence → How to investigate" (2 slots with no "What happened" label at all). Same page, three layouts, no shared mental model for the eye to lock onto. Collapsed the card down to exactly two always-present slots: 1. "What happened" — prose description + raw evidence in a mono block, or the raw issue.message as last-resort fallback. Evidence is now *inside* this slot rather than a sibling card — they answer the same question (here's the finding, here's the proof). The prose is skipped only when it's identical word-for-word to a single evidence line (duplication of text within the same slot, which reads as clutter); the slot itself always renders. 2. Next steps — always rendered with exactly one label depending on what we know: • "Recommended action" — triage supplied finding.action • "How to investigate" — no action but we have a log source or evidence path, so the context-aware tips give useful guidance • "Suggested next steps" — generic fallback when we have neither The context-aware tips and the generic fallback share one ul/li structure, so the visual weight is identical across cases. Content-driven optional sections (stat strip, "Other entries in this group", "Supporting findings") remain conditional on the data that powers them — that variation is meaningful, not accidental. The main card is now the fixed anchor every issue detail shares. --- .../issue-detail/IssueDetailPage.tsx | 105 ++++++++++-------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx index 4fb5a26..0c81c45 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx +++ b/customers/vm-troubleshooting-dashboard/frontend/src/components/issue-detail/IssueDetailPage.tsx @@ -246,29 +246,48 @@ export function IssueDetailPage() {
)} + {/* + Consistent two-slot skeleton across every issue detail: + + 1. "What happened" — prose description + raw evidence, + or the raw message when neither is + available. Always rendered. + 2. "Next steps" — label is one of: + • "Recommended action" — triage + supplied an explicit action + • "How to investigate" — no + action but we have a log + source or evidence path + • "Suggested next steps" — + generic fallback when we have + neither + Always rendered. + + The skeleton is intentionally NOT conditional on the + finding's completeness — every view has the same two + anchor points so the eye builds a single mental model. + Content varies; structure does not. + */} + + {/* === Slot 1: What happened === */} {(() => { - // Decide the "What happened" sub-section once, up front, so - // we never render the section label above an empty body. - const pageSubline = primaryFinding - ? sampleLine(primaryFinding.title, issue.message).trim() - : "" const cleanedDesc = (primaryFinding?.description ?? "") .replace(/\s*\(\d+x\s+in\s+[^)]+\)\s*$/, "") .trim() const ev = primaryFinding?.evidence ?? [] const evSingle = ev.length === 1 ? ev[0].trim() : null - // Hide the description when it would just echo either the - // single-line evidence block below or the page subline above. - const descIsDup = - (evSingle !== null && cleanedDesc === evSingle) || - (pageSubline !== "" && cleanedDesc === pageSubline) - const descToShow = primaryFinding - ? descIsDup - ? null - : cleanedDesc || null - : issue.message + // When the prose description is *identical* to the single + // evidence line, the evidence block below already carries + // the same string in a more useful mono-font form — skip + // the prose to avoid word-for-word duplication within the + // same slot. If the description adds anything (prose + // framing, extra context) we keep it above the evidence. + const descIsDupOfEvidence = + evSingle !== null && cleanedDesc === evSingle + const descToShow = descIsDupOfEvidence + ? null + : cleanedDesc || (!primaryFinding ? issue.message : null) const hasNoMatch = findings.length === 0 - if (!descToShow && !hasNoMatch) return null return (

What happened

@@ -277,6 +296,16 @@ export function IssueDetailPage() { {descToShow}

) : null} + {ev.length > 0 ? ( +
+                        {ev.join("\n")}
+                      
+ ) : null} + {!descToShow && ev.length === 0 ? ( +

+ {issue.message} +

+ ) : null} {hasNoMatch ? (
No triage match for this fingerprint. Showing the @@ -287,8 +316,7 @@ export function IssueDetailPage() { ) })()} - {/* Recommended action — inline, not its own card, so the - "finding → action" pair reads as one thought. */} + {/* === Slot 2: Next steps === */} {primaryFinding?.action ? (

Recommended action

@@ -299,29 +327,13 @@ export function IssueDetailPage() {

- ) : null} - - {/* Evidence — raw matched line(s) from the triage analyzer. - Mono font so a CX agent can grep for the exact string; - visually distinct from the prose description above. */} - {primaryFinding?.evidence?.length ? ( -
-

Evidence

-
-                    {primaryFinding.evidence.join("\n")}
-                  
-
- ) : null} - - {/* Fallback guidance — when the triage analyzer didn't emit - an action (common for low-confidence log-match findings), - assemble a short context-aware tip list. Each bullet is - predicated on an actual fact of this issue (source file - kind, occurrence count, sibling count) so the guidance - stays useful instead of reading as boilerplate. */} - {!primaryFinding?.action && (messageSource || evidencePaths.length > 0) ? ( + ) : (
-

How to investigate

+

+ {messageSource || evidencePaths.length > 0 + ? "How to investigate" + : "Suggested next steps"} +

    {messageSource ? (
  • @@ -352,10 +364,15 @@ export function IssueDetailPage() {
  • ) : evidencePaths.length > 0 ? (
  • - Open the Related artifacts to see the context - around each occurrence. + Open the related artifacts in the sidebar to see + the context around each occurrence.
  • - ) : null} + ) : ( +
  • + Review the archive's collector output for related + signals that may share a root cause. +
  • + )} {occCount > 1 ? (
  • This event fired{" "} @@ -387,7 +404,7 @@ export function IssueDetailPage() {
- ) : null} + )} From ab0765c68762a3b4c952d96e62a1bad01f513184 Mon Sep 17 00:00:00 2001 From: Eugene de Beste Date: Mon, 20 Apr 2026 16:54:13 +0200 Subject: [PATCH 23/23] fix(vm-troubleshooting-dashboard): kill prev/next flicker on issue-detail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit useIssueDetail was firing a cache-cold load on every first navigation to a new issue — the hook's data briefly became undefined, the component's isLoading branch rendered "Loading issue…", then the finding rendered. Going back was smooth only because the cache was already warm. Added `placeholderData: keepPreviousData` (same pattern useIssues already uses). The previously-viewed issue stays on screen while the next one fetches, so prev/next transitions are uniformly smooth in both directions. --- .../vm-troubleshooting-dashboard/frontend/src/api/issues.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts index b8bb31c..7d90fe8 100644 --- a/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts +++ b/customers/vm-troubleshooting-dashboard/frontend/src/api/issues.ts @@ -44,6 +44,12 @@ export function useIssueDetail(archiveId: string, issueId: string) { `/archives/${encodeURIComponent(archiveId)}/issues/${encodeURIComponent(issueId)}`, ), ), + // Keep the previously-viewed issue's data on screen while the next + // one fetches — otherwise the prev/next buttons flash through the + // loading branch on first navigation (cache cold), then render + // smoothly on the way back (cache warm). `keepPreviousData` makes + // the transition uniformly smooth. + placeholderData: keepPreviousData, }) }