diff --git a/.claude/commands/monitor.md b/.claude/commands/monitor.md index eac924e0f..1108979a8 100644 --- a/.claude/commands/monitor.md +++ b/.claude/commands/monitor.md @@ -1,41 +1,63 @@ # Monitor Fly Logs -Sample and analyse Fly.io logs at regular intervals. +Capture, search, and analyse Fly.io logs via `scripts/logs.sh`. ## Default usage ```bash -./scripts/monitor_logs.sh +./scripts/logs.sh ``` -Runs for 4 hours with 10-second intervals. +Captures from all five Fly apps (`hover`, `hover-worker`, `hover-analysis`, +`hover-autoscaler-worker`, `hover-autoscaler-analysis`) every 3s, runs an +analyse snapshot every 5 minutes, and writes a final report when the run +finishes (~72 minutes by default). Press Ctrl+C to stop early — the final report +still writes. -## Options +## Subcommands ```bash -./scripts/monitor_logs.sh --run-id "descriptive-name" # Custom name -./scripts/monitor_logs.sh --interval 30 --iterations 120 # 30s for 1 hour +./scripts/logs.sh monitor [...] # explicit form of the default +./scripts/logs.sh search [...] # grep captured raw logs +./scripts/logs.sh analyse [...] # run probes, write analysis.md/json ``` -## Output structure +## Common options + +```bash +./scripts/logs.sh --interval 5 --iterations 720 # 5s × 1h +./scripts/logs.sh --run-id "incident-pr349" # custom slug +./scripts/logs.sh --analyse-every 30s # tighter snapshots +./scripts/logs.sh --analyse-every 0 # disable snapshots +./scripts/logs.sh --app hover,hover-worker # subset of apps + +./scripts/logs.sh search --keyword panic --keyword pgx +./scripts/logs.sh search --regex 'status[":]+5\d\d' --app hover +./scripts/logs.sh analyse --keyword "deadline exceeded" +./scripts/logs.sh analyse --run 20260502/1430_mellow-rose_3s_1h ``` -logs/YYYYMMDD/HHMM__s_h/ -├── raw/ -│ ├── _iter1.log -│ ├── _iter2.log -│ └── ... -├── _iter1.json -├── _iter2.json -├── time_series.csv -└── summary.md + +## Output structure + +```text +logs/YYYYMMDD/HHMM__/ +├── /raw/*.log # cursor-filtered captures (one per iteration) +├── /.cursor # last-seen ISO timestamp per app +├── snapshots/ +│ ├── analysis_Z.md +│ └── analysis_Z.json +├── analysis.md # final probe report +├── analysis.json +└── monitor.log # verbose run history ``` -## What it captures +## Probes (analyse) -- Raw log samples from Fly -- JSON summaries per iteration -- Aggregated time series data -- Summary markdown report +Severity, panics & fatals, HTTP status, latency (p50/p95/p99 + slowest), +heartbeat, process health, autoscaler, database/external errors, Sentry, plus +any ad-hoc `--keyword`/`--regex`. Every finding records `count`, `first_seen`, +`last_seen`, and `peak` (timestamp of the highest-count minute). -Automatic aggregation runs via `scripts/aggregate_logs.py` after each iteration. +The legacy `scripts/monitor_logs.sh` still works — it forwards to +`./scripts/logs.sh monitor`. diff --git a/CHANGELOG.md b/CHANGELOG.md index a56d59460..92774d777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,31 @@ On merge, CI will: ## [Unreleased] -_Add unreleased changes here._ +### Added + +- `scripts/logs.sh` unified Fly log tool with `monitor`, `search`, and `analyse` + subcommands. Bare `logs.sh` runs `monitor` with all defaults (3s capture + across all five Fly apps, 5-minute analyse snapshots, final report at run + end). `search` greps captured raw logs by keyword/regex across one or more + runs (case-insensitive by default, also reads `raw.zip` after cleanup). + `analyse` runs a fixed probe set — severity, panics, HTTP status, latency, + heartbeat, process health, autoscaler, database/external errors, Sentry, plus + ad-hoc keywords — and writes `analysis.{md,json}` with `first_seen`, + `last_seen`, and `peak` timestamps for every finding. +- `scripts/filter_since.py` per-app cursor filter wired into `capture_app` so + each iteration only persists log lines newer than the previous capture, + eliminating the 4× overlap inflation that came from `flyctl logs --no-tail`. + +### Changed + +- `scripts/monitor_logs.sh` reduced to a back-compat shim that forwards all + flags to `logs.sh monitor`. Existing call sites in `opencode.json`, + `.claude/commands/monitor.md`, `.claude/settings.local.json`, and + `docs/development/DEVELOPMENT.md` continue to work unchanged. +- Default monitor app list now includes the two autoscaler sidecars + (`hover-autoscaler-worker`, `hover-autoscaler-analysis`). +- Default `--run-id` is auto-generated as a `-` slug (e.g. + `mellow-rose`) so concurrent runs are easy to distinguish. ## Full changelog history diff --git a/docs/development/DEVELOPMENT.md b/docs/development/DEVELOPMENT.md index 7623cebd4..4d0609ddd 100644 --- a/docs/development/DEVELOPMENT.md +++ b/docs/development/DEVELOPMENT.md @@ -294,29 +294,52 @@ logic ├── crawler/ # Web crawling functionality ├── db/ # Database o ## Monitoring Fly Logs -For production investigations use `scripts/monitor_logs.sh`: +For production investigations use `scripts/logs.sh`: ```bash -# Default: 10-second intervals for 4 hours -./scripts/monitor_logs.sh +# Default: 3-second capture across all five Fly apps, 5-minute analyse +# snapshots, ~72 minutes total. Press Ctrl+C to stop early — the final +# report still writes. +./scripts/logs.sh -# Custom run with descriptive name -./scripts/monitor_logs.sh --run-id "heavy-load-test" +# Custom slug, longer interval and duration +./scripts/logs.sh --interval 30 --iterations 120 --run-id "30min-check" -# Custom intervals and duration -./scripts/monitor_logs.sh --interval 30 --iterations 120 --run-id "30min-check" +# Tighter snapshot cadence, or disable snapshots +./scripts/logs.sh --analyse-every 30s +./scripts/logs.sh --analyse-every 0 ``` +`logs.sh` has three subcommands sharing the same run layout: + +```bash +./scripts/logs.sh search --keyword panic --keyword pgx +./scripts/logs.sh analyse --keyword "deadline exceeded" +./scripts/logs.sh analyse --run 20260502/1430_mellow-rose_3s_1h +``` + +The legacy `scripts/monitor_logs.sh` still works — it forwards to +`./scripts/logs.sh monitor`. + **Output structure:** -- Folder: `logs/YYYYMMDD/HHMM__s_h/` - - Example: `logs/20251105/0833_heavy-load-test_10s_4h/` -- Raw logs: `raw/_iter.log` -- JSON summaries: `_iter.json` -- Aggregated outputs: - - `time_series.csv` - per-minute log level counts - - `summary.md` - human-readable report with critical patterns - - Automatically regenerated after each iteration +- Run dir: `logs/YYYYMMDD/HHMM__s_/` + - Example: `logs/20260502/1430_mellow-rose_3s_1h/` +- Per-app captures: `/raw/_iter.log` (cursor-filtered against + `/.cursor` so each iteration only persists lines newer than the previous) +- Per-iteration JSON: `/_iter.json` +- Aggregated outputs (per app): + - `time_series.csv` — per-minute log level counts + - `summary.md` — human-readable per-app report +- Cross-app analysis (whole run): + - `analysis.md` / `analysis.json` — final probe report (severity, panics, + HTTP, latency, heartbeat, process health, autoscaler, DB/external, Sentry, + ad-hoc keywords) with `first_seen` / `last_seen` / `peak` timestamps for + every finding + - `snapshots/analysis_Z.{md,json}` — point-in-time snapshots written + every `--analyse-every` while the run is in progress +- `monitor.log` — verbose run history (cleanup, per-iteration capture, errors). + The TTY shows only a startup banner and a self-overwriting ticker. **Defaults:** diff --git a/opencode.json b/opencode.json index c75b0a1c9..eee4111f9 100644 --- a/opencode.json +++ b/opencode.json @@ -8,7 +8,7 @@ }, "monitor-fly": { "description": "Collect and summarise Fly logs", - "template": "Run ./scripts/monitor_logs.sh with suitable arguments from $ARGUMENTS if provided. Summarise critical patterns, error spikes, and likely causes." + "template": "Run ./scripts/logs.sh with suitable arguments from $ARGUMENTS if provided (use the `search` or `analyse` subcommand for grep / probe-driven analysis of an existing run). Summarise critical patterns, error spikes, and likely causes." }, "load-test": { "description": "Run scripted load test safely", diff --git a/scripts/aggregate_logs.py b/scripts/aggregate_logs.py index c6c238947..f22f512ae 100644 --- a/scripts/aggregate_logs.py +++ b/scripts/aggregate_logs.py @@ -404,7 +404,10 @@ def watch_mode(log_dir, interval=10): parser.add_argument("--full", action="store_true", help="Full reprocess (ignore state)") args = parser.parse_args() - if args.watch: - watch_mode(args.log_dir, args.interval) - else: - aggregate_logs(args.log_dir, incremental=not args.full) + try: + if args.watch: + watch_mode(args.log_dir, args.interval) + else: + aggregate_logs(args.log_dir, incremental=not args.full) + except KeyboardInterrupt: + sys.exit(130) diff --git a/scripts/analyse_logs.py b/scripts/analyse_logs.py new file mode 100755 index 000000000..c174f99f5 --- /dev/null +++ b/scripts/analyse_logs.py @@ -0,0 +1,635 @@ +#!/usr/bin/env python3 +"""Analyse captured Fly logs and emit a deterministic report. + +Streams every raw line for each app in a run through a fixed set of probes +(severity, panics, HTTP status, latency, process health, autoscaler, DB, Sentry, +heartbeat, plus ad-hoc keywords). Each finding records `count`, `first_seen`, +`last_seen`, and `peak` (timestamp of the highest-count minute). + +Outputs `analysis.json` (machine readable) and `analysis.md` (human readable) +into the run directory. +""" + +from __future__ import annotations + +import argparse +import json +import re +import statistics +import sys +from collections import Counter, defaultdict, deque +from itertools import pairwise +from datetime import datetime, timedelta +from pathlib import Path +from typing import Iterator + +# Reuse run-resolution + line iteration from search_logs. +sys.path.insert(0, str(Path(__file__).parent)) +from search_logs import iter_lines, resolve_runs # noqa: E402 + +ISO_KEYS = ("time", "timestamp", "@timestamp", "ts", "created_at") +ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]") +LEADING_TS_RE = re.compile(r"^\s*(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)") + + +def _strip_ansi(s: str) -> str: + return ANSI_RE.sub("", s) + + +def _iso_seconds(rec: dict | None, line: str = "") -> str: + """Best-effort ISO timestamp: prefer JSON record fields, fall back to the + leading timestamp token Fly stamps on every raw log line.""" + if rec: + for k in ISO_KEYS: + v = rec.get(k) + if v: + raw = str(v).replace("Z", "+00:00") + try: + return datetime.fromisoformat(raw).isoformat(timespec="seconds") + except ValueError: + # Malformed value for this key — keep looking. Other ISO + # keys or the leading-line fallback below may be valid. + continue + if line: + m = LEADING_TS_RE.match(line) + if m: + raw = m.group(1).replace("Z", "+00:00") + try: + return datetime.fromisoformat(raw).isoformat(timespec="seconds") + except ValueError: + return raw[:19] + return "" + + +def _iso_minute(ts: str) -> str: + return ts[:16] if len(ts) >= 16 else ts + + +def _parse_record(line: str) -> dict | None: + idx = line.find("{") + if idx == -1: + return None + try: + rec = json.loads(line[idx:]) + except json.JSONDecodeError: + return None + return rec if isinstance(rec, dict) else None + + +class Bucket: + """Tracks count plus first-seen, last-seen, and peak-minute for a label.""" + + __slots__ = ("count", "first", "last", "minute_counts", "samples") + + def __init__(self) -> None: + self.count = 0 + self.first = "" + self.last = "" + self.minute_counts: Counter = Counter() + self.samples: list[str] = [] + + def add(self, ts: str, sample: str | None = None) -> None: + self.count += 1 + if ts: + if not self.first or ts < self.first: + self.first = ts + if not self.last or ts > self.last: + self.last = ts + self.minute_counts[_iso_minute(ts)] += 1 + if sample and len(self.samples) < 3: + trimmed = sample.strip() + if len(trimmed) > 240: + trimmed = trimmed[:237] + "..." + self.samples.append(trimmed) + + def to_dict(self, label: str) -> dict: + peak_min, peak_count = ("", 0) + if self.minute_counts: + peak_min, peak_count = self.minute_counts.most_common(1)[0] + return { + "label": label, + "count": self.count, + "first_seen": self.first, + "last_seen": self.last, + "peak": peak_min, + "peak_count": peak_count, + "samples": list(self.samples), + } + + +# --- Probes ---------------------------------------------------------------- + +class Probe: + name = "" + + def feed(self, ts: str, line: str, rec: dict | None, level: str, msg: str) -> None: + raise NotImplementedError + + def report(self) -> dict: + raise NotImplementedError + + +class SeverityProbe(Probe): + name = "Severity" + LEVELS = ("debug", "info", "warn", "error", "fatal") + + def __init__(self) -> None: + self.buckets: dict[str, Bucket] = {lvl: Bucket() for lvl in self.LEVELS} + self.unknown = Bucket() + + def feed(self, ts, line, rec, level, msg): + if level in self.buckets: + self.buckets[level].add(ts, msg) + elif level: + self.unknown.add(ts, line) + + def report(self) -> dict: + findings = [ + self.buckets[lvl].to_dict(lvl) + for lvl in self.LEVELS + if self.buckets[lvl].count + ] + if self.unknown.count: + findings.append(self.unknown.to_dict("unknown-level")) + return {"name": self.name, "findings": findings} + + +class RegexBucketProbe(Probe): + """Generic probe: match a list of (label, regex) and bucket per label.""" + + def __init__(self, name: str, patterns: list[tuple[str, str]], flags: int = re.IGNORECASE): + self.name = name + self.compiled = [(label, re.compile(pat, flags)) for label, pat in patterns] + self.buckets: dict[str, Bucket] = defaultdict(Bucket) + + def feed(self, ts, line, rec, level, msg): + for label, pat in self.compiled: + m = pat.search(line) + if m: + self.buckets[label].add(ts, line) + + def report(self) -> dict: + findings = [b.to_dict(label) for label, b in self.buckets.items() if b.count] + findings.sort(key=lambda f: -f["count"]) + return {"name": self.name, "findings": findings} + + +class PanicProbe(Probe): + name = "Panics & fatals" + + def __init__(self) -> None: + self.buckets: dict[str, Bucket] = defaultdict(Bucket) + self.panic_re = re.compile(r"panic:\s*(.+)", re.IGNORECASE) + self.fatal_re = re.compile(r"\bfatal(?:\s+error)?:\s*(.+)", re.IGNORECASE) + + def feed(self, ts, line, rec, level, msg): + for pat, kind in ((self.panic_re, "panic"), (self.fatal_re, "fatal")): + m = pat.search(line) + if m: + rest = m.group(1).strip() + key = rest.split("\n", 1)[0][:120] + if not key: + key = kind + self.buckets[f"{kind}: {key}"].add(ts, line) + return + + def report(self) -> dict: + findings = sorted( + (b.to_dict(label) for label, b in self.buckets.items()), + key=lambda f: -f["count"], + ) + return {"name": self.name, "findings": findings[:10]} + + +class HTTPStatusProbe(Probe): + name = "HTTP status" + + def __init__(self) -> None: + self.buckets: dict[str, Bucket] = defaultdict(Bucket) + self.status_field = re.compile(r'\bstatus(?:_code)?[\"\']?\s*[=:]\s*\"?(\d{3})\b') + self.access_log = re.compile(r'HTTP/\d\.\d"\s+(\d{3})\s+\d+') + + def feed(self, ts, line, rec, level, msg): + code: str | None = None + if rec: + for k in ("status", "status_code", "statusCode", "http_status"): + v = rec.get(k) + if isinstance(v, (int, str)) and str(v).isdigit() and len(str(v)) == 3: + code = str(v) + break + if code is None: + m = self.access_log.search(line) or self.status_field.search(line) + if m: + code = m.group(1) + if not code: + return + klass = f"{code[0]}xx" + self.buckets[klass].add(ts) + if code in ("429", "499", "500", "502", "503", "504"): + self.buckets[code].add(ts, line) + + def report(self) -> dict: + findings = [b.to_dict(label) for label, b in sorted(self.buckets.items())] + findings = [f for f in findings if f["count"]] + return {"name": self.name, "findings": findings} + + +class LatencyProbe(Probe): + name = "Latency" + DURATION_KEYS = ("dur_ms", "duration_ms", "latency_ms", "elapsed_ms", "took_ms") + + def __init__(self) -> None: + self.values_ms: list[float] = [] + self.slowest: list[tuple[float, str, str]] = [] # (ms, ts, line) + + def _record(self, ms: float, ts: str, line: str) -> None: + if ms < 0 or ms > 3_600_000: + return + self.values_ms.append(ms) + self.slowest.append((ms, ts, line)) + if len(self.slowest) > 200: + self.slowest.sort(key=lambda x: -x[0]) + self.slowest = self.slowest[:50] + + def feed(self, ts, line, rec, level, msg): + if not rec: + return + for k in self.DURATION_KEYS: + v = rec.get(k) + if isinstance(v, (int, float)): + self._record(float(v), ts, line) + return + # Bare `duration` follows Go/zerolog convention: integer nanoseconds. + # Don't try to infer units from magnitude — sub-millisecond values + # would otherwise be recorded as milliseconds and skew percentiles. + # Apps emitting milliseconds should use the explicit `*_ms` keys above. + d = rec.get("duration") + if isinstance(d, (int, float)): + self._record(float(d) / 1_000_000, ts, line) + + def report(self) -> dict: + if not self.values_ms: + return {"name": self.name, "findings": [], "note": "no duration fields seen"} + vs = sorted(self.values_ms) + n = len(vs) + + def pct(p: float) -> float: + # Linear interpolation between adjacent ranks (Type 7 / "inclusive"), + # the same method as `statistics.quantiles(method="inclusive")`. + # `round()` previously hit banker's-rounding edge cases (e.g. p50 + # of [100, 300] returned 100 instead of 200). + if n == 1 or p <= 0: + return float(vs[0]) + if p >= 100: + return float(vs[-1]) + rank = p / 100 * (n - 1) + lo = int(rank) + hi = min(lo + 1, n - 1) + frac = rank - lo + return float(vs[lo]) + frac * (float(vs[hi]) - float(vs[lo])) + + self.slowest.sort(key=lambda x: -x[0]) + slowest = [ + {"duration_ms": ms, "timestamp": ts, "line": (line.strip()[:240])} + for ms, ts, line in self.slowest[:10] + ] + return { + "name": self.name, + "samples": n, + "p50_ms": pct(50), + "p95_ms": pct(95), + "p99_ms": pct(99), + "max_ms": vs[-1], + "mean_ms": statistics.fmean(vs), + "slowest": slowest, + } + + +class HeartbeatProbe(Probe): + """Detect minutes where info-level traffic dropped to zero mid-run.""" + + name = "Heartbeat" + + def __init__(self) -> None: + self.minute_info: Counter = Counter() + self.minute_seen: list[str] = [] + + def feed(self, ts, line, rec, level, msg): + if not ts: + return + m = _iso_minute(ts) + if m and (not self.minute_seen or self.minute_seen[-1] != m): + self.minute_seen.append(m) + if level == "info": + self.minute_info[m] += 1 + + def report(self) -> dict: + if not self.minute_seen: + return {"name": self.name, "findings": [], "note": "no timestamped traffic"} + minutes = sorted(set(self.minute_seen)) + non_zero = [self.minute_info[m] for m in minutes if self.minute_info.get(m, 0) > 0] + median = statistics.median(non_zero) if non_zero else 0 + gap_minutes: set[str] = set() + # Two ways a minute can be a heartbeat gap: + # (1) observed in minute_seen (had warn/error traffic) but zero info, + # (2) entirely missing — fell between two observed minutes with no + # log lines at all, so `minute_seen` doesn't include it. + if median >= 1: + # Skip the first and last observed minutes — they're typically + # partial windows (capture started or stopped mid-minute), so a + # zero info-count there is expected, not a real heartbeat gap. + for m in minutes[1:-1]: + if self.minute_info.get(m, 0) == 0: + gap_minutes.add(m) + if len(minutes) >= 2: + for prev, curr in pairwise(minutes): + try: + p = datetime.strptime(prev, "%Y-%m-%dT%H:%M") + c = datetime.strptime(curr, "%Y-%m-%dT%H:%M") + except ValueError: + continue + step = p + timedelta(minutes=1) + while step < c: + gap_minutes.add(step.strftime("%Y-%m-%dT%H:%M")) + step += timedelta(minutes=1) + gaps: list[dict] = [ + {"minute": m, "expected_min": int(median)} + for m in sorted(gap_minutes)[:200] + ] + return { + "name": self.name, + "median_info_per_minute": median, + "first_minute": minutes[0], + "last_minute": minutes[-1], + "gap_minutes": gaps[:20], + } + + +def _build_probes(extra_keywords: list[str], extra_regexes: list[str]) -> list[Probe]: + probes: list[Probe] = [ + SeverityProbe(), + PanicProbe(), + HTTPStatusProbe(), + LatencyProbe(), + HeartbeatProbe(), + RegexBucketProbe( + "Process health", + [ + ("starting machine", r"starting machine"), + ("stopping machine", r"stopping machine"), + ("exited with code", r"exited with code\s+\d+"), + ("out of memory", r"out of memory|oom[- ]?killed"), + ("killed by signal", r"killed by signal|signal:\s*killed"), + ("health check failed", r"health check.*fail"), + ("restart", r"\brestart(ing)?\b"), + ], + ), + RegexBucketProbe( + "Autoscaler", + [ + ("reconciling", r'"msg":\s*"reconciling"|reconciling\s+app'), + ("scale up", r"scal(e|ing)\s*up|adding machine"), + ("scale down", r"scal(e|ing)\s*down|removing machine"), + ("target=N", r"target\s*[=:]\s*\d+|\"target\":\s*\{"), + ("queue depth", r"queue[_ ]depth|backlog\s*[=:]\s*\d+"), + ("no-op", r"no scale change|already at target"), + ], + ), + RegexBucketProbe( + "Database / external", + [ + ("pgx error", r"\bpgx\b.*error|pgx:.*"), + ("pq error", r"\bpq:\s"), + ("connection refused", r"connection refused"), + ("context deadline exceeded", r"context deadline exceeded"), + ("i/o timeout", r"i/o timeout"), + ("connection reset", r"connection reset"), + ("too many connections", r"too many connections"), + ], + ), + RegexBucketProbe( + "Sentry", + [ + ("event sent", r"sentry.*event\b|event sent to sentry"), + ("send failed", r"sentry.*(?:fail|error)"), + ], + ), + ] + if extra_keywords or extra_regexes: + patterns = [(f"keyword:{k}", re.escape(k)) for k in extra_keywords] + patterns += [(f"regex:{r}", r) for r in extra_regexes] + probes.append(RegexBucketProbe("Ad-hoc keywords", patterns)) + return probes + + +_DEDUPE_WINDOW = 50_000 + + +def _analyse_app(app_dir: Path, extra_keywords: list[str], extra_regexes: list[str]) -> dict: + probes = _build_probes(extra_keywords, extra_regexes) + total_lines = 0 + unique_lines = 0 + parsed_records = 0 + first_ts = "" + last_ts = "" + # Bounded sliding-window dedupe: capture overlap is between adjacent + # iterations, so a recent-window LRU catches all real duplicates without + # unbounded memory growth on long, high-volume runs. Capture-time cursor + # filtering already removes most dupes; this is the secondary safety net. + seen: set[str] = set() + seen_order: deque[str] = deque() + + for _source, raw_line in iter_lines(app_dir): + total_lines += 1 + line = _strip_ansi(raw_line).rstrip() + if not line or line in seen: + continue + seen.add(line) + seen_order.append(line) + if len(seen_order) > _DEDUPE_WINDOW: + seen.discard(seen_order.popleft()) + unique_lines += 1 + rec = _parse_record(line) + if rec is not None: + parsed_records += 1 + ts = _iso_seconds(rec, line) + level = "" + msg = "" + if rec: + level = str(rec.get("level") or "").lower() + msg = str(rec.get("msg") or rec.get("message") or "") + if ts: + if not first_ts or ts < first_ts: + first_ts = ts + if not last_ts or ts > last_ts: + last_ts = ts + for probe in probes: + try: + probe.feed(ts, line, rec, level, msg) + except Exception as exc: # don't let one bad line abort the whole run + print(f"warn: probe {probe.name} on {app_dir.name}: {exc}", file=sys.stderr) + + return { + "app": app_dir.name, + "total_lines": total_lines, + "unique_lines": unique_lines, + "parsed_records": parsed_records, + "first_seen": first_ts, + "last_seen": last_ts, + "probes": [p.report() for p in probes], + } + + +# --- Reporting ------------------------------------------------------------- + +def _fmt_window(d: dict) -> str: + first = d.get("first_seen") or "-" + last = d.get("last_seen") or "-" + peak = d.get("peak") or "-" + pc = d.get("peak_count") or 0 + return f"first={first} last={last} peak={peak}({pc})" + + +def _render_md(report: dict) -> str: + out: list[str] = [] + run = report["run"] + out.append(f"# Log analysis — {run}") + out.append("") + out.append(f"Generated: {report['generated_at']}") + if report.get("extra_keywords") or report.get("extra_regexes"): + out.append( + f"Ad-hoc terms: keywords={report.get('extra_keywords')} regexes={report.get('extra_regexes')}" + ) + out.append("") + + for app in report["apps"]: + out.append(f"## {app['app']}") + unique = app.get("unique_lines", app["total_lines"]) + out.append( + f"- lines: {app['total_lines']:,} captured / {unique:,} unique" + f" (parsed JSON: {app['parsed_records']:,})" + f" | window: {app.get('first_seen') or '-'} → {app.get('last_seen') or '-'}" + ) + for probe in app["probes"]: + out.append("") + out.append(f"### {probe['name']}") + if probe.get("note"): + out.append(f"_{probe['note']}_") + findings = probe.get("findings") + if findings is None and probe["name"] == "Latency": + if probe.get("samples"): + out.append( + f"- samples: {probe['samples']} | " + f"p50={probe['p50_ms']:.0f}ms p95={probe['p95_ms']:.0f}ms " + f"p99={probe['p99_ms']:.0f}ms max={probe['max_ms']:.0f}ms" + ) + if probe.get("slowest"): + out.append("") + out.append("Slowest:") + for s in probe["slowest"][:5]: + out.append( + f" - {s['duration_ms']:.0f}ms @ {s['timestamp']}: {s['line']}" + ) + continue + if probe["name"] == "Heartbeat": + out.append( + f"- median info/min: {probe.get('median_info_per_minute')} | " + f"window: {probe.get('first_minute')} → {probe.get('last_minute')}" + ) + gaps = probe.get("gap_minutes") or [] + if gaps: + out.append(f"- {len(gaps)} zero-info minute(s):") + for g in gaps[:10]: + out.append(f" - {g['minute']}") + continue + if not findings: + out.append("- (no matches)") + continue + for f in findings: + line = f"- {f['label']}: {f['count']} | {_fmt_window(f)}" + out.append(line) + for s in f.get("samples", [])[:1]: + out.append(f" sample: {s}") + out.append("") + return "\n".join(out) + + +def _select_app_dirs(run: Path, app_filter: list[str] | None) -> list[Path]: + out = [] + for child in sorted(run.iterdir()): + if not child.is_dir(): + continue + if app_filter and child.name not in app_filter: + continue + if (child / "raw").exists() or (child / "raw.zip").exists(): + out.append(child) + return out + + +def main() -> int: + p = argparse.ArgumentParser(description="Analyse captured Fly logs.") + p.add_argument("--run", default=None, help="'latest' (default), 'all', a date, or a run dir.") + p.add_argument("--root", default="logs", help="Logs root directory (default: logs).") + p.add_argument("--app", default="", help="Comma-separated app filter (default: all apps).") + p.add_argument("--keyword", action="append", default=[], help="Ad-hoc keyword (repeatable).") + p.add_argument("--regex", action="append", default=[], help="Ad-hoc regex (repeatable).") + p.add_argument("--out", default=None, help="Override output base path (writes .json + .md).") + p.add_argument("--stdout", action="store_true", help="Print markdown to stdout in addition to writing files.") + args = p.parse_args() + + root = Path(args.root) + runs = resolve_runs(args.run, root) + if not runs: + print(f"No runs matched under {root} (run={args.run!r})", file=sys.stderr) + return 1 + if len(runs) > 1 and args.out: + # `--out` shares a single base path across the loop below, so multiple + # runs would silently overwrite each other. Without `--out` each run + # writes to its own `/analysis.{md,json}` and multi-run selection + # is fine. + print( + f"Resolved {len(runs)} runs; --out requires a single run because " + f"combined output isn't implemented.", + file=sys.stderr, + ) + return 2 + + apps = [a.strip() for a in args.app.split(",") if a.strip()] or None + + rc = 0 + for run in runs: + app_dirs = _select_app_dirs(run, apps) + if not app_dirs: + print(f"No app dirs with raw logs under {run}", file=sys.stderr) + rc = rc or 1 + continue + app_reports = [_analyse_app(d, args.keyword, args.regex) for d in app_dirs] + report = { + "run": str(run.relative_to(root)), + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "extra_keywords": args.keyword, + "extra_regexes": args.regex, + "apps": app_reports, + } + if args.out: + base = Path(args.out) + else: + base = run / "analysis" + base.parent.mkdir(parents=True, exist_ok=True) + json_path = base.with_suffix(".json") + md_path = base.with_suffix(".md") + json_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + md = _render_md(report) + md_path.write_text(md, encoding="utf-8") + print(f"wrote {md_path}") + print(f"wrote {json_path}") + if args.stdout: + print(md) + return rc + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + sys.exit(130) diff --git a/scripts/filter_since.py b/scripts/filter_since.py new file mode 100755 index 000000000..877695b8a --- /dev/null +++ b/scripts/filter_since.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +"""Filter Fly log lines from stdin against a per-app timestamp cursor. + +Keeps only lines whose leading ISO timestamp is strictly greater than the +cursor stored at the given path, then updates the cursor to the max timestamp +seen. Used by `logs.sh monitor` so overlapping `flyctl logs --no-tail` captures +only persist new lines per iteration. + +Lines without a parseable leading timestamp (typically multi-line stack-trace +continuations) are emitted only when the most recent timestamped header was +emitted. That preserves multi-line records when their header survives the +cursor while preventing stale continuation blocks from re-appearing in +overlapping `flyctl logs --no-tail` windows. +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]") +LEADING_TS_RE = re.compile( + r"^\s*(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)" +) + + +def _normalise(ts: str) -> str: + """Normalise to a lexicographically-comparable form (Z -> +00:00).""" + return ts.replace("Z", "+00:00") + + +def main() -> int: + if len(sys.argv) != 2: + print("usage: filter_since.py ", file=sys.stderr) + return 2 + + cursor_path = Path(sys.argv[1]) + cursor = "" + if cursor_path.exists(): + cursor = cursor_path.read_text(encoding="utf-8").strip() + + new_max = cursor + out = sys.stdout + # Multi-line records (e.g. Go panic stack traces) only stamp a timestamp on + # the first line. Track whether the most recent header was emitted so we + # don't re-persist continuation lines whose header was dropped by the + # cursor — that would reintroduce duplicates this filter exists to remove. + header_emitted = False + for line in sys.stdin: + clean = ANSI_RE.sub("", line) + m = LEADING_TS_RE.match(clean) + if not m: + if header_emitted: + out.write(line) + continue + ts = _normalise(m.group(1)) + if cursor and ts <= cursor: + header_emitted = False + continue + out.write(line) + header_emitted = True + if ts > new_max: + new_max = ts + + if new_max and new_max != cursor: + cursor_path.parent.mkdir(parents=True, exist_ok=True) + cursor_path.write_text(new_max, encoding="utf-8") + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + # Bash forwards SIGINT to every child in the pipe. Exit quietly so + # `logs.sh`'s own trap can write the final report without a traceback + # bleeding into the terminal. + sys.exit(130) diff --git a/scripts/logs.sh b/scripts/logs.sh new file mode 100755 index 000000000..d12d38462 --- /dev/null +++ b/scripts/logs.sh @@ -0,0 +1,744 @@ +#!/usr/bin/env bash + +# logs.sh — unified Fly log tool: monitor / search / analyse. +# +# logs.sh monitor [...] capture logs on a fixed cadence (was monitor_logs.sh) +# logs.sh search [...] grep across captured raw logs (zipped or live) +# logs.sh analyse [...] run probes, write analysis.md/json into a run dir +# +# All subcommands accept --help. + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +usage_top() { + cat <<'USAGE' +Usage: logs.sh [options] + +Commands: + monitor Capture Fly logs on a cadence and aggregate per-minute summaries. + search Grep captured raw logs by keyword/regex across one or more apps. + analyse Run pre-built probes (severity, panics, HTTP, latency, autoscaler, + DB, Sentry, heartbeat, ad-hoc) over a run; writes analysis.md/json. + +Run `logs.sh --help` for command-specific options. +USAGE +} + +# Probe a candidate interpreter and confirm it's Python 3.10+. The Python +# helpers use `itertools.pairwise` (3.10+) so 3.9 would import-fail at runtime. +_is_python3() { + "$@" -c "import sys; sys.exit(0 if sys.version_info >= (3, 10) else 1)" \ + >/dev/null 2>&1 +} + +# Locate a working Python 3.10+ interpreter shared by search/analyse helpers. +resolve_python() { + if command -v python3 >/dev/null 2>&1 && _is_python3 python3; then + PYTHON_CMD="python3" + PYTHON_ARGS=() + elif command -v python >/dev/null 2>&1 && _is_python3 python; then + PYTHON_CMD="python" + PYTHON_ARGS=() + elif command -v py >/dev/null 2>&1 && _is_python3 py -3; then + PYTHON_CMD="py" + PYTHON_ARGS=(-3) + else + echo "Python 3.10+ is required for this command but was not found." >&2 + exit 1 + fi +} + +cmd_search() { + resolve_python + exec env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \ + "$SCRIPT_DIR/search_logs.py" "$@" +} + +cmd_analyse() { + resolve_python + exec env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \ + "$SCRIPT_DIR/analyse_logs.py" "$@" +} + +generate_run_slug() { + # Deterministic-free, no-deps friendly slug — adjective-colour shape so + # concurrent runs are easy to distinguish at a glance in the logs/ tree. + local adjs=(grumpy happy lazy quick brave silent loud sleepy hungry tiny + spicy mellow plucky witty bright stormy frosty sunny rusty merry + gentle clumsy chatty curious eager fancy giddy nimble proud sturdy) + local colours=(orange purple sky river panda cobra falcon meadow ember hazel + crimson teal indigo amber slate olive rose mint coral cobalt + ivory ochre azure plum lilac mango onyx pearl sage saffron) + local seed=$(( ( $(date +%s) ^ $$ ) & 0x7FFFFFFF )) + local a=$(( seed % ${#adjs[@]} )) + local c=$(( (seed / ${#adjs[@]}) % ${#colours[@]} )) + echo "${adjs[$a]}-${colours[$c]}" +} + +# parse_duration "5m" -> 300; accepts plain seconds, Ns, Nm, Nh. +parse_duration() { + local v="$1" + if [[ "$v" =~ ^([0-9]+)$ ]]; then echo "${BASH_REMATCH[1]}"; return; fi + if [[ "$v" =~ ^([0-9]+)s$ ]]; then echo "${BASH_REMATCH[1]}"; return; fi + if [[ "$v" =~ ^([0-9]+)m$ ]]; then echo $(( ${BASH_REMATCH[1]} * 60 )); return; fi + if [[ "$v" =~ ^([0-9]+)h$ ]]; then echo $(( ${BASH_REMATCH[1]} * 3600 )); return; fi + echo "invalid duration: $v (use 30s, 5m, 1h, or plain seconds)" >&2 + exit 1 +} + +cmd_monitor() { + # Defaults; environment variables of the same name take precedence. + APP="${APP:-hover,hover-worker,hover-analysis,hover-autoscaler-worker,hover-autoscaler-analysis}" + INTERVAL="${INTERVAL:-3}" + SAMPLES="${SAMPLES:-400}" + ITERATIONS="${ITERATIONS:-1440}" # ~72 minutes at 3s intervals + RUN_ID="${RUN_ID:-}" + OUTPUT_ROOT="${OUTPUT_ROOT:-logs}" + CLEANUP_OLD="${CLEANUP_OLD:-true}" + CLEANUP_DAYS="${CLEANUP_DAYS:-1}" + CLEANUP_MODE="${CLEANUP_MODE:-zip}" + ANALYSE_EVERY="${ANALYSE_EVERY:-5m}" + PYTHON_CMD="" + PYTHON_ARGS=() + + monitor_usage() { + cat <<'USAGE' +Usage: logs.sh monitor [options] + +Fetch recent Fly logs on a fixed cadence, archive the raw output, and write +per-minute summaries describing how often each log level/message occurred. + +Automatic cleanup (enabled by default): + - Zips raw logs and iteration JSONs from runs older than 1 day + - Keeps summary.md, summary.json, and monitor.log + - Use --no-cleanup to disable or --cleanup-mode delete to remove everything + +Options: + --app NAMES Fly application name(s), comma-separated + (default: hover,hover-worker,hover-analysis, + hover-autoscaler-worker,hover-autoscaler-analysis) + --interval SECONDS Seconds to wait between samples (default: 3) + --samples N Number of log lines to request each run (default: 400) + --iterations N Number of iterations to perform (0 = run forever, + default: 1440 = ~72 minutes at 3s intervals) + --run-id ID Identifier used when naming output directories + (default: auto-generated - slug) + --analyse-every DUR Run analyse to write a snapshot every DUR (default: 5m). + Accepts plain seconds, Ns, Nm, or Nh. Use 0 to disable. + --no-cleanup Disable automatic cleanup (default: enabled) + --cleanup-days N Clean runs older than N days (default: 1) + --cleanup-mode MODE How to clean: 'zip' or 'delete' (default: zip) + zip: archives raw/ and iteration JSONs, keeps summaries + delete: removes entire run directory + -h, --help Show this message and exit + +Environment variables with the same names (APP, INTERVAL, SAMPLES, ITERATIONS, +RUN_ID) override the defaults as well. +USAGE + } + + require_value() { + # Bail on missing values for options that take an argument so a stray + # `logs.sh monitor --app` fails with a readable message instead of + # tripping `set -u` on the unbound `$2` expansion below. + if [[ $# -lt 2 || "$2" == -* ]]; then + echo "Missing value for $1" >&2 + monitor_usage + exit 2 + fi + } + + while [[ $# -gt 0 ]]; do + case "$1" in + --app) require_value "$@"; APP="$2"; shift 2 ;; + --interval) require_value "$@"; INTERVAL="$2"; shift 2 ;; + --samples) require_value "$@"; SAMPLES="$2"; shift 2 ;; + --iterations) require_value "$@"; ITERATIONS="$2"; shift 2 ;; + --run-id) require_value "$@"; RUN_ID="$2"; shift 2 ;; + --analyse-every) require_value "$@"; ANALYSE_EVERY="$2"; shift 2 ;; + --no-cleanup) CLEANUP_OLD=false; shift ;; + --cleanup-days) require_value "$@"; CLEANUP_DAYS="$2"; shift 2 ;; + --cleanup-mode) require_value "$@"; CLEANUP_MODE="$2"; shift 2 ;; + -h|--help) monitor_usage; exit 0 ;; + *) + echo "Unknown option: $1" >&2 + monitor_usage + exit 1 + ;; + esac + done + + if ! [[ "$INTERVAL" =~ ^[0-9]+$ && "$INTERVAL" -gt 0 ]]; then + echo "interval must be a positive integer" >&2 + exit 1 + fi + if ! [[ "$SAMPLES" =~ ^[0-9]+$ && "$SAMPLES" -ge 1 && "$SAMPLES" -le 10000 ]]; then + echo "samples must be an integer between 1 and 10000" >&2 + exit 1 + fi + if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]]; then + echo "iterations must be an integer >= 0" >&2 + exit 1 + fi + if ! [[ "$CLEANUP_DAYS" =~ ^[0-9]+$ && "$CLEANUP_DAYS" -ge 0 ]]; then + echo "cleanup-days must be a non-negative integer" >&2 + exit 1 + fi + if [[ "$CLEANUP_MODE" != "zip" && "$CLEANUP_MODE" != "delete" ]]; then + echo "cleanup-mode must be 'zip' or 'delete'" >&2 + exit 1 + fi + + IFS=',' read -r -a APPS <<< "$APP" + for i in "${!APPS[@]}"; do + APPS[i]="${APPS[i]// /}" + done + if [[ ${#APPS[@]} -eq 0 ]]; then + echo "at least one app name is required" >&2 + exit 1 + fi + # Reject every empty entry — `--app "hover,,worker"` or a trailing comma + # would otherwise create `$RUN_DIR//raw` and call `flyctl logs --app ""`. + for app_name in "${APPS[@]}"; do + if [[ -z "$app_name" ]]; then + echo "app list contains an empty value; check comma placement in --app/$APP" >&2 + exit 1 + fi + done + + if command -v python3 >/dev/null 2>&1 && _is_python3 python3; then + PYTHON_CMD="python3" + elif command -v python >/dev/null 2>&1 && _is_python3 python; then + PYTHON_CMD="python" + elif command -v py >/dev/null 2>&1 && _is_python3 py -3; then + PYTHON_CMD="py" + PYTHON_ARGS=(-3) + fi + + # Auto-generate settings suffix with appropriate units. + if [[ "$INTERVAL" -ge 60 ]]; then + INTERVAL_MINUTES=$(( INTERVAL / 60 )) + INTERVAL_STR="${INTERVAL_MINUTES}m" + else + INTERVAL_STR="${INTERVAL}s" + fi + + if [[ "$ITERATIONS" -eq 0 ]]; then + SETTINGS_SUFFIX="${INTERVAL_STR}_forever" + else + DURATION_SECONDS=$(( ITERATIONS * INTERVAL )) + if [[ "$DURATION_SECONDS" -ge 86400 ]]; then + DURATION_DAYS=$(( (DURATION_SECONDS + 43200) / 86400 )) + DURATION_STR="${DURATION_DAYS}d" + elif [[ "$DURATION_SECONDS" -ge 3600 ]]; then + DURATION_HOURS=$(( (DURATION_SECONDS + 1800) / 3600 )) + DURATION_STR="${DURATION_HOURS}h" + else + DURATION_MINUTES=$(( (DURATION_SECONDS + 30) / 60 )) + DURATION_STR="${DURATION_MINUTES}m" + fi + SETTINGS_SUFFIX="${INTERVAL_STR}_${DURATION_STR}" + fi + + # `--run-id` is interpolated into a filesystem path; reject anything that + # could escape the advertised `logs/YYYYMMDD/HHMM__/` + # layout (path separators, traversal segments, leading dot/dash). + if [[ -n "$RUN_ID" && ! "$RUN_ID" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]]; then + echo "run-id may only contain letters, numbers, dot, underscore, and hyphen, and may not start with '.' or '-'" >&2 + exit 1 + fi + if [[ -z "$RUN_ID" ]]; then + RUN_SLUG="$(generate_run_slug)" + else + RUN_SLUG="$RUN_ID" + fi + RUN_ID="${RUN_SLUG}_${SETTINGS_SUFFIX}" + + ANALYSE_EVERY_SECONDS=$(parse_duration "$ANALYSE_EVERY") + + DATE_DIR="$OUTPUT_ROOT/$(date +"%Y%m%d")" + TIME_PREFIX=$(date +"%H%M") + RUN_DIR="$DATE_DIR/${TIME_PREFIX}_${RUN_ID}" + LOG_FILE="$RUN_DIR/monitor.log" + + mkdir -p "$RUN_DIR" + for app in "${APPS[@]}"; do + mkdir -p "$RUN_DIR/$app/raw" + done + + # Output helpers — keep TTY tidy, monitor.log retains every event. + iso_ts() { date -u +"%Y-%m-%dT%H:%M:%SZ"; } + log_to_file() { echo "[$(iso_ts)] $*" >> "$LOG_FILE"; } + log_user() { + # Print a user-facing message and record it to the log too. + echo "$*" + echo "[$(iso_ts)] $*" >> "$LOG_FILE" + } + + USE_TICKER=false + if [[ -t 1 ]]; then USE_TICKER=true; fi + + # ANSI palette — empty when not on a TTY so non-TTY output stays plain and + # `monitor.log` writes (which use the *_plain variables) never carry codes. + if [[ "$USE_TICKER" == "true" ]]; then + C_BOLD=$'\033[1m' + # Bright-black foreground (a.k.a. grey) rather than \033[2m dim — many + # terminals (VS Code's included) render the dim attribute by altering + # the background, which produces a visible black box behind the text. + C_DIM=$'\033[90m' + C_CYAN=$'\033[36m' + C_GREEN=$'\033[32m' + C_YELLOW=$'\033[33m' + C_RESET=$'\033[0m' + else + C_BOLD="" C_DIM="" C_CYAN="" C_GREEN="" C_YELLOW="" C_RESET="" + fi + + emit_styled() { + # Print a styled line to stdout, plain text to the log. + local plain="$1" styled="$2" + if [[ "$USE_TICKER" == "true" ]]; then + printf "%s\n" "$styled" + else + echo "$plain" + fi + echo "[$(iso_ts)] $plain" >> "$LOG_FILE" + } + + fmt_duration() { + local s=$1 + if (( s < 60 )); then printf "%ds" "$s"; return; fi + if (( s < 3600 )); then printf "%dm %ds" $((s/60)) $((s%60)); return; fi + printf "%dh %dm" $((s/3600)) $(( (s%3600)/60 )) + } + ticker_done() { + if [[ "$USE_TICKER" == "true" ]]; then + printf "\n" + fi + } + + # ── Animated ticker ────────────────────────────────────────────────── + # The ticker line is redrawn at 5Hz by a background process while the + # main loop captures logs. State is shared via a small file the main + # loop writes once per iteration; the animator reads it on every redraw + # so the spinner, elapsed time, and snapshot countdown all keep ticking + # even while flyctl calls are in flight. + + TICKER_STATE_FILE="$RUN_DIR/.ticker_state" + TICKER_ANIMATOR_PID="" + + write_ticker_state() { + # Single line: + # iter iter_max start_epoch last_analyse_epoch analyse_every captured_total + printf '%d %d %d %d %d %d\n' \ + "$iteration" "$ITERATIONS" "$start_epoch" "$last_analyse_epoch" \ + "$ANALYSE_EVERY_SECONDS" "$CAPTURED_TOTAL" \ + > "$TICKER_STATE_FILE" + } + + ticker_animator() { + local idx=0 + local frames=(◐ ◓ ◑ ◒) + local n=${#frames[@]} + local iter_num iter_max s_epoch a_epoch a_every captured + local now elapsed elapsed_fmt iter_styled iter_max_part snap_part until_snap snap_fmt spin + while [[ -f "$TICKER_STATE_FILE" ]]; do + if read -r iter_num iter_max s_epoch a_epoch a_every captured \ + < "$TICKER_STATE_FILE" 2>/dev/null \ + && [[ -n "${iter_num:-}" ]]; then + now=$(date +%s) + elapsed=$(( now - s_epoch )) + elapsed_fmt=$(fmt_duration $elapsed) + if [[ "$iter_max" -gt 0 ]]; then + iter_max_part=" / ${iter_max}" + else + iter_max_part="" + fi + if [[ "$a_every" -gt 0 ]]; then + until_snap=$(( a_every - (now - a_epoch) )) + (( until_snap < 0 )) && until_snap=0 + snap_fmt=$(fmt_duration $until_snap) + snap_part=" ${C_DIM}-${C_RESET} ${C_DIM}next snapshot ${snap_fmt}${C_RESET}" + else + snap_part="" + fi + spin="${frames[$idx]}" + printf "\r\033[K ${C_BOLD}${C_CYAN}%s${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET} ${C_DIM}-${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET}%s ${C_DIM}-${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET} logs%s" \ + "$spin" "$elapsed_fmt" "$iter_num" "$iter_max_part" "$captured" "$snap_part" + fi + idx=$(( (idx + 1) % n )) + sleep 0.2 + done + } + + stop_ticker_animator() { + if [[ -n "$TICKER_ANIMATOR_PID" ]]; then + kill "$TICKER_ANIMATOR_PID" 2>/dev/null || true + wait "$TICKER_ANIMATOR_PID" 2>/dev/null || true + TICKER_ANIMATOR_PID="" + fi + rm -f "$TICKER_STATE_FILE" + } + + STOP_REQUESTED=false + on_interrupt() { + STOP_REQUESTED=true + stop_ticker_animator + ticker_done + emit_styled \ + "Stop requested — finishing up..." \ + "${C_BOLD}${C_YELLOW}Stop requested${C_RESET} — final iteration & report..." + } + trap on_interrupt INT TERM + + # Sleep but watch stdin for `q` so the user has a clean alternative to + # Ctrl+C. Any other keystroke is ignored (we keep polling). Falls back to + # a plain sleep when stdin isn't an interactive TTY (CI, redirected input). + poll_quit_or_sleep() { + local duration=$1 + if [[ "$USE_TICKER" != "true" || ! -t 0 ]]; then + sleep "$duration" || true + return + fi + local target=$(( $(date +%s) + duration )) + local key="" + while (( $(date +%s) < target )); do + local rem=$(( target - $(date +%s) )) + (( rem < 1 )) && rem=1 + if read -rsn1 -t "$rem" key 2>/dev/null; then + case "$key" in + q|Q) + STOP_REQUESTED=true + stop_ticker_animator + ticker_done + emit_styled \ + "Stop requested (q) — final iteration & report..." \ + "${C_BOLD}${C_YELLOW}Stop requested${C_RESET} (q) — final iteration & report..." + return + ;; + *) ;; # ignore stray keystrokes, keep polling + esac + fi + done + } + + # Cleanup is now silent on TTY (recorded in monitor.log only) — it ran on + # almost every invocation and dominated the startup banner. + if [[ "$CLEANUP_OLD" == "true" ]]; then + log_to_file "Cleaning up old runs (older than $CLEANUP_DAYS days, mode: $CLEANUP_MODE)" + if [[ "$(uname)" == "Darwin" ]]; then + CUTOFF_DATE=$(date -u -v-${CLEANUP_DAYS}d +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d") + else + CUTOFF_DATE=$(date -u -d "$CLEANUP_DAYS days ago" +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d") + fi + if [[ -d "$OUTPUT_ROOT" ]]; then + find "$OUTPUT_ROOT" -mindepth 2 -maxdepth 2 -type d | while read -r run_dir; do + date_dir=$(basename "$(dirname "$run_dir")") + if ! [[ "$date_dir" =~ ^[0-9]{8}$ ]]; then continue; fi + if [[ "$date_dir" -ge "$CUTOFF_DATE" ]]; then continue; fi + run_name=$(basename "$run_dir") + if [[ "$CLEANUP_MODE" == "zip" ]]; then + while IFS= read -r raw_dir; do + [[ -z "$raw_dir" ]] && continue + zip_parent=$(dirname "$raw_dir") + [[ -f "$zip_parent/raw.zip" ]] && continue + rel=${raw_dir#"$run_dir/"} + log_to_file " Zipping raw logs: $date_dir/$run_name/$rel" + (cd "$zip_parent" && zip -q -r "raw.zip" "raw" && rm -rf "raw") || \ + log_to_file " Failed to zip raw directory $raw_dir" + done < <(find "$run_dir" -type d -name raw 2>/dev/null) + # Null-delimited so a `--run-id` containing spaces (or any + # other whitespace) doesn't split filenames mid-token. + if find "$run_dir" -type f -name '*_iter*.json' -print -quit 2>/dev/null | grep -q .; then + log_to_file " Removing iteration JSONs: $date_dir/$run_name" + find "$run_dir" -type f -name '*_iter*.json' -print0 2>/dev/null | xargs -0 rm -f || \ + log_to_file " Failed to remove iteration JSONs in $run_dir" + fi + else + log_to_file " Deleting: $date_dir/$run_name" + rm -rf "$run_dir" || log_to_file " Failed to delete $run_dir" + fi + done + fi + log_to_file "Cleanup complete" + fi + + # Compact startup banner: run dir, app list, and one settings line. + if [[ "$ITERATIONS" -gt 0 ]]; then + DURATION_HINT=" (~$(fmt_duration $((ITERATIONS * INTERVAL))))" + else + DURATION_HINT=" (forever)" + fi + # Comma-space join. `${APPS[*]}` only honours the first IFS char, so build + # the joined string explicitly to keep the space after each comma. + printf -v APPS_JOINED '%s, ' "${APPS[@]}" + APPS_JOINED="${APPS_JOINED%, }" + if [[ "$ANALYSE_EVERY_SECONDS" -gt 0 ]]; then + SNAP_HINT="every $ANALYSE_EVERY" + else + SNAP_HINT="disabled" + fi + + # Rule + key/value helpers for the bracketed banner / ticker / done layout. + # All rules share one width so they line up; separate BANNER_WIDTH governs + # apps-list wrapping so a narrow rule doesn't force aggressive wraps. + RULE_WIDTH=60 + BANNER_WIDTH=60 + rule_chars() { printf '─%.0s' $(seq 1 "$1"); } + + print_top_rule() { + local label="$1" + local pad=$(( RULE_WIDTH - 4 - ${#label} )) + (( pad < 0 )) && pad=0 + local trail + trail=$(rule_chars "$pad") + if [[ "$USE_TICKER" == "true" ]]; then + printf "${C_DIM}── ${C_RESET}${C_BOLD}${C_CYAN}%s${C_RESET}${C_DIM} %s${C_RESET}\n" "$label" "$trail" + else + printf -- "── %s %s\n" "$label" "$trail" + fi + echo "[$(iso_ts)] ── $label ──" >> "$LOG_FILE" + } + print_rule() { + local line + line=$(rule_chars "$RULE_WIDTH") + if [[ "$USE_TICKER" == "true" ]]; then + printf "${C_DIM}%s${C_RESET}\n" "$line" + else + printf "%s\n" "$line" + fi + echo "[$(iso_ts)] $line" >> "$LOG_FILE" + } + print_kv() { + # Indented key/value line. Key in default weight; value dimmed (grey) + # so the eye lands on the slug + ticker, not the static config block. + # 3-space indent so the key column aligns with the slug label in the + # top rule (which sits at column 3, after `── `). + local key="$1" value="$2" + if [[ "$USE_TICKER" == "true" ]]; then + printf " %-10s ${C_DIM}%s${C_RESET}\n" "$key" "$value" + else + printf " %-10s %s\n" "$key" "$value" + fi + echo "[$(iso_ts)] $key: $value" >> "$LOG_FILE" + } + print_kv_wrapped() { + # Like print_kv but wraps `value` at word boundaries onto continuation + # lines indented to align under the value column. + local key="$1" value="$2" + local cont_indent=" " # 3 + 10 + 2 = 15 spaces + local wrap_width=$(( BANNER_WIDTH - 15 )) + local first=true + while IFS= read -r line; do + if $first; then + print_kv "$key" "$line" + first=false + else + if [[ "$USE_TICKER" == "true" ]]; then + printf "%s${C_DIM}%s${C_RESET}\n" "$cont_indent" "$line" + else + printf "%s%s\n" "$cont_indent" "$line" + fi + echo "[$(iso_ts)] $line" >> "$LOG_FILE" + fi + done < <(printf "%s\n" "$value" | fold -s -w "$wrap_width") + } + + print_top_rule "$RUN_SLUG" + print_kv "Run" "$RUN_DIR" + print_kv_wrapped "Apps" "$APPS_JOINED" + print_kv "Interval" "${INTERVAL}s" + print_kv "Samples" "${SAMPLES} lines/fetch" + print_kv "Iterations" "${ITERATIONS}${DURATION_HINT}" + print_kv "Snapshots" "$SNAP_HINT" + if [[ "$USE_TICKER" == "true" && -t 0 ]]; then + print_kv "Quit" "press q or Ctrl/CMD + C " + fi + print_rule + if [[ -z "$PYTHON_CMD" ]]; then + log_user "Python not found; continuing with raw log capture only" + fi + + capture_app() { + local app="$1" ts="$2" iter="$3" + local raw_file="$RUN_DIR/$app/raw/${ts}_iter${iter}.log" + local summary_file="$RUN_DIR/$app/${ts}_iter${iter}.json" + local cursor_file="$RUN_DIR/$app/.cursor" + + # `flyctl logs --no-tail` returns the same recent window every call, so + # filter against a per-app cursor to keep only lines newer than the + # last iteration. Falls back to the unfiltered capture when Python is + # unavailable. + if [[ -n "$PYTHON_CMD" ]]; then + if ! flyctl logs --app "$app" --no-tail 2>&1 \ + | tail -n "$SAMPLES" \ + | env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \ + "$SCRIPT_DIR/filter_since.py" "$cursor_file" \ + > "$raw_file"; then + log_to_file "[$app] Failed to fetch logs from Fly; raw output stored in $raw_file" + return + fi + else + if ! flyctl logs --app "$app" --no-tail 2>&1 | tail -n "$SAMPLES" > "$raw_file"; then + log_to_file "[$app] Failed to fetch logs from Fly; raw output stored in $raw_file" + return + fi + fi + + if [[ ! -s "$raw_file" ]]; then + return + fi + if [[ -z "$PYTHON_CMD" ]]; then + log_to_file "[$app] Captured raw logs only (Python unavailable)" + return + fi + if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/process_logs.py" "$raw_file" "$summary_file" >> "$LOG_FILE" 2>&1; then + log_to_file "[$app] Failed to process logs (see output above)" + return + fi + if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1; then + log_to_file "[$app] Failed to aggregate logs (see output above)" + fi + } + + run_snapshot_analyse() { + if [[ -z "$PYTHON_CMD" ]]; then return; fi + local snap_ts + snap_ts=$(date -u +"%H%M%SZ") + mkdir -p "$RUN_DIR/snapshots" + local run_ref="$(basename "$DATE_DIR")/$(basename "$RUN_DIR")" + log_to_file "Snapshot analyse → $RUN_DIR/snapshots/analysis_${snap_ts}.md" + env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \ + "$SCRIPT_DIR/analyse_logs.py" \ + --root "$OUTPUT_ROOT" \ + --run "$run_ref" \ + --out "$RUN_DIR/snapshots/analysis_${snap_ts}" >> "$LOG_FILE" 2>&1 || \ + log_to_file "Snapshot analyse failed (see log)" + } + + iteration=0 + start_epoch=$(date +%s) + last_analyse_epoch=$start_epoch + CAPTURED_TOTAL=0 + + # Kick off the background animator before the first capture so the + # ticker line is alive from t=0. + if [[ "$USE_TICKER" == "true" ]]; then + write_ticker_state + ticker_animator & + TICKER_ANIMATOR_PID=$! + fi + + while true; do + iteration=$((iteration + 1)) + iter_start_epoch=$(date +%s) + ts=$(date -u +"%Y%m%dT%H%M%SZ") + log_to_file "Iteration $iteration: capturing logs" + + # Capture all apps in parallel — flyctl calls are independent, and + # running them sequentially made each iteration ~5-10s instead of the + # advertised INTERVAL. Track each PID explicitly so we wait only for + # the capture children — bare `wait` would also block on the ticker + # animator, which is an intentional infinite loop. + local capture_pids=() + for app in "${APPS[@]}"; do + capture_app "$app" "$ts" "$iteration" & + capture_pids+=($!) + done + for pid in "${capture_pids[@]}"; do + wait "$pid" 2>/dev/null || true + done + + # Tally lines persisted this iteration (cursor-filtered, so reflects + # genuinely new log lines, not the flyctl --no-tail window). + for app in "${APPS[@]}"; do + raw="$RUN_DIR/$app/raw/${ts}_iter${iteration}.log" + if [[ -f "$raw" ]]; then + n=$(wc -l < "$raw" 2>/dev/null | tr -d ' ') + CAPTURED_TOTAL=$(( CAPTURED_TOTAL + ${n:-0} )) + fi + done + + if [[ "$ANALYSE_EVERY_SECONDS" -gt 0 ]]; then + now_epoch=$(date +%s) + if (( now_epoch - last_analyse_epoch >= ANALYSE_EVERY_SECONDS )); then + run_snapshot_analyse + last_analyse_epoch=$now_epoch + fi + fi + + # Hand the latest counters to the animator; it picks them up on its + # next 200ms redraw. + write_ticker_state + log_to_file "iter ${iteration}/${ITERATIONS} · elapsed $(fmt_duration $(( $(date +%s) - start_epoch )))" + + if [[ "$STOP_REQUESTED" == "true" ]]; then break; fi + if [[ "$ITERATIONS" -ne 0 && "$iteration" -ge "$ITERATIONS" ]]; then break; fi + + # `--interval` is the wall-clock period between iteration starts; if + # capture+analyse took longer than INTERVAL we run back-to-back (and + # log a warning) instead of compounding the lag. + iter_elapsed=$(( $(date +%s) - iter_start_epoch )) + remaining=$(( INTERVAL - iter_elapsed )) + if (( remaining > 0 )); then + poll_quit_or_sleep "$remaining" + else + log_to_file "Iteration $iteration took ${iter_elapsed}s (>= interval ${INTERVAL}s); no sleep" + fi + if [[ "$STOP_REQUESTED" == "true" ]]; then break; fi + done + + stop_ticker_animator + ticker_done + trap - INT TERM + + if [[ -z "$PYTHON_CMD" ]]; then + log_user "Skipping aggregation (Python unavailable)" + else + log_to_file "Running final aggregation..." + for app in "${APPS[@]}"; do + if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1; then + log_to_file "[$app] Final aggregation failed (see output above)" + fi + done + log_to_file "Aggregation complete" + + log_to_file "Running final analyse..." + run_ref="$(basename "$DATE_DIR")/$(basename "$RUN_DIR")" + env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \ + "$SCRIPT_DIR/analyse_logs.py" \ + --root "$OUTPUT_ROOT" \ + --run "$run_ref" >> "$LOG_FILE" 2>&1 || \ + log_to_file "Final analyse failed (see log)" + print_rule + emit_styled \ + " ✓ Done after $iteration iteration(s)" \ + " ${C_BOLD}${C_GREEN}✓ Done${C_RESET} after ${C_BOLD}${C_CYAN}$iteration${C_RESET} iteration(s)" + emit_styled \ + " Report: $RUN_DIR/analysis.md" \ + " ${C_BOLD}Report:${C_RESET} $RUN_DIR/analysis.md" + fi +} + +# Default subcommand is `monitor`. Bare `logs.sh`, or any invocation whose +# first positional starts with a dash (i.e. a flag, not a subcommand), runs +# monitor with the supplied flags. +if [[ $# -eq 0 ]]; then + cmd_monitor + exit 0 +fi + +case "$1" in + -h|--help|help) usage_top; exit 0 ;; + monitor) shift; cmd_monitor "$@" ;; + search) shift; cmd_search "$@" ;; + analyse|analyze) shift; cmd_analyse "$@" ;; + -*) cmd_monitor "$@" ;; + *) + echo "Unknown command: $1" >&2 + usage_top + exit 1 + ;; +esac diff --git a/scripts/monitor_logs.sh b/scripts/monitor_logs.sh index 6765d57e9..162f39f26 100755 --- a/scripts/monitor_logs.sh +++ b/scripts/monitor_logs.sh @@ -1,337 +1,9 @@ #!/usr/bin/env bash +# Back-compat shim — the canonical entry point is `logs.sh`. +# All flags accepted by this script are forwarded to `logs.sh monitor`. + set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) - -APP="hover,hover-worker,hover-analysis" -INTERVAL=10 -SAMPLES=400 -ITERATIONS=1440 # 4 hours at 10s intervals -RUN_ID="" -OUTPUT_ROOT="logs" -CLEANUP_OLD=true -CLEANUP_DAYS=1 -CLEANUP_MODE="zip" -PYTHON_CMD="" -PYTHON_ARGS=() - -usage() { - cat <<'USAGE' -Usage: monitor_logs.sh [options] - -Fetch recent Fly logs on a fixed cadence, archive the raw output, and write -per-minute summaries describing how often each log level/message occurred. - -Automatic cleanup (enabled by default): - - Zips raw logs and iteration JSONs from runs older than 1 day - - Keeps summary.md, summary.json, and monitor.log - - Use --no-cleanup to disable or --cleanup-mode delete to remove everything - -Options: - --app NAMES Fly application name(s), comma-separated - (default: hover,hover-worker,hover-analysis) - --interval SECONDS Seconds to wait between samples (default: 60) - --samples N Number of log lines to request each run (default: 400) - --iterations N Number of iterations to perform (0 = run forever) - --run-id ID Identifier used when naming output directories - --no-cleanup Disable automatic cleanup (default: enabled) - --cleanup-days N Clean runs older than N days (default: 1) - --cleanup-mode MODE How to clean: 'zip' or 'delete' (default: zip) - zip: archives raw/ and iteration JSONs, keeps summaries - delete: removes entire run directory - -h, --help Show this message and exit - -Environment variables with the same names (APP, INTERVAL, SAMPLES, ITERATIONS, -RUN_ID) override the defaults as well. - -Examples: - # Default: auto-zip raw logs from runs older than 1 day - ./monitor_logs.sh - - # Disable cleanup - ./monitor_logs.sh --no-cleanup - - # Delete entire runs older than 3 days - ./monitor_logs.sh --cleanup-days 3 --cleanup-mode delete -USAGE -} - -# Allow environment variables to override defaults -APP=${APP:-$APP} -INTERVAL=${INTERVAL:-$INTERVAL} -SAMPLES=${SAMPLES:-$SAMPLES} -ITERATIONS=${ITERATIONS:-$ITERATIONS} -RUN_ID=${RUN_ID:-$RUN_ID} - -while [[ $# -gt 0 ]]; do - case "$1" in - --app) - APP="$2" - shift 2 - ;; - --interval) - INTERVAL="$2" - shift 2 - ;; - --samples) - SAMPLES="$2" - shift 2 - ;; - --iterations) - ITERATIONS="$2" - shift 2 - ;; - --run-id) - RUN_ID="$2" - shift 2 - ;; - --no-cleanup) - CLEANUP_OLD=false - shift - ;; - --cleanup-days) - CLEANUP_DAYS="$2" - shift 2 - ;; - --cleanup-mode) - CLEANUP_MODE="$2" - shift 2 - ;; - -h|--help) - usage - exit 0 - ;; - *) - echo "Unknown option: $1" >&2 - usage - exit 1 - ;; - esac -done - -if ! [[ "$INTERVAL" =~ ^[0-9]+$ && "$INTERVAL" -gt 0 ]]; then - echo "interval must be a positive integer" >&2 - exit 1 -fi - -if ! [[ "$SAMPLES" =~ ^[0-9]+$ && "$SAMPLES" -ge 1 && "$SAMPLES" -le 10000 ]]; then - echo "samples must be an integer between 1 and 10000" >&2 - exit 1 -fi - -if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]]; then - echo "iterations must be an integer >= 0" >&2 - exit 1 -fi - -if ! [[ "$CLEANUP_DAYS" =~ ^[0-9]+$ && "$CLEANUP_DAYS" -ge 0 ]]; then - echo "cleanup-days must be a non-negative integer" >&2 - exit 1 -fi - -if [[ "$CLEANUP_MODE" != "zip" && "$CLEANUP_MODE" != "delete" ]]; then - echo "cleanup-mode must be 'zip' or 'delete'" >&2 - exit 1 -fi - -# Split APP into an array on commas (and strip whitespace around entries). -IFS=',' read -r -a APPS <<< "$APP" -for i in "${!APPS[@]}"; do - APPS[i]="${APPS[i]// /}" -done -if [[ ${#APPS[@]} -eq 0 || -z "${APPS[0]}" ]]; then - echo "at least one app name is required" >&2 - exit 1 -fi - -# Resolve a working Python interpreter for log processing helpers. -if command -v python3 >/dev/null 2>&1 && python3 -c "import sys" >/dev/null 2>&1; then - PYTHON_CMD="python3" -elif command -v python >/dev/null 2>&1 && python -c "import sys" >/dev/null 2>&1; then - PYTHON_CMD="python" -elif command -v py >/dev/null 2>&1 && py -3 -c "import sys" >/dev/null 2>&1; then - PYTHON_CMD="py" - PYTHON_ARGS=(-3) -fi - -# Auto-generate settings suffix with appropriate units -# Interval: use minutes if >= 60s, otherwise seconds -if [[ "$INTERVAL" -ge 60 ]]; then - INTERVAL_MINUTES=$(( INTERVAL / 60 )) - INTERVAL_STR="${INTERVAL_MINUTES}m" -else - INTERVAL_STR="${INTERVAL}s" -fi - -if [[ "$ITERATIONS" -eq 0 ]]; then - SETTINGS_SUFFIX="${INTERVAL_STR}_forever" -else - # Calculate total duration in seconds - DURATION_SECONDS=$(( ITERATIONS * INTERVAL )) - - # Duration: use days if >= 24h, hours if >= 60m, otherwise minutes - if [[ "$DURATION_SECONDS" -ge 86400 ]]; then - DURATION_DAYS=$(( (DURATION_SECONDS + 43200) / 86400 )) - DURATION_STR="${DURATION_DAYS}d" - elif [[ "$DURATION_SECONDS" -ge 3600 ]]; then - DURATION_HOURS=$(( (DURATION_SECONDS + 1800) / 3600 )) - DURATION_STR="${DURATION_HOURS}h" - else - DURATION_MINUTES=$(( (DURATION_SECONDS + 30) / 60 )) - DURATION_STR="${DURATION_MINUTES}m" - fi - - SETTINGS_SUFFIX="${INTERVAL_STR}_${DURATION_STR}" -fi - -# Combine custom name (if provided) with settings -if [[ -z "$RUN_ID" ]]; then - RUN_ID="$SETTINGS_SUFFIX" -else - RUN_ID="${RUN_ID}_${SETTINGS_SUFFIX}" -fi - -# Create directory structure: logs/YYYYMMDD/HHMM_run-id/ -DATE_DIR="$OUTPUT_ROOT/$(date +"%Y%m%d")" -TIME_PREFIX=$(date +"%H%M") -RUN_DIR="$DATE_DIR/${TIME_PREFIX}_${RUN_ID}" -LOG_FILE="$RUN_DIR/monitor.log" - -mkdir -p "$RUN_DIR" -for app in "${APPS[@]}"; do - mkdir -p "$RUN_DIR/$app/raw" -done - -# Cleanup old runs if requested -if [[ "$CLEANUP_OLD" == "true" ]]; then - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Cleaning up old runs (older than $CLEANUP_DAYS days, mode: $CLEANUP_MODE)" | tee -a "$LOG_FILE" - - # Calculate cutoff date (days ago) - if [[ "$(uname)" == "Darwin" ]]; then - # macOS date command - CUTOFF_DATE=$(date -u -v-${CLEANUP_DAYS}d +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d") - else - # GNU date command - CUTOFF_DATE=$(date -u -d "$CLEANUP_DAYS days ago" +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d") - fi - - # Find old run directories - if [[ -d "$OUTPUT_ROOT" ]]; then - find "$OUTPUT_ROOT" -mindepth 2 -maxdepth 2 -type d | while read -r run_dir; do - # Extract date from path (logs/YYYYMMDD/HHMM_run-id) - date_dir=$(basename "$(dirname "$run_dir")") - - # Skip if not a date directory - if ! [[ "$date_dir" =~ ^[0-9]{8}$ ]]; then - continue - fi - - # Skip if directory is from today or newer than cutoff - if [[ "$date_dir" -ge "$CUTOFF_DATE" ]]; then - continue - fi - - run_name=$(basename "$run_dir") - - if [[ "$CLEANUP_MODE" == "zip" ]]; then - # Zip mode: archive any raw/ dirs and iteration JSONs, keep summaries. - # Handles both legacy layout ($run_dir/raw) and per-app layout - # ($run_dir//raw). - while IFS= read -r raw_dir; do - [[ -z "$raw_dir" ]] && continue - zip_parent=$(dirname "$raw_dir") - [[ -f "$zip_parent/raw.zip" ]] && continue - rel=${raw_dir#"$run_dir/"} - echo " Zipping raw logs: $date_dir/$run_name/$rel" | tee -a "$LOG_FILE" - (cd "$zip_parent" && zip -q -r "raw.zip" "raw" && rm -rf "raw") || { - echo " Failed to zip raw directory $raw_dir" | tee -a "$LOG_FILE" - } - done < <(find "$run_dir" -type d -name raw 2>/dev/null) - - # Remove iteration JSONs anywhere under the run dir (keep summaries). - iter_files=$(find "$run_dir" -type f -name '*_iter*.json' 2>/dev/null) - if [[ -n "$iter_files" ]]; then - echo " Removing iteration JSONs: $date_dir/$run_name" | tee -a "$LOG_FILE" - printf '%s\n' "$iter_files" | xargs rm -f || { - echo " Failed to remove iteration JSONs in $run_dir" | tee -a "$LOG_FILE" - } - fi - else - # Delete mode: remove entire run directory - echo " Deleting: $date_dir/$run_name" | tee -a "$LOG_FILE" - rm -rf "$run_dir" || { - echo " Failed to delete $run_dir" | tee -a "$LOG_FILE" - } - fi - done - fi - - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Cleanup complete" | tee -a "$LOG_FILE" -fi - -echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Starting log monitor" | tee -a "$LOG_FILE" -echo "Apps: ${APPS[*]} | Interval: ${INTERVAL}s | Samples: $SAMPLES | Iterations: $ITERATIONS" | tee -a "$LOG_FILE" -echo "Run directory: $RUN_DIR" | tee -a "$LOG_FILE" -for app in "${APPS[@]}"; do - echo " [$app] raw: $RUN_DIR/$app/raw summaries: $RUN_DIR/$app" | tee -a "$LOG_FILE" -done -if [[ -z "$PYTHON_CMD" ]]; then - echo "Python not found; continuing with raw log capture only" | tee -a "$LOG_FILE" -fi - -# capture_app APP_NAME TIMESTAMP ITER — fetch + process logs for a single app. -capture_app() { - local app="$1" ts="$2" iter="$3" - local raw_file="$RUN_DIR/$app/raw/${ts}_iter${iter}.log" - local summary_file="$RUN_DIR/$app/${ts}_iter${iter}.json" - - if ! flyctl logs --app "$app" --no-tail 2>&1 | tail -n "$SAMPLES" > "$raw_file"; then - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Failed to fetch logs from Fly; raw output stored in $raw_file" | tee -a "$LOG_FILE" - return - fi - - if [[ -z "$PYTHON_CMD" ]]; then - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Captured raw logs only (Python unavailable)" | tee -a "$LOG_FILE" - return - fi - - if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/process_logs.py" "$raw_file" "$summary_file" >> "$LOG_FILE" 2>&1; then - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Failed to process logs (see output above)" | tee -a "$LOG_FILE" - return - fi - - env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1 -} - -iteration=0 - -while true; do - iteration=$((iteration + 1)) - ts=$(date -u +"%Y%m%dT%H%M%SZ") - - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Iteration $iteration: capturing logs" | tee -a "$LOG_FILE" - - for app in "${APPS[@]}"; do - capture_app "$app" "$ts" "$iteration" - done - - if [[ "$ITERATIONS" -ne 0 && "$iteration" -ge "$ITERATIONS" ]]; then - break - fi - - sleep "$INTERVAL" -done - -echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Monitoring finished after $iteration iteration(s)" | tee -a "$LOG_FILE" - -# Final aggregation per app -if [[ -z "$PYTHON_CMD" ]]; then - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Skipping aggregation (Python unavailable)" | tee -a "$LOG_FILE" -else - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Running final aggregation..." | tee -a "$LOG_FILE" - for app in "${APPS[@]}"; do - env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1 - done - echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Aggregation complete" | tee -a "$LOG_FILE" -fi +exec "$SCRIPT_DIR/logs.sh" monitor "$@" diff --git a/scripts/process_logs.py b/scripts/process_logs.py index 7789a6f26..9180b0759 100755 --- a/scripts/process_logs.py +++ b/scripts/process_logs.py @@ -190,4 +190,7 @@ def main() -> int: if __name__ == "__main__": - sys.exit(main()) + try: + sys.exit(main()) + except KeyboardInterrupt: + sys.exit(130) diff --git a/scripts/search_logs.py b/scripts/search_logs.py new file mode 100755 index 000000000..eecb27079 --- /dev/null +++ b/scripts/search_logs.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""Search captured Fly logs by keyword/regex across one or more apps. + +Reads raw captures from a `logs/YYYYMMDD/HHMM_//raw/` directory or +the zipped `raw.zip` produced by monitor cleanup. Prints matching lines with a +source prefix and a per-app/per-run match count summary on stderr. +""" + +from __future__ import annotations + +import argparse +import re +import sys +import zipfile +from pathlib import Path +from typing import Iterator + + +def _candidate_run_dirs(root: Path) -> list[Path]: + """Return run directories under `logs/YYYYMMDD/HHMM_*` sorted oldest -> newest.""" + out: list[Path] = [] + if not root.exists(): + return out + for date_dir in sorted(root.iterdir()): + if not date_dir.is_dir() or len(date_dir.name) != 8 or not date_dir.name.isdigit(): + continue + for run in sorted(date_dir.iterdir()): + if run.is_dir(): + out.append(run) + return out + + +def resolve_runs(run_arg: str | None, root: Path) -> list[Path]: + """Resolve the --run argument to one or more run directories. + + None / 'latest' -> [most recent run] + 'all' -> every run + 'YYYYMMDD' -> runs under that date + other -> prefix match against `/` or run name + """ + runs = _candidate_run_dirs(root) + if not runs: + return [] + if run_arg in (None, "", "latest"): + return [runs[-1]] + if run_arg == "all": + return runs + target = run_arg.strip("/") + matched: list[Path] = [] + for r in runs: + rel = str(r.relative_to(root)) + if rel == target or rel.startswith(target + "/"): + matched.append(r) + continue + if r.parent.name == target or r.name == target or r.name.startswith(target): + matched.append(r) + return matched + + +def _iter_app_dirs(run_dir: Path, app_filter: list[str] | None) -> Iterator[Path]: + for child in sorted(run_dir.iterdir()): + if not child.is_dir(): + continue + if app_filter and child.name not in app_filter: + continue + if (child / "raw").exists() or (child / "raw.zip").exists(): + yield child + + +def iter_lines(app_dir: Path) -> Iterator[tuple[str, str]]: + """Yield (source_label, line) for every captured raw log line in an app dir.""" + raw_dir = app_dir / "raw" + if raw_dir.exists(): + for log in sorted(raw_dir.glob("*.log")): + with log.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + yield (log.name, line.rstrip("\n")) + raw_zip = app_dir / "raw.zip" + if raw_zip.exists(): + with zipfile.ZipFile(raw_zip) as zf: + for info in sorted(zf.infolist(), key=lambda i: i.filename): + if info.is_dir() or not info.filename.endswith(".log"): + continue + with zf.open(info) as f: + for raw_line in f: + yield ( + Path(info.filename).name, + raw_line.decode("utf-8", errors="ignore").rstrip("\n"), + ) + + +def search( + runs: list[Path], + pattern: re.Pattern, + app_filter: list[str] | None, + max_matches: int, + root: Path, +) -> int: + total = 0 + stop = False + for run in runs: + per_app: dict[str, int] = {} + rel_run = run.relative_to(root) + for app_dir in _iter_app_dirs(run, app_filter): + count = 0 + for source, line in iter_lines(app_dir): + if pattern.search(line): + count += 1 + total += 1 + print(f"[{rel_run}][{app_dir.name}][{source}] {line}") + if max_matches and total >= max_matches: + stop = True + break + per_app[app_dir.name] = count + if stop: + break + for app, n in per_app.items(): + print(f"# {rel_run} {app}: {n} match(es)", file=sys.stderr) + if stop: + print("# match cap reached", file=sys.stderr) + break + print(f"# total matches: {total}", file=sys.stderr) + return total + + +def main() -> int: + p = argparse.ArgumentParser(description="Search captured Fly logs.") + p.add_argument("--keyword", action="append", default=[], help="Literal substring (repeatable).") + p.add_argument("--regex", action="append", default=[], help="Regex pattern (repeatable).") + p.add_argument("--app", default="", help="Comma-separated app filter (default: all apps in run).") + p.add_argument("--run", default=None, help="'latest' (default), 'all', a date, or a run dir.") + p.add_argument("--root", default="logs", help="Logs root directory (default: logs).") + p.add_argument("-i", "--ignore-case", action="store_true", default=True, + help="Match case-insensitively (default: on).") + p.add_argument("--case-sensitive", action="store_true", + help="Force case-sensitive matching (overrides --ignore-case).") + p.add_argument("--max", type=int, default=0, help="Stop after N total matches (0 = unlimited).") + args = p.parse_args() + + if not args.keyword and not args.regex: + print("Provide at least one --keyword or --regex.", file=sys.stderr) + return 2 + if args.max < 0: + print("--max must be >= 0 (0 = unlimited).", file=sys.stderr) + return 2 + + parts = [re.escape(k) for k in args.keyword] + list(args.regex) + flags = 0 if args.case_sensitive else re.IGNORECASE + try: + pattern = re.compile("|".join(f"(?:{p})" for p in parts), flags) + except re.error as exc: + print(f"Invalid regex: {exc}", file=sys.stderr) + return 2 + + root = Path(args.root) + runs = resolve_runs(args.run, root) + if not runs: + print(f"No runs matched under {root} (run={args.run!r})", file=sys.stderr) + return 1 + + apps = [a.strip() for a in args.app.split(",") if a.strip()] or None + search(runs, pattern, apps, args.max, root) + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + sys.exit(130)