diff --git a/.claude/commands/monitor.md b/.claude/commands/monitor.md
index eac924e0f..1108979a8 100644
--- a/.claude/commands/monitor.md
+++ b/.claude/commands/monitor.md
@@ -1,41 +1,63 @@
 # Monitor Fly Logs
 
-Sample and analyse Fly.io logs at regular intervals.
+Capture, search, and analyse Fly.io logs via `scripts/logs.sh`.
 
 ## Default usage
 
 ```bash
-./scripts/monitor_logs.sh
+./scripts/logs.sh
 ```
 
-Runs for 4 hours with 10-second intervals.
+Captures from all five Fly apps (`hover`, `hover-worker`, `hover-analysis`,
+`hover-autoscaler-worker`, `hover-autoscaler-analysis`) every 3s, runs an
+analyse snapshot every 5 minutes, and writes a final report when the run
+finishes (~72 minutes by default). Press Ctrl+C to stop early — the final report
+still writes.
 
-## Options
+## Subcommands
 
 ```bash
-./scripts/monitor_logs.sh --run-id "descriptive-name"    # Custom name
-./scripts/monitor_logs.sh --interval 30 --iterations 120 # 30s for 1 hour
+./scripts/logs.sh monitor [...]   # explicit form of the default
+./scripts/logs.sh search  [...]   # grep captured raw logs
+./scripts/logs.sh analyse [...]   # run probes, write analysis.md/json
 ```
 
-## Output structure
+## Common options
+
+```bash
+./scripts/logs.sh --interval 5 --iterations 720       # 5s × 1h
+./scripts/logs.sh --run-id "incident-pr349"           # custom slug
+./scripts/logs.sh --analyse-every 30s                 # tighter snapshots
+./scripts/logs.sh --analyse-every 0                   # disable snapshots
+./scripts/logs.sh --app hover,hover-worker            # subset of apps
+
+./scripts/logs.sh search --keyword panic --keyword pgx
+./scripts/logs.sh search --regex 'status[":]+5\d\d' --app hover
 
+./scripts/logs.sh analyse --keyword "deadline exceeded"
+./scripts/logs.sh analyse --run 20260502/1430_mellow-rose_3s_1h
 ```
-logs/YYYYMMDD/HHMM_<name>_<interval>s_<duration>h/
-├── raw/
-│   ├── <timestamp>_iter1.log
-│   ├── <timestamp>_iter2.log
-│   └── ...
-├── <timestamp>_iter1.json
-├── <timestamp>_iter2.json
-├── time_series.csv
-└── summary.md
+
+## Output structure
+
+```text
+logs/YYYYMMDD/HHMM_<slug>_<settings>/
+├── <app>/raw/*.log          # cursor-filtered captures (one per iteration)
+├── <app>/.cursor            # last-seen ISO timestamp per app
+├── snapshots/
+│   ├── analysis_<HHMMSS>Z.md
+│   └── analysis_<HHMMSS>Z.json
+├── analysis.md              # final probe report
+├── analysis.json
+└── monitor.log              # verbose run history
 ```
 
-## What it captures
+## Probes (analyse)
 
-- Raw log samples from Fly
-- JSON summaries per iteration
-- Aggregated time series data
-- Summary markdown report
+Severity, panics & fatals, HTTP status, latency (p50/p95/p99 + slowest),
+heartbeat, process health, autoscaler, database/external errors, Sentry, plus
+any ad-hoc `--keyword`/`--regex`. Every finding records `count`, `first_seen`,
+`last_seen`, and `peak` (timestamp of the highest-count minute).
 
-Automatic aggregation runs via `scripts/aggregate_logs.py` after each iteration.
+The legacy `scripts/monitor_logs.sh` still works — it forwards to
+`./scripts/logs.sh monitor`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a56d59460..92774d777 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,7 +28,31 @@ On merge, CI will:
 
 ## [Unreleased]
 
-_Add unreleased changes here._
+### Added
+
+- `scripts/logs.sh` unified Fly log tool with `monitor`, `search`, and `analyse`
+  subcommands. Bare `logs.sh` runs `monitor` with all defaults (3s capture
+  across all five Fly apps, 5-minute analyse snapshots, final report at run
+  end). `search` greps captured raw logs by keyword/regex across one or more
+  runs (case-insensitive by default, also reads `raw.zip` after cleanup).
+  `analyse` runs a fixed probe set — severity, panics, HTTP status, latency,
+  heartbeat, process health, autoscaler, database/external errors, Sentry, plus
+  ad-hoc keywords — and writes `analysis.{md,json}` with `first_seen`,
+  `last_seen`, and `peak` timestamps for every finding.
+- `scripts/filter_since.py` per-app cursor filter wired into `capture_app` so
+  each iteration only persists log lines newer than the previous capture,
+  eliminating the 4× overlap inflation that came from `flyctl logs --no-tail`.
+
+### Changed
+
+- `scripts/monitor_logs.sh` reduced to a back-compat shim that forwards all
+  flags to `logs.sh monitor`. Existing call sites in `opencode.json`,
+  `.claude/commands/monitor.md`, `.claude/settings.local.json`, and
+  `docs/development/DEVELOPMENT.md` continue to work unchanged.
+- Default monitor app list now includes the two autoscaler sidecars
+  (`hover-autoscaler-worker`, `hover-autoscaler-analysis`).
+- Default `--run-id` is auto-generated as a `<adjective>-<colour>` slug (e.g.
+  `mellow-rose`) so concurrent runs are easy to distinguish.
 
 ## Full changelog history
 
diff --git a/docs/development/DEVELOPMENT.md b/docs/development/DEVELOPMENT.md
index 7623cebd4..4d0609ddd 100644
--- a/docs/development/DEVELOPMENT.md
+++ b/docs/development/DEVELOPMENT.md
@@ -294,29 +294,52 @@ logic ├── crawler/ # Web crawling functionality ├── db/ # Database o
 
 ## Monitoring Fly Logs
 
-For production investigations use `scripts/monitor_logs.sh`:
+For production investigations use `scripts/logs.sh`:
 
 ```bash
-# Default: 10-second intervals for 4 hours
-./scripts/monitor_logs.sh
+# Default: 3-second capture across all five Fly apps, 5-minute analyse
+# snapshots, ~72 minutes total. Press Ctrl+C to stop early — the final
+# report still writes.
+./scripts/logs.sh
 
-# Custom run with descriptive name
-./scripts/monitor_logs.sh --run-id "heavy-load-test"
+# Custom slug, longer interval and duration
+./scripts/logs.sh --interval 30 --iterations 120 --run-id "30min-check"
 
-# Custom intervals and duration
-./scripts/monitor_logs.sh --interval 30 --iterations 120 --run-id "30min-check"
+# Tighter snapshot cadence, or disable snapshots
+./scripts/logs.sh --analyse-every 30s
+./scripts/logs.sh --analyse-every 0
 ```
 
+`logs.sh` has three subcommands sharing the same run layout:
+
+```bash
+./scripts/logs.sh search --keyword panic --keyword pgx
+./scripts/logs.sh analyse --keyword "deadline exceeded"
+./scripts/logs.sh analyse --run 20260502/1430_mellow-rose_3s_1h
+```
+
+The legacy `scripts/monitor_logs.sh` still works — it forwards to
+`./scripts/logs.sh monitor`.
+
 **Output structure:**
 
-- Folder: `logs/YYYYMMDD/HHMM_<name>_<interval>s_<duration>h/`
-  - Example: `logs/20251105/0833_heavy-load-test_10s_4h/`
-- Raw logs: `raw/<timestamp>_iter<N>.log`
-- JSON summaries: `<timestamp>_iter<N>.json`
-- Aggregated outputs:
-  - `time_series.csv` - per-minute log level counts
-  - `summary.md` - human-readable report with critical patterns
-  - Automatically regenerated after each iteration
+- Run dir: `logs/YYYYMMDD/HHMM_<slug>_<interval>s_<duration>/`
+  - Example: `logs/20260502/1430_mellow-rose_3s_1h/`
+- Per-app captures: `<app>/raw/<timestamp>_iter<N>.log` (cursor-filtered against
+  `<app>/.cursor` so each iteration only persists lines newer than the previous)
+- Per-iteration JSON: `<app>/<timestamp>_iter<N>.json`
+- Aggregated outputs (per app):
+  - `time_series.csv` — per-minute log level counts
+  - `summary.md` — human-readable per-app report
+- Cross-app analysis (whole run):
+  - `analysis.md` / `analysis.json` — final probe report (severity, panics,
+    HTTP, latency, heartbeat, process health, autoscaler, DB/external, Sentry,
+    ad-hoc keywords) with `first_seen` / `last_seen` / `peak` timestamps for
+    every finding
+  - `snapshots/analysis_<HHMMSS>Z.{md,json}` — point-in-time snapshots written
+    every `--analyse-every` while the run is in progress
+- `monitor.log` — verbose run history (cleanup, per-iteration capture, errors).
+  The TTY shows only a startup banner and a self-overwriting ticker.
 
 **Defaults:**
 
diff --git a/opencode.json b/opencode.json
index c75b0a1c9..eee4111f9 100644
--- a/opencode.json
+++ b/opencode.json
@@ -8,7 +8,7 @@
     },
     "monitor-fly": {
       "description": "Collect and summarise Fly logs",
-      "template": "Run ./scripts/monitor_logs.sh with suitable arguments from $ARGUMENTS if provided. Summarise critical patterns, error spikes, and likely causes."
+      "template": "Run ./scripts/logs.sh with suitable arguments from $ARGUMENTS if provided (use the `search` or `analyse` subcommand for grep / probe-driven analysis of an existing run). Summarise critical patterns, error spikes, and likely causes."
     },
     "load-test": {
       "description": "Run scripted load test safely",
diff --git a/scripts/aggregate_logs.py b/scripts/aggregate_logs.py
index c6c238947..f22f512ae 100644
--- a/scripts/aggregate_logs.py
+++ b/scripts/aggregate_logs.py
@@ -404,7 +404,10 @@ def watch_mode(log_dir, interval=10):
     parser.add_argument("--full", action="store_true", help="Full reprocess (ignore state)")
     args = parser.parse_args()
 
-    if args.watch:
-        watch_mode(args.log_dir, args.interval)
-    else:
-        aggregate_logs(args.log_dir, incremental=not args.full)
+    try:
+        if args.watch:
+            watch_mode(args.log_dir, args.interval)
+        else:
+            aggregate_logs(args.log_dir, incremental=not args.full)
+    except KeyboardInterrupt:
+        sys.exit(130)
diff --git a/scripts/analyse_logs.py b/scripts/analyse_logs.py
new file mode 100755
index 000000000..c174f99f5
--- /dev/null
+++ b/scripts/analyse_logs.py
@@ -0,0 +1,635 @@
+#!/usr/bin/env python3
+"""Analyse captured Fly logs and emit a deterministic report.
+
+Streams every raw line for each app in a run through a fixed set of probes
+(severity, panics, HTTP status, latency, process health, autoscaler, DB, Sentry,
+heartbeat, plus ad-hoc keywords). Each finding records `count`, `first_seen`,
+`last_seen`, and `peak` (timestamp of the highest-count minute).
+
+Outputs `analysis.json` (machine readable) and `analysis.md` (human readable)
+into the run directory.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import statistics
+import sys
+from collections import Counter, defaultdict, deque
+from itertools import pairwise
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Iterator
+
+# Reuse run-resolution + line iteration from search_logs.
+sys.path.insert(0, str(Path(__file__).parent))
+from search_logs import iter_lines, resolve_runs  # noqa: E402
+
+ISO_KEYS = ("time", "timestamp", "@timestamp", "ts", "created_at")
+ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]")
+LEADING_TS_RE = re.compile(r"^\s*(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)")
+
+
+def _strip_ansi(s: str) -> str:
+    return ANSI_RE.sub("", s)
+
+
+def _iso_seconds(rec: dict | None, line: str = "") -> str:
+    """Best-effort ISO timestamp: prefer JSON record fields, fall back to the
+    leading timestamp token Fly stamps on every raw log line."""
+    if rec:
+        for k in ISO_KEYS:
+            v = rec.get(k)
+            if v:
+                raw = str(v).replace("Z", "+00:00")
+                try:
+                    return datetime.fromisoformat(raw).isoformat(timespec="seconds")
+                except ValueError:
+                    # Malformed value for this key — keep looking. Other ISO
+                    # keys or the leading-line fallback below may be valid.
+                    continue
+    if line:
+        m = LEADING_TS_RE.match(line)
+        if m:
+            raw = m.group(1).replace("Z", "+00:00")
+            try:
+                return datetime.fromisoformat(raw).isoformat(timespec="seconds")
+            except ValueError:
+                return raw[:19]
+    return ""
+
+
+def _iso_minute(ts: str) -> str:
+    return ts[:16] if len(ts) >= 16 else ts
+
+
+def _parse_record(line: str) -> dict | None:
+    idx = line.find("{")
+    if idx == -1:
+        return None
+    try:
+        rec = json.loads(line[idx:])
+    except json.JSONDecodeError:
+        return None
+    return rec if isinstance(rec, dict) else None
+
+
+class Bucket:
+    """Tracks count plus first-seen, last-seen, and peak-minute for a label."""
+
+    __slots__ = ("count", "first", "last", "minute_counts", "samples")
+
+    def __init__(self) -> None:
+        self.count = 0
+        self.first = ""
+        self.last = ""
+        self.minute_counts: Counter = Counter()
+        self.samples: list[str] = []
+
+    def add(self, ts: str, sample: str | None = None) -> None:
+        self.count += 1
+        if ts:
+            if not self.first or ts < self.first:
+                self.first = ts
+            if not self.last or ts > self.last:
+                self.last = ts
+            self.minute_counts[_iso_minute(ts)] += 1
+        if sample and len(self.samples) < 3:
+            trimmed = sample.strip()
+            if len(trimmed) > 240:
+                trimmed = trimmed[:237] + "..."
+            self.samples.append(trimmed)
+
+    def to_dict(self, label: str) -> dict:
+        peak_min, peak_count = ("", 0)
+        if self.minute_counts:
+            peak_min, peak_count = self.minute_counts.most_common(1)[0]
+        return {
+            "label": label,
+            "count": self.count,
+            "first_seen": self.first,
+            "last_seen": self.last,
+            "peak": peak_min,
+            "peak_count": peak_count,
+            "samples": list(self.samples),
+        }
+
+
+# --- Probes ----------------------------------------------------------------
+
+class Probe:
+    name = ""
+
+    def feed(self, ts: str, line: str, rec: dict | None, level: str, msg: str) -> None:
+        raise NotImplementedError
+
+    def report(self) -> dict:
+        raise NotImplementedError
+
+
+class SeverityProbe(Probe):
+    name = "Severity"
+    LEVELS = ("debug", "info", "warn", "error", "fatal")
+
+    def __init__(self) -> None:
+        self.buckets: dict[str, Bucket] = {lvl: Bucket() for lvl in self.LEVELS}
+        self.unknown = Bucket()
+
+    def feed(self, ts, line, rec, level, msg):
+        if level in self.buckets:
+            self.buckets[level].add(ts, msg)
+        elif level:
+            self.unknown.add(ts, line)
+
+    def report(self) -> dict:
+        findings = [
+            self.buckets[lvl].to_dict(lvl)
+            for lvl in self.LEVELS
+            if self.buckets[lvl].count
+        ]
+        if self.unknown.count:
+            findings.append(self.unknown.to_dict("unknown-level"))
+        return {"name": self.name, "findings": findings}
+
+
+class RegexBucketProbe(Probe):
+    """Generic probe: match a list of (label, regex) and bucket per label."""
+
+    def __init__(self, name: str, patterns: list[tuple[str, str]], flags: int = re.IGNORECASE):
+        self.name = name
+        self.compiled = [(label, re.compile(pat, flags)) for label, pat in patterns]
+        self.buckets: dict[str, Bucket] = defaultdict(Bucket)
+
+    def feed(self, ts, line, rec, level, msg):
+        for label, pat in self.compiled:
+            m = pat.search(line)
+            if m:
+                self.buckets[label].add(ts, line)
+
+    def report(self) -> dict:
+        findings = [b.to_dict(label) for label, b in self.buckets.items() if b.count]
+        findings.sort(key=lambda f: -f["count"])
+        return {"name": self.name, "findings": findings}
+
+
+class PanicProbe(Probe):
+    name = "Panics & fatals"
+
+    def __init__(self) -> None:
+        self.buckets: dict[str, Bucket] = defaultdict(Bucket)
+        self.panic_re = re.compile(r"panic:\s*(.+)", re.IGNORECASE)
+        self.fatal_re = re.compile(r"\bfatal(?:\s+error)?:\s*(.+)", re.IGNORECASE)
+
+    def feed(self, ts, line, rec, level, msg):
+        for pat, kind in ((self.panic_re, "panic"), (self.fatal_re, "fatal")):
+            m = pat.search(line)
+            if m:
+                rest = m.group(1).strip()
+                key = rest.split("\n", 1)[0][:120]
+                if not key:
+                    key = kind
+                self.buckets[f"{kind}: {key}"].add(ts, line)
+                return
+
+    def report(self) -> dict:
+        findings = sorted(
+            (b.to_dict(label) for label, b in self.buckets.items()),
+            key=lambda f: -f["count"],
+        )
+        return {"name": self.name, "findings": findings[:10]}
+
+
+class HTTPStatusProbe(Probe):
+    name = "HTTP status"
+
+    def __init__(self) -> None:
+        self.buckets: dict[str, Bucket] = defaultdict(Bucket)
+        self.status_field = re.compile(r'\bstatus(?:_code)?[\"\']?\s*[=:]\s*\"?(\d{3})\b')
+        self.access_log = re.compile(r'HTTP/\d\.\d"\s+(\d{3})\s+\d+')
+
+    def feed(self, ts, line, rec, level, msg):
+        code: str | None = None
+        if rec:
+            for k in ("status", "status_code", "statusCode", "http_status"):
+                v = rec.get(k)
+                if isinstance(v, (int, str)) and str(v).isdigit() and len(str(v)) == 3:
+                    code = str(v)
+                    break
+        if code is None:
+            m = self.access_log.search(line) or self.status_field.search(line)
+            if m:
+                code = m.group(1)
+        if not code:
+            return
+        klass = f"{code[0]}xx"
+        self.buckets[klass].add(ts)
+        if code in ("429", "499", "500", "502", "503", "504"):
+            self.buckets[code].add(ts, line)
+
+    def report(self) -> dict:
+        findings = [b.to_dict(label) for label, b in sorted(self.buckets.items())]
+        findings = [f for f in findings if f["count"]]
+        return {"name": self.name, "findings": findings}
+
+
+class LatencyProbe(Probe):
+    name = "Latency"
+    DURATION_KEYS = ("dur_ms", "duration_ms", "latency_ms", "elapsed_ms", "took_ms")
+
+    def __init__(self) -> None:
+        self.values_ms: list[float] = []
+        self.slowest: list[tuple[float, str, str]] = []  # (ms, ts, line)
+
+    def _record(self, ms: float, ts: str, line: str) -> None:
+        if ms < 0 or ms > 3_600_000:
+            return
+        self.values_ms.append(ms)
+        self.slowest.append((ms, ts, line))
+        if len(self.slowest) > 200:
+            self.slowest.sort(key=lambda x: -x[0])
+            self.slowest = self.slowest[:50]
+
+    def feed(self, ts, line, rec, level, msg):
+        if not rec:
+            return
+        for k in self.DURATION_KEYS:
+            v = rec.get(k)
+            if isinstance(v, (int, float)):
+                self._record(float(v), ts, line)
+                return
+        # Bare `duration` follows Go/zerolog convention: integer nanoseconds.
+        # Don't try to infer units from magnitude — sub-millisecond values
+        # would otherwise be recorded as milliseconds and skew percentiles.
+        # Apps emitting milliseconds should use the explicit `*_ms` keys above.
+        d = rec.get("duration")
+        if isinstance(d, (int, float)):
+            self._record(float(d) / 1_000_000, ts, line)
+
+    def report(self) -> dict:
+        if not self.values_ms:
+            return {"name": self.name, "findings": [], "note": "no duration fields seen"}
+        vs = sorted(self.values_ms)
+        n = len(vs)
+
+        def pct(p: float) -> float:
+            # Linear interpolation between adjacent ranks (Type 7 / "inclusive"),
+            # the same method as `statistics.quantiles(method="inclusive")`.
+            # `round()` previously hit banker's-rounding edge cases (e.g. p50
+            # of [100, 300] returned 100 instead of 200).
+            if n == 1 or p <= 0:
+                return float(vs[0])
+            if p >= 100:
+                return float(vs[-1])
+            rank = p / 100 * (n - 1)
+            lo = int(rank)
+            hi = min(lo + 1, n - 1)
+            frac = rank - lo
+            return float(vs[lo]) + frac * (float(vs[hi]) - float(vs[lo]))
+
+        self.slowest.sort(key=lambda x: -x[0])
+        slowest = [
+            {"duration_ms": ms, "timestamp": ts, "line": (line.strip()[:240])}
+            for ms, ts, line in self.slowest[:10]
+        ]
+        return {
+            "name": self.name,
+            "samples": n,
+            "p50_ms": pct(50),
+            "p95_ms": pct(95),
+            "p99_ms": pct(99),
+            "max_ms": vs[-1],
+            "mean_ms": statistics.fmean(vs),
+            "slowest": slowest,
+        }
+
+
+class HeartbeatProbe(Probe):
+    """Detect minutes where info-level traffic dropped to zero mid-run."""
+
+    name = "Heartbeat"
+
+    def __init__(self) -> None:
+        self.minute_info: Counter = Counter()
+        self.minute_seen: list[str] = []
+
+    def feed(self, ts, line, rec, level, msg):
+        if not ts:
+            return
+        m = _iso_minute(ts)
+        if m and (not self.minute_seen or self.minute_seen[-1] != m):
+            self.minute_seen.append(m)
+        if level == "info":
+            self.minute_info[m] += 1
+
+    def report(self) -> dict:
+        if not self.minute_seen:
+            return {"name": self.name, "findings": [], "note": "no timestamped traffic"}
+        minutes = sorted(set(self.minute_seen))
+        non_zero = [self.minute_info[m] for m in minutes if self.minute_info.get(m, 0) > 0]
+        median = statistics.median(non_zero) if non_zero else 0
+        gap_minutes: set[str] = set()
+        # Two ways a minute can be a heartbeat gap:
+        #   (1) observed in minute_seen (had warn/error traffic) but zero info,
+        #   (2) entirely missing — fell between two observed minutes with no
+        #       log lines at all, so `minute_seen` doesn't include it.
+        if median >= 1:
+            # Skip the first and last observed minutes — they're typically
+            # partial windows (capture started or stopped mid-minute), so a
+            # zero info-count there is expected, not a real heartbeat gap.
+            for m in minutes[1:-1]:
+                if self.minute_info.get(m, 0) == 0:
+                    gap_minutes.add(m)
+            if len(minutes) >= 2:
+                for prev, curr in pairwise(minutes):
+                    try:
+                        p = datetime.strptime(prev, "%Y-%m-%dT%H:%M")
+                        c = datetime.strptime(curr, "%Y-%m-%dT%H:%M")
+                    except ValueError:
+                        continue
+                    step = p + timedelta(minutes=1)
+                    while step < c:
+                        gap_minutes.add(step.strftime("%Y-%m-%dT%H:%M"))
+                        step += timedelta(minutes=1)
+        gaps: list[dict] = [
+            {"minute": m, "expected_min": int(median)}
+            for m in sorted(gap_minutes)[:200]
+        ]
+        return {
+            "name": self.name,
+            "median_info_per_minute": median,
+            "first_minute": minutes[0],
+            "last_minute": minutes[-1],
+            "gap_minutes": gaps[:20],
+        }
+
+
+def _build_probes(extra_keywords: list[str], extra_regexes: list[str]) -> list[Probe]:
+    probes: list[Probe] = [
+        SeverityProbe(),
+        PanicProbe(),
+        HTTPStatusProbe(),
+        LatencyProbe(),
+        HeartbeatProbe(),
+        RegexBucketProbe(
+            "Process health",
+            [
+                ("starting machine", r"starting machine"),
+                ("stopping machine", r"stopping machine"),
+                ("exited with code", r"exited with code\s+\d+"),
+                ("out of memory", r"out of memory|oom[- ]?killed"),
+                ("killed by signal", r"killed by signal|signal:\s*killed"),
+                ("health check failed", r"health check.*fail"),
+                ("restart", r"\brestart(ing)?\b"),
+            ],
+        ),
+        RegexBucketProbe(
+            "Autoscaler",
+            [
+                ("reconciling", r'"msg":\s*"reconciling"|reconciling\s+app'),
+                ("scale up", r"scal(e|ing)\s*up|adding machine"),
+                ("scale down", r"scal(e|ing)\s*down|removing machine"),
+                ("target=N", r"target\s*[=:]\s*\d+|\"target\":\s*\{"),
+                ("queue depth", r"queue[_ ]depth|backlog\s*[=:]\s*\d+"),
+                ("no-op", r"no scale change|already at target"),
+            ],
+        ),
+        RegexBucketProbe(
+            "Database / external",
+            [
+                ("pgx error", r"\bpgx\b.*error|pgx:.*"),
+                ("pq error", r"\bpq:\s"),
+                ("connection refused", r"connection refused"),
+                ("context deadline exceeded", r"context deadline exceeded"),
+                ("i/o timeout", r"i/o timeout"),
+                ("connection reset", r"connection reset"),
+                ("too many connections", r"too many connections"),
+            ],
+        ),
+        RegexBucketProbe(
+            "Sentry",
+            [
+                ("event sent", r"sentry.*event\b|event sent to sentry"),
+                ("send failed", r"sentry.*(?:fail|error)"),
+            ],
+        ),
+    ]
+    if extra_keywords or extra_regexes:
+        patterns = [(f"keyword:{k}", re.escape(k)) for k in extra_keywords]
+        patterns += [(f"regex:{r}", r) for r in extra_regexes]
+        probes.append(RegexBucketProbe("Ad-hoc keywords", patterns))
+    return probes
+
+
+_DEDUPE_WINDOW = 50_000
+
+
+def _analyse_app(app_dir: Path, extra_keywords: list[str], extra_regexes: list[str]) -> dict:
+    probes = _build_probes(extra_keywords, extra_regexes)
+    total_lines = 0
+    unique_lines = 0
+    parsed_records = 0
+    first_ts = ""
+    last_ts = ""
+    # Bounded sliding-window dedupe: capture overlap is between adjacent
+    # iterations, so a recent-window LRU catches all real duplicates without
+    # unbounded memory growth on long, high-volume runs. Capture-time cursor
+    # filtering already removes most dupes; this is the secondary safety net.
+    seen: set[str] = set()
+    seen_order: deque[str] = deque()
+
+    for _source, raw_line in iter_lines(app_dir):
+        total_lines += 1
+        line = _strip_ansi(raw_line).rstrip()
+        if not line or line in seen:
+            continue
+        seen.add(line)
+        seen_order.append(line)
+        if len(seen_order) > _DEDUPE_WINDOW:
+            seen.discard(seen_order.popleft())
+        unique_lines += 1
+        rec = _parse_record(line)
+        if rec is not None:
+            parsed_records += 1
+        ts = _iso_seconds(rec, line)
+        level = ""
+        msg = ""
+        if rec:
+            level = str(rec.get("level") or "").lower()
+            msg = str(rec.get("msg") or rec.get("message") or "")
+        if ts:
+            if not first_ts or ts < first_ts:
+                first_ts = ts
+            if not last_ts or ts > last_ts:
+                last_ts = ts
+        for probe in probes:
+            try:
+                probe.feed(ts, line, rec, level, msg)
+            except Exception as exc:  # don't let one bad line abort the whole run
+                print(f"warn: probe {probe.name} on {app_dir.name}: {exc}", file=sys.stderr)
+
+    return {
+        "app": app_dir.name,
+        "total_lines": total_lines,
+        "unique_lines": unique_lines,
+        "parsed_records": parsed_records,
+        "first_seen": first_ts,
+        "last_seen": last_ts,
+        "probes": [p.report() for p in probes],
+    }
+
+
+# --- Reporting -------------------------------------------------------------
+
+def _fmt_window(d: dict) -> str:
+    first = d.get("first_seen") or "-"
+    last = d.get("last_seen") or "-"
+    peak = d.get("peak") or "-"
+    pc = d.get("peak_count") or 0
+    return f"first={first} last={last} peak={peak}({pc})"
+
+
+def _render_md(report: dict) -> str:
+    out: list[str] = []
+    run = report["run"]
+    out.append(f"# Log analysis — {run}")
+    out.append("")
+    out.append(f"Generated: {report['generated_at']}")
+    if report.get("extra_keywords") or report.get("extra_regexes"):
+        out.append(
+            f"Ad-hoc terms: keywords={report.get('extra_keywords')} regexes={report.get('extra_regexes')}"
+        )
+    out.append("")
+
+    for app in report["apps"]:
+        out.append(f"## {app['app']}")
+        unique = app.get("unique_lines", app["total_lines"])
+        out.append(
+            f"- lines: {app['total_lines']:,} captured / {unique:,} unique"
+            f" (parsed JSON: {app['parsed_records']:,})"
+            f" | window: {app.get('first_seen') or '-'} → {app.get('last_seen') or '-'}"
+        )
+        for probe in app["probes"]:
+            out.append("")
+            out.append(f"### {probe['name']}")
+            if probe.get("note"):
+                out.append(f"_{probe['note']}_")
+            findings = probe.get("findings")
+            if findings is None and probe["name"] == "Latency":
+                if probe.get("samples"):
+                    out.append(
+                        f"- samples: {probe['samples']} | "
+                        f"p50={probe['p50_ms']:.0f}ms p95={probe['p95_ms']:.0f}ms "
+                        f"p99={probe['p99_ms']:.0f}ms max={probe['max_ms']:.0f}ms"
+                    )
+                    if probe.get("slowest"):
+                        out.append("")
+                        out.append("Slowest:")
+                        for s in probe["slowest"][:5]:
+                            out.append(
+                                f"  - {s['duration_ms']:.0f}ms @ {s['timestamp']}: {s['line']}"
+                            )
+                continue
+            if probe["name"] == "Heartbeat":
+                out.append(
+                    f"- median info/min: {probe.get('median_info_per_minute')} | "
+                    f"window: {probe.get('first_minute')} → {probe.get('last_minute')}"
+                )
+                gaps = probe.get("gap_minutes") or []
+                if gaps:
+                    out.append(f"- {len(gaps)} zero-info minute(s):")
+                    for g in gaps[:10]:
+                        out.append(f"  - {g['minute']}")
+                continue
+            if not findings:
+                out.append("- (no matches)")
+                continue
+            for f in findings:
+                line = f"- {f['label']}: {f['count']} | {_fmt_window(f)}"
+                out.append(line)
+                for s in f.get("samples", [])[:1]:
+                    out.append(f"    sample: {s}")
+        out.append("")
+    return "\n".join(out)
+
+
+def _select_app_dirs(run: Path, app_filter: list[str] | None) -> list[Path]:
+    out = []
+    for child in sorted(run.iterdir()):
+        if not child.is_dir():
+            continue
+        if app_filter and child.name not in app_filter:
+            continue
+        if (child / "raw").exists() or (child / "raw.zip").exists():
+            out.append(child)
+    return out
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="Analyse captured Fly logs.")
+    p.add_argument("--run", default=None, help="'latest' (default), 'all', a date, or a run dir.")
+    p.add_argument("--root", default="logs", help="Logs root directory (default: logs).")
+    p.add_argument("--app", default="", help="Comma-separated app filter (default: all apps).")
+    p.add_argument("--keyword", action="append", default=[], help="Ad-hoc keyword (repeatable).")
+    p.add_argument("--regex", action="append", default=[], help="Ad-hoc regex (repeatable).")
+    p.add_argument("--out", default=None, help="Override output base path (writes .json + .md).")
+    p.add_argument("--stdout", action="store_true", help="Print markdown to stdout in addition to writing files.")
+    args = p.parse_args()
+
+    root = Path(args.root)
+    runs = resolve_runs(args.run, root)
+    if not runs:
+        print(f"No runs matched under {root} (run={args.run!r})", file=sys.stderr)
+        return 1
+    if len(runs) > 1 and args.out:
+        # `--out` shares a single base path across the loop below, so multiple
+        # runs would silently overwrite each other. Without `--out` each run
+        # writes to its own `<run>/analysis.{md,json}` and multi-run selection
+        # is fine.
+        print(
+            f"Resolved {len(runs)} runs; --out requires a single run because "
+            f"combined output isn't implemented.",
+            file=sys.stderr,
+        )
+        return 2
+
+    apps = [a.strip() for a in args.app.split(",") if a.strip()] or None
+
+    rc = 0
+    for run in runs:
+        app_dirs = _select_app_dirs(run, apps)
+        if not app_dirs:
+            print(f"No app dirs with raw logs under {run}", file=sys.stderr)
+            rc = rc or 1
+            continue
+        app_reports = [_analyse_app(d, args.keyword, args.regex) for d in app_dirs]
+        report = {
+            "run": str(run.relative_to(root)),
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "extra_keywords": args.keyword,
+            "extra_regexes": args.regex,
+            "apps": app_reports,
+        }
+        if args.out:
+            base = Path(args.out)
+        else:
+            base = run / "analysis"
+        base.parent.mkdir(parents=True, exist_ok=True)
+        json_path = base.with_suffix(".json")
+        md_path = base.with_suffix(".md")
+        json_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
+        md = _render_md(report)
+        md_path.write_text(md, encoding="utf-8")
+        print(f"wrote {md_path}")
+        print(f"wrote {json_path}")
+        if args.stdout:
+            print(md)
+    return rc
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        sys.exit(130)
diff --git a/scripts/filter_since.py b/scripts/filter_since.py
new file mode 100755
index 000000000..877695b8a
--- /dev/null
+++ b/scripts/filter_since.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""Filter Fly log lines from stdin against a per-app timestamp cursor.
+
+Keeps only lines whose leading ISO timestamp is strictly greater than the
+cursor stored at the given path, then updates the cursor to the max timestamp
+seen. Used by `logs.sh monitor` so overlapping `flyctl logs --no-tail` captures
+only persist new lines per iteration.
+
+Lines without a parseable leading timestamp (typically multi-line stack-trace
+continuations) are emitted only when the most recent timestamped header was
+emitted. That preserves multi-line records when their header survives the
+cursor while preventing stale continuation blocks from re-appearing in
+overlapping `flyctl logs --no-tail` windows.
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from pathlib import Path
+
+ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]")
+LEADING_TS_RE = re.compile(
+    r"^\s*(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)"
+)
+
+
+def _normalise(ts: str) -> str:
+    """Normalise to a lexicographically-comparable form (Z -> +00:00)."""
+    return ts.replace("Z", "+00:00")
+
+
+def main() -> int:
+    if len(sys.argv) != 2:
+        print("usage: filter_since.py <cursor_file>", file=sys.stderr)
+        return 2
+
+    cursor_path = Path(sys.argv[1])
+    cursor = ""
+    if cursor_path.exists():
+        cursor = cursor_path.read_text(encoding="utf-8").strip()
+
+    new_max = cursor
+    out = sys.stdout
+    # Multi-line records (e.g. Go panic stack traces) only stamp a timestamp on
+    # the first line. Track whether the most recent header was emitted so we
+    # don't re-persist continuation lines whose header was dropped by the
+    # cursor — that would reintroduce duplicates this filter exists to remove.
+    header_emitted = False
+    for line in sys.stdin:
+        clean = ANSI_RE.sub("", line)
+        m = LEADING_TS_RE.match(clean)
+        if not m:
+            if header_emitted:
+                out.write(line)
+            continue
+        ts = _normalise(m.group(1))
+        if cursor and ts <= cursor:
+            header_emitted = False
+            continue
+        out.write(line)
+        header_emitted = True
+        if ts > new_max:
+            new_max = ts
+
+    if new_max and new_max != cursor:
+        cursor_path.parent.mkdir(parents=True, exist_ok=True)
+        cursor_path.write_text(new_max, encoding="utf-8")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        # Bash forwards SIGINT to every child in the pipe. Exit quietly so
+        # `logs.sh`'s own trap can write the final report without a traceback
+        # bleeding into the terminal.
+        sys.exit(130)
diff --git a/scripts/logs.sh b/scripts/logs.sh
new file mode 100755
index 000000000..d12d38462
--- /dev/null
+++ b/scripts/logs.sh
@@ -0,0 +1,744 @@
+#!/usr/bin/env bash
+
+# logs.sh — unified Fly log tool: monitor / search / analyse.
+#
+#   logs.sh monitor [...]   capture logs on a fixed cadence (was monitor_logs.sh)
+#   logs.sh search  [...]   grep across captured raw logs (zipped or live)
+#   logs.sh analyse [...]   run probes, write analysis.md/json into a run dir
+#
+# All subcommands accept --help.
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+usage_top() {
+    cat <<'USAGE'
+Usage: logs.sh <command> [options]
+
+Commands:
+  monitor   Capture Fly logs on a cadence and aggregate per-minute summaries.
+  search    Grep captured raw logs by keyword/regex across one or more apps.
+  analyse   Run pre-built probes (severity, panics, HTTP, latency, autoscaler,
+            DB, Sentry, heartbeat, ad-hoc) over a run; writes analysis.md/json.
+
+Run `logs.sh <command> --help` for command-specific options.
+USAGE
+}
+
+# Probe a candidate interpreter and confirm it's Python 3.10+. The Python
+# helpers use `itertools.pairwise` (3.10+) so 3.9 would import-fail at runtime.
+_is_python3() {
+    "$@" -c "import sys; sys.exit(0 if sys.version_info >= (3, 10) else 1)" \
+        >/dev/null 2>&1
+}
+
+# Locate a working Python 3.10+ interpreter shared by search/analyse helpers.
+resolve_python() {
+    if command -v python3 >/dev/null 2>&1 && _is_python3 python3; then
+        PYTHON_CMD="python3"
+        PYTHON_ARGS=()
+    elif command -v python >/dev/null 2>&1 && _is_python3 python; then
+        PYTHON_CMD="python"
+        PYTHON_ARGS=()
+    elif command -v py >/dev/null 2>&1 && _is_python3 py -3; then
+        PYTHON_CMD="py"
+        PYTHON_ARGS=(-3)
+    else
+        echo "Python 3.10+ is required for this command but was not found." >&2
+        exit 1
+    fi
+}
+
+cmd_search() {
+    resolve_python
+    exec env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \
+        "$SCRIPT_DIR/search_logs.py" "$@"
+}
+
+cmd_analyse() {
+    resolve_python
+    exec env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \
+        "$SCRIPT_DIR/analyse_logs.py" "$@"
+}
+
+generate_run_slug() {
+    # Deterministic-free, no-deps friendly slug — adjective-colour shape so
+    # concurrent runs are easy to distinguish at a glance in the logs/ tree.
+    local adjs=(grumpy happy lazy quick brave silent loud sleepy hungry tiny
+                spicy mellow plucky witty bright stormy frosty sunny rusty merry
+                gentle clumsy chatty curious eager fancy giddy nimble proud sturdy)
+    local colours=(orange purple sky river panda cobra falcon meadow ember hazel
+                   crimson teal indigo amber slate olive rose mint coral cobalt
+                   ivory ochre azure plum lilac mango onyx pearl sage saffron)
+    local seed=$(( ( $(date +%s) ^ $$ ) & 0x7FFFFFFF ))
+    local a=$(( seed % ${#adjs[@]} ))
+    local c=$(( (seed / ${#adjs[@]}) % ${#colours[@]} ))
+    echo "${adjs[$a]}-${colours[$c]}"
+}
+
+# parse_duration "5m" -> 300; accepts plain seconds, Ns, Nm, Nh.
+parse_duration() {
+    local v="$1"
+    if [[ "$v" =~ ^([0-9]+)$ ]]; then echo "${BASH_REMATCH[1]}"; return; fi
+    if [[ "$v" =~ ^([0-9]+)s$ ]]; then echo "${BASH_REMATCH[1]}"; return; fi
+    if [[ "$v" =~ ^([0-9]+)m$ ]]; then echo $(( ${BASH_REMATCH[1]} * 60 )); return; fi
+    if [[ "$v" =~ ^([0-9]+)h$ ]]; then echo $(( ${BASH_REMATCH[1]} * 3600 )); return; fi
+    echo "invalid duration: $v (use 30s, 5m, 1h, or plain seconds)" >&2
+    exit 1
+}
+
+cmd_monitor() {
+    # Defaults; environment variables of the same name take precedence.
+    APP="${APP:-hover,hover-worker,hover-analysis,hover-autoscaler-worker,hover-autoscaler-analysis}"
+    INTERVAL="${INTERVAL:-3}"
+    SAMPLES="${SAMPLES:-400}"
+    ITERATIONS="${ITERATIONS:-1440}"  # ~72 minutes at 3s intervals
+    RUN_ID="${RUN_ID:-}"
+    OUTPUT_ROOT="${OUTPUT_ROOT:-logs}"
+    CLEANUP_OLD="${CLEANUP_OLD:-true}"
+    CLEANUP_DAYS="${CLEANUP_DAYS:-1}"
+    CLEANUP_MODE="${CLEANUP_MODE:-zip}"
+    ANALYSE_EVERY="${ANALYSE_EVERY:-5m}"
+    PYTHON_CMD=""
+    PYTHON_ARGS=()
+
+    monitor_usage() {
+        cat <<'USAGE'
+Usage: logs.sh monitor [options]
+
+Fetch recent Fly logs on a fixed cadence, archive the raw output, and write
+per-minute summaries describing how often each log level/message occurred.
+
+Automatic cleanup (enabled by default):
+  - Zips raw logs and iteration JSONs from runs older than 1 day
+  - Keeps summary.md, summary.json, and monitor.log
+  - Use --no-cleanup to disable or --cleanup-mode delete to remove everything
+
+Options:
+  --app NAMES           Fly application name(s), comma-separated
+                        (default: hover,hover-worker,hover-analysis,
+                        hover-autoscaler-worker,hover-autoscaler-analysis)
+  --interval SECONDS    Seconds to wait between samples (default: 3)
+  --samples N           Number of log lines to request each run (default: 400)
+  --iterations N        Number of iterations to perform (0 = run forever,
+                        default: 1440 = ~72 minutes at 3s intervals)
+  --run-id ID           Identifier used when naming output directories
+                        (default: auto-generated <adjective>-<colour> slug)
+  --analyse-every DUR   Run analyse to write a snapshot every DUR (default: 5m).
+                        Accepts plain seconds, Ns, Nm, or Nh. Use 0 to disable.
+  --no-cleanup          Disable automatic cleanup (default: enabled)
+  --cleanup-days N      Clean runs older than N days (default: 1)
+  --cleanup-mode MODE   How to clean: 'zip' or 'delete' (default: zip)
+                        zip: archives raw/ and iteration JSONs, keeps summaries
+                        delete: removes entire run directory
+  -h, --help            Show this message and exit
+
+Environment variables with the same names (APP, INTERVAL, SAMPLES, ITERATIONS,
+RUN_ID) override the defaults as well.
+USAGE
+    }
+
+    require_value() {
+        # Bail on missing values for options that take an argument so a stray
+        # `logs.sh monitor --app` fails with a readable message instead of
+        # tripping `set -u` on the unbound `$2` expansion below.
+        if [[ $# -lt 2 || "$2" == -* ]]; then
+            echo "Missing value for $1" >&2
+            monitor_usage
+            exit 2
+        fi
+    }
+
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --app)            require_value "$@"; APP="$2"; shift 2 ;;
+            --interval)       require_value "$@"; INTERVAL="$2"; shift 2 ;;
+            --samples)        require_value "$@"; SAMPLES="$2"; shift 2 ;;
+            --iterations)     require_value "$@"; ITERATIONS="$2"; shift 2 ;;
+            --run-id)         require_value "$@"; RUN_ID="$2"; shift 2 ;;
+            --analyse-every)  require_value "$@"; ANALYSE_EVERY="$2"; shift 2 ;;
+            --no-cleanup)     CLEANUP_OLD=false; shift ;;
+            --cleanup-days)   require_value "$@"; CLEANUP_DAYS="$2"; shift 2 ;;
+            --cleanup-mode)   require_value "$@"; CLEANUP_MODE="$2"; shift 2 ;;
+            -h|--help)        monitor_usage; exit 0 ;;
+            *)
+                echo "Unknown option: $1" >&2
+                monitor_usage
+                exit 1
+                ;;
+        esac
+    done
+
+    if ! [[ "$INTERVAL" =~ ^[0-9]+$ && "$INTERVAL" -gt 0 ]]; then
+        echo "interval must be a positive integer" >&2
+        exit 1
+    fi
+    if ! [[ "$SAMPLES" =~ ^[0-9]+$ && "$SAMPLES" -ge 1 && "$SAMPLES" -le 10000 ]]; then
+        echo "samples must be an integer between 1 and 10000" >&2
+        exit 1
+    fi
+    if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]]; then
+        echo "iterations must be an integer >= 0" >&2
+        exit 1
+    fi
+    if ! [[ "$CLEANUP_DAYS" =~ ^[0-9]+$ && "$CLEANUP_DAYS" -ge 0 ]]; then
+        echo "cleanup-days must be a non-negative integer" >&2
+        exit 1
+    fi
+    if [[ "$CLEANUP_MODE" != "zip" && "$CLEANUP_MODE" != "delete" ]]; then
+        echo "cleanup-mode must be 'zip' or 'delete'" >&2
+        exit 1
+    fi
+
+    IFS=',' read -r -a APPS <<< "$APP"
+    for i in "${!APPS[@]}"; do
+        APPS[i]="${APPS[i]// /}"
+    done
+    if [[ ${#APPS[@]} -eq 0 ]]; then
+        echo "at least one app name is required" >&2
+        exit 1
+    fi
+    # Reject every empty entry — `--app "hover,,worker"` or a trailing comma
+    # would otherwise create `$RUN_DIR//raw` and call `flyctl logs --app ""`.
+    for app_name in "${APPS[@]}"; do
+        if [[ -z "$app_name" ]]; then
+            echo "app list contains an empty value; check comma placement in --app/$APP" >&2
+            exit 1
+        fi
+    done
+
+    if command -v python3 >/dev/null 2>&1 && _is_python3 python3; then
+        PYTHON_CMD="python3"
+    elif command -v python >/dev/null 2>&1 && _is_python3 python; then
+        PYTHON_CMD="python"
+    elif command -v py >/dev/null 2>&1 && _is_python3 py -3; then
+        PYTHON_CMD="py"
+        PYTHON_ARGS=(-3)
+    fi
+
+    # Auto-generate settings suffix with appropriate units.
+    if [[ "$INTERVAL" -ge 60 ]]; then
+        INTERVAL_MINUTES=$(( INTERVAL / 60 ))
+        INTERVAL_STR="${INTERVAL_MINUTES}m"
+    else
+        INTERVAL_STR="${INTERVAL}s"
+    fi
+
+    if [[ "$ITERATIONS" -eq 0 ]]; then
+        SETTINGS_SUFFIX="${INTERVAL_STR}_forever"
+    else
+        DURATION_SECONDS=$(( ITERATIONS * INTERVAL ))
+        if [[ "$DURATION_SECONDS" -ge 86400 ]]; then
+            DURATION_DAYS=$(( (DURATION_SECONDS + 43200) / 86400 ))
+            DURATION_STR="${DURATION_DAYS}d"
+        elif [[ "$DURATION_SECONDS" -ge 3600 ]]; then
+            DURATION_HOURS=$(( (DURATION_SECONDS + 1800) / 3600 ))
+            DURATION_STR="${DURATION_HOURS}h"
+        else
+            DURATION_MINUTES=$(( (DURATION_SECONDS + 30) / 60 ))
+            DURATION_STR="${DURATION_MINUTES}m"
+        fi
+        SETTINGS_SUFFIX="${INTERVAL_STR}_${DURATION_STR}"
+    fi
+
+    # `--run-id` is interpolated into a filesystem path; reject anything that
+    # could escape the advertised `logs/YYYYMMDD/HHMM_<slug>_<settings>/`
+    # layout (path separators, traversal segments, leading dot/dash).
+    if [[ -n "$RUN_ID" && ! "$RUN_ID" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]]; then
+        echo "run-id may only contain letters, numbers, dot, underscore, and hyphen, and may not start with '.' or '-'" >&2
+        exit 1
+    fi
+    if [[ -z "$RUN_ID" ]]; then
+        RUN_SLUG="$(generate_run_slug)"
+    else
+        RUN_SLUG="$RUN_ID"
+    fi
+    RUN_ID="${RUN_SLUG}_${SETTINGS_SUFFIX}"
+
+    ANALYSE_EVERY_SECONDS=$(parse_duration "$ANALYSE_EVERY")
+
+    DATE_DIR="$OUTPUT_ROOT/$(date +"%Y%m%d")"
+    TIME_PREFIX=$(date +"%H%M")
+    RUN_DIR="$DATE_DIR/${TIME_PREFIX}_${RUN_ID}"
+    LOG_FILE="$RUN_DIR/monitor.log"
+
+    mkdir -p "$RUN_DIR"
+    for app in "${APPS[@]}"; do
+        mkdir -p "$RUN_DIR/$app/raw"
+    done
+
+    # Output helpers — keep TTY tidy, monitor.log retains every event.
+    iso_ts() { date -u +"%Y-%m-%dT%H:%M:%SZ"; }
+    log_to_file() { echo "[$(iso_ts)] $*" >> "$LOG_FILE"; }
+    log_user() {
+        # Print a user-facing message and record it to the log too.
+        echo "$*"
+        echo "[$(iso_ts)] $*" >> "$LOG_FILE"
+    }
+
+    USE_TICKER=false
+    if [[ -t 1 ]]; then USE_TICKER=true; fi
+
+    # ANSI palette — empty when not on a TTY so non-TTY output stays plain and
+    # `monitor.log` writes (which use the *_plain variables) never carry codes.
+    if [[ "$USE_TICKER" == "true" ]]; then
+        C_BOLD=$'\033[1m'
+        # Bright-black foreground (a.k.a. grey) rather than \033[2m dim — many
+        # terminals (VS Code's included) render the dim attribute by altering
+        # the background, which produces a visible black box behind the text.
+        C_DIM=$'\033[90m'
+        C_CYAN=$'\033[36m'
+        C_GREEN=$'\033[32m'
+        C_YELLOW=$'\033[33m'
+        C_RESET=$'\033[0m'
+    else
+        C_BOLD="" C_DIM="" C_CYAN="" C_GREEN="" C_YELLOW="" C_RESET=""
+    fi
+
+    emit_styled() {
+        # Print a styled line to stdout, plain text to the log.
+        local plain="$1" styled="$2"
+        if [[ "$USE_TICKER" == "true" ]]; then
+            printf "%s\n" "$styled"
+        else
+            echo "$plain"
+        fi
+        echo "[$(iso_ts)] $plain" >> "$LOG_FILE"
+    }
+
+    fmt_duration() {
+        local s=$1
+        if (( s < 60 )); then printf "%ds" "$s"; return; fi
+        if (( s < 3600 )); then printf "%dm %ds" $((s/60)) $((s%60)); return; fi
+        printf "%dh %dm" $((s/3600)) $(( (s%3600)/60 ))
+    }
+    ticker_done() {
+        if [[ "$USE_TICKER" == "true" ]]; then
+            printf "\n"
+        fi
+    }
+
+    # ── Animated ticker ──────────────────────────────────────────────────
+    # The ticker line is redrawn at 5Hz by a background process while the
+    # main loop captures logs. State is shared via a small file the main
+    # loop writes once per iteration; the animator reads it on every redraw
+    # so the spinner, elapsed time, and snapshot countdown all keep ticking
+    # even while flyctl calls are in flight.
+
+    TICKER_STATE_FILE="$RUN_DIR/.ticker_state"
+    TICKER_ANIMATOR_PID=""
+
+    write_ticker_state() {
+        # Single line:
+        # iter iter_max start_epoch last_analyse_epoch analyse_every captured_total
+        printf '%d %d %d %d %d %d\n' \
+            "$iteration" "$ITERATIONS" "$start_epoch" "$last_analyse_epoch" \
+            "$ANALYSE_EVERY_SECONDS" "$CAPTURED_TOTAL" \
+            > "$TICKER_STATE_FILE"
+    }
+
+    ticker_animator() {
+        local idx=0
+        local frames=(◐ ◓ ◑ ◒)
+        local n=${#frames[@]}
+        local iter_num iter_max s_epoch a_epoch a_every captured
+        local now elapsed elapsed_fmt iter_styled iter_max_part snap_part until_snap snap_fmt spin
+        while [[ -f "$TICKER_STATE_FILE" ]]; do
+            if read -r iter_num iter_max s_epoch a_epoch a_every captured \
+                < "$TICKER_STATE_FILE" 2>/dev/null \
+                && [[ -n "${iter_num:-}" ]]; then
+                now=$(date +%s)
+                elapsed=$(( now - s_epoch ))
+                elapsed_fmt=$(fmt_duration $elapsed)
+                if [[ "$iter_max" -gt 0 ]]; then
+                    iter_max_part=" / ${iter_max}"
+                else
+                    iter_max_part=""
+                fi
+                if [[ "$a_every" -gt 0 ]]; then
+                    until_snap=$(( a_every - (now - a_epoch) ))
+                    (( until_snap < 0 )) && until_snap=0
+                    snap_fmt=$(fmt_duration $until_snap)
+                    snap_part=" ${C_DIM}-${C_RESET} ${C_DIM}next snapshot ${snap_fmt}${C_RESET}"
+                else
+                    snap_part=""
+                fi
+                spin="${frames[$idx]}"
+                printf "\r\033[K   ${C_BOLD}${C_CYAN}%s${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET} ${C_DIM}-${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET}%s ${C_DIM}-${C_RESET} ${C_BOLD}${C_CYAN}%s${C_RESET} logs%s" \
+                    "$spin" "$elapsed_fmt" "$iter_num" "$iter_max_part" "$captured" "$snap_part"
+            fi
+            idx=$(( (idx + 1) % n ))
+            sleep 0.2
+        done
+    }
+
+    stop_ticker_animator() {
+        if [[ -n "$TICKER_ANIMATOR_PID" ]]; then
+            kill "$TICKER_ANIMATOR_PID" 2>/dev/null || true
+            wait "$TICKER_ANIMATOR_PID" 2>/dev/null || true
+            TICKER_ANIMATOR_PID=""
+        fi
+        rm -f "$TICKER_STATE_FILE"
+    }
+
+    STOP_REQUESTED=false
+    on_interrupt() {
+        STOP_REQUESTED=true
+        stop_ticker_animator
+        ticker_done
+        emit_styled \
+            "Stop requested — finishing up..." \
+            "${C_BOLD}${C_YELLOW}Stop requested${C_RESET} — final iteration & report..."
+    }
+    trap on_interrupt INT TERM
+
+    # Sleep but watch stdin for `q` so the user has a clean alternative to
+    # Ctrl+C. Any other keystroke is ignored (we keep polling). Falls back to
+    # a plain sleep when stdin isn't an interactive TTY (CI, redirected input).
+    poll_quit_or_sleep() {
+        local duration=$1
+        if [[ "$USE_TICKER" != "true" || ! -t 0 ]]; then
+            sleep "$duration" || true
+            return
+        fi
+        local target=$(( $(date +%s) + duration ))
+        local key=""
+        while (( $(date +%s) < target )); do
+            local rem=$(( target - $(date +%s) ))
+            (( rem < 1 )) && rem=1
+            if read -rsn1 -t "$rem" key 2>/dev/null; then
+                case "$key" in
+                    q|Q)
+                        STOP_REQUESTED=true
+                        stop_ticker_animator
+                        ticker_done
+                        emit_styled \
+                            "Stop requested (q) — final iteration & report..." \
+                            "${C_BOLD}${C_YELLOW}Stop requested${C_RESET} (q) — final iteration & report..."
+                        return
+                        ;;
+                    *) ;;  # ignore stray keystrokes, keep polling
+                esac
+            fi
+        done
+    }
+
+    # Cleanup is now silent on TTY (recorded in monitor.log only) — it ran on
+    # almost every invocation and dominated the startup banner.
+    if [[ "$CLEANUP_OLD" == "true" ]]; then
+        log_to_file "Cleaning up old runs (older than $CLEANUP_DAYS days, mode: $CLEANUP_MODE)"
+        if [[ "$(uname)" == "Darwin" ]]; then
+            CUTOFF_DATE=$(date -u -v-${CLEANUP_DAYS}d +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d")
+        else
+            CUTOFF_DATE=$(date -u -d "$CLEANUP_DAYS days ago" +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d")
+        fi
+        if [[ -d "$OUTPUT_ROOT" ]]; then
+            find "$OUTPUT_ROOT" -mindepth 2 -maxdepth 2 -type d | while read -r run_dir; do
+                date_dir=$(basename "$(dirname "$run_dir")")
+                if ! [[ "$date_dir" =~ ^[0-9]{8}$ ]]; then continue; fi
+                if [[ "$date_dir" -ge "$CUTOFF_DATE" ]]; then continue; fi
+                run_name=$(basename "$run_dir")
+                if [[ "$CLEANUP_MODE" == "zip" ]]; then
+                    while IFS= read -r raw_dir; do
+                        [[ -z "$raw_dir" ]] && continue
+                        zip_parent=$(dirname "$raw_dir")
+                        [[ -f "$zip_parent/raw.zip" ]] && continue
+                        rel=${raw_dir#"$run_dir/"}
+                        log_to_file "  Zipping raw logs: $date_dir/$run_name/$rel"
+                        (cd "$zip_parent" && zip -q -r "raw.zip" "raw" && rm -rf "raw") || \
+                            log_to_file "  Failed to zip raw directory $raw_dir"
+                    done < <(find "$run_dir" -type d -name raw 2>/dev/null)
+                    # Null-delimited so a `--run-id` containing spaces (or any
+                    # other whitespace) doesn't split filenames mid-token.
+                    if find "$run_dir" -type f -name '*_iter*.json' -print -quit 2>/dev/null | grep -q .; then
+                        log_to_file "  Removing iteration JSONs: $date_dir/$run_name"
+                        find "$run_dir" -type f -name '*_iter*.json' -print0 2>/dev/null | xargs -0 rm -f || \
+                            log_to_file "  Failed to remove iteration JSONs in $run_dir"
+                    fi
+                else
+                    log_to_file "  Deleting: $date_dir/$run_name"
+                    rm -rf "$run_dir" || log_to_file "  Failed to delete $run_dir"
+                fi
+            done
+        fi
+        log_to_file "Cleanup complete"
+    fi
+
+    # Compact startup banner: run dir, app list, and one settings line.
+    if [[ "$ITERATIONS" -gt 0 ]]; then
+        DURATION_HINT=" (~$(fmt_duration $((ITERATIONS * INTERVAL))))"
+    else
+        DURATION_HINT=" (forever)"
+    fi
+    # Comma-space join. `${APPS[*]}` only honours the first IFS char, so build
+    # the joined string explicitly to keep the space after each comma.
+    printf -v APPS_JOINED '%s, ' "${APPS[@]}"
+    APPS_JOINED="${APPS_JOINED%, }"
+    if [[ "$ANALYSE_EVERY_SECONDS" -gt 0 ]]; then
+        SNAP_HINT="every $ANALYSE_EVERY"
+    else
+        SNAP_HINT="disabled"
+    fi
+
+    # Rule + key/value helpers for the bracketed banner / ticker / done layout.
+    # All rules share one width so they line up; separate BANNER_WIDTH governs
+    # apps-list wrapping so a narrow rule doesn't force aggressive wraps.
+    RULE_WIDTH=60
+    BANNER_WIDTH=60
+    rule_chars() { printf '─%.0s' $(seq 1 "$1"); }
+
+    print_top_rule() {
+        local label="$1"
+        local pad=$(( RULE_WIDTH - 4 - ${#label} ))
+        (( pad < 0 )) && pad=0
+        local trail
+        trail=$(rule_chars "$pad")
+        if [[ "$USE_TICKER" == "true" ]]; then
+            printf "${C_DIM}── ${C_RESET}${C_BOLD}${C_CYAN}%s${C_RESET}${C_DIM} %s${C_RESET}\n" "$label" "$trail"
+        else
+            printf -- "── %s %s\n" "$label" "$trail"
+        fi
+        echo "[$(iso_ts)] ── $label ──" >> "$LOG_FILE"
+    }
+    print_rule() {
+        local line
+        line=$(rule_chars "$RULE_WIDTH")
+        if [[ "$USE_TICKER" == "true" ]]; then
+            printf "${C_DIM}%s${C_RESET}\n" "$line"
+        else
+            printf "%s\n" "$line"
+        fi
+        echo "[$(iso_ts)] $line" >> "$LOG_FILE"
+    }
+    print_kv() {
+        # Indented key/value line. Key in default weight; value dimmed (grey)
+        # so the eye lands on the slug + ticker, not the static config block.
+        # 3-space indent so the key column aligns with the slug label in the
+        # top rule (which sits at column 3, after `── `).
+        local key="$1" value="$2"
+        if [[ "$USE_TICKER" == "true" ]]; then
+            printf "   %-10s  ${C_DIM}%s${C_RESET}\n" "$key" "$value"
+        else
+            printf "   %-10s  %s\n" "$key" "$value"
+        fi
+        echo "[$(iso_ts)] $key: $value" >> "$LOG_FILE"
+    }
+    print_kv_wrapped() {
+        # Like print_kv but wraps `value` at word boundaries onto continuation
+        # lines indented to align under the value column.
+        local key="$1" value="$2"
+        local cont_indent="               "  # 3 + 10 + 2 = 15 spaces
+        local wrap_width=$(( BANNER_WIDTH - 15 ))
+        local first=true
+        while IFS= read -r line; do
+            if $first; then
+                print_kv "$key" "$line"
+                first=false
+            else
+                if [[ "$USE_TICKER" == "true" ]]; then
+                    printf "%s${C_DIM}%s${C_RESET}\n" "$cont_indent" "$line"
+                else
+                    printf "%s%s\n" "$cont_indent" "$line"
+                fi
+                echo "[$(iso_ts)]   $line" >> "$LOG_FILE"
+            fi
+        done < <(printf "%s\n" "$value" | fold -s -w "$wrap_width")
+    }
+
+    print_top_rule "$RUN_SLUG"
+    print_kv "Run" "$RUN_DIR"
+    print_kv_wrapped "Apps" "$APPS_JOINED"
+    print_kv "Interval" "${INTERVAL}s"
+    print_kv "Samples" "${SAMPLES} lines/fetch"
+    print_kv "Iterations" "${ITERATIONS}${DURATION_HINT}"
+    print_kv "Snapshots" "$SNAP_HINT"
+    if [[ "$USE_TICKER" == "true" && -t 0 ]]; then
+        print_kv "Quit" "press q or Ctrl/CMD + C "
+    fi
+    print_rule
+    if [[ -z "$PYTHON_CMD" ]]; then
+        log_user "Python not found; continuing with raw log capture only"
+    fi
+
+    capture_app() {
+        local app="$1" ts="$2" iter="$3"
+        local raw_file="$RUN_DIR/$app/raw/${ts}_iter${iter}.log"
+        local summary_file="$RUN_DIR/$app/${ts}_iter${iter}.json"
+        local cursor_file="$RUN_DIR/$app/.cursor"
+
+        # `flyctl logs --no-tail` returns the same recent window every call, so
+        # filter against a per-app cursor to keep only lines newer than the
+        # last iteration. Falls back to the unfiltered capture when Python is
+        # unavailable.
+        if [[ -n "$PYTHON_CMD" ]]; then
+            if ! flyctl logs --app "$app" --no-tail 2>&1 \
+                | tail -n "$SAMPLES" \
+                | env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \
+                    "$SCRIPT_DIR/filter_since.py" "$cursor_file" \
+                > "$raw_file"; then
+                log_to_file "[$app] Failed to fetch logs from Fly; raw output stored in $raw_file"
+                return
+            fi
+        else
+            if ! flyctl logs --app "$app" --no-tail 2>&1 | tail -n "$SAMPLES" > "$raw_file"; then
+                log_to_file "[$app] Failed to fetch logs from Fly; raw output stored in $raw_file"
+                return
+            fi
+        fi
+
+        if [[ ! -s "$raw_file" ]]; then
+            return
+        fi
+        if [[ -z "$PYTHON_CMD" ]]; then
+            log_to_file "[$app] Captured raw logs only (Python unavailable)"
+            return
+        fi
+        if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/process_logs.py" "$raw_file" "$summary_file" >> "$LOG_FILE" 2>&1; then
+            log_to_file "[$app] Failed to process logs (see output above)"
+            return
+        fi
+        if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1; then
+            log_to_file "[$app] Failed to aggregate logs (see output above)"
+        fi
+    }
+
+    run_snapshot_analyse() {
+        if [[ -z "$PYTHON_CMD" ]]; then return; fi
+        local snap_ts
+        snap_ts=$(date -u +"%H%M%SZ")
+        mkdir -p "$RUN_DIR/snapshots"
+        local run_ref="$(basename "$DATE_DIR")/$(basename "$RUN_DIR")"
+        log_to_file "Snapshot analyse → $RUN_DIR/snapshots/analysis_${snap_ts}.md"
+        env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \
+            "$SCRIPT_DIR/analyse_logs.py" \
+            --root "$OUTPUT_ROOT" \
+            --run "$run_ref" \
+            --out "$RUN_DIR/snapshots/analysis_${snap_ts}" >> "$LOG_FILE" 2>&1 || \
+            log_to_file "Snapshot analyse failed (see log)"
+    }
+
+    iteration=0
+    start_epoch=$(date +%s)
+    last_analyse_epoch=$start_epoch
+    CAPTURED_TOTAL=0
+
+    # Kick off the background animator before the first capture so the
+    # ticker line is alive from t=0.
+    if [[ "$USE_TICKER" == "true" ]]; then
+        write_ticker_state
+        ticker_animator &
+        TICKER_ANIMATOR_PID=$!
+    fi
+
+    while true; do
+        iteration=$((iteration + 1))
+        iter_start_epoch=$(date +%s)
+        ts=$(date -u +"%Y%m%dT%H%M%SZ")
+        log_to_file "Iteration $iteration: capturing logs"
+
+        # Capture all apps in parallel — flyctl calls are independent, and
+        # running them sequentially made each iteration ~5-10s instead of the
+        # advertised INTERVAL. Track each PID explicitly so we wait only for
+        # the capture children — bare `wait` would also block on the ticker
+        # animator, which is an intentional infinite loop.
+        local capture_pids=()
+        for app in "${APPS[@]}"; do
+            capture_app "$app" "$ts" "$iteration" &
+            capture_pids+=($!)
+        done
+        for pid in "${capture_pids[@]}"; do
+            wait "$pid" 2>/dev/null || true
+        done
+
+        # Tally lines persisted this iteration (cursor-filtered, so reflects
+        # genuinely new log lines, not the flyctl --no-tail window).
+        for app in "${APPS[@]}"; do
+            raw="$RUN_DIR/$app/raw/${ts}_iter${iteration}.log"
+            if [[ -f "$raw" ]]; then
+                n=$(wc -l < "$raw" 2>/dev/null | tr -d ' ')
+                CAPTURED_TOTAL=$(( CAPTURED_TOTAL + ${n:-0} ))
+            fi
+        done
+
+        if [[ "$ANALYSE_EVERY_SECONDS" -gt 0 ]]; then
+            now_epoch=$(date +%s)
+            if (( now_epoch - last_analyse_epoch >= ANALYSE_EVERY_SECONDS )); then
+                run_snapshot_analyse
+                last_analyse_epoch=$now_epoch
+            fi
+        fi
+
+        # Hand the latest counters to the animator; it picks them up on its
+        # next 200ms redraw.
+        write_ticker_state
+        log_to_file "iter ${iteration}/${ITERATIONS} · elapsed $(fmt_duration $(( $(date +%s) - start_epoch )))"
+
+        if [[ "$STOP_REQUESTED" == "true" ]]; then break; fi
+        if [[ "$ITERATIONS" -ne 0 && "$iteration" -ge "$ITERATIONS" ]]; then break; fi
+
+        # `--interval` is the wall-clock period between iteration starts; if
+        # capture+analyse took longer than INTERVAL we run back-to-back (and
+        # log a warning) instead of compounding the lag.
+        iter_elapsed=$(( $(date +%s) - iter_start_epoch ))
+        remaining=$(( INTERVAL - iter_elapsed ))
+        if (( remaining > 0 )); then
+            poll_quit_or_sleep "$remaining"
+        else
+            log_to_file "Iteration $iteration took ${iter_elapsed}s (>= interval ${INTERVAL}s); no sleep"
+        fi
+        if [[ "$STOP_REQUESTED" == "true" ]]; then break; fi
+    done
+
+    stop_ticker_animator
+    ticker_done
+    trap - INT TERM
+
+    if [[ -z "$PYTHON_CMD" ]]; then
+        log_user "Skipping aggregation (Python unavailable)"
+    else
+        log_to_file "Running final aggregation..."
+        for app in "${APPS[@]}"; do
+            if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1; then
+                log_to_file "[$app] Final aggregation failed (see output above)"
+            fi
+        done
+        log_to_file "Aggregation complete"
+
+        log_to_file "Running final analyse..."
+        run_ref="$(basename "$DATE_DIR")/$(basename "$RUN_DIR")"
+        env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" \
+            "$SCRIPT_DIR/analyse_logs.py" \
+            --root "$OUTPUT_ROOT" \
+            --run "$run_ref" >> "$LOG_FILE" 2>&1 || \
+            log_to_file "Final analyse failed (see log)"
+        print_rule
+        emit_styled \
+            "   ✓ Done after $iteration iteration(s)" \
+            "   ${C_BOLD}${C_GREEN}✓ Done${C_RESET} after ${C_BOLD}${C_CYAN}$iteration${C_RESET} iteration(s)"
+        emit_styled \
+            "     Report: $RUN_DIR/analysis.md" \
+            "     ${C_BOLD}Report:${C_RESET} $RUN_DIR/analysis.md"
+    fi
+}
+
+# Default subcommand is `monitor`. Bare `logs.sh`, or any invocation whose
+# first positional starts with a dash (i.e. a flag, not a subcommand), runs
+# monitor with the supplied flags.
+if [[ $# -eq 0 ]]; then
+    cmd_monitor
+    exit 0
+fi
+
+case "$1" in
+    -h|--help|help)     usage_top; exit 0 ;;
+    monitor)            shift; cmd_monitor "$@" ;;
+    search)             shift; cmd_search "$@" ;;
+    analyse|analyze)    shift; cmd_analyse "$@" ;;
+    -*)                 cmd_monitor "$@" ;;
+    *)
+        echo "Unknown command: $1" >&2
+        usage_top
+        exit 1
+        ;;
+esac
diff --git a/scripts/monitor_logs.sh b/scripts/monitor_logs.sh
index 6765d57e9..162f39f26 100755
--- a/scripts/monitor_logs.sh
+++ b/scripts/monitor_logs.sh
@@ -1,337 +1,9 @@
 #!/usr/bin/env bash
 
+# Back-compat shim — the canonical entry point is `logs.sh`.
+# All flags accepted by this script are forwarded to `logs.sh monitor`.
+
 set -euo pipefail
 
 SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-APP="hover,hover-worker,hover-analysis"
-INTERVAL=10
-SAMPLES=400
-ITERATIONS=1440  # 4 hours at 10s intervals
-RUN_ID=""
-OUTPUT_ROOT="logs"
-CLEANUP_OLD=true
-CLEANUP_DAYS=1
-CLEANUP_MODE="zip"
-PYTHON_CMD=""
-PYTHON_ARGS=()
-
-usage() {
-    cat <<'USAGE'
-Usage: monitor_logs.sh [options]
-
-Fetch recent Fly logs on a fixed cadence, archive the raw output, and write
-per-minute summaries describing how often each log level/message occurred.
-
-Automatic cleanup (enabled by default):
-  - Zips raw logs and iteration JSONs from runs older than 1 day
-  - Keeps summary.md, summary.json, and monitor.log
-  - Use --no-cleanup to disable or --cleanup-mode delete to remove everything
-
-Options:
-  --app NAMES           Fly application name(s), comma-separated
-                        (default: hover,hover-worker,hover-analysis)
-  --interval SECONDS    Seconds to wait between samples (default: 60)
-  --samples N           Number of log lines to request each run (default: 400)
-  --iterations N        Number of iterations to perform (0 = run forever)
-  --run-id ID           Identifier used when naming output directories
-  --no-cleanup          Disable automatic cleanup (default: enabled)
-  --cleanup-days N      Clean runs older than N days (default: 1)
-  --cleanup-mode MODE   How to clean: 'zip' or 'delete' (default: zip)
-                        zip: archives raw/ and iteration JSONs, keeps summaries
-                        delete: removes entire run directory
-  -h, --help            Show this message and exit
-
-Environment variables with the same names (APP, INTERVAL, SAMPLES, ITERATIONS,
-RUN_ID) override the defaults as well.
-
-Examples:
-  # Default: auto-zip raw logs from runs older than 1 day
-  ./monitor_logs.sh
-
-  # Disable cleanup
-  ./monitor_logs.sh --no-cleanup
-
-  # Delete entire runs older than 3 days
-  ./monitor_logs.sh --cleanup-days 3 --cleanup-mode delete
-USAGE
-}
-
-# Allow environment variables to override defaults
-APP=${APP:-$APP}
-INTERVAL=${INTERVAL:-$INTERVAL}
-SAMPLES=${SAMPLES:-$SAMPLES}
-ITERATIONS=${ITERATIONS:-$ITERATIONS}
-RUN_ID=${RUN_ID:-$RUN_ID}
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --app)
-            APP="$2"
-            shift 2
-            ;;
-        --interval)
-            INTERVAL="$2"
-            shift 2
-            ;;
-        --samples)
-            SAMPLES="$2"
-            shift 2
-            ;;
-        --iterations)
-            ITERATIONS="$2"
-            shift 2
-            ;;
-        --run-id)
-            RUN_ID="$2"
-            shift 2
-            ;;
-        --no-cleanup)
-            CLEANUP_OLD=false
-            shift
-            ;;
-        --cleanup-days)
-            CLEANUP_DAYS="$2"
-            shift 2
-            ;;
-        --cleanup-mode)
-            CLEANUP_MODE="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1" >&2
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-if ! [[ "$INTERVAL" =~ ^[0-9]+$ && "$INTERVAL" -gt 0 ]]; then
-    echo "interval must be a positive integer" >&2
-    exit 1
-fi
-
-if ! [[ "$SAMPLES" =~ ^[0-9]+$ && "$SAMPLES" -ge 1 && "$SAMPLES" -le 10000 ]]; then
-    echo "samples must be an integer between 1 and 10000" >&2
-    exit 1
-fi
-
-if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]]; then
-    echo "iterations must be an integer >= 0" >&2
-    exit 1
-fi
-
-if ! [[ "$CLEANUP_DAYS" =~ ^[0-9]+$ && "$CLEANUP_DAYS" -ge 0 ]]; then
-    echo "cleanup-days must be a non-negative integer" >&2
-    exit 1
-fi
-
-if [[ "$CLEANUP_MODE" != "zip" && "$CLEANUP_MODE" != "delete" ]]; then
-    echo "cleanup-mode must be 'zip' or 'delete'" >&2
-    exit 1
-fi
-
-# Split APP into an array on commas (and strip whitespace around entries).
-IFS=',' read -r -a APPS <<< "$APP"
-for i in "${!APPS[@]}"; do
-    APPS[i]="${APPS[i]// /}"
-done
-if [[ ${#APPS[@]} -eq 0 || -z "${APPS[0]}" ]]; then
-    echo "at least one app name is required" >&2
-    exit 1
-fi
-
-# Resolve a working Python interpreter for log processing helpers.
-if command -v python3 >/dev/null 2>&1 && python3 -c "import sys" >/dev/null 2>&1; then
-    PYTHON_CMD="python3"
-elif command -v python >/dev/null 2>&1 && python -c "import sys" >/dev/null 2>&1; then
-    PYTHON_CMD="python"
-elif command -v py >/dev/null 2>&1 && py -3 -c "import sys" >/dev/null 2>&1; then
-    PYTHON_CMD="py"
-    PYTHON_ARGS=(-3)
-fi
-
-# Auto-generate settings suffix with appropriate units
-# Interval: use minutes if >= 60s, otherwise seconds
-if [[ "$INTERVAL" -ge 60 ]]; then
-    INTERVAL_MINUTES=$(( INTERVAL / 60 ))
-    INTERVAL_STR="${INTERVAL_MINUTES}m"
-else
-    INTERVAL_STR="${INTERVAL}s"
-fi
-
-if [[ "$ITERATIONS" -eq 0 ]]; then
-    SETTINGS_SUFFIX="${INTERVAL_STR}_forever"
-else
-    # Calculate total duration in seconds
-    DURATION_SECONDS=$(( ITERATIONS * INTERVAL ))
-
-    # Duration: use days if >= 24h, hours if >= 60m, otherwise minutes
-    if [[ "$DURATION_SECONDS" -ge 86400 ]]; then
-        DURATION_DAYS=$(( (DURATION_SECONDS + 43200) / 86400 ))
-        DURATION_STR="${DURATION_DAYS}d"
-    elif [[ "$DURATION_SECONDS" -ge 3600 ]]; then
-        DURATION_HOURS=$(( (DURATION_SECONDS + 1800) / 3600 ))
-        DURATION_STR="${DURATION_HOURS}h"
-    else
-        DURATION_MINUTES=$(( (DURATION_SECONDS + 30) / 60 ))
-        DURATION_STR="${DURATION_MINUTES}m"
-    fi
-
-    SETTINGS_SUFFIX="${INTERVAL_STR}_${DURATION_STR}"
-fi
-
-# Combine custom name (if provided) with settings
-if [[ -z "$RUN_ID" ]]; then
-    RUN_ID="$SETTINGS_SUFFIX"
-else
-    RUN_ID="${RUN_ID}_${SETTINGS_SUFFIX}"
-fi
-
-# Create directory structure: logs/YYYYMMDD/HHMM_run-id/
-DATE_DIR="$OUTPUT_ROOT/$(date +"%Y%m%d")"
-TIME_PREFIX=$(date +"%H%M")
-RUN_DIR="$DATE_DIR/${TIME_PREFIX}_${RUN_ID}"
-LOG_FILE="$RUN_DIR/monitor.log"
-
-mkdir -p "$RUN_DIR"
-for app in "${APPS[@]}"; do
-    mkdir -p "$RUN_DIR/$app/raw"
-done
-
-# Cleanup old runs if requested
-if [[ "$CLEANUP_OLD" == "true" ]]; then
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Cleaning up old runs (older than $CLEANUP_DAYS days, mode: $CLEANUP_MODE)" | tee -a "$LOG_FILE"
-
-    # Calculate cutoff date (days ago)
-    if [[ "$(uname)" == "Darwin" ]]; then
-        # macOS date command
-        CUTOFF_DATE=$(date -u -v-${CLEANUP_DAYS}d +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d")
-    else
-        # GNU date command
-        CUTOFF_DATE=$(date -u -d "$CLEANUP_DAYS days ago" +"%Y%m%d" 2>/dev/null || date -u +"%Y%m%d")
-    fi
-
-    # Find old run directories
-    if [[ -d "$OUTPUT_ROOT" ]]; then
-        find "$OUTPUT_ROOT" -mindepth 2 -maxdepth 2 -type d | while read -r run_dir; do
-            # Extract date from path (logs/YYYYMMDD/HHMM_run-id)
-            date_dir=$(basename "$(dirname "$run_dir")")
-
-            # Skip if not a date directory
-            if ! [[ "$date_dir" =~ ^[0-9]{8}$ ]]; then
-                continue
-            fi
-
-            # Skip if directory is from today or newer than cutoff
-            if [[ "$date_dir" -ge "$CUTOFF_DATE" ]]; then
-                continue
-            fi
-
-            run_name=$(basename "$run_dir")
-
-            if [[ "$CLEANUP_MODE" == "zip" ]]; then
-                # Zip mode: archive any raw/ dirs and iteration JSONs, keep summaries.
-                # Handles both legacy layout ($run_dir/raw) and per-app layout
-                # ($run_dir/<app>/raw).
-                while IFS= read -r raw_dir; do
-                    [[ -z "$raw_dir" ]] && continue
-                    zip_parent=$(dirname "$raw_dir")
-                    [[ -f "$zip_parent/raw.zip" ]] && continue
-                    rel=${raw_dir#"$run_dir/"}
-                    echo "  Zipping raw logs: $date_dir/$run_name/$rel" | tee -a "$LOG_FILE"
-                    (cd "$zip_parent" && zip -q -r "raw.zip" "raw" && rm -rf "raw") || {
-                        echo "  Failed to zip raw directory $raw_dir" | tee -a "$LOG_FILE"
-                    }
-                done < <(find "$run_dir" -type d -name raw 2>/dev/null)
-
-                # Remove iteration JSONs anywhere under the run dir (keep summaries).
-                iter_files=$(find "$run_dir" -type f -name '*_iter*.json' 2>/dev/null)
-                if [[ -n "$iter_files" ]]; then
-                    echo "  Removing iteration JSONs: $date_dir/$run_name" | tee -a "$LOG_FILE"
-                    printf '%s\n' "$iter_files" | xargs rm -f || {
-                        echo "  Failed to remove iteration JSONs in $run_dir" | tee -a "$LOG_FILE"
-                    }
-                fi
-            else
-                # Delete mode: remove entire run directory
-                echo "  Deleting: $date_dir/$run_name" | tee -a "$LOG_FILE"
-                rm -rf "$run_dir" || {
-                    echo "  Failed to delete $run_dir" | tee -a "$LOG_FILE"
-                }
-            fi
-        done
-    fi
-
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Cleanup complete" | tee -a "$LOG_FILE"
-fi
-
-echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Starting log monitor" | tee -a "$LOG_FILE"
-echo "Apps: ${APPS[*]} | Interval: ${INTERVAL}s | Samples: $SAMPLES | Iterations: $ITERATIONS" | tee -a "$LOG_FILE"
-echo "Run directory: $RUN_DIR" | tee -a "$LOG_FILE"
-for app in "${APPS[@]}"; do
-    echo "  [$app] raw: $RUN_DIR/$app/raw  summaries: $RUN_DIR/$app" | tee -a "$LOG_FILE"
-done
-if [[ -z "$PYTHON_CMD" ]]; then
-    echo "Python not found; continuing with raw log capture only" | tee -a "$LOG_FILE"
-fi
-
-# capture_app APP_NAME TIMESTAMP ITER — fetch + process logs for a single app.
-capture_app() {
-    local app="$1" ts="$2" iter="$3"
-    local raw_file="$RUN_DIR/$app/raw/${ts}_iter${iter}.log"
-    local summary_file="$RUN_DIR/$app/${ts}_iter${iter}.json"
-
-    if ! flyctl logs --app "$app" --no-tail 2>&1 | tail -n "$SAMPLES" > "$raw_file"; then
-        echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Failed to fetch logs from Fly; raw output stored in $raw_file" | tee -a "$LOG_FILE"
-        return
-    fi
-
-    if [[ -z "$PYTHON_CMD" ]]; then
-        echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Captured raw logs only (Python unavailable)" | tee -a "$LOG_FILE"
-        return
-    fi
-
-    if ! env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/process_logs.py" "$raw_file" "$summary_file" >> "$LOG_FILE" 2>&1; then
-        echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] [$app] Failed to process logs (see output above)" | tee -a "$LOG_FILE"
-        return
-    fi
-
-    env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1
-}
-
-iteration=0
-
-while true; do
-    iteration=$((iteration + 1))
-    ts=$(date -u +"%Y%m%dT%H%M%SZ")
-
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Iteration $iteration: capturing logs" | tee -a "$LOG_FILE"
-
-    for app in "${APPS[@]}"; do
-        capture_app "$app" "$ts" "$iteration"
-    done
-
-    if [[ "$ITERATIONS" -ne 0 && "$iteration" -ge "$ITERATIONS" ]]; then
-        break
-    fi
-
-    sleep "$INTERVAL"
-done
-
-echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Monitoring finished after $iteration iteration(s)" | tee -a "$LOG_FILE"
-
-# Final aggregation per app
-if [[ -z "$PYTHON_CMD" ]]; then
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Skipping aggregation (Python unavailable)" | tee -a "$LOG_FILE"
-else
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Running final aggregation..." | tee -a "$LOG_FILE"
-    for app in "${APPS[@]}"; do
-        env PYTHONUTF8=1 "$PYTHON_CMD" "${PYTHON_ARGS[@]+"${PYTHON_ARGS[@]}"}" "$SCRIPT_DIR/aggregate_logs.py" "$RUN_DIR/$app" >> "$LOG_FILE" 2>&1
-    done
-    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Aggregation complete" | tee -a "$LOG_FILE"
-fi
+exec "$SCRIPT_DIR/logs.sh" monitor "$@"
diff --git a/scripts/process_logs.py b/scripts/process_logs.py
index 7789a6f26..9180b0759 100755
--- a/scripts/process_logs.py
+++ b/scripts/process_logs.py
@@ -190,4 +190,7 @@ def main() -> int:
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        sys.exit(130)
diff --git a/scripts/search_logs.py b/scripts/search_logs.py
new file mode 100755
index 000000000..eecb27079
--- /dev/null
+++ b/scripts/search_logs.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Search captured Fly logs by keyword/regex across one or more apps.
+
+Reads raw captures from a `logs/YYYYMMDD/HHMM_<run-id>/<app>/raw/` directory or
+the zipped `raw.zip` produced by monitor cleanup. Prints matching lines with a
+source prefix and a per-app/per-run match count summary on stderr.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import zipfile
+from pathlib import Path
+from typing import Iterator
+
+
+def _candidate_run_dirs(root: Path) -> list[Path]:
+    """Return run directories under `logs/YYYYMMDD/HHMM_*` sorted oldest -> newest."""
+    out: list[Path] = []
+    if not root.exists():
+        return out
+    for date_dir in sorted(root.iterdir()):
+        if not date_dir.is_dir() or len(date_dir.name) != 8 or not date_dir.name.isdigit():
+            continue
+        for run in sorted(date_dir.iterdir()):
+            if run.is_dir():
+                out.append(run)
+    return out
+
+
+def resolve_runs(run_arg: str | None, root: Path) -> list[Path]:
+    """Resolve the --run argument to one or more run directories.
+
+    None / 'latest' -> [most recent run]
+    'all'           -> every run
+    'YYYYMMDD'      -> runs under that date
+    other           -> prefix match against `<date>/<runname>` or run name
+    """
+    runs = _candidate_run_dirs(root)
+    if not runs:
+        return []
+    if run_arg in (None, "", "latest"):
+        return [runs[-1]]
+    if run_arg == "all":
+        return runs
+    target = run_arg.strip("/")
+    matched: list[Path] = []
+    for r in runs:
+        rel = str(r.relative_to(root))
+        if rel == target or rel.startswith(target + "/"):
+            matched.append(r)
+            continue
+        if r.parent.name == target or r.name == target or r.name.startswith(target):
+            matched.append(r)
+    return matched
+
+
+def _iter_app_dirs(run_dir: Path, app_filter: list[str] | None) -> Iterator[Path]:
+    for child in sorted(run_dir.iterdir()):
+        if not child.is_dir():
+            continue
+        if app_filter and child.name not in app_filter:
+            continue
+        if (child / "raw").exists() or (child / "raw.zip").exists():
+            yield child
+
+
+def iter_lines(app_dir: Path) -> Iterator[tuple[str, str]]:
+    """Yield (source_label, line) for every captured raw log line in an app dir."""
+    raw_dir = app_dir / "raw"
+    if raw_dir.exists():
+        for log in sorted(raw_dir.glob("*.log")):
+            with log.open("r", encoding="utf-8", errors="ignore") as f:
+                for line in f:
+                    yield (log.name, line.rstrip("\n"))
+    raw_zip = app_dir / "raw.zip"
+    if raw_zip.exists():
+        with zipfile.ZipFile(raw_zip) as zf:
+            for info in sorted(zf.infolist(), key=lambda i: i.filename):
+                if info.is_dir() or not info.filename.endswith(".log"):
+                    continue
+                with zf.open(info) as f:
+                    for raw_line in f:
+                        yield (
+                            Path(info.filename).name,
+                            raw_line.decode("utf-8", errors="ignore").rstrip("\n"),
+                        )
+
+
+def search(
+    runs: list[Path],
+    pattern: re.Pattern,
+    app_filter: list[str] | None,
+    max_matches: int,
+    root: Path,
+) -> int:
+    total = 0
+    stop = False
+    for run in runs:
+        per_app: dict[str, int] = {}
+        rel_run = run.relative_to(root)
+        for app_dir in _iter_app_dirs(run, app_filter):
+            count = 0
+            for source, line in iter_lines(app_dir):
+                if pattern.search(line):
+                    count += 1
+                    total += 1
+                    print(f"[{rel_run}][{app_dir.name}][{source}] {line}")
+                    if max_matches and total >= max_matches:
+                        stop = True
+                        break
+            per_app[app_dir.name] = count
+            if stop:
+                break
+        for app, n in per_app.items():
+            print(f"# {rel_run} {app}: {n} match(es)", file=sys.stderr)
+        if stop:
+            print("# match cap reached", file=sys.stderr)
+            break
+    print(f"# total matches: {total}", file=sys.stderr)
+    return total
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="Search captured Fly logs.")
+    p.add_argument("--keyword", action="append", default=[], help="Literal substring (repeatable).")
+    p.add_argument("--regex", action="append", default=[], help="Regex pattern (repeatable).")
+    p.add_argument("--app", default="", help="Comma-separated app filter (default: all apps in run).")
+    p.add_argument("--run", default=None, help="'latest' (default), 'all', a date, or a run dir.")
+    p.add_argument("--root", default="logs", help="Logs root directory (default: logs).")
+    p.add_argument("-i", "--ignore-case", action="store_true", default=True,
+                   help="Match case-insensitively (default: on).")
+    p.add_argument("--case-sensitive", action="store_true",
+                   help="Force case-sensitive matching (overrides --ignore-case).")
+    p.add_argument("--max", type=int, default=0, help="Stop after N total matches (0 = unlimited).")
+    args = p.parse_args()
+
+    if not args.keyword and not args.regex:
+        print("Provide at least one --keyword or --regex.", file=sys.stderr)
+        return 2
+    if args.max < 0:
+        print("--max must be >= 0 (0 = unlimited).", file=sys.stderr)
+        return 2
+
+    parts = [re.escape(k) for k in args.keyword] + list(args.regex)
+    flags = 0 if args.case_sensitive else re.IGNORECASE
+    try:
+        pattern = re.compile("|".join(f"(?:{p})" for p in parts), flags)
+    except re.error as exc:
+        print(f"Invalid regex: {exc}", file=sys.stderr)
+        return 2
+
+    root = Path(args.root)
+    runs = resolve_runs(args.run, root)
+    if not runs:
+        print(f"No runs matched under {root} (run={args.run!r})", file=sys.stderr)
+        return 1
+
+    apps = [a.strip() for a in args.app.split(",") if a.strip()] or None
+    search(runs, pattern, apps, args.max, root)
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        sys.exit(130)