diff --git a/CHANGELOG.md b/CHANGELOG.md index fc45a6d..de8248c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,3 +9,4 @@ All notable changes to this project will be documented here. - Added model metrics artifact support. - Added DevSecOps repository documentation and GitHub workflow templates. - Documented AMD Developer Cloud / DigitalOcean primary compute path with Fireworks fallback. +- Added vLLM metrics scraping and GPU snapshot evidence collection. diff --git a/README.md b/README.md index 6de6a0c..037aac7 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,8 @@ Each run writes: - `runs//events.jsonl` - `runs//findings.json` - `runs//metrics.json` +- `runs//vllm-metrics-before.prom` when `/metrics` is reachable +- `runs//vllm-metrics-after.prom` when `/metrics` is reachable - `runs//attack.py` - `runs//remediation.patch` - `runs//report.md` diff --git a/docs/case-study-outline.md b/docs/case-study-outline.md index d22499d..f549459 100644 --- a/docs/case-study-outline.md +++ b/docs/case-study-outline.md @@ -27,3 +27,4 @@ The architecture is built for long-context security evidence: Terraform plan JSO - Add more Azure rules: storage shared keys, public network access, Key Vault public access. - Add real LocalStack Azure exploit execution. - Add AWS and Kubernetes adapters behind the same run artifact model. +- Add AMD GPU-hosted model evidence with vLLM `/metrics` and `amd-smi` or `rocm-smi` snapshots. diff --git a/docs/case-study.md b/docs/case-study.md index 1ad64c9..e4b5fa2 100644 --- a/docs/case-study.md +++ b/docs/case-study.md @@ -77,6 +77,8 @@ branch -> PR -> tests/lint/type/audit -> review -> squash merge -> tag -> releas See [Runbook](runbook.md). +Operational evidence includes CLI run artifacts, model endpoint type, vLLM Prometheus snapshots when available, and local `amd-smi` or `rocm-smi` output when GPU tools are present. + ## 9. Cost analysis See [Cost Report](cost-report.md). V1 is designed to run locally, with AMD Developer Cloud used only for model-serving evidence. @@ -107,6 +109,7 @@ See [Failure Modes](failure-modes.md). - Add real LocalStack Azure exploit execution. - Add AWS, Kubernetes, and Docker Compose scenario detectors. - Add streamed time-to-first-token metrics. +- Add AMD GPU-hosted model evidence with vLLM `/metrics` and `amd-smi` or `rocm-smi` snapshots. - Add SBOM and signed release provenance. ## 14. Repository and demo links diff --git a/docs/cost-report.md b/docs/cost-report.md index 19fcdb7..1b4ba32 100644 --- a/docs/cost-report.md +++ b/docs/cost-report.md @@ -24,3 +24,5 @@ V1 is designed to keep cloud spend near zero by default. Offline mode runs local ## Future tracking Record actual hackathon model-serving runtime, GPU hours, and any LocalStack or cloud access costs before publishing the case study. + +Also record whether each demo run used offline mode, managed inference, or AMD GPU-hosted inference. This keeps the case study honest if a managed endpoint is used as a fallback before DigitalOcean/AMD access is ready. diff --git a/docs/runbook.md b/docs/runbook.md index 900a2b5..c24c080 100644 --- a/docs/runbook.md +++ b/docs/runbook.md @@ -49,6 +49,35 @@ Use [AMD Compute Strategy](compute-strategy.md) as the deployment checklist. Bui If AMD GPU access is delayed, point `NULLSTATE_LLM_BASE_URL` at the managed endpoint and keep the same nullstate run flow. Label the evidence as managed inference, not private GPU-hosted inference. +## Metrics evidence + +When `NULLSTATE_LLM_BASE_URL` is set, nullstate tries to scrape: + +```text +/metrics +``` + +If the endpoint exposes vLLM Prometheus metrics, the run writes: + +- `vllm-metrics-before.prom` +- `vllm-metrics-after.prom` +- parsed counters inside `metrics.json` + +The CLI also attempts a local GPU snapshot with `amd-smi` first and `rocm-smi` second. If neither tool exists, `metrics.json` records `status: unavailable` instead of failing the run. + +## Work you can do before AMD GPU access + +While waiting on DigitalOcean/AMD support, prepare the non-GPU pieces: + +- DigitalOcean project and firewall policy +- SSH keys and least-privilege access +- non-GPU droplet for LocalStack/nullstate smoke tests +- Docker installation and update policy +- GitHub repository secrets/environment names +- local `.env` file based on `.env.example` +- sanitized screenshots of repo workflow, PR checks, and offline demo +- LocalStack Azure token/access path if available + ## Artifact review before publishing Check: @@ -57,6 +86,8 @@ Check: - `runs//findings.json` - `runs//events.jsonl` - `runs//metrics.json` +- `runs//vllm-metrics-before.prom` +- `runs//vllm-metrics-after.prom` - `runs//remediation.patch` Do not publish secrets, real tenant IDs, real subscription IDs, private endpoints, or Terraform state. diff --git a/src/nullstate/cli.py b/src/nullstate/cli.py index 32c66c4..aaf9f10 100644 --- a/src/nullstate/cli.py +++ b/src/nullstate/cli.py @@ -2,6 +2,7 @@ import shutil from pathlib import Path +import os import typer from rich.console import Console @@ -13,6 +14,7 @@ from .attack import simulate_attack, write_attack_script from .demo import create_demo from .findings import find_public_blob_exposures +from .metrics import collect_run_metrics from .remediation import remediate_terraform_files from .report import render_report from .sandbox import get_backend, list_backends, render_commands, run_commands @@ -99,6 +101,12 @@ def run( findings = find_public_blob_exposures(plan) events.write("analysis", "Terraform plan analyzed", finding_count=len(findings)) write_json(run_dir / "findings.json", [finding.to_dict() for finding in findings]) + before_metrics = collect_run_metrics( + run_dir=run_dir, + base_url=os.getenv("NULLSTATE_LLM_BASE_URL"), + offline=offline, + stage="before", + ) write_attack_script(run_dir / "attack.py") red_agent = LlmAgent("red", red_model) @@ -119,10 +127,20 @@ def run( patch_result = remediate_terraform_files(workspace_dir) (run_dir / "remediation.patch").write_text(patch_result.diff, encoding="utf-8") + after_metrics = collect_run_metrics( + run_dir=run_dir, + base_url=os.getenv("NULLSTATE_LLM_BASE_URL"), + offline=offline, + stage="after", + ) write_json( run_dir / "metrics.json", { "model_calls": [red_result.metrics.to_dict(), blue_result.metrics.to_dict()], + "endpoint": { + "before": before_metrics, + "after": after_metrics, + }, "notes": ( "Token metrics come from OpenAI-compatible response usage when available. " "Offline mock mode records zero token counts. User-authored prompts are not required; " diff --git a/src/nullstate/metrics.py b/src/nullstate/metrics.py index afb83e2..dd8530e 100644 --- a/src/nullstate/metrics.py +++ b/src/nullstate/metrics.py @@ -1,8 +1,14 @@ from __future__ import annotations import re +import shutil +import subprocess from dataclasses import asdict, dataclass +from pathlib import Path from typing import Any +from urllib.parse import urlparse + +import requests @dataclass(frozen=True) @@ -69,3 +75,72 @@ def parse_vllm_metrics(metrics_text: str) -> dict[str, float]: if metric_name in wanted: parsed[wanted[metric_name]] = parsed.get(wanted[metric_name], 0.0) + float(raw_value) return parsed + + +def classify_endpoint(*, base_url: str | None, offline: bool) -> str: + if offline or not base_url: + return "offline" + host = urlparse(base_url).hostname or "" + if any(provider in host for provider in ("fireworks.ai", "together.ai", "openai.com", "anthropic.com")): + return "managed" + if host in {"localhost", "127.0.0.1", "::1"}: + return "self-hosted" + return "amd-gpu-hosted" + + +def collect_run_metrics(*, run_dir: Path, base_url: str | None, offline: bool, stage: str) -> dict[str, Any]: + endpoint_type = classify_endpoint(base_url=base_url, offline=offline) + summary: dict[str, Any] = { + "endpoint_type": endpoint_type, + "base_url_host": _safe_host(base_url), + "vllm_metrics": {}, + "vllm_metrics_artifact": None, + "gpu_snapshot": gpu_snapshot(), + } + if offline or not base_url: + return summary + + metrics_url = base_url.rstrip("/") + "/metrics" + try: + response = requests.get(metrics_url, timeout=10) + response.raise_for_status() + except requests.RequestException as error: + summary["vllm_metrics_error"] = str(error) + return summary + + artifact = run_dir / f"vllm-metrics-{stage}.prom" + artifact.write_text(response.text, encoding="utf-8") + summary["vllm_metrics"] = parse_vllm_metrics(response.text) + summary["vllm_metrics_artifact"] = artifact.name + return summary + + +def gpu_snapshot(command_runner=None) -> dict[str, Any]: + runner = command_runner or _run_gpu_command + attempted = ["amd-smi", "rocm-smi"] + for command in attempted: + result = runner(command) + if result is None: + continue + return { + "status": "available", + "command": command, + "stdout": result.stdout, + "stderr": result.stderr, + "returncode": result.returncode, + } + return {"status": "unavailable", "attempted": attempted} + + +def _run_gpu_command(command: str) -> subprocess.CompletedProcess[str] | None: + executable = shutil.which(command) + if not executable: + return None + args = [executable, "static"] if command == "amd-smi" else [executable] + return subprocess.run(args, text=True, capture_output=True, check=False, timeout=15) + + +def _safe_host(base_url: str | None) -> str | None: + if not base_url: + return None + return urlparse(base_url).hostname diff --git a/tests/test_metrics.py b/tests/test_metrics.py index abf59bb..afffa3f 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,15 @@ import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import Mock, patch -from nullstate.metrics import metrics_from_openai_response, parse_vllm_metrics +from nullstate.metrics import ( + classify_endpoint, + collect_run_metrics, + gpu_snapshot, + metrics_from_openai_response, + parse_vllm_metrics, +) class MetricsTests(unittest.TestCase): @@ -39,6 +48,38 @@ def test_parse_vllm_prometheus_metrics_extracts_key_counters(self): self.assertEqual(parsed["num_requests_running"], 3.0) self.assertEqual(parsed["gpu_cache_usage_perc"], 0.73) + def test_classifies_offline_managed_and_amd_endpoints(self): + self.assertEqual(classify_endpoint(base_url=None, offline=True), "offline") + self.assertEqual(classify_endpoint(base_url="https://api.fireworks.ai/inference/v1", offline=False), "managed") + self.assertEqual(classify_endpoint(base_url="http://localhost:8000", offline=False), "self-hosted") + self.assertEqual(classify_endpoint(base_url="http://10.10.0.5:8000", offline=False), "amd-gpu-hosted") + + def test_collect_run_metrics_writes_vllm_snapshots(self): + with TemporaryDirectory() as raw_tmp: + run_dir = Path(raw_tmp) + response = Mock() + response.text = 'vllm:generation_tokens_total{model_name="demo"} 42.0\n' + response.raise_for_status.return_value = None + + with patch("nullstate.metrics.requests.get", return_value=response): + summary = collect_run_metrics( + run_dir=run_dir, + base_url="http://10.10.0.5:8000", + offline=False, + stage="before", + ) + + self.assertEqual(summary["endpoint_type"], "amd-gpu-hosted") + self.assertEqual(summary["vllm_metrics"]["generation_tokens_total"], 42.0) + self.assertTrue((run_dir / "vllm-metrics-before.prom").exists()) + + def test_gpu_snapshot_is_available_without_gpu_tools(self): + snapshot = gpu_snapshot(command_runner=lambda command: None) + + self.assertEqual(snapshot["status"], "unavailable") + self.assertIn("amd-smi", snapshot["attempted"]) + self.assertIn("rocm-smi", snapshot["attempted"]) + if __name__ == "__main__": unittest.main()