diff --git a/alphatrion/tracing/prometheus_exporter.py b/alphatrion/tracing/prometheus_exporter.py index 020ef3e..a3b5cf6 100644 --- a/alphatrion/tracing/prometheus_exporter.py +++ b/alphatrion/tracing/prometheus_exporter.py @@ -68,35 +68,35 @@ def _init_metrics(self): self.llm_tokens_total = Counter( "llm_tokens_total", "Total LLM tokens consumed", - ["team_id", "user_id", "experiment_id", "model", "token_type"], + ["org_id", "team_id", "user_id", "experiment_id", "model", "token_type"], registry=self.registry, ) self.llm_input_tokens_total = Counter( "llm_input_tokens_total", "Total LLM input tokens consumed", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_output_tokens_total = Counter( "llm_output_tokens_total", "Total LLM output tokens consumed", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_cache_read_input_tokens_total = Counter( "llm_cache_read_input_tokens_total", "Total LLM cache read input tokens", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_cache_creation_input_tokens_total = Counter( "llm_cache_creation_input_tokens_total", "Total LLM cache creation input tokens", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) @@ -104,35 +104,35 @@ def _init_metrics(self): self.llm_cost_total = Counter( "llm_cost_total", "Total LLM cost in USD", - ["team_id", "user_id", "experiment_id", "model", "cost_type"], + ["org_id", "team_id", "user_id", "experiment_id", "model", "cost_type"], registry=self.registry, ) self.llm_input_cost_total = Counter( "llm_input_cost_total", "Total LLM input cost in USD", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_output_cost_total = Counter( "llm_output_cost_total", "Total LLM output cost in USD", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_cache_read_cost_total = Counter( "llm_cache_read_cost_total", "Total LLM cache read cost in USD", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) self.llm_cache_creation_cost_total = Counter( "llm_cache_creation_cost_total", "Total LLM cache creation cost in USD", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], registry=self.registry, ) @@ -140,7 +140,7 @@ def _init_metrics(self): self.llm_requests_total = Counter( "llm_requests_total", "Total number of LLM requests", - ["team_id", "user_id", "experiment_id", "model", "status"], + ["org_id", "team_id", "user_id", "experiment_id", "model", "status"], registry=self.registry, ) @@ -148,7 +148,7 @@ def _init_metrics(self): self.llm_request_duration_seconds = Histogram( "llm_request_duration_seconds", "LLM request duration in seconds", - ["team_id", "user_id", "experiment_id", "model"], + ["org_id", "team_id", "user_id", "experiment_id", "model"], buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0], registry=self.registry, ) @@ -189,6 +189,7 @@ def _process_span(self, span: ReadableSpan): return attributes = {k: str(v) for k, v in span.attributes.items()} + org_id = attributes.get("org_id", "unknown") team_id = attributes.get("team_id", "unknown") user_id = attributes.get("user_id", "unknown") experiment_id = attributes.get("experiment_id", "unknown") @@ -205,7 +206,7 @@ def _process_span(self, span: ReadableSpan): self.llm_errors_total.labels(error_type=error_type).inc() self._process_llm_span( - span, attributes, team_id, user_id, experiment_id, duration, status + span, attributes, org_id, team_id, user_id, experiment_id, duration, status ) except Exception as e: @@ -236,6 +237,7 @@ def _process_llm_span( self, span: ReadableSpan, attributes: dict[str, str], + org_id: str, team_id: str, user_id: str, experiment_id: str, @@ -260,6 +262,7 @@ def _process_llm_span( if total_tokens > 0: self.llm_tokens_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -269,9 +272,10 @@ def _process_llm_span( if input_tokens > 0: self.llm_input_tokens_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(input_tokens) self.llm_tokens_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -281,9 +285,10 @@ def _process_llm_span( if output_tokens > 0: self.llm_output_tokens_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(output_tokens) self.llm_tokens_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -293,9 +298,10 @@ def _process_llm_span( if cache_read_input_tokens > 0: self.llm_cache_read_input_tokens_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(cache_read_input_tokens) self.llm_tokens_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -305,9 +311,10 @@ def _process_llm_span( if cache_creation_input_tokens > 0: self.llm_cache_creation_input_tokens_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(cache_creation_input_tokens) self.llm_tokens_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -329,6 +336,7 @@ def _process_llm_span( if total_cost > 0: self.llm_cost_total.labels( + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, @@ -338,22 +346,22 @@ def _process_llm_span( if input_cost > 0: self.llm_input_cost_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(input_cost) if output_cost > 0: self.llm_output_cost_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(output_cost) if cache_read_cost > 0: self.llm_cache_read_cost_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(cache_read_cost) if cache_creation_cost > 0: self.llm_cache_creation_cost_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).inc(cache_creation_cost) except (ValueError, TypeError) as e: @@ -361,12 +369,12 @@ def _process_llm_span( # Request count self.llm_requests_total.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model, status=status + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model, status=status ).inc() # Duration self.llm_request_duration_seconds.labels( - team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model + org_id=org_id, team_id=team_id, user_id=user_id, experiment_id=experiment_id, model=model ).observe(duration) def _push_metrics(self): diff --git a/docs/prometheus-integration.md b/docs/prometheus-integration.md index 73e412f..e885dbc 100644 --- a/docs/prometheus-integration.md +++ b/docs/prometheus-integration.md @@ -81,44 +81,44 @@ async with experiment.CraftExperiment.start(name="my_experiment") as exp: ### LLM Token Metrics - **`llm_tokens_total`** - Total LLM tokens consumed - - Labels: `team_id`, `user_id`, `experiment_id`, `model`, `token_type` (input/output/cache_read_input/cache_creation_input/total) + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model`, `token_type` (input/output/cache_read_input/cache_creation_input/total) - **`llm_input_tokens_total`** - Total input tokens - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_output_tokens_total`** - Total output tokens - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_cache_read_input_tokens_total`** - Total cache read input tokens - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_cache_creation_input_tokens_total`** - Total cache creation input tokens - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` ### LLM Cost Metrics (USD) - **`llm_cost_total`** - Total LLM cost in USD - - Labels: `team_id`, `user_id`, `experiment_id`, `model`, `cost_type` (total) + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model`, `cost_type` (total) - **`llm_input_cost_total`** - Total input token cost in USD - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_output_cost_total`** - Total output token cost in USD - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_cache_read_cost_total`** - Total cache read cost in USD - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - **`llm_cache_creation_cost_total`** - Total cache creation cost in USD - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` ### LLM Request Metrics - **`llm_requests_total`** - Total number of LLM requests - - Labels: `team_id`, `user_id`, `experiment_id`, `model`, `status` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model`, `status` - **`llm_request_duration_seconds`** - LLM request duration histogram - - Labels: `team_id`, `user_id`, `experiment_id`, `model` + - Labels: `org_id`, `team_id`, `user_id`, `experiment_id`, `model` - Buckets: 0.1s, 0.5s, 1s, 2s, 5s, 10s, 30s, 60s ### Error Tracking @@ -228,6 +228,18 @@ The platform dashboard provides a comprehensive view combining operational healt Create your own panels with queries like: ```promql +# Cost by organization +sum by (org_id) (llm_cost_total{cost_type="total"}) + +# Cost by team within org +sum by (org_id, team_id) (llm_cost_total{cost_type="total"}) + +# Top 10 users by cost +topk(10, sum by (user_id) (llm_cost_total{cost_type="total"})) + +# Specific user's cost +sum(llm_cost_total{user_id="user123", cost_type="total"}) + # Token usage by experiment sum by (experiment_id) (llm_tokens_total{token_type="total"}) @@ -238,7 +250,7 @@ sum by (experiment_id) (llm_cost_total{cost_type="total"}) rate(llm_requests_total{team_id="YOUR_TEAM_ID"}[5m]) # Average latency -rate(llm_duration_seconds_sum[5m]) / rate(llm_duration_seconds_count[5m]) +rate(llm_request_duration_seconds_sum[5m]) / rate(llm_request_duration_seconds_count[5m]) # Success rate sum(rate(llm_requests_total{status="OK"}[5m])) / sum(rate(llm_requests_total[5m])) @@ -258,6 +270,9 @@ sum by (team_id) (llm_errors_total) # Count unique experiments (derived metric) count(sum by (experiment_id) (llm_requests_total)) +# Per-user cost within a specific org +sum by (user_id) (llm_cost_total{org_id="org123", cost_type="total"}) + # Count unique teams (derived metric) count(sum by (team_id) (llm_requests_total)) @@ -306,17 +321,22 @@ tracer_provider.add_span_processor(BatchSpanProcessor(prometheus_exporter)) The implementation balances observability with Prometheus performance. Metrics are aggregated by: -- `team_id` - Organization/team level (low cardinality) +- `org_id` - Organization level (very low cardinality) +- `team_id` - Team level within org (low cardinality) - `user_id` - User level for per-user cost tracking (medium cardinality) - `experiment_id` - Experiment level (medium-high cardinality) - `model` - AI model being used (low cardinality) - Other minimal dimensions (`status`, `token_type`) **Cardinality Considerations:** -- `user_id` is included to enable per-user cost tracking and billing -- In high-user environments (1000+ users), consider aggregating costs by team in Prometheus and using ClickHouse for detailed per-user breakdowns +- `org_id` enables multi-tenant deployments and per-organization billing +- `team_id` allows tracking within organizations +- `user_id` enables per-user cost tracking and billing within teams +- In high-user environments (1000+ users per org), consider aggregating costs by team/org in Prometheus and using ClickHouse for detailed per-user breakdowns - Labels like `run_id`, `span_kind`, and `semantic_kind` are intentionally excluded +**Label Hierarchy:** `org_id` > `team_id` > `user_id` > `experiment_id` + For detailed trace analysis and span classification, use the ClickHouse trace store which is optimized for high-cardinality data. ## Troubleshooting