From fac92199bd4ae7af3c08ae1dfae0584b513f5c49 Mon Sep 17 00:00:00 2001 From: Makisuo Date: Wed, 20 May 2026 12:34:11 +0200 Subject: [PATCH 1/3] init --- apps/api/src/routes/query-engine.http.ts | 21 ++ .../src/services/ServiceMapRollupService.ts | 44 ++- .../api/tinybird/service-external-edges.ts | 111 ++++++ .../components/services/dependency-table.tsx | 337 ++++++++++++++++++ .../services/dependency-type-badge.tsx | 75 ++++ .../services/service-dependencies-tab.tsx | 295 +++++++++++++++ .../services/atoms/tinybird-query-atoms.ts | 5 + apps/web/src/routes/services/$serviceName.tsx | 107 ++++-- .../domain/src/generated/clickhouse-schema.ts | 5 +- packages/domain/src/http/query-engine.ts | 22 ++ packages/domain/src/tinybird/datasources.ts | 102 ++++++ .../domain/src/tinybird/materializations.ts | 78 ++++ packages/query-engine/src/ch/expr.ts | 2 + .../src/ch/functions/date-time.ts | 21 ++ .../query-engine/src/ch/functions/index.ts | 9 +- packages/query-engine/src/ch/index.ts | 5 + .../src/ch/queries/service-map-rollup.ts | 129 ++++++- .../src/ch/queries/service-map.test.ts | 143 ++++++++ .../src/ch/queries/service-map.ts | 221 ++++++++++-- 19 files changed, 1670 insertions(+), 62 deletions(-) create mode 100644 apps/web/src/api/tinybird/service-external-edges.ts create mode 100644 apps/web/src/components/services/dependency-table.tsx create mode 100644 apps/web/src/components/services/dependency-type-badge.tsx create mode 100644 apps/web/src/components/services/service-dependencies-tab.tsx create mode 100644 packages/query-engine/src/ch/queries/service-map.test.ts diff --git a/apps/api/src/routes/query-engine.http.ts b/apps/api/src/routes/query-engine.http.ts index 3a036cfc..4b3104d5 100644 --- a/apps/api/src/routes/query-engine.http.ts +++ b/apps/api/src/routes/query-engine.http.ts @@ -21,6 +21,7 @@ import { ServiceReleasesResponse, ServiceDependenciesResponse, ServiceDbEdgesResponse, + ServiceExternalEdgesResponse, ServicePlatformsResponse, ServiceWorkloadsResponse, ServiceUsageResponse, @@ -436,6 +437,26 @@ export const HttpQueryEngineLive = HttpApiBuilder.group(MapleApi, "queryEngine", return new ServiceDbEdgesResponse({ data: compiled.castRows(rows) as any[] }) }), ) + .handle("serviceExternalEdges", ({ payload }) => + Effect.gen(function* () { + const tenant = yield* CurrentTenant.Context + const compiled = CH.serviceExternalEdgesSQL( + { + deploymentEnv: payload.deploymentEnv, + serviceName: payload.serviceName, + }, + { orgId: tenant.orgId, startTime: payload.startTime, endTime: payload.endTime }, + ) + const rows = yield* mapExecError( + warehouse.sqlQuery(tenant, compiled.sql, { + profile: "aggregation", + context: "serviceExternalEdges", + }), + "serviceExternalEdges query failed", + ) + return new ServiceExternalEdgesResponse({ data: compiled.castRows(rows) as any[] }) + }), + ) .handle("servicePlatforms", ({ payload }) => Effect.gen(function* () { const tenant = yield* CurrentTenant.Context diff --git a/apps/api/src/services/ServiceMapRollupService.ts b/apps/api/src/services/ServiceMapRollupService.ts index 4eaee194..42d6537e 100644 --- a/apps/api/src/services/ServiceMapRollupService.ts +++ b/apps/api/src/services/ServiceMapRollupService.ts @@ -27,6 +27,7 @@ export interface ServiceMapRollupResult { readonly orgsProcessed: number readonly hoursRolledUp: number readonly edgesWritten: number + readonly resolutionsWritten: number readonly orgFailures: number } @@ -92,12 +93,12 @@ export class ServiceMapRollupService extends Context.Service< let hoursRolledUp = 0 let edgesWritten = 0 + let resolutionsWritten = 0 for (const hourMs of missing) { - const rollup = CH.serviceMapEdgesRollupSQL({ - orgId, - hourStart: toTinybirdDateTime(hourMs), - hourEnd: toTinybirdDateTime(hourMs + HOUR_MS), - }) + const hourStart = toTinybirdDateTime(hourMs) + const hourEnd = toTinybirdDateTime(hourMs + HOUR_MS) + + const rollup = CH.serviceMapEdgesRollupSQL({ orgId, hourStart, hourEnd }) const raw = yield* warehouse.sqlQuery(tenant, rollup.sql, { context: "serviceMapRollup", }) @@ -106,9 +107,32 @@ export class ServiceMapRollupService extends Context.Service< yield* warehouse.ingest(tenant, "service_map_edges_hourly", rows) edgesWritten += rows.length } + + // Companion write: address-resolutions, used by the external-edges + // query's anti-join to suppress internal-service overlap. Separate + // SQL pass (~same cost as the edges JOIN) so the existing rollup + // query keeps its tight shape; failure of one ingest doesn't + // invalidate the other (per-org Effect failure already isolated). + const resolutionsRollup = CH.serviceMapResolutionsRollupSQL({ + orgId, + hourStart, + hourEnd, + }) + const resolutionsRaw = yield* warehouse.sqlQuery(tenant, resolutionsRollup.sql, { + context: "serviceMapResolutionsRollup", + }) + const resolutionsRows = resolutionsRollup.castRows(resolutionsRaw) + if (resolutionsRows.length > 0) { + yield* warehouse.ingest( + tenant, + "service_address_resolutions_hourly", + resolutionsRows, + ) + resolutionsWritten += resolutionsRows.length + } hoursRolledUp += 1 } - return { hoursRolledUp, edgesWritten, failed: false } + return { hoursRolledUp, edgesWritten, resolutionsWritten, failed: false } }) const runRollupTick: ServiceMapRollupServiceShape["runRollupTick"] = Effect.fn( @@ -130,7 +154,12 @@ export class ServiceMapRollupService extends Context.Service< error: Cause.pretty(cause), }), ), - { hoursRolledUp: 0, edgesWritten: 0, failed: true }, + { + hoursRolledUp: 0, + edgesWritten: 0, + resolutionsWritten: 0, + failed: true, + }, ), ), ), @@ -141,6 +170,7 @@ export class ServiceMapRollupService extends Context.Service< orgsProcessed: orgRows.length, hoursRolledUp: results.reduce((sum, r) => sum + r.hoursRolledUp, 0), edgesWritten: results.reduce((sum, r) => sum + r.edgesWritten, 0), + resolutionsWritten: results.reduce((sum, r) => sum + r.resolutionsWritten, 0), orgFailures: results.filter((r) => r.failed).length, } }) diff --git a/apps/web/src/api/tinybird/service-external-edges.ts b/apps/web/src/api/tinybird/service-external-edges.ts new file mode 100644 index 00000000..67164316 --- /dev/null +++ b/apps/web/src/api/tinybird/service-external-edges.ts @@ -0,0 +1,111 @@ +import { Effect, Schema } from "effect" +import { ServiceExternalEdgesRequest } from "@maple/domain/http" +import { MapleApiAtomClient } from "@/lib/services/common/atom-client" +import { summarizeSampling } from "@/lib/sampling" +import { TinybirdDateTimeString, decodeInput, runTinybirdQuery } from "@/api/tinybird/effect-utils" + +export type ServiceExternalTargetType = "http" | "messaging" | "rpc" + +export interface ServiceExternalEdge { + sourceService: string + targetType: ServiceExternalTargetType + targetSystem: string + targetName: string + callCount: number + estimatedCallCount: number + errorCount: number + errorRate: number + avgDurationMs: number + p95DurationMs: number + hasSampling: boolean + samplingWeight: number +} + +export interface ServiceExternalEdgesResponse { + edges: ServiceExternalEdge[] +} + +const GetServiceExternalEdgesInputSchema = Schema.Struct({ + serviceName: Schema.String, + startTime: Schema.optional(TinybirdDateTimeString), + endTime: Schema.optional(TinybirdDateTimeString), + deploymentEnv: Schema.optional(Schema.String), +}) + +export type GetServiceExternalEdgesInput = Schema.Schema.Type + +const defaultTimeRange = () => { + const now = new Date() + const dayAgo = new Date(now.getTime() - 24 * 60 * 60 * 1000) + const fmt = (d: Date) => d.toISOString().replace("T", " ").slice(0, 19) + return { startTime: fmt(dayAgo), endTime: fmt(now) } +} + +const knownTargetTypes: ReadonlySet = new Set([ + "http", + "messaging", + "rpc", +]) + +function coerceTargetType(value: unknown): ServiceExternalTargetType { + return knownTargetTypes.has(value as ServiceExternalTargetType) + ? (value as ServiceExternalTargetType) + : "http" +} + +function transformEdge(row: Record, durationSeconds: number): ServiceExternalEdge { + const callCount = Number(row.callCount ?? 0) + const errorCount = Number(row.errorCount ?? 0) + const estimatedSpanCount = Number(row.estimatedSpanCount ?? 0) + const sampling = summarizeSampling(estimatedSpanCount, callCount, durationSeconds) + const estimatedCallCount = sampling.hasSampling ? Math.round(estimatedSpanCount) : callCount + return { + sourceService: String(row.sourceService ?? ""), + targetType: coerceTargetType(row.targetType), + targetSystem: String(row.targetSystem ?? ""), + targetName: String(row.targetName ?? ""), + callCount, + estimatedCallCount, + errorCount, + errorRate: callCount > 0 ? errorCount / callCount : 0, + avgDurationMs: Number(row.avgDurationMs ?? 0), + p95DurationMs: Number(row.p95DurationMs ?? 0), + hasSampling: sampling.hasSampling, + samplingWeight: sampling.weight, + } +} + +export const getServiceExternalEdges = Effect.fn("QueryEngine.getServiceExternalEdges")(function* ({ + data, +}: { + data: GetServiceExternalEdgesInput +}) { + const input = yield* decodeInput( + GetServiceExternalEdgesInputSchema, + data ?? {}, + "getServiceExternalEdges", + ) + const fallback = defaultTimeRange() + + const result = yield* runTinybirdQuery("serviceExternalEdges", () => + Effect.gen(function* () { + const client = yield* MapleApiAtomClient + return yield* client.queryEngine.serviceExternalEdges({ + payload: new ServiceExternalEdgesRequest({ + serviceName: input.serviceName, + startTime: input.startTime ?? fallback.startTime, + endTime: input.endTime ?? fallback.endTime, + deploymentEnv: input.deploymentEnv, + }), + }) + }), + ) + + const startMs = input.startTime ? new Date(input.startTime.replace(" ", "T") + "Z").getTime() : 0 + const endMs = input.endTime ? new Date(input.endTime.replace(" ", "T") + "Z").getTime() : 0 + const durationSeconds = startMs > 0 && endMs > 0 ? Math.max((endMs - startMs) / 1000, 1) : 3600 + + return { + edges: result.data.map((row) => transformEdge(row, durationSeconds)), + } +}) diff --git a/apps/web/src/components/services/dependency-table.tsx b/apps/web/src/components/services/dependency-table.tsx new file mode 100644 index 00000000..1d3c96ab --- /dev/null +++ b/apps/web/src/components/services/dependency-table.tsx @@ -0,0 +1,337 @@ +import { useMemo, useState } from "react" +import { useNavigate } from "@tanstack/react-router" +import { cn } from "@maple/ui/utils" +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@maple/ui/components/ui/table" +import { Tooltip, TooltipContent, TooltipTrigger } from "@maple/ui/components/ui/tooltip" +import { ChevronDownIcon, ChevronUpIcon, ChevronExpandYIcon } from "@/components/icons" +import { formatLatency } from "@/lib/format" +import { DependencyTypeBadge, type DependencyKind } from "./dependency-type-badge" + +export interface DependencyRow { + id: string + kind: DependencyKind + name: string + subtitle?: string + callsPerSec: number + tracedCallsPerSec: number + totalCalls: number + estimatedCalls: number + errorRate: number + avgDurationMs: number + p95DurationMs: number + hasSampling: boolean + samplingWeight: number + whereClause: string +} + +interface DependencyTableProps { + serviceName: string + rows: DependencyRow[] + startTime?: string + endTime?: string + timePreset?: string +} + +type SortKey = "calls" | "errorRate" | "p95" +type SortDir = "asc" | "desc" + +function formatRate(value: number): string { + if (value >= 1000) return `${(value / 1000).toFixed(1)}k` + if (value >= 1) return value.toFixed(1) + return value.toFixed(2) +} + +function formatErrorRate(rate: number): string { + if (rate >= 0.01) return `${(rate * 100).toFixed(1)}%` + if (rate > 0) return "<1%" + return "0%" +} + +function errorTone(rate: number): "error" | "warn" | "default" { + if (rate > 0.05) return "error" + if (rate > 0.01) return "warn" + return "default" +} + +export function DependencyTable({ + serviceName, + rows, + startTime, + endTime, + timePreset, +}: DependencyTableProps) { + const navigate = useNavigate() + const [sortKey, setSortKey] = useState("calls") + const [sortDir, setSortDir] = useState("desc") + + // Column-relative maxima drive the inline bars. Calls + p95 read as "more is + // more"; error rate as "any value is a problem", so its bar always tints red + // with intensity scaled to severity (0..5%+). + const maxima = useMemo(() => { + return rows.reduce( + (acc, row) => ({ + calls: Math.max(acc.calls, row.callsPerSec), + p95: Math.max(acc.p95, row.p95DurationMs), + }), + { calls: 0, p95: 0 }, + ) + }, [rows]) + + const sorted = useMemo(() => { + const out = [...rows] + out.sort((a, b) => { + const aV = + sortKey === "calls" + ? a.callsPerSec + : sortKey === "errorRate" + ? a.errorRate + : a.p95DurationMs + const bV = + sortKey === "calls" + ? b.callsPerSec + : sortKey === "errorRate" + ? b.errorRate + : b.p95DurationMs + return sortDir === "desc" ? bV - aV : aV - bV + }) + return out + }, [rows, sortKey, sortDir]) + + const toggleSort = (key: SortKey) => { + if (key === sortKey) { + setSortDir(sortDir === "desc" ? "asc" : "desc") + } else { + setSortKey(key) + setSortDir("desc") + } + } + + const handleRowClick = (row: DependencyRow) => { + navigate({ + to: "/traces", + search: { + services: [serviceName], + whereClause: row.whereClause, + startTime, + endTime, + timePreset, + }, + }) + } + + return ( +
+ + + + + Target + + toggleSort("calls")} + /> + toggleSort("errorRate")} + /> + + Avg + + toggleSort("p95")} + /> + + + + {sorted.length === 0 ? ( + + + No downstream dependencies in this window. + + + ) : ( + sorted.map((row) => { + const tone = errorTone(row.errorRate) + return ( + handleRowClick(row)} + className="cursor-pointer group/row border-b last:border-b-0 hover:bg-muted/40" + > + +
+ +
+ + {row.name} + + {row.subtitle ? ( + + {row.subtitle} + + ) : null} +
+
+
+ + {row.hasSampling ? ( + + } + className="cursor-help tabular-nums font-mono text-[12.5px] text-foreground" + > + ~{formatRate(row.callsPerSec)} + + + Estimated ×{row.samplingWeight.toFixed(0)} from{" "} + {formatRate(row.tracedCallsPerSec)} traced req/s + + + ) : ( + + {formatRate(row.callsPerSec)} + + )} + + 0 ? row.errorRate : 0} + // Errors get a fixed "severity scale" (5% = full bar) rather + // than column-relative, so a 0.2% sliver looks small even + // when it happens to be the worst in the table. + max={0.05} + tone="errors" + align="right" + > + + {formatErrorRate(row.errorRate)} + + + + + {formatLatency(row.avgDurationMs)} + + + + + {formatLatency(row.p95DurationMs)} + + +
+ ) + }) + )} +
+
+
+ ) +} + +interface BarCellProps { + value: number + max: number + tone: "calls" | "errors" | "latency" + align: "left" | "right" + children: React.ReactNode +} + +/** + * A numeric cell with a column-tinted bar overlay behind the value. The bar's + * width is normalized against the column max (or, for errors, a fixed severity + * ceiling) so distribution is legible at a glance — replaces the standalone + * bar-list cards the previous design had stacked beside the table. + * + * Bars sit at very low opacity behind the text; group hover lifts them so they + * become a clear hover affordance without screaming for attention at rest. + */ +function BarCell({ value, max, tone, align, children }: BarCellProps) { + const pct = max > 0 ? Math.min((value / max) * 100, 100) : 0 + const hasBar = pct > 0 + return ( + + {hasBar ? ( +
+ ) : null} + {children} + + ) +} + +interface SortableHeadProps { + label: string + align?: "left" | "right" + active: boolean + dir: SortDir + onClick: () => void +} + +function SortableHead({ label, align = "left", active, dir, onClick }: SortableHeadProps) { + const Icon = active ? (dir === "desc" ? ChevronDownIcon : ChevronUpIcon) : ChevronExpandYIcon + return ( + + + {label} + + + + ) +} diff --git a/apps/web/src/components/services/dependency-type-badge.tsx b/apps/web/src/components/services/dependency-type-badge.tsx new file mode 100644 index 00000000..fccbd062 --- /dev/null +++ b/apps/web/src/components/services/dependency-type-badge.tsx @@ -0,0 +1,75 @@ +import { cn } from "@maple/ui/utils" +import { + DatabaseIcon, + GlobeIcon, + NetworkNodesIcon, + PaperPlaneIcon, + ServerIcon, +} from "@/components/icons" + +/** + * Visual identity for one downstream-dependency category surfaced on the + * service-detail Dependencies tab. + * + * - service → another internal service (from `serviceDependencies`) + * - database → DB target (from `serviceDbEdges`) + * - http → external HTTP host (from `serviceExternalEdges`) + * - messaging → message queue (from `serviceExternalEdges`) + * - rpc → RPC target (from `serviceExternalEdges`) + */ +export type DependencyKind = "service" | "database" | "http" | "messaging" | "rpc" + +interface DependencyTypeBadgeProps { + kind: DependencyKind + className?: string +} + +const labels: Record = { + service: "Service", + database: "Database", + http: "HTTP", + messaging: "Queue", + rpc: "RPC", +} + +// Token-based palette so the badge tracks the user's theme. Each tone pairs a +// soft tinted background with a muted foreground; consistent visual weight so +// no single category dominates the column read. +const tones: Record = { + service: "bg-severity-info/10 text-severity-info border-severity-info/20", + database: "bg-amber-500/10 text-amber-600 border-amber-500/20 dark:text-amber-300", + http: "bg-foreground/5 text-muted-foreground border-border", + messaging: "bg-violet-500/10 text-violet-600 border-violet-500/20 dark:text-violet-300", + rpc: "bg-cyan-500/10 text-cyan-600 border-cyan-500/20 dark:text-cyan-300", +} + +function getIcon(kind: DependencyKind) { + switch (kind) { + case "service": + return ServerIcon + case "database": + return DatabaseIcon + case "http": + return GlobeIcon + case "messaging": + return PaperPlaneIcon + case "rpc": + return NetworkNodesIcon + } +} + +export function DependencyTypeBadge({ kind, className }: DependencyTypeBadgeProps) { + const Icon = getIcon(kind) + return ( + + + {labels[kind]} + + ) +} diff --git a/apps/web/src/components/services/service-dependencies-tab.tsx b/apps/web/src/components/services/service-dependencies-tab.tsx new file mode 100644 index 00000000..30c90e4a --- /dev/null +++ b/apps/web/src/components/services/service-dependencies-tab.tsx @@ -0,0 +1,295 @@ +import { useMemo } from "react" +import { cn } from "@maple/ui/utils" +import { Result } from "@/lib/effect-atom" +import { useRetainedRefreshableResultValue } from "@/hooks/use-retained-refreshable-result-value" +import { + getServiceMapResultAtom, + getServiceMapDbEdgesResultAtom, + getServiceExternalEdgesResultAtom, +} from "@/lib/services/atoms/tinybird-query-atoms" +import { formatLatency } from "@/lib/format" +import { DependencyTable, type DependencyRow } from "./dependency-table" +import type { DependencyKind } from "./dependency-type-badge" + +interface ServiceDependenciesTabProps { + serviceName: string + startTime?: string + endTime?: string + timePreset?: string + effectiveStartTime: string + effectiveEndTime: string +} + +interface RawEdge { + sourceService?: string + targetService?: string + dbSystem?: string + targetType?: "http" | "messaging" | "rpc" + targetSystem?: string + targetName?: string + callCount?: number + estimatedCallCount?: number + errorRate?: number + avgDurationMs?: number + p95DurationMs?: number + hasSampling?: boolean + samplingWeight?: number +} + +function formatRate(value: number): string { + if (value >= 1000) return `${(value / 1000).toFixed(1)}k` + if (value >= 1) return value.toFixed(1) + return value.toFixed(2) +} + +function formatErrorRate(rate: number): string { + if (rate >= 0.01) return `${(rate * 100).toFixed(1)}%` + if (rate > 0) return "<1%" + return "0%" +} + +function escapeForWhereClause(value: string): string { + return value.replace(/'/g, "\\'") +} + +export function ServiceDependenciesTab({ + serviceName, + startTime, + endTime, + timePreset, + effectiveStartTime, + effectiveEndTime, +}: ServiceDependenciesTabProps) { + const servicesResult = useRetainedRefreshableResultValue( + getServiceMapResultAtom({ + data: { startTime: effectiveStartTime, endTime: effectiveEndTime }, + }), + ) + const dbsResult = useRetainedRefreshableResultValue( + getServiceMapDbEdgesResultAtom({ + data: { startTime: effectiveStartTime, endTime: effectiveEndTime }, + }), + ) + const externalResult = useRetainedRefreshableResultValue( + getServiceExternalEdgesResultAtom({ + data: { + serviceName, + startTime: effectiveStartTime, + endTime: effectiveEndTime, + }, + }), + ) + + const durationSeconds = useMemo(() => { + const s = new Date(effectiveStartTime.replace(" ", "T") + "Z").getTime() + const e = new Date(effectiveEndTime.replace(" ", "T") + "Z").getTime() + return s > 0 && e > 0 ? Math.max((e - s) / 1000, 1) : 3600 + }, [effectiveStartTime, effectiveEndTime]) + + const rows = useMemo(() => { + const out: DependencyRow[] = [] + + const serviceEdges = Result.builder(servicesResult) + .onSuccess((r) => r.edges as RawEdge[]) + .orElse(() => [] as RawEdge[]) + const dbEdges = Result.builder(dbsResult) + .onSuccess((r) => r.edges as RawEdge[]) + .orElse(() => [] as RawEdge[]) + const externalEdges = Result.builder(externalResult) + .onSuccess((r) => r.edges as RawEdge[]) + .orElse(() => [] as RawEdge[]) + + for (const edge of serviceEdges) { + if (edge.sourceService !== serviceName || !edge.targetService) continue + const callCount = Number(edge.callCount ?? 0) + const estimated = Number(edge.estimatedCallCount ?? callCount) + const target = String(edge.targetService) + out.push({ + id: `service:${target}`, + kind: "service", + name: target, + callsPerSec: estimated / durationSeconds, + tracedCallsPerSec: callCount / durationSeconds, + totalCalls: callCount, + estimatedCalls: estimated, + errorRate: Number(edge.errorRate ?? 0), + avgDurationMs: Number(edge.avgDurationMs ?? 0), + p95DurationMs: Number(edge.p95DurationMs ?? 0), + hasSampling: Boolean(edge.hasSampling), + samplingWeight: Number(edge.samplingWeight ?? 1), + whereClause: `SpanKind = 'Client' AND server.address ILIKE '%${escapeForWhereClause(target)}%'`, + }) + } + + for (const edge of dbEdges) { + if (edge.sourceService !== serviceName || !edge.dbSystem) continue + const callCount = Number(edge.callCount ?? 0) + const estimated = Number(edge.estimatedCallCount ?? callCount) + const target = String(edge.dbSystem) + out.push({ + id: `database:${target}`, + kind: "database", + name: target, + callsPerSec: estimated / durationSeconds, + tracedCallsPerSec: callCount / durationSeconds, + totalCalls: callCount, + estimatedCalls: estimated, + errorRate: Number(edge.errorRate ?? 0), + avgDurationMs: Number(edge.avgDurationMs ?? 0), + p95DurationMs: Number(edge.p95DurationMs ?? 0), + hasSampling: Boolean(edge.hasSampling), + samplingWeight: Number(edge.samplingWeight ?? 1), + whereClause: `SpanKind = 'Client' AND db.system.name = '${escapeForWhereClause(target)}'`, + }) + } + + for (const edge of externalEdges) { + const target = String(edge.targetName ?? "") + if (!target) continue + const kind: DependencyKind = + edge.targetType === "messaging" + ? "messaging" + : edge.targetType === "rpc" + ? "rpc" + : "http" + const callCount = Number(edge.callCount ?? 0) + const estimated = Number(edge.estimatedCallCount ?? callCount) + const system = edge.targetSystem ? String(edge.targetSystem) : "" + const whereClause = + kind === "messaging" + ? `SpanKind = 'Producer' AND messaging.destination = '${escapeForWhereClause(target)}'` + : kind === "rpc" + ? `SpanKind = 'Client' AND rpc.service = '${escapeForWhereClause(target)}'` + : `SpanKind = 'Client' AND (server.address = '${escapeForWhereClause(target)}' OR http.host = '${escapeForWhereClause(target)}')` + + out.push({ + id: `${kind}:${target}`, + kind, + name: target, + subtitle: system || undefined, + callsPerSec: estimated / durationSeconds, + tracedCallsPerSec: callCount / durationSeconds, + totalCalls: callCount, + estimatedCalls: estimated, + errorRate: Number(edge.errorRate ?? 0), + avgDurationMs: Number(edge.avgDurationMs ?? 0), + p95DurationMs: Number(edge.p95DurationMs ?? 0), + hasSampling: Boolean(edge.hasSampling), + samplingWeight: Number(edge.samplingWeight ?? 1), + whereClause, + }) + } + + return out + }, [servicesResult, dbsResult, externalResult, serviceName, durationSeconds]) + + const isWaiting = + (Result.isSuccess(servicesResult) && servicesResult.waiting) || + (Result.isSuccess(dbsResult) && dbsResult.waiting) || + (Result.isSuccess(externalResult) && externalResult.waiting) + + // Aggregate facts for the summary strip. Each datum gets its own muted + // label + sharp value so the eye lands on the numbers first, the labels + // second — same hierarchy the chart cards on Overview already use. + const summary = useMemo(() => { + if (rows.length === 0) return null + + const byKind = rows.reduce>( + (acc, row) => { + acc[row.kind] = (acc[row.kind] ?? 0) + 1 + return acc + }, + { service: 0, database: 0, http: 0, messaging: 0, rpc: 0 }, + ) + const breakdown = (["service", "database", "http", "messaging", "rpc"] as const) + .filter((k) => byKind[k] > 0) + .map((k) => `${byKind[k]} ${labelFor(k, byKind[k])}`) + .join(" · ") + + const topByCalls = [...rows].sort((a, b) => b.callsPerSec - a.callsPerSec)[0] + const topByErrors = [...rows] + .filter((r) => r.errorRate > 0) + .sort((a, b) => b.errorRate - a.errorRate)[0] + const topByLatency = [...rows].sort((a, b) => b.p95DurationMs - a.p95DurationMs)[0] + + return { breakdown, topByCalls, topByErrors, topByLatency } + }, [rows]) + + return ( +
+ {summary ? ( +
+ + {rows.length}{" "} + downstream + + {summary.breakdown} + + + {summary.topByErrors ? ( + + ) : ( + + )} + +
+ ) : null} + + +
+ ) +} + +interface HeadlineFactProps { + label: string + name: string + value: string + tone?: "error" +} + +function HeadlineFact({ label, name, value, tone }: HeadlineFactProps) { + return ( + + {label} + {name} + + {value} + + + ) +} + +function labelFor(kind: DependencyKind, count: number): string { + const map: Record = { + service: ["service", "services"], + database: ["database", "databases"], + http: ["external HTTP", "external HTTP"], + messaging: ["queue", "queues"], + rpc: ["RPC target", "RPC targets"], + } + return count === 1 ? map[kind][0] : map[kind][1] +} diff --git a/apps/web/src/lib/services/atoms/tinybird-query-atoms.ts b/apps/web/src/lib/services/atoms/tinybird-query-atoms.ts index aa65c29d..69e8db26 100644 --- a/apps/web/src/lib/services/atoms/tinybird-query-atoms.ts +++ b/apps/web/src/lib/services/atoms/tinybird-query-atoms.ts @@ -47,6 +47,7 @@ import { } from "@/api/tinybird/infra" import { getServiceUsage } from "@/api/tinybird/service-usage" import { getServiceMap, getServiceMapDbEdges, getServicePlatforms } from "@/api/tinybird/service-map" +import { getServiceExternalEdges } from "@/api/tinybird/service-external-edges" import { getServiceWorkloads } from "@/api/tinybird/service-infra" import { getServiceApdexTimeSeries, @@ -298,6 +299,10 @@ export const getServiceMapDbEdgesResultAtom = makeQueryAtomFamily(getServiceMapD staleTime: 15_000, }) +export const getServiceExternalEdgesResultAtom = makeQueryAtomFamily(getServiceExternalEdges, { + staleTime: 30_000, +}) + export const getServicePlatformsResultAtom = makeQueryAtomFamily(getServicePlatforms, { staleTime: 60_000, }) diff --git a/apps/web/src/routes/services/$serviceName.tsx b/apps/web/src/routes/services/$serviceName.tsx index cf8ace06..cd5bd953 100644 --- a/apps/web/src/routes/services/$serviceName.tsx +++ b/apps/web/src/routes/services/$serviceName.tsx @@ -13,6 +13,7 @@ import type { ChartReferenceLine, ChartTooltipMode, } from "@maple/ui/components/charts/_shared/chart-types" +import { Tabs, TabsList, TabsTrigger } from "@maple/ui/components/ui/tabs" import { getCustomChartServiceDetailResultAtom, getServiceApdexTimeSeriesResultAtom, @@ -24,11 +25,16 @@ import { PageRefreshProvider } from "@/components/time-range-picker/page-refresh import { TimeRangeHeaderControls } from "@/components/time-range-picker/time-range-header-controls" import { Button } from "@maple/ui/components/ui/button" import { BellIcon } from "@/components/icons" +import { ServiceDependenciesTab } from "@/components/services/service-dependencies-tab" + +const ServiceDetailTab = Schema.Literals(["overview", "dependencies"]) +type ServiceDetailTabValue = Schema.Schema.Type const serviceDetailSearchSchema = Schema.Struct({ startTime: Schema.optional(Schema.String), endTime: Schema.optional(Schema.String), timePreset: Schema.optional(Schema.String), + tab: Schema.optional(ServiceDetailTab), }) export const Route = effectRoute(createFileRoute("/services/$serviceName"))({ @@ -113,6 +119,84 @@ function ServiceDetailContent() { }) } + const activeTab: ServiceDetailTabValue = search.tab ?? "overview" + const handleTabChange = (value: unknown) => { + const next = value === "dependencies" ? "dependencies" : "overview" + navigate({ + replace: true, + search: (prev: Record) => ({ + ...prev, + tab: next === "overview" ? undefined : next, + }), + }) + } + + return ( + + {/* View switch lives inline with other page controls so it reads as a + perspective toggle, not a navigation bar. Sized to match the time + picker buttons (h-7) and tucked left of them so the visual order is: + "what view → what window → what action". */} + + + + Overview + + + Dependencies + + + + + +
+ } + > + {activeTab === "overview" ? ( + + ) : ( + + )} + + ) +} + +interface OverviewTabProps { + serviceName: string + effectiveStartTime: string + effectiveEndTime: string +} + +function OverviewTab({ serviceName, effectiveStartTime, effectiveEndTime }: OverviewTabProps) { const detailResult = useRetainedRefreshableResultValue( getCustomChartServiceDetailResultAtom({ data: { @@ -185,26 +269,5 @@ function ServiceDetailContent() { referenceLines: releaseMarkers, })) - return ( - - - - - } - > - - - ) + return } diff --git a/packages/domain/src/generated/clickhouse-schema.ts b/packages/domain/src/generated/clickhouse-schema.ts index 2adff56d..64220577 100644 --- a/packages/domain/src/generated/clickhouse-schema.ts +++ b/packages/domain/src/generated/clickhouse-schema.ts @@ -1,7 +1,7 @@ // This file is generated by scripts/generate-clickhouse-schema.ts // Do not edit manually. -export const projectRevision = "4edfaab355d12f571b55f57365336f07621a08a4fad3cd1522964c6597aed262" as const +export const projectRevision = "29804960ae922b01c05dd2705e8a70418d8d728811f30b96d3e758ac15012981" as const export const latestSnapshotStatements: ReadonlyArray = [ "CREATE TABLE IF NOT EXISTS alert_checks (\n OrgId LowCardinality(String),\n RuleId String,\n GroupKey String,\n Timestamp DateTime64(3),\n Status LowCardinality(String),\n SignalType LowCardinality(String),\n Comparator LowCardinality(String),\n Threshold Float64,\n ObservedValue Nullable(Float64),\n SampleCount UInt32,\n WindowMinutes UInt16,\n WindowStart DateTime64(3),\n WindowEnd DateTime64(3),\n ConsecutiveBreaches UInt16,\n ConsecutiveHealthy UInt16,\n IncidentId Nullable(String),\n IncidentTransition LowCardinality(String),\n EvaluationDurationMs UInt32\n)\nENGINE = MergeTree\nPARTITION BY toDate(Timestamp)\nORDER BY (OrgId, RuleId, GroupKey, Timestamp)\nTTL toDate(Timestamp) + INTERVAL 90 DAY", @@ -16,6 +16,8 @@ export const latestSnapshotStatements: ReadonlyArray = [ "CREATE TABLE IF NOT EXISTS metrics_gauge (\n OrgId LowCardinality(String),\n ResourceAttributes Map(LowCardinality(String), String),\n ResourceSchemaUrl String,\n ScopeName String,\n ScopeVersion String,\n ScopeAttributes Map(LowCardinality(String), String),\n ScopeSchemaUrl String,\n ServiceName LowCardinality(String),\n MetricName LowCardinality(String),\n MetricDescription LowCardinality(String),\n MetricUnit LowCardinality(String),\n Attributes Map(LowCardinality(String), String),\n StartTimeUnix DateTime64(9),\n TimeUnix DateTime64(9),\n Value Float64,\n Flags UInt32,\n ExemplarsTraceId Array(String),\n ExemplarsSpanId Array(String),\n ExemplarsTimestamp Array(DateTime64(9)),\n ExemplarsValue Array(Float64),\n ExemplarsFilteredAttributes Array(Map(LowCardinality(String), String))\n)\nENGINE = MergeTree\nPARTITION BY toDate(TimeUnix)\nORDER BY (OrgId, ServiceName, MetricName, Attributes, toUnixTimestamp64Nano(TimeUnix))\nTTL toDate(TimeUnix) + INTERVAL 365 DAY", "CREATE TABLE IF NOT EXISTS metrics_histogram (\n OrgId LowCardinality(String),\n ResourceAttributes Map(LowCardinality(String), String),\n ResourceSchemaUrl String,\n ScopeName String,\n ScopeVersion String,\n ScopeAttributes Map(LowCardinality(String), String),\n ScopeSchemaUrl String,\n ServiceName LowCardinality(String),\n MetricName LowCardinality(String),\n MetricDescription LowCardinality(String),\n MetricUnit LowCardinality(String),\n Attributes Map(LowCardinality(String), String),\n StartTimeUnix DateTime64(9),\n TimeUnix DateTime64(9),\n Count UInt64,\n Sum Float64,\n BucketCounts Array(UInt64),\n ExplicitBounds Array(Float64),\n ExemplarsTraceId Array(String),\n ExemplarsSpanId Array(String),\n ExemplarsTimestamp Array(DateTime64(9)),\n ExemplarsValue Array(Float64),\n ExemplarsFilteredAttributes Array(Map(LowCardinality(String), String)),\n Flags UInt32,\n Min Nullable(Float64),\n Max Nullable(Float64),\n AggregationTemporality Int32\n)\nENGINE = MergeTree\nPARTITION BY toDate(TimeUnix)\nORDER BY (OrgId, ServiceName, MetricName, Attributes, toUnixTimestamp64Nano(TimeUnix))\nTTL toDate(TimeUnix) + INTERVAL 365 DAY", "CREATE TABLE IF NOT EXISTS metrics_sum (\n OrgId LowCardinality(String),\n ResourceAttributes Map(LowCardinality(String), String),\n ResourceSchemaUrl String,\n ScopeName String,\n ScopeVersion String,\n ScopeAttributes Map(LowCardinality(String), String),\n ScopeSchemaUrl String,\n ServiceName LowCardinality(String),\n MetricName LowCardinality(String),\n MetricDescription LowCardinality(String),\n MetricUnit LowCardinality(String),\n Attributes Map(LowCardinality(String), String),\n StartTimeUnix DateTime64(9),\n TimeUnix DateTime64(9),\n Value Float64,\n Flags UInt32,\n ExemplarsTraceId Array(String),\n ExemplarsSpanId Array(String),\n ExemplarsTimestamp Array(DateTime64(9)),\n ExemplarsValue Array(Float64),\n ExemplarsFilteredAttributes Array(Map(LowCardinality(String), String)),\n AggregationTemporality Int32,\n IsMonotonic Bool\n)\nENGINE = MergeTree\nPARTITION BY toDate(TimeUnix)\nORDER BY (OrgId, ServiceName, MetricName, Attributes, toUnixTimestamp64Nano(TimeUnix))\nTTL toDate(TimeUnix) + INTERVAL 365 DAY", + "CREATE TABLE IF NOT EXISTS service_address_resolutions_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n SourceService LowCardinality(String),\n ParentServerAddress String,\n ResolvedTargetService LowCardinality(String),\n DeploymentEnv LowCardinality(String)\n)\nENGINE = ReplacingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, SourceService, ParentServerAddress, ResolvedTargetService)\nTTL toDate(Hour) + INTERVAL 90 DAY", + "CREATE TABLE IF NOT EXISTS service_external_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n ServiceName LowCardinality(String),\n TargetType LowCardinality(String),\n TargetSystem LowCardinality(String),\n TargetName String,\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, ServiceName, TargetType, TargetName)\nTTL toDate(Hour) + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_children (\n OrgId LowCardinality(String),\n Timestamp DateTime,\n TraceId String,\n ParentSpanId String,\n ServiceName LowCardinality(String),\n SpanKind LowCardinality(String),\n Duration UInt64,\n StatusCode LowCardinality(String),\n TraceState String,\n DeploymentEnv LowCardinality(String)\n)\nENGINE = MergeTree\nPARTITION BY toDate(Timestamp)\nORDER BY (OrgId, TraceId, ParentSpanId, Timestamp)\nTTL Timestamp + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_db_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n ServiceName LowCardinality(String),\n DbSystem LowCardinality(String),\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampledSpanCount SimpleAggregateFunction(sum, UInt64),\n UnsampledSpanCount SimpleAggregateFunction(sum, UInt64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, ServiceName, DbSystem)\nTTL toDate(Hour) + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n SourceService LowCardinality(String),\n TargetService String,\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampledSpanCount SimpleAggregateFunction(sum, UInt64),\n UnsampledSpanCount SimpleAggregateFunction(sum, UInt64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, SourceService, TargetService)\nTTL toDate(Hour) + INTERVAL 90 DAY", @@ -38,6 +40,7 @@ export const latestSnapshotStatements: ReadonlyArray = [ "CREATE MATERIALIZED VIEW IF NOT EXISTS metric_catalog_gauge_mv TO metric_catalog AS\nSELECT\n OrgId,\n toStartOfHour(toDateTime(TimeUnix)) AS Hour,\n 'gauge' AS MetricType,\n ServiceName,\n MetricName,\n anyLast(MetricDescription) AS MetricDescription,\n anyLast(MetricUnit) AS MetricUnit,\n toUInt8(0) AS IsMonotonic,\n count() AS DataPointCount,\n min(toDateTime(TimeUnix)) AS FirstSeen,\n max(toDateTime(TimeUnix)) AS LastSeen\n FROM metrics_gauge\n GROUP BY OrgId, Hour, MetricType, ServiceName, MetricName", "CREATE MATERIALIZED VIEW IF NOT EXISTS metric_catalog_histogram_mv TO metric_catalog AS\nSELECT\n OrgId,\n toStartOfHour(toDateTime(TimeUnix)) AS Hour,\n 'histogram' AS MetricType,\n ServiceName,\n MetricName,\n anyLast(MetricDescription) AS MetricDescription,\n anyLast(MetricUnit) AS MetricUnit,\n toUInt8(0) AS IsMonotonic,\n count() AS DataPointCount,\n min(toDateTime(TimeUnix)) AS FirstSeen,\n max(toDateTime(TimeUnix)) AS LastSeen\n FROM metrics_histogram\n GROUP BY OrgId, Hour, MetricType, ServiceName, MetricName", "CREATE MATERIALIZED VIEW IF NOT EXISTS metric_catalog_sum_mv TO metric_catalog AS\nSELECT\n OrgId,\n toStartOfHour(toDateTime(TimeUnix)) AS Hour,\n 'sum' AS MetricType,\n ServiceName,\n MetricName,\n anyLast(MetricDescription) AS MetricDescription,\n anyLast(MetricUnit) AS MetricUnit,\n anyLast(toUInt8(IsMonotonic)) AS IsMonotonic,\n count() AS DataPointCount,\n min(toDateTime(TimeUnix)) AS FirstSeen,\n max(toDateTime(TimeUnix)) AS LastSeen\n FROM metrics_sum\n GROUP BY OrgId, Hour, MetricType, ServiceName, MetricName", + "CREATE MATERIALIZED VIEW IF NOT EXISTS service_external_edges_hourly_mv TO service_external_edges_hourly AS\nSELECT\n OrgId,\n toStartOfHour(toDateTime(Timestamp)) AS Hour,\n ServiceName,\n multiIf(\n SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', 'messaging',\n SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', 'rpc',\n 'http'\n ) AS TargetType,\n multiIf(\n SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', SpanAttributes['messaging.system'],\n SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', SpanAttributes['rpc.system'],\n ''\n ) AS TargetSystem,\n multiIf(\n SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '',\n if(SpanAttributes['messaging.destination'] != '', SpanAttributes['messaging.destination'], SpanAttributes['messaging.system']),\n SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '',\n if(SpanAttributes['rpc.service'] != '', SpanAttributes['rpc.service'], SpanAttributes['rpc.system']),\n if(SpanAttributes['server.address'] != '',\n SpanAttributes['server.address'],\n if(SpanAttributes['http.host'] != '',\n SpanAttributes['http.host'],\n SpanAttributes['url.authority']))\n ) AS TargetName,\n ResourceAttributes['deployment.environment'] AS DeploymentEnv,\n count() AS CallCount,\n countIf(StatusCode = 'Error') AS ErrorCount,\n sum(Duration / 1000000) AS DurationSumMs,\n max(Duration / 1000000) AS MaxDurationMs,\n sum(SampleRate) AS SampleRateSum\n FROM traces\n WHERE SpanKind IN ('Client', 'Producer')\n AND SpanAttributes['db.system.name'] = ''\n AND ServiceName != ''\n AND (\n SpanAttributes['server.address'] != ''\n OR SpanAttributes['http.host'] != ''\n OR SpanAttributes['url.authority'] != ''\n OR SpanAttributes['messaging.destination'] != ''\n OR SpanAttributes['messaging.system'] != ''\n OR SpanAttributes['rpc.service'] != ''\n OR SpanAttributes['rpc.system'] != ''\n )\n GROUP BY OrgId, Hour, ServiceName, TargetType, TargetSystem, TargetName, DeploymentEnv\n HAVING TargetName != ''", "CREATE MATERIALIZED VIEW IF NOT EXISTS service_map_children_mv TO service_map_children AS\nSELECT\n OrgId,\n toDateTime(Timestamp) AS Timestamp,\n TraceId,\n ParentSpanId,\n ServiceName,\n SpanKind,\n Duration,\n StatusCode,\n TraceState,\n ResourceAttributes['deployment.environment'] AS DeploymentEnv\n FROM traces\n WHERE SpanKind IN ('Server', 'Consumer')\n AND ParentSpanId != ''", "CREATE MATERIALIZED VIEW IF NOT EXISTS service_map_db_edges_hourly_mv TO service_map_db_edges_hourly AS\nSELECT\n OrgId,\n toStartOfHour(toDateTime(Timestamp)) AS Hour,\n ServiceName,\n SpanAttributes['db.system.name'] AS DbSystem,\n ResourceAttributes['deployment.environment'] AS DeploymentEnv,\n count() AS CallCount,\n countIf(StatusCode = 'Error') AS ErrorCount,\n sum(Duration / 1000000) AS DurationSumMs,\n max(Duration / 1000000) AS MaxDurationMs,\n countIf(TraceState LIKE '%th:%') AS SampledSpanCount,\n countIf(TraceState = '' OR TraceState NOT LIKE '%th:%') AS UnsampledSpanCount,\n sum(SampleRate) AS SampleRateSum\n FROM traces\n WHERE SpanKind IN ('Client', 'Producer')\n AND SpanAttributes['db.system.name'] != ''\n AND ServiceName != ''\n GROUP BY OrgId, Hour, ServiceName, DbSystem, DeploymentEnv", "CREATE MATERIALIZED VIEW IF NOT EXISTS service_map_spans_mv TO service_map_spans AS\nSELECT\n OrgId,\n toDateTime(Timestamp) AS Timestamp,\n TraceId,\n SpanId,\n ParentSpanId,\n ServiceName,\n SpanKind,\n Duration,\n StatusCode,\n TraceState,\n ResourceAttributes['deployment.environment'] AS DeploymentEnv\n FROM traces\n WHERE SpanKind IN ('Client', 'Producer', 'Server', 'Consumer')", diff --git a/packages/domain/src/http/query-engine.ts b/packages/domain/src/http/query-engine.ts index 5a0bab34..32c80dfc 100644 --- a/packages/domain/src/http/query-engine.ts +++ b/packages/domain/src/http/query-engine.ts @@ -242,6 +242,21 @@ export class ServiceDbEdgesResponse extends Schema.Class data: Schema.Array(Schema.Record(Schema.String, Schema.Unknown)), }) {} +export class ServiceExternalEdgesRequest extends Schema.Class( + "ServiceExternalEdgesRequest", +)({ + startTime: TinybirdDateTime, + endTime: TinybirdDateTime, + serviceName: Schema.String, + deploymentEnv: Schema.optional(Schema.String), +}) {} + +export class ServiceExternalEdgesResponse extends Schema.Class( + "ServiceExternalEdgesResponse", +)({ + data: Schema.Array(Schema.Record(Schema.String, Schema.Unknown)), +}) {} + const ServicePlatformLiteral = Schema.Literals(["kubernetes", "cloudflare", "lambda", "web", "unknown"]) export class ServicePlatformsRequest extends Schema.Class("ServicePlatformsRequest")({ @@ -1093,6 +1108,13 @@ export class QueryEngineApiGroup extends HttpApiGroup.make("queryEngine") error: queryEngineEndpointErrors, }), ) + .add( + HttpApiEndpoint.post("serviceExternalEdges", "/service-external-edges", { + payload: ServiceExternalEdgesRequest, + success: ServiceExternalEdgesResponse, + error: queryEngineEndpointErrors, + }), + ) .add( HttpApiEndpoint.post("servicePlatforms", "/service-platforms", { payload: ServicePlatformsRequest, diff --git a/packages/domain/src/tinybird/datasources.ts b/packages/domain/src/tinybird/datasources.ts index 46d75fc1..1192b61b 100644 --- a/packages/domain/src/tinybird/datasources.ts +++ b/packages/domain/src/tinybird/datasources.ts @@ -388,6 +388,108 @@ export const serviceMapDbEdgesHourly = defineDatasource("service_map_db_edges_ho export type ServiceMapDbEdgesHourlyRow = InferRow +/** + * Pre-aggregated hourly service-to-external-target edges for the service detail + * page's Dependencies tab (and, eventually, external nodes on the service map). + * + * One row per (OrgId, Hour, ServiceName, TargetType, TargetSystem, TargetName, + * DeploymentEnv) — captures Client/Producer spans WITHOUT `db.system.name` + * (those are in `service_map_db_edges_hourly`), keyed by what they're talking to: + * + * - http — `server.address` / `http.host` / `url.authority` + * - messaging — `messaging.system` + `messaging.destination` + * - rpc — `rpc.system` + `rpc.service` + * + * `TargetType` is LowCardinality(String) — not Enum8 — to match the + * `alert_checks` pattern (forward-compat with potential direct ingestion paths + * that don't support Enum8 JSONPath ingestion). Allowed values: 'http' | + * 'messaging' | 'rpc'. Populated by materialized view, not direct ingestion. + * + * Internal-service overlap (e.g. `auth-api` calling `users-api` shows up here + * as `http://users-api.svc.cluster.local`) is filtered at QUERY time via a + * LEFT ANTI JOIN against `service_address_resolutions_hourly`. + */ +export const serviceExternalEdgesHourly = defineDatasource("service_external_edges_hourly", { + description: + "Pre-aggregated hourly service-to-external-target edges (http / messaging / rpc) for the service-detail Dependencies tab. Captures Client/Producer spans WITHOUT db.system.name. Populated by materialized view.", + jsonPaths: false, + schema: { + OrgId: t.string().lowCardinality(), + Hour: t.dateTime(), + ServiceName: t.string().lowCardinality(), + TargetType: t.string().lowCardinality(), + TargetSystem: t.string().lowCardinality(), + TargetName: t.string(), + DeploymentEnv: t.string().lowCardinality(), + CallCount: t.simpleAggregateFunction("sum", t.uint64()), + ErrorCount: t.simpleAggregateFunction("sum", t.uint64()), + DurationSumMs: t.simpleAggregateFunction("sum", t.float64()), + MaxDurationMs: t.simpleAggregateFunction("max", t.float64()), + SampleRateSum: t.simpleAggregateFunction("sum", t.float64()), + }, + engine: engine.aggregatingMergeTree({ + partitionKey: "toDate(Hour)", + sortingKey: [ + "OrgId", + "Hour", + "DeploymentEnv", + "ServiceName", + "TargetType", + "TargetSystem", + "TargetName", + ], + ttl: "toDate(Hour) + INTERVAL 90 DAY", + }), +}) + +export type ServiceExternalEdgesHourlyRow = InferRow + +/** + * Resolved `(SourceService, parent-Client-span.server.address) → child-Server- + * span.ServiceName` facts emitted by `ServiceMapRollupService` from the same + * cross-span JOIN that fills `service_map_edges_hourly`. One row per resolved + * (sourceService, parentServerAddress, resolvedTargetService) triple per hour. + * + * Used by the Dependencies-tab external-edges query to anti-join out HTTP + * targets that actually resolve to a known internal service in the same window + * (so `auth-api → users-api.svc.cluster.local` doesn't show up under "External + * HTTP" when it's already represented as an internal service edge). + * + * Not populated by a materialized view — the parent→child JOIN is a cross-span + * operation that an incremental MV cannot express. Same caveat as + * `service_map_edges_hourly`. + */ +export const serviceAddressResolutionsHourly = defineDatasource( + "service_address_resolutions_hourly", + { + description: + "Resolved (sourceService, parent.server.address) → resolved targetService facts emitted by the ServiceMapRollupService rollup. Used to anti-join internal-service overlap out of the external-edges query.", + jsonPaths: false, + schema: { + OrgId: t.string().lowCardinality(), + Hour: t.dateTime(), + SourceService: t.string().lowCardinality(), + ParentServerAddress: t.string(), + ResolvedTargetService: t.string().lowCardinality(), + DeploymentEnv: t.string().lowCardinality(), + }, + engine: engine.replacingMergeTree({ + partitionKey: "toDate(Hour)", + sortingKey: [ + "OrgId", + "Hour", + "DeploymentEnv", + "SourceService", + "ParentServerAddress", + "ResolvedTargetService", + ], + ttl: "toDate(Hour) + INTERVAL 90 DAY", + }), + }, +) + +export type ServiceAddressResolutionsHourlyRow = InferRow + /** * Pre-aggregated hourly per-service platform attributes for the service map. * One row per (OrgId, Hour, ServiceName, DeploymentEnv) with the resource diff --git a/packages/domain/src/tinybird/materializations.ts b/packages/domain/src/tinybird/materializations.ts index d68a98f0..61cf4fcb 100644 --- a/packages/domain/src/tinybird/materializations.ts +++ b/packages/domain/src/tinybird/materializations.ts @@ -4,6 +4,7 @@ import { serviceMapSpans, serviceMapChildren, serviceMapDbEdgesHourly, + serviceExternalEdgesHourly, servicePlatformsHourly, serviceOverviewSpans, errorSpans, @@ -373,6 +374,83 @@ export const serviceMapDbEdgesHourlyMv = defineMaterializedView("service_map_db_ ], }) +/** + * Materialized view pre-aggregating service-to-external-target edges per hour. + * Captures Client/Producer spans WITHOUT `db.system.name` set (DB calls go to + * `service_map_db_edges_hourly_mv`) — i.e. plain HTTP outbound, messaging + * producers, and RPC clients. + * + * `TargetType` precedence: messaging > rpc > http. A span carrying both + * `messaging.system` and `server.address` (rare, but happens when a queue + * client also tags the broker address) lands as messaging — its identity is + * the queue, not the host. RPC is preferred over http for the same reason. + * + * For HTTP the fallback chain for `TargetName` is + * `server.address` → `http.host` → `url.authority` — modern OTel SDKs emit + * `server.address`; legacy ones still emit `http.host`; some emit `url.authority`. + */ +export const serviceExternalEdgesHourlyMv = defineMaterializedView( + "service_external_edges_hourly_mv", + { + description: + "Pre-aggregates Client/Producer spans without db.system.name into hourly service-to-external-target edges (http / messaging / rpc) for the service-detail Dependencies tab.", + datasource: serviceExternalEdgesHourly, + nodes: [ + node({ + name: "service_external_edges_hourly_mv_node", + sql: ` + SELECT + OrgId, + toStartOfHour(toDateTime(Timestamp)) AS Hour, + ServiceName, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', 'messaging', + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', 'rpc', + 'http' + ) AS TargetType, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', SpanAttributes['messaging.system'], + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', SpanAttributes['rpc.system'], + '' + ) AS TargetSystem, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', + if(SpanAttributes['messaging.destination'] != '', SpanAttributes['messaging.destination'], SpanAttributes['messaging.system']), + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', + if(SpanAttributes['rpc.service'] != '', SpanAttributes['rpc.service'], SpanAttributes['rpc.system']), + if(SpanAttributes['server.address'] != '', + SpanAttributes['server.address'], + if(SpanAttributes['http.host'] != '', + SpanAttributes['http.host'], + SpanAttributes['url.authority'])) + ) AS TargetName, + ResourceAttributes['deployment.environment'] AS DeploymentEnv, + count() AS CallCount, + countIf(StatusCode = 'Error') AS ErrorCount, + sum(Duration / 1000000) AS DurationSumMs, + max(Duration / 1000000) AS MaxDurationMs, + sum(SampleRate) AS SampleRateSum + FROM traces + WHERE SpanKind IN ('Client', 'Producer') + AND SpanAttributes['db.system.name'] = '' + AND ServiceName != '' + AND ( + SpanAttributes['server.address'] != '' + OR SpanAttributes['http.host'] != '' + OR SpanAttributes['url.authority'] != '' + OR SpanAttributes['messaging.destination'] != '' + OR SpanAttributes['messaging.system'] != '' + OR SpanAttributes['rpc.service'] != '' + OR SpanAttributes['rpc.system'] != '' + ) + GROUP BY OrgId, Hour, ServiceName, TargetType, TargetSystem, TargetName, DeploymentEnv + HAVING TargetName != '' + `, + }), + ], + }, +) + /** * Materialized view pre-aggregating per-service hosting-platform attributes per hour. * Picks `max()` per attribute string so non-empty values dominate empty ones — diff --git a/packages/query-engine/src/ch/expr.ts b/packages/query-engine/src/ch/expr.ts index efc426fe..9836ddee 100644 --- a/packages/query-engine/src/ch/expr.ts +++ b/packages/query-engine/src/ch/expr.ts @@ -271,6 +271,8 @@ export { least_, greatest_, toStartOfInterval, + toStartOfHour, + toUnixTimestamp, intervalSub, formatDateTime, toDateTime, diff --git a/packages/query-engine/src/ch/functions/date-time.ts b/packages/query-engine/src/ch/functions/date-time.ts index 30463058..a1276e00 100644 --- a/packages/query-engine/src/ch/functions/date-time.ts +++ b/packages/query-engine/src/ch/functions/date-time.ts @@ -14,6 +14,27 @@ export function toStartOfInterval(col: Expr, seconds: number | Expr(raw(`toStartOfInterval(${compile(col.toFragment())}, INTERVAL ${secStr} SECOND)`)) } +/** + * `toStartOfHour(expr)` — floor a DateTime to its hour boundary. Equivalent to + * `toStartOfInterval(col, 3600)` but kept as a distinct function so queries + * that bucket on natural hours stay legible (the resolutions rollup, the + * service-map edge rollup, and the dependencies tab all read from + * `*_hourly` tables on this exact boundary). + */ +export function toStartOfHour(col: Expr): Expr { + return makeExpr(raw(`toStartOfHour(${compile(col.toFragment())})`)) +} + +/** + * `toUnixTimestamp(expr)` — convert a DateTime/DateTime64 to a UInt32 of + * seconds since epoch. Useful for stable JSON-numeric keys (e.g. the rollup's + * "have we already sealed this hour" check) without forcing the consumer to + * parse RFC3339. + */ +export function toUnixTimestamp(col: Expr): Expr { + return makeExpr(raw(`toUnixTimestamp(${compile(col.toFragment())})`)) +} + export function intervalSub(col: Expr, seconds: number | Expr): Expr { const secStr = typeof seconds === "number" diff --git a/packages/query-engine/src/ch/functions/index.ts b/packages/query-engine/src/ch/functions/index.ts index 9704047c..3dbf01ae 100644 --- a/packages/query-engine/src/ch/functions/index.ts +++ b/packages/query-engine/src/ch/functions/index.ts @@ -41,7 +41,14 @@ export { greatest_, } from "./numeric" -export { toStartOfInterval, intervalSub, formatDateTime, toDateTime } from "./date-time" +export { + toStartOfInterval, + toStartOfHour, + toUnixTimestamp, + intervalSub, + formatDateTime, + toDateTime, +} from "./date-time" export { if_, multiIf, coalesce, nullIf } from "./conditional" diff --git a/packages/query-engine/src/ch/index.ts b/packages/query-engine/src/ch/index.ts index c83b9ee8..cf08500b 100644 --- a/packages/query-engine/src/ch/index.ts +++ b/packages/query-engine/src/ch/index.ts @@ -269,15 +269,20 @@ export { type ServiceDbEdgesOutput, type ServicePlatformsOpts, type ServicePlatformsOutput, + serviceExternalEdgesSQL, + type ServiceExternalEdgesOpts, + type ServiceExternalEdgesOutput, } from "./queries/service-map" // Queries — Service Map hourly edge rollup export { serviceMapEdgesRollupSQL, serviceMapEdgesExistingHoursSQL, + serviceMapResolutionsRollupSQL, type ServiceMapEdgesRollupParams, type ServiceMapEdgesHourlyOutput, type ServiceMapEdgesExistingHour, + type ServiceAddressResolutionsHourlyOutput, } from "./queries/service-map-rollup" // Queries — Service Infrastructure (service.name ↔ k8s workload join) diff --git a/packages/query-engine/src/ch/queries/service-map-rollup.ts b/packages/query-engine/src/ch/queries/service-map-rollup.ts index c852ec93..f2f719f4 100644 --- a/packages/query-engine/src/ch/queries/service-map-rollup.ts +++ b/packages/query-engine/src/ch/queries/service-map-rollup.ts @@ -14,7 +14,12 @@ // --------------------------------------------------------------------------- import type { CompiledQuery } from "../compile" +import { compileCH } from "../compile" +import * as CH from "../expr" +import { param } from "../param" +import { from, fromQuery } from "../query" import { escapeClickHouseString } from "../../sql/sql-fragment" +import { ServiceMapEdgesHourly, Traces } from "../tables" import { serviceMapEdgeJoinSQL } from "./service-map" /** One pre-aggregated service-to-service edge bucket — mirrors the columns of @@ -58,13 +63,24 @@ export function serviceMapEdgesExistingHoursSQL(params: { startTime: string endTime: string }): CompiledQuery { - const esc = escapeClickHouseString - const sql = `SELECT DISTINCT toUnixTimestamp(Hour) AS hourTs -FROM service_map_edges_hourly -WHERE OrgId = '${esc(params.orgId)}' - AND Hour >= toDateTime('${esc(params.startTime)}') - AND Hour < toDateTime('${esc(params.endTime)}') -FORMAT JSON` + // `GROUP BY hourTs` collapses identical hour values across edge rows — the + // rollup only cares about which hour starts have been sealed, not which + // edges live in them. Same semantics as SELECT DISTINCT, with the DSL. + const query = from(ServiceMapEdgesHourly) + .select(($) => ({ hourTs: CH.toUnixTimestamp($.Hour) })) + .where(($) => [ + $.OrgId.eq(param.string("orgId")), + $.Hour.gte(param.dateTime("startTime")), + $.Hour.lt(param.dateTime("endTime")), + ]) + .groupBy("hourTs") + .format("JSON") + + const { sql } = compileCH(query, { + orgId: params.orgId, + startTime: params.startTime, + endTime: params.endTime, + }) return { sql, @@ -93,3 +109,102 @@ FORMAT JSON` castRows: (rows) => rows as unknown as ReadonlyArray, } } + +// --------------------------------------------------------------------------- +// Resolutions rollup (companion of the edges rollup) +// +// Emits one row per resolved `(SourceService, parent.server.address) → +// child.ServiceName` triple per hour. Used by `serviceExternalEdgesSQL`'s +// LEFT ANTI JOIN to suppress internal-service HTTP overlap from the +// Dependencies tab's "external" view. +// +// Reads raw `traces` (not `service_map_spans`) because the projection MV +// doesn't carry SpanAttributes; we need `server.address` on the parent. Runs +// once per completed hour from `ServiceMapRollupService.processOrg`. +// --------------------------------------------------------------------------- + +/** One resolved address-to-service mapping bucket — mirrors the columns of + * `service_address_resolutions_hourly`. */ +export interface ServiceAddressResolutionsHourlyOutput { + readonly OrgId: string + readonly Hour: string + readonly SourceService: string + readonly ParentServerAddress: string + readonly ResolvedTargetService: string + readonly DeploymentEnv: string +} + +export function serviceMapResolutionsRollupSQL( + params: ServiceMapEdgesRollupParams, +): CompiledQuery { + // Parent side: Client/Producer spans, projecting just what the join + outer + // SELECT needs. The map lookups (`server.address`, `deployment.environment`) + // happen here so the outer query reads them as plain columns instead of + // re-evaluating the map per output row. + const parents = from(Traces) + .select(($) => ({ + OrgId: $.OrgId, + Timestamp: $.Timestamp, + TraceId: $.TraceId, + SpanId: $.SpanId, + ServiceName: $.ServiceName, + ServerAddress: $.SpanAttributes.get("server.address"), + DeploymentEnv: $.ResourceAttributes.get("deployment.environment"), + })) + .where(($) => [ + CH.inList($.SpanKind, ["Client", "Producer"]), + $.Timestamp.gte(param.dateTime("hourStart")), + $.Timestamp.lt(param.dateTime("hourEnd")), + $.OrgId.eq(param.string("orgId")), + $.SpanAttributes.get("server.address").neq(""), + ]) + + // Child side: Server/Consumer spans. Only the columns needed to JOIN on + // (TraceId, ParentSpanId) and to project the resolved target ServiceName. + const children = from(Traces) + .select(($) => ({ + TraceId: $.TraceId, + ParentSpanId: $.ParentSpanId, + ServiceName: $.ServiceName, + })) + .where(($) => [ + CH.inList($.SpanKind, ["Server", "Consumer"]), + $.Timestamp.gte(param.dateTime("hourStart")), + $.Timestamp.lt(param.dateTime("hourEnd")), + $.OrgId.eq(param.string("orgId")), + ]) + + const query = fromQuery(parents, "p") + .innerJoinQuery(children, "c", (p, c) => + p.SpanId.eq(c.ParentSpanId).and(p.TraceId.eq(c.TraceId)), + ) + .select(($) => ({ + OrgId: $.OrgId, + Hour: CH.toStartOfHour($.Timestamp), + SourceService: $.ServiceName, + ParentServerAddress: $.ServerAddress, + ResolvedTargetService: $.c.ServiceName, + DeploymentEnv: $.DeploymentEnv, + })) + .where(($) => [$.ServiceName.neq($.c.ServiceName)]) + .groupBy( + "OrgId", + "Hour", + "SourceService", + "ParentServerAddress", + "ResolvedTargetService", + "DeploymentEnv", + ) + .format("JSON") + + const { sql } = compileCH(query, { + orgId: params.orgId, + hourStart: params.hourStart, + hourEnd: params.hourEnd, + }) + + return { + sql, + castRows: (rows) => rows as unknown as ReadonlyArray, + } +} diff --git a/packages/query-engine/src/ch/queries/service-map.test.ts b/packages/query-engine/src/ch/queries/service-map.test.ts new file mode 100644 index 00000000..fa53c10e --- /dev/null +++ b/packages/query-engine/src/ch/queries/service-map.test.ts @@ -0,0 +1,143 @@ +import { describe, expect, it } from "vitest" +import { serviceExternalEdgesSQL } from "./service-map" +import { serviceMapResolutionsRollupSQL } from "./service-map-rollup" + +const baseParams = { + orgId: "org_1", + startTime: "2024-01-01 00:00:00", + endTime: "2024-01-02 00:00:00", +} + +// --------------------------------------------------------------------------- +// serviceExternalEdgesSQL +// --------------------------------------------------------------------------- + +describe("serviceExternalEdgesSQL", () => { + it("scopes by org, service, and time window", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + expect(sql).toContain("OrgId = 'org_1'") + expect(sql).toContain("ServiceName = 'artifacts-api'") + expect(sql).toContain("toStartOfHour(toDateTime('2024-01-01 00:00:00'))") + expect(sql).toContain("toStartOfHour(toDateTime('2024-01-02 00:00:00'))") + }) + + it("unions hourly MV branch with raw-traces fallback for the in-progress hour", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + expect(sql).toContain("FROM service_external_edges_hourly") + expect(sql).toContain("FROM traces") + expect(sql).toContain("UNION ALL") + // Recent branch must filter to the in-progress hour [endHour, endTime]. + expect(sql).toContain("Timestamp >= toStartOfHour(toDateTime('2024-01-02 00:00:00'))") + }) + + it("excludes db.system.name from the raw-traces branch (DB edges are a separate MV)", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + expect(sql).toContain("SpanAttributes['db.system.name'] = ''") + }) + + it("applies messaging > rpc > http precedence in the multiIf", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + // First branch of multiIf must be the messaging predicate. + const multiIfIdx = sql.indexOf("multiIf(") + expect(multiIfIdx).toBeGreaterThan(-1) + const after = sql.slice(multiIfIdx, multiIfIdx + 400) + const msgIdx = after.indexOf("'messaging'") + const rpcIdx = after.indexOf("'rpc'") + const httpIdx = after.indexOf("'http'") + expect(msgIdx).toBeGreaterThan(-1) + expect(rpcIdx).toBeGreaterThan(-1) + expect(httpIdx).toBeGreaterThan(-1) + expect(msgIdx).toBeLessThan(rpcIdx) + expect(rpcIdx).toBeLessThan(httpIdx) + }) + + it("anti-joins internal-service overlap from the resolutions table for HTTP only", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + expect(sql).toContain("FROM service_address_resolutions_hourly") + expect(sql).toContain("targetType = 'http'") + expect(sql).toContain("targetName IN (") + }) + + it("threads deploymentEnv into both branches and the resolutions anti-join", () => { + const { sql } = serviceExternalEdgesSQL( + { serviceName: "artifacts-api", deploymentEnv: "production" }, + baseParams, + ) + expect(sql).toContain("DeploymentEnv = 'production'") + expect(sql).toContain("ResourceAttributes['deployment.environment'] = 'production'") + }) + + it("groups by target identity and orders by callCount desc", () => { + const { sql } = serviceExternalEdgesSQL({ serviceName: "artifacts-api" }, baseParams) + expect(sql).toContain("GROUP BY sourceService, targetType, targetSystem, targetName") + expect(sql).toContain("ORDER BY callCount DESC") + expect(sql).toContain("LIMIT 200") + expect(sql).toContain("FORMAT JSON") + }) + + it("escapes single quotes in serviceName / orgId to prevent SQL injection", () => { + const { sql } = serviceExternalEdgesSQL( + { serviceName: "weird'service" }, + { ...baseParams, orgId: "org'attack" }, + ) + expect(sql).toContain("ServiceName = 'weird\\'service'") + expect(sql).toContain("OrgId = 'org\\'attack'") + }) +}) + +// --------------------------------------------------------------------------- +// serviceMapResolutionsRollupSQL — companion of the edges rollup +// --------------------------------------------------------------------------- + +describe("serviceMapResolutionsRollupSQL", () => { + const hourParams = { + orgId: "org_1", + hourStart: "2024-01-01 00:00:00", + hourEnd: "2024-01-01 01:00:00", + } + + it("joins parent Client/Producer spans to child Server/Consumer spans", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + expect(sql).toContain("SpanKind IN ('Client', 'Producer')") + expect(sql).toContain("SpanKind IN ('Server', 'Consumer')") + expect(sql).toContain("ON (p.SpanId = c.ParentSpanId AND p.TraceId = c.TraceId)") + }) + + it("projects parent server.address as the resolution key", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + // Map lookup is pushed into the parent subquery as `ServerAddress`, so the + // outer SELECT reads a flat column instead of re-evaluating the map. + expect(sql).toContain("SpanAttributes['server.address'] AS ServerAddress") + expect(sql).toContain("p.ServerAddress AS ParentServerAddress") + expect(sql).toContain("c.ServiceName AS ResolvedTargetService") + }) + + it("hour-buckets via toStartOfHour, scopes by org", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + expect(sql).toContain("toStartOfHour(p.Timestamp) AS Hour") + expect(sql).toContain("OrgId = 'org_1'") + }) + + it("drops same-service edges and empty server.address", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + expect(sql).toContain("p.ServiceName != c.ServiceName") + expect(sql).toContain("SpanAttributes['server.address'] != ''") + }) + + it("bounds the join to a single hour on both sides", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + expect(sql).toContain("Timestamp >= '2024-01-01 00:00:00'") + expect(sql).toContain("Timestamp < '2024-01-01 01:00:00'") + // Both branches must enforce the hour bound — count occurrences. + const matches = sql.match(/Timestamp >= '2024-01-01 00:00:00'/g) + expect(matches?.length).toBe(2) + }) + + it("groups by the resolution key tuple and formats as JSON", () => { + const { sql } = serviceMapResolutionsRollupSQL(hourParams) + expect(sql).toContain( + "GROUP BY OrgId, Hour, SourceService, ParentServerAddress, ResolvedTargetService, DeploymentEnv", + ) + expect(sql).toContain("FORMAT JSON") + }) +}) diff --git a/packages/query-engine/src/ch/queries/service-map.ts b/packages/query-engine/src/ch/queries/service-map.ts index dedbaff9..bcb8b92f 100644 --- a/packages/query-engine/src/ch/queries/service-map.ts +++ b/packages/query-engine/src/ch/queries/service-map.ts @@ -5,7 +5,11 @@ // --------------------------------------------------------------------------- import { escapeClickHouseString } from "../../sql/sql-fragment" -import type { CompiledQuery } from "../compile" +import { compileCH, type CompiledQuery } from "../compile" +import * as CH from "../expr" +import { param } from "../param" +import { from } from "../query" +import { ServicePlatformsHourly } from "../tables" // --------------------------------------------------------------------------- // Service dependencies @@ -294,6 +298,167 @@ FORMAT JSON` } } +// --------------------------------------------------------------------------- +// Service ↔ external target edges (http / messaging / rpc) +// +// Surfaces non-DB Client/Producer outbound calls — HTTP endpoints, message +// queues, RPC targets — as a unified inventory for the service-detail page's +// "Dependencies" tab. Mirrors the DB-edges pattern: hourly MV (sealed buckets) +// UNION ALL with raw-traces fallback (in-progress hour), then de-duplicated +// against `service_address_resolutions_hourly` so HTTP targets whose address +// resolves to a known internal service (in the same window) drop out — those +// already appear under "Services" via `serviceDependenciesSQL`. +// --------------------------------------------------------------------------- + +export interface ServiceExternalEdgesOpts { + deploymentEnv?: string + serviceName: string +} + +export interface ServiceExternalEdgesOutput { + readonly sourceService: string + readonly targetType: "http" | "messaging" | "rpc" + readonly targetSystem: string + readonly targetName: string + readonly callCount: number + readonly errorCount: number + readonly avgDurationMs: number + readonly p95DurationMs: number + readonly estimatedSpanCount: number +} + +export function serviceExternalEdgesSQL( + opts: ServiceExternalEdgesOpts, + params: { orgId: string; startTime: string; endTime: string }, +): CompiledQuery { + const esc = escapeClickHouseString + const envFilterMv = opts.deploymentEnv + ? `AND DeploymentEnv = '${esc(opts.deploymentEnv)}'` + : "" + const envFilterRaw = opts.deploymentEnv + ? `AND ResourceAttributes['deployment.environment'] = '${esc(opts.deploymentEnv)}'` + : "" + const envFilterRes = opts.deploymentEnv + ? `AND DeploymentEnv = '${esc(opts.deploymentEnv)}'` + : "" + + // Hourly branch: sealed buckets from the MV-fed table. Carries + // `bucket*` aliases so the outer aggregate can't collide with inner ones + // (same nested-aggregate optimizer gotcha as `serviceDbEdgesSQL`). + const hourlyEdges = `SELECT + ServiceName AS sourceService, + TargetType AS targetType, + TargetSystem AS targetSystem, + TargetName AS targetName, + sum(CallCount) AS bucketCallCount, + sum(ErrorCount) AS bucketErrorCount, + sum(DurationSumMs) AS bucketDurationSumMs, + max(MaxDurationMs) AS bucketMaxDurationMs, + sum(if(SampleRateSum > 0, SampleRateSum, toFloat64(CallCount))) AS bucketEstimatedSpanCount + FROM service_external_edges_hourly + WHERE OrgId = '${esc(params.orgId)}' + AND ServiceName = '${esc(opts.serviceName)}' + AND Hour >= toStartOfHour(toDateTime('${esc(params.startTime)}')) + AND Hour < toStartOfHour(toDateTime('${esc(params.endTime)}')) + AND TargetName != '' + ${envFilterMv} + GROUP BY sourceService, targetType, targetSystem, targetName` + + // Recent branch: raw `traces` for the in-progress hour only. Mirrors the + // `multiIf` precedence used by the MV (messaging > rpc > http) so the + // two branches produce identical row shapes for the same span. + const recentEdges = `SELECT + ServiceName AS sourceService, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', 'messaging', + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', 'rpc', + 'http' + ) AS targetType, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', SpanAttributes['messaging.system'], + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', SpanAttributes['rpc.system'], + '' + ) AS targetSystem, + multiIf( + SpanAttributes['messaging.destination'] != '' OR SpanAttributes['messaging.system'] != '', + if(SpanAttributes['messaging.destination'] != '', SpanAttributes['messaging.destination'], SpanAttributes['messaging.system']), + SpanAttributes['rpc.service'] != '' OR SpanAttributes['rpc.system'] != '', + if(SpanAttributes['rpc.service'] != '', SpanAttributes['rpc.service'], SpanAttributes['rpc.system']), + if(SpanAttributes['server.address'] != '', + SpanAttributes['server.address'], + if(SpanAttributes['http.host'] != '', + SpanAttributes['http.host'], + SpanAttributes['url.authority'])) + ) AS targetName, + count() AS bucketCallCount, + countIf(StatusCode = 'Error') AS bucketErrorCount, + sum(Duration / 1000000) AS bucketDurationSumMs, + max(Duration / 1000000) AS bucketMaxDurationMs, + sum(SampleRate) AS bucketEstimatedSpanCount + FROM traces + WHERE OrgId = '${esc(params.orgId)}' + AND ServiceName = '${esc(opts.serviceName)}' + AND Timestamp >= toStartOfHour(toDateTime('${esc(params.endTime)}')) + AND Timestamp <= '${esc(params.endTime)}' + AND SpanKind IN ('Client', 'Producer') + AND SpanAttributes['db.system.name'] = '' + AND ( + SpanAttributes['server.address'] != '' + OR SpanAttributes['http.host'] != '' + OR SpanAttributes['url.authority'] != '' + OR SpanAttributes['messaging.destination'] != '' + OR SpanAttributes['messaging.system'] != '' + OR SpanAttributes['rpc.service'] != '' + OR SpanAttributes['rpc.system'] != '' + ) + ${envFilterRaw} + GROUP BY sourceService, targetType, targetSystem, targetName + HAVING targetName != ''` + + // Internal-service overlap suppression: drop HTTP rows whose `targetName` + // resolves to a known internal service in the same window. Messaging and + // RPC pass through unchanged (queues/RPC services are never the same + // identity as an internal service name). Scoped to `[startHour, endHour]` + // so we don't anti-join against ancient resolutions. + const sql = `SELECT + sourceService, + targetType, + targetSystem, + targetName, + sum(bucketCallCount) AS callCount, + sum(bucketErrorCount) AS errorCount, + sum(bucketDurationSumMs) / nullIf(sum(bucketCallCount), 0) AS avgDurationMs, + max(bucketMaxDurationMs) AS p95DurationMs, + sum(bucketEstimatedSpanCount) AS estimatedSpanCount +FROM ( + ${hourlyEdges} + UNION ALL + ${recentEdges} +) AS edges +WHERE NOT ( + targetType = 'http' + AND targetName IN ( + SELECT DISTINCT ParentServerAddress + FROM service_address_resolutions_hourly + WHERE OrgId = '${esc(params.orgId)}' + AND SourceService = '${esc(opts.serviceName)}' + AND Hour >= toStartOfHour(toDateTime('${esc(params.startTime)}')) + AND Hour <= toDateTime('${esc(params.endTime)}') + AND ParentServerAddress != '' + ${envFilterRes} + ) +) +GROUP BY sourceService, targetType, targetSystem, targetName +ORDER BY callCount DESC +LIMIT 200 +FORMAT JSON` + + return { + sql, + castRows: (rows) => rows as unknown as ReadonlyArray, + } +} + // --------------------------------------------------------------------------- // Service hosting platform // @@ -330,30 +495,38 @@ export function servicePlatformsSQL( opts: ServicePlatformsOpts, params: { orgId: string; startTime: string; endTime: string }, ): CompiledQuery { - const esc = escapeClickHouseString - const envFilter = opts.deploymentEnv - ? `AND DeploymentEnv = '${esc(opts.deploymentEnv)}'` - : "" + const query = from(ServicePlatformsHourly) + .select(($) => ({ + serviceName: $.ServiceName, + // `max()` on a SimpleAggregateFunction(max, String) column merges + // non-empty strings to win over empty ones — the "did any span in + // this window carry this attribute" semantics the platform + // classifier needs. + k8sCluster: CH.max_($.K8sCluster), + k8sPodName: CH.max_($.K8sPodName), + k8sDeploymentName: CH.max_($.K8sDeploymentName), + cloudPlatform: CH.max_($.CloudPlatform), + cloudProvider: CH.max_($.CloudProvider), + faasName: CH.max_($.FaasName), + mapleSdkType: CH.max_($.MapleSdkType), + processRuntimeName: CH.max_($.ProcessRuntimeName), + })) + .where(($) => [ + $.OrgId.eq(param.string("orgId")), + $.Hour.gte(CH.toStartOfHour(param.dateTime("startTime"))), + $.Hour.lte(param.dateTime("endTime")), + $.ServiceName.neq(""), + opts.deploymentEnv ? $.DeploymentEnv.eq(opts.deploymentEnv) : undefined, + ]) + .groupBy("serviceName") + .limit(500) + .format("JSON") - const sql = `SELECT - ServiceName AS serviceName, - max(K8sCluster) AS k8sCluster, - max(K8sPodName) AS k8sPodName, - max(K8sDeploymentName) AS k8sDeploymentName, - max(CloudPlatform) AS cloudPlatform, - max(CloudProvider) AS cloudProvider, - max(FaasName) AS faasName, - max(MapleSdkType) AS mapleSdkType, - max(ProcessRuntimeName) AS processRuntimeName -FROM service_platforms_hourly -WHERE OrgId = '${esc(params.orgId)}' - AND Hour >= toStartOfHour(toDateTime('${esc(params.startTime)}')) - AND Hour <= '${esc(params.endTime)}' - AND ServiceName != '' - ${envFilter} -GROUP BY serviceName -LIMIT 500 -FORMAT JSON` + const { sql } = compileCH(query, { + orgId: params.orgId, + startTime: params.startTime, + endTime: params.endTime, + }) return { sql, From 388d48789b893c29c2d32d36fd5d053fb7952ec9 Mon Sep 17 00:00:00 2001 From: Makisuo Date: Wed, 20 May 2026 12:54:11 +0200 Subject: [PATCH 2/3] s --- apps/api/src/services/ServiceMapRollupService.ts | 8 ++++++++ packages/domain/src/generated/clickhouse-schema.ts | 4 ++-- packages/query-engine/src/ch/queries/service-map.ts | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/ServiceMapRollupService.ts b/apps/api/src/services/ServiceMapRollupService.ts index 42d6537e..8fe6a07b 100644 --- a/apps/api/src/services/ServiceMapRollupService.ts +++ b/apps/api/src/services/ServiceMapRollupService.ts @@ -113,6 +113,14 @@ export class ServiceMapRollupService extends Context.Service< // SQL pass (~same cost as the edges JOIN) so the existing rollup // query keeps its tight shape; failure of one ingest doesn't // invalidate the other (per-org Effect failure already isolated). + // + // NOTE: the hour is marked "done" purely by the presence of an edges + // row (see `serviceMapEdgesExistingHoursSQL`). If the edges write + // succeeds but the resolutions write fails (warehouse error, ingest + // throttling), the next tick will skip the hour and the resolutions + // gap is permanent — manifesting as internal-service HTTP calls + // leaking into the Dependencies "External" tab for that window. + // Backfill = re-running this rollup with the edges row deleted. const resolutionsRollup = CH.serviceMapResolutionsRollupSQL({ orgId, hourStart, diff --git a/packages/domain/src/generated/clickhouse-schema.ts b/packages/domain/src/generated/clickhouse-schema.ts index 64220577..73e3af41 100644 --- a/packages/domain/src/generated/clickhouse-schema.ts +++ b/packages/domain/src/generated/clickhouse-schema.ts @@ -1,7 +1,7 @@ // This file is generated by scripts/generate-clickhouse-schema.ts // Do not edit manually. -export const projectRevision = "29804960ae922b01c05dd2705e8a70418d8d728811f30b96d3e758ac15012981" as const +export const projectRevision = "2addb09f5853b15750a3394934ee507efd6b37a7511695477698f141c1617d72" as const export const latestSnapshotStatements: ReadonlyArray = [ "CREATE TABLE IF NOT EXISTS alert_checks (\n OrgId LowCardinality(String),\n RuleId String,\n GroupKey String,\n Timestamp DateTime64(3),\n Status LowCardinality(String),\n SignalType LowCardinality(String),\n Comparator LowCardinality(String),\n Threshold Float64,\n ObservedValue Nullable(Float64),\n SampleCount UInt32,\n WindowMinutes UInt16,\n WindowStart DateTime64(3),\n WindowEnd DateTime64(3),\n ConsecutiveBreaches UInt16,\n ConsecutiveHealthy UInt16,\n IncidentId Nullable(String),\n IncidentTransition LowCardinality(String),\n EvaluationDurationMs UInt32\n)\nENGINE = MergeTree\nPARTITION BY toDate(Timestamp)\nORDER BY (OrgId, RuleId, GroupKey, Timestamp)\nTTL toDate(Timestamp) + INTERVAL 90 DAY", @@ -17,7 +17,7 @@ export const latestSnapshotStatements: ReadonlyArray = [ "CREATE TABLE IF NOT EXISTS metrics_histogram (\n OrgId LowCardinality(String),\n ResourceAttributes Map(LowCardinality(String), String),\n ResourceSchemaUrl String,\n ScopeName String,\n ScopeVersion String,\n ScopeAttributes Map(LowCardinality(String), String),\n ScopeSchemaUrl String,\n ServiceName LowCardinality(String),\n MetricName LowCardinality(String),\n MetricDescription LowCardinality(String),\n MetricUnit LowCardinality(String),\n Attributes Map(LowCardinality(String), String),\n StartTimeUnix DateTime64(9),\n TimeUnix DateTime64(9),\n Count UInt64,\n Sum Float64,\n BucketCounts Array(UInt64),\n ExplicitBounds Array(Float64),\n ExemplarsTraceId Array(String),\n ExemplarsSpanId Array(String),\n ExemplarsTimestamp Array(DateTime64(9)),\n ExemplarsValue Array(Float64),\n ExemplarsFilteredAttributes Array(Map(LowCardinality(String), String)),\n Flags UInt32,\n Min Nullable(Float64),\n Max Nullable(Float64),\n AggregationTemporality Int32\n)\nENGINE = MergeTree\nPARTITION BY toDate(TimeUnix)\nORDER BY (OrgId, ServiceName, MetricName, Attributes, toUnixTimestamp64Nano(TimeUnix))\nTTL toDate(TimeUnix) + INTERVAL 365 DAY", "CREATE TABLE IF NOT EXISTS metrics_sum (\n OrgId LowCardinality(String),\n ResourceAttributes Map(LowCardinality(String), String),\n ResourceSchemaUrl String,\n ScopeName String,\n ScopeVersion String,\n ScopeAttributes Map(LowCardinality(String), String),\n ScopeSchemaUrl String,\n ServiceName LowCardinality(String),\n MetricName LowCardinality(String),\n MetricDescription LowCardinality(String),\n MetricUnit LowCardinality(String),\n Attributes Map(LowCardinality(String), String),\n StartTimeUnix DateTime64(9),\n TimeUnix DateTime64(9),\n Value Float64,\n Flags UInt32,\n ExemplarsTraceId Array(String),\n ExemplarsSpanId Array(String),\n ExemplarsTimestamp Array(DateTime64(9)),\n ExemplarsValue Array(Float64),\n ExemplarsFilteredAttributes Array(Map(LowCardinality(String), String)),\n AggregationTemporality Int32,\n IsMonotonic Bool\n)\nENGINE = MergeTree\nPARTITION BY toDate(TimeUnix)\nORDER BY (OrgId, ServiceName, MetricName, Attributes, toUnixTimestamp64Nano(TimeUnix))\nTTL toDate(TimeUnix) + INTERVAL 365 DAY", "CREATE TABLE IF NOT EXISTS service_address_resolutions_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n SourceService LowCardinality(String),\n ParentServerAddress String,\n ResolvedTargetService LowCardinality(String),\n DeploymentEnv LowCardinality(String)\n)\nENGINE = ReplacingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, SourceService, ParentServerAddress, ResolvedTargetService)\nTTL toDate(Hour) + INTERVAL 90 DAY", - "CREATE TABLE IF NOT EXISTS service_external_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n ServiceName LowCardinality(String),\n TargetType LowCardinality(String),\n TargetSystem LowCardinality(String),\n TargetName String,\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, ServiceName, TargetType, TargetName)\nTTL toDate(Hour) + INTERVAL 90 DAY", + "CREATE TABLE IF NOT EXISTS service_external_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n ServiceName LowCardinality(String),\n TargetType LowCardinality(String),\n TargetSystem LowCardinality(String),\n TargetName String,\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, ServiceName, TargetType, TargetSystem, TargetName)\nTTL toDate(Hour) + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_children (\n OrgId LowCardinality(String),\n Timestamp DateTime,\n TraceId String,\n ParentSpanId String,\n ServiceName LowCardinality(String),\n SpanKind LowCardinality(String),\n Duration UInt64,\n StatusCode LowCardinality(String),\n TraceState String,\n DeploymentEnv LowCardinality(String)\n)\nENGINE = MergeTree\nPARTITION BY toDate(Timestamp)\nORDER BY (OrgId, TraceId, ParentSpanId, Timestamp)\nTTL Timestamp + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_db_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n ServiceName LowCardinality(String),\n DbSystem LowCardinality(String),\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampledSpanCount SimpleAggregateFunction(sum, UInt64),\n UnsampledSpanCount SimpleAggregateFunction(sum, UInt64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, ServiceName, DbSystem)\nTTL toDate(Hour) + INTERVAL 90 DAY", "CREATE TABLE IF NOT EXISTS service_map_edges_hourly (\n OrgId LowCardinality(String),\n Hour DateTime,\n SourceService LowCardinality(String),\n TargetService String,\n DeploymentEnv LowCardinality(String),\n CallCount SimpleAggregateFunction(sum, UInt64),\n ErrorCount SimpleAggregateFunction(sum, UInt64),\n DurationSumMs SimpleAggregateFunction(sum, Float64),\n MaxDurationMs SimpleAggregateFunction(max, Float64),\n SampledSpanCount SimpleAggregateFunction(sum, UInt64),\n UnsampledSpanCount SimpleAggregateFunction(sum, UInt64),\n SampleRateSum SimpleAggregateFunction(sum, Float64)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY toDate(Hour)\nORDER BY (OrgId, Hour, DeploymentEnv, SourceService, TargetService)\nTTL toDate(Hour) + INTERVAL 90 DAY", diff --git a/packages/query-engine/src/ch/queries/service-map.ts b/packages/query-engine/src/ch/queries/service-map.ts index bcb8b92f..0767a779 100644 --- a/packages/query-engine/src/ch/queries/service-map.ts +++ b/packages/query-engine/src/ch/queries/service-map.ts @@ -443,7 +443,7 @@ WHERE NOT ( WHERE OrgId = '${esc(params.orgId)}' AND SourceService = '${esc(opts.serviceName)}' AND Hour >= toStartOfHour(toDateTime('${esc(params.startTime)}')) - AND Hour <= toDateTime('${esc(params.endTime)}') + AND Hour < toStartOfHour(toDateTime('${esc(params.endTime)}')) AND ParentServerAddress != '' ${envFilterRes} ) From 73ec71fc9fdd04aa9723e9790a298670f67b8199 Mon Sep 17 00:00:00 2001 From: Makisuo Date: Wed, 20 May 2026 13:18:46 +0200 Subject: [PATCH 3/3] Update service-dependencies-tab.tsx --- .../services/service-dependencies-tab.tsx | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/apps/web/src/components/services/service-dependencies-tab.tsx b/apps/web/src/components/services/service-dependencies-tab.tsx index 30c90e4a..b072e77c 100644 --- a/apps/web/src/components/services/service-dependencies-tab.tsx +++ b/apps/web/src/components/services/service-dependencies-tab.tsx @@ -183,6 +183,57 @@ export function ServiceDependenciesTab({ return out }, [servicesResult, dbsResult, externalResult, serviceName, durationSeconds]) + // Fold HTTP rows that look like a known internal service into that service's + // row. The address-resolutions rollup eventually catches this server-side + // via exact `ParentServerAddress` match — but only after the hourly tick, + // and only when the Client span carries `server.address` (not `http.host`). + // In the meantime we'd render two rows for the same logical target + // (`SERVICE artifacts-api` AND `HTTP http://prd-artifacts-api`), which reads + // as a duplicate. + // + // Heuristic: an HTTP target whose hostname *contains* a known internal + // service name (>=5 chars, so generic names like `api` don't false-match) + // is treated as a hostname-variant of that service. The HTTP row drops out + // of the visible list; the SERVICE row gains a `via host1, host2` subtitle. + const dedupedRows = useMemo(() => { + const serviceNames = rows + .filter((r) => r.kind === "service" && r.name.length >= 5) + .map((r) => ({ canonical: r.name, lower: r.name.toLowerCase() })) + + if (serviceNames.length === 0) return rows + + // Map from canonical service name → list of HTTP hostnames that resolve here. + const matchedHosts = new Map() + // IDs of HTTP rows to hide (those that matched at least one service). + const hiddenIds = new Set() + + for (const row of rows) { + if (row.kind !== "http") continue + const hostLower = row.name.toLowerCase() + for (const svc of serviceNames) { + if (hostLower.includes(svc.lower)) { + const list = matchedHosts.get(svc.canonical) ?? [] + list.push(row.name) + matchedHosts.set(svc.canonical, list) + hiddenIds.add(row.id) + break + } + } + } + + if (hiddenIds.size === 0) return rows + + return rows.flatMap((row) => { + if (hiddenIds.has(row.id)) return [] + if (row.kind !== "service") return [row] + const hosts = matchedHosts.get(row.name) + if (!hosts?.length) return [row] + const subtitle = + hosts.length === 1 ? `via ${hosts[0]}` : `via ${hosts[0]} +${hosts.length - 1} more` + return [{ ...row, subtitle }] + }) + }, [rows]) + const isWaiting = (Result.isSuccess(servicesResult) && servicesResult.waiting) || (Result.isSuccess(dbsResult) && dbsResult.waiting) || @@ -192,9 +243,9 @@ export function ServiceDependenciesTab({ // label + sharp value so the eye lands on the numbers first, the labels // second — same hierarchy the chart cards on Overview already use. const summary = useMemo(() => { - if (rows.length === 0) return null + if (dedupedRows.length === 0) return null - const byKind = rows.reduce>( + const byKind = dedupedRows.reduce>( (acc, row) => { acc[row.kind] = (acc[row.kind] ?? 0) + 1 return acc @@ -206,21 +257,21 @@ export function ServiceDependenciesTab({ .map((k) => `${byKind[k]} ${labelFor(k, byKind[k])}`) .join(" · ") - const topByCalls = [...rows].sort((a, b) => b.callsPerSec - a.callsPerSec)[0] - const topByErrors = [...rows] + const topByCalls = [...dedupedRows].sort((a, b) => b.callsPerSec - a.callsPerSec)[0] + const topByErrors = [...dedupedRows] .filter((r) => r.errorRate > 0) .sort((a, b) => b.errorRate - a.errorRate)[0] - const topByLatency = [...rows].sort((a, b) => b.p95DurationMs - a.p95DurationMs)[0] + const topByLatency = [...dedupedRows].sort((a, b) => b.p95DurationMs - a.p95DurationMs)[0] return { breakdown, topByCalls, topByErrors, topByLatency } - }, [rows]) + }, [dedupedRows]) return (
{summary ? (
- {rows.length}{" "} + {dedupedRows.length}{" "} downstream {summary.breakdown} @@ -250,7 +301,7 @@ export function ServiceDependenciesTab({