Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions app/pipeline/contracts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

from .asr import ASRProvider, ASRRequest, ASRResult
from .artifacts import (
ARTIFACT_MANIFEST_VERSION,
AsyncUploadReader,
AudioArtifactIndex,
ArtifactManifestEntry,
PersistedTranscriptionArtifacts,
SavedUploadArtifact,
TranscriptionArtifactStore,
TranscriptionArtifactWriteRequest,
UploadPersistenceRequest,
build_artifact_manifest,
)
from .context import PipelineContext
from .diarization import (
Expand Down Expand Up @@ -48,13 +51,15 @@
"ASRProvider",
"ASRRequest",
"ASRResult",
"ARTIFACT_MANIFEST_VERSION",
"AsyncUploadReader",
"AudioArtifactIndex",
"AudioEnhancementProvider",
"AudioEnhancementRequest",
"AudioEnhancementResult",
"AudioNormalizationRequest",
"AudioNormalizationResult",
"ArtifactManifestEntry",
"DiarizationProvider",
"DiarizationRequest",
"DiarizationResult",
Expand All @@ -76,4 +81,5 @@
"VoiceprintMatchProvider",
"VoiceprintMatchRequest",
"VoiceprintMatchResult",
"build_artifact_manifest",
]
48 changes: 48 additions & 0 deletions app/pipeline/contracts/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from pathlib import Path
from typing import Any, Protocol, runtime_checkable

ARTIFACT_MANIFEST_VERSION = "artifact_manifest.v1"


class AsyncUploadReader(Protocol):
"""Minimal async file interface used by UploadFile and test doubles."""
Expand Down Expand Up @@ -64,6 +66,49 @@ class PersistedTranscriptionArtifacts:
embedding_paths: dict[str, Path]


@dataclass(frozen=True, slots=True)
class ArtifactManifestEntry:
"""Public-safe artifact descriptor embedded in completed results.

This intentionally describes artifact names and roles without exposing
host-local paths. Clients may ignore the whole manifest.
"""

name: str
filename: str
role: str
media_type: str
required_for_result: bool = False
speaker_label: str | None = None

def as_dict(self) -> dict[str, Any]:
payload: dict[str, Any] = {
"name": self.name,
"filename": self.filename,
"role": self.role,
"media_type": self.media_type,
"required_for_result": self.required_for_result,
}
if self.speaker_label is not None:
payload["speaker_label"] = self.speaker_label
return payload


def build_artifact_manifest(
stable: list[ArtifactManifestEntry],
optional: list[ArtifactManifestEntry] | None = None,
experimental: list[ArtifactManifestEntry] | None = None,
) -> dict[str, Any]:
"""Build the optional artifact manifest for a completed transcription."""

return {
"manifest_version": ARTIFACT_MANIFEST_VERSION,
"stable": [entry.as_dict() for entry in stable],
"optional": [entry.as_dict() for entry in optional or []],
"experimental": [entry.as_dict() for entry in experimental or []],
}


@runtime_checkable
class TranscriptionArtifactStore(Protocol):
"""Stable slot for persisting completed transcription artifacts."""
Expand All @@ -74,11 +119,14 @@ def persist_transcription(


__all__ = [
"ARTIFACT_MANIFEST_VERSION",
"AsyncUploadReader",
"AudioArtifactIndex",
"ArtifactManifestEntry",
"PersistedTranscriptionArtifacts",
"SavedUploadArtifact",
"TranscriptionArtifactStore",
"TranscriptionArtifactWriteRequest",
"UploadPersistenceRequest",
"build_artifact_manifest",
]
33 changes: 32 additions & 1 deletion app/providers/artifacts/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@
from pathlib import Path

from config import DENOISE_MODEL, DENOISE_SNR_THRESHOLD
from infra.audio.paths import safe_speaker_label
from infra.transcription_artifacts import persist_transcription_artifacts
from pipeline.contracts import PipelineContext, PipelineResult
from pipeline.contracts import (
ArtifactManifestEntry,
PipelineContext,
PipelineResult,
build_artifact_manifest,
)


class InMemoryArtifactsProvider:
Expand Down Expand Up @@ -89,6 +95,7 @@ def _build_transcription(self, context: PipelineContext) -> dict | None:
context.aligned_segments,
context.voiceprint_matches,
)
embedding_labels = sorted(context.speaker_embeddings)
warning = None
if not context.voiceprint_matches and not context.speaker_embeddings:
warning = "no_speakers_detected"
Expand All @@ -111,6 +118,7 @@ def _build_transcription(self, context: PipelineContext) -> dict | None:
"max_speakers": context.request.max_speakers,
"no_repeat_ngram_size": context.request.no_repeat_ngram_size or 0,
},
"artifacts": self._build_artifact_manifest(embedding_labels),
}
if context.transcription_result is not None:
guard_report = context.transcription_result.get("hallucination_guard")
Expand All @@ -123,6 +131,29 @@ def _build_transcription(self, context: PipelineContext) -> dict | None:
transcription["warning"] = warning
return transcription

@staticmethod
def _build_artifact_manifest(speaker_labels: list[str]) -> dict:
stable = [
ArtifactManifestEntry(
name="result",
filename="result.json",
role="primary_result",
media_type="application/json",
required_for_result=True,
)
]
stable.extend(
ArtifactManifestEntry(
name="speaker_embedding",
filename=f"emb_{safe_speaker_label(speaker_label)}.npy",
role="speaker_embedding",
media_type="application/octet-stream",
speaker_label=speaker_label,
)
for speaker_label in speaker_labels
)
return build_artifact_manifest(stable=stable)

def build(self, context: PipelineContext) -> PipelineResult:
transcription = self._build_transcription(context)
artifact_paths = None
Expand Down
34 changes: 33 additions & 1 deletion doc/api.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,28 @@ practice, omit `denoise_model` to inherit `DENOISE_MODEL`, send
"max_speakers": 0,
"no_repeat_ngram_size": 0
},
"artifacts": {
"manifest_version": "artifact_manifest.v1",
"stable": [
{
"name": "result",
"filename": "result.json",
"role": "primary_result",
"media_type": "application/json",
"required_for_result": true
},
{
"name": "speaker_embedding",
"filename": "emb_SPEAKER_00.npy",
"role": "speaker_embedding",
"media_type": "application/octet-stream",
"required_for_result": false,
"speaker_label": "SPEAKER_00"
}
],
"optional": [],
"experimental": []
},
"alignment": {
"status": "succeeded",
"language": "en",
Expand All @@ -198,7 +220,8 @@ enrollment or rename call.
**Result contract anchors**: completed results report `status="completed"` in
the persisted transcription object. `segments[].speaker_label` is always the
raw diarization cluster label. `segments[].words` and top-level `alignment` are
optional metadata; clients must tolerate either field being absent.
optional metadata; top-level `artifacts` is optional as well. Clients must
tolerate these fields being absent.

`speaker_id` / `speaker_name`: matching uses an **adaptive threshold**, not a
fixed `0.75` cutoff. Actual logic:
Expand Down Expand Up @@ -259,6 +282,14 @@ no need to cross-reference the original request. See
[`configuration.en.md`](./configuration.en.md) for each setting's source and
default.

**`artifacts`** is an optional manifest describing stable, optional, and
experimental artifacts that live alongside this result. Current stable entries
include the primary `result.json` and one `emb_<speaker_label>.npy` speaker
embedding per cluster. The manifest exposes only filenames, roles, categories,
media types, and `speaker_label`; it does not expose local paths, hosts, tokens,
real job runtime paths, or debug data. Default clients do not need this field,
and older results without `artifacts` remain compatible.

Completed `GET /api/jobs/{id}` results and `GET /api/transcriptions/{id}` share the
same payload shape. That means `speaker_map` and `unique_speakers` are available in
the completed job result as well:
Expand Down Expand Up @@ -289,6 +320,7 @@ aggregation fields for UI / downstream consumers:
| --- | --- | --- |
| `speaker_map` | object | `speaker_label → {matched_id, matched_name, similarity, embedding_key}` mapping; reflects the **diarization model's voiceprint match result** and does not change when segments are manually corrected |
| `unique_speakers` | array[string] | Deduplicated list of speaker names, recalculated from the persisted `segments[].speaker_name` values to reflect the latest manual corrections |
| `artifacts` | object | Optional artifact manifest for stable / optional / experimental artifacts; clients must tolerate it being absent |

### `GET /api/export/{tr_id}`

Expand Down
32 changes: 31 additions & 1 deletion doc/api.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,28 @@ curl -X POST http://localhost:8780/api/transcribe \
"max_speakers": 0,
"no_repeat_ngram_size": 0
},
"artifacts": {
"manifest_version": "artifact_manifest.v1",
"stable": [
{
"name": "result",
"filename": "result.json",
"role": "primary_result",
"media_type": "application/json",
"required_for_result": true
},
{
"name": "speaker_embedding",
"filename": "emb_SPEAKER_00.npy",
"role": "speaker_embedding",
"media_type": "application/octet-stream",
"required_for_result": false,
"speaker_label": "SPEAKER_00"
}
],
"optional": [],
"experimental": []
},
"alignment": {
"status": "succeeded",
"language": "zh",
Expand All @@ -191,7 +213,8 @@ curl -X POST http://localhost:8780/api/transcribe \

**结果契约锚点**:完成态持久化转写对象会带 `status="completed"`。
`segments[].speaker_label` 永远是原始 diarization cluster 标签。
`segments[].words` 和顶层 `alignment` 都是可选元数据,客户端必须能接受字段缺失。
`segments[].words`、顶层 `alignment` 和顶层 `artifacts` 都是可选元数据,
客户端必须能接受字段缺失。

`speaker_id` 和 `speaker_name`:匹配采用**自适应阈值**,不是固定 0.75。实际逻辑:

Expand Down Expand Up @@ -231,6 +254,12 @@ alignment 模型会记录为 `jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-c
都可独立解读,无需再查原始请求。各配置项来源和默认值见
[`configuration.zh.md`](./configuration.zh.md)。

**`artifacts`** 是可选 manifest,用于描述与该结果同目录的稳定、可选和实验性
artifact。当前稳定项包括主结果 `result.json` 和每个说话人 cluster 的
`emb_<speaker_label>.npy`。manifest 只暴露文件名、角色、类别、媒体类型和
`speaker_label`,不暴露本地路径、主机、token、真实 job 运行路径或调试信息。
默认客户端不需要依赖该字段;老结果没有 `artifacts` 时仍应按兼容结果处理。

`GET /api/jobs/{id}` 的完成态结果与 `GET /api/transcriptions/{id}` 使用同一份
持久化结果结构,因此完成态里同样会带上 `speaker_map` 和 `unique_speakers`:

Expand Down Expand Up @@ -260,6 +289,7 @@ alignment 模型会记录为 `jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-c
| --- | --- | --- |
| `speaker_map` | object | `speaker_label → {matched_id, matched_name, similarity, embedding_key}` 的映射,反映 **diarization 模型的声纹匹配结果**,不随人工单段纠错变化;便于前端一次性渲染人名下拉 / 统计 |
| `unique_speakers` | array[string] | 去重后的说话人名列表,从持久化结果里的 `segments[].speaker_name` 重算,反映最新的人工纠错结果 |
| `artifacts` | object | 可选 artifact manifest;用于发现结果相关的稳定 / 可选 / 实验 artifact,缺失时必须兼容 |

与 `GET /api/jobs/{id}` 不同,本端点始终从磁盘读取持久化结果,**进程重启后仍可访问**,
也能反映最新的人工纠错;`/api/jobs/{id}` 优先读内存,内存未命中时才回落到磁盘(见上方注意事项)。
Expand Down
9 changes: 9 additions & 0 deletions doc/changelog.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@

## Unreleased

### Features

- Added an optional `artifacts` manifest to completed results. The manifest
lists artifact category, role, filename, media type, and `speaker_label`
where relevant; it does not expose local paths, job runtime paths, hosts,
tokens, or debug data. Default clients can continue to rely only on the
`result.json` primary view, and must treat unknown or missing `artifacts`
fields as compatible.

## 0.7.6 — Health, alignment, and embedding runtime fixes (2026-05-07)

### Security
Expand Down
7 changes: 7 additions & 0 deletions doc/changelog.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

## Unreleased

### 功能

- 新增完成态结果的可选 `artifacts` manifest。该字段只列出当前结果相关 artifact
的类别、角色、文件名、媒体类型和 `speaker_label`,不暴露本地路径、job 运行路径、
host、token 或调试信息。默认客户端仍只需读取 `result.json` 主视图;未知或缺失
`artifacts` 字段必须被视为兼容。

## 0.7.6 — 健康检查、alignment 与 embedding 运行时修复 (2026-05-07)

### 安全
Expand Down
7 changes: 6 additions & 1 deletion doc/configuration.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,17 @@ Stable anchors in completed transcription results:
- Top-level `alignment`: optional forced-alignment metadata, sanitized.
- Top-level `params`: effective per-job processing settings, including request
overrides and service defaults used for this result.
- Top-level `artifacts`: optional artifact manifest listing stable / optional /
experimental artifact filenames, roles, categories, media types, and
`speaker_label` values; it never exposes local paths, hosts, tokens, or debug
data.
- `speaker_map`: diarization cluster to voiceprint match map; manual segment
corrections do not rewrite it.
- `unique_speakers`: deduplicated current segment display names.

New fields are added under the optional-field principle. Clients should ignore
unknown fields and tolerate missing `words`, `alignment`, and `warning`.
unknown fields and tolerate missing `words`, `alignment`, `artifacts`, and
`warning`.

## v0.7.6 Validation Wording

Expand Down
5 changes: 4 additions & 1 deletion doc/configuration.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,14 @@ cohort 生命周期:
- `segments[].words`:可选词级 alignment。
- 顶层 `alignment`:可选 forced-alignment 元数据,字段内容会脱敏。
- 顶层 `params`:记录本次任务实际使用的请求级与服务级处理参数,便于离线解释结果。
- 顶层 `artifacts`:可选 artifact manifest,只列出稳定 / 可选 / 实验 artifact 的
文件名、角色、类别、媒体类型和 `speaker_label`;不暴露本地路径、主机、token 或
调试信息。
- `speaker_map`:diarization cluster 到声纹匹配结果的映射;人工改单段说话人不会回写它。
- `unique_speakers`:按当前 segment 展示名去重后的列表。

新增字段按可选字段原则扩展;客户端应忽略不认识的字段,并容忍 `words` /
`alignment` / `warning` 缺失。
`alignment` / `artifacts` / `warning` 缺失。

## v0.7.6 验证口径

Expand Down
Loading
Loading