From fca6676e7a450f01dc8e2925a0195ec851750920 Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 17:42:10 -0700 Subject: [PATCH 1/6] feat(llm): add ProviderUnsupportedContentBlock error category MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the provider_unsupported_content_block canonical category from llm-provider §7 (introduced by proposal 0015). Raised when the bound model does not support a content block type used in the request (e.g., a text-only model received an image block, or the model supports images but not the requested media_type or source variant). The exception carries block_type and reason attributes so callers can route on the specific unsupported case; mirrors the precedent StructuredOutputInvalid set in PR-1 (carry the structured payload the caller needs for diagnostics + recovery). Non-transient by default — NOT added to TRANSIENT_CATEGORIES. The bound model's capability set doesn't change between calls, so retrying without changing the request, the bound model, or the provider won't succeed. Users who want fallback semantics MAY route on the category in a userland middleware (e.g., switch to a multimodal-capable provider). Distinct from ProviderInvalidRequest: ProviderInvalidRequest covers spec-shape violations (the request is malformed); this category covers capability mismatches (the request is well-formed but the bound model can't fulfill it). --- src/openarmature/llm/__init__.py | 4 +++ src/openarmature/llm/errors.py | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/openarmature/llm/__init__.py b/src/openarmature/llm/__init__.py index 2b91e9f..90dfdfe 100644 --- a/src/openarmature/llm/__init__.py +++ b/src/openarmature/llm/__init__.py @@ -30,6 +30,7 @@ PROVIDER_MODEL_NOT_LOADED, PROVIDER_RATE_LIMIT, PROVIDER_UNAVAILABLE, + PROVIDER_UNSUPPORTED_CONTENT_BLOCK, STRUCTURED_OUTPUT_INVALID, TRANSIENT_CATEGORIES, LlmProviderError, @@ -40,6 +41,7 @@ ProviderModelNotLoaded, ProviderRateLimit, ProviderUnavailable, + ProviderUnsupportedContentBlock, StructuredOutputInvalid, ) from .messages import ( @@ -69,6 +71,7 @@ "PROVIDER_MODEL_NOT_LOADED", "PROVIDER_RATE_LIMIT", "PROVIDER_UNAVAILABLE", + "PROVIDER_UNSUPPORTED_CONTENT_BLOCK", "STRUCTURED_OUTPUT_INVALID", "TRANSIENT_CATEGORIES", "AssistantMessage", @@ -85,6 +88,7 @@ "ProviderModelNotLoaded", "ProviderRateLimit", "ProviderUnavailable", + "ProviderUnsupportedContentBlock", "Response", "RuntimeConfig", "StructuredOutputInvalid", diff --git a/src/openarmature/llm/errors.py b/src/openarmature/llm/errors.py index 14eedc3..9af765a 100644 --- a/src/openarmature/llm/errors.py +++ b/src/openarmature/llm/errors.py @@ -29,6 +29,7 @@ PROVIDER_RATE_LIMIT = "provider_rate_limit" PROVIDER_INVALID_RESPONSE = "provider_invalid_response" PROVIDER_INVALID_REQUEST = "provider_invalid_request" +PROVIDER_UNSUPPORTED_CONTENT_BLOCK = "provider_unsupported_content_block" STRUCTURED_OUTPUT_INVALID = "structured_output_invalid" @@ -137,6 +138,48 @@ class ProviderInvalidRequest(LlmProviderError): category = PROVIDER_INVALID_REQUEST +# Non-transient by default — the bound model's capability set does +# not change between calls, so retrying without changing the request +# (the message list, the bound model, or the provider) will not +# succeed. +# +# Distinct from ProviderInvalidRequest. ProviderInvalidRequest covers +# spec-shape violations (the request is malformed at the wire layer); +# ProviderUnsupportedContentBlock covers capability mismatches (the +# request is well-formed but the bound model can't fulfill it). +# Splitting them lets callers route the unsupported-content case +# differently (e.g., fall back to a multimodal-capable provider) +# without overloading the malformed-request category. +class ProviderUnsupportedContentBlock(LlmProviderError): + """Raised when the bound model does not support a content block + type used in the request. + + Examples: a text-only model received an image block, or the model + supports images but not the requested ``media_type`` or ``source`` + variant. + + Attributes: + block_type: The block type that was rejected (e.g., ``"image"``), + when the provider's response makes this identifiable. + reason: The provider's human-readable description of the + rejection, when available. + """ + + category = PROVIDER_UNSUPPORTED_CONTENT_BLOCK + block_type: str | None + reason: str | None + + def __init__( + self, + *args: Any, + block_type: str | None = None, + reason: str | None = None, + ) -> None: + super().__init__(*args) + self.block_type = block_type + self.reason = reason + + # Non-transient by default — a model that fails schema compliance on a # given prompt usually fails the same way on retry. The default # RetryMiddleware classifier does NOT retry this category. Users wanting @@ -184,6 +227,7 @@ def __init__( "PROVIDER_MODEL_NOT_LOADED", "PROVIDER_RATE_LIMIT", "PROVIDER_UNAVAILABLE", + "PROVIDER_UNSUPPORTED_CONTENT_BLOCK", "STRUCTURED_OUTPUT_INVALID", "TRANSIENT_CATEGORIES", "LlmProviderError", @@ -194,5 +238,6 @@ def __init__( "ProviderModelNotLoaded", "ProviderRateLimit", "ProviderUnavailable", + "ProviderUnsupportedContentBlock", "StructuredOutputInvalid", ] From 33d702b5a47e9afbd62181f75030263d8f9fb81b Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 17:43:41 -0700 Subject: [PATCH 2/6] feat(llm): content-block types + UserMessage extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the content-block surface from llm-provider §3.1 (proposal 0015): - TextBlock(type, text) with a non-empty-text validator - ImageSourceURL(type, url) and ImageSourceInline(type, base64_data), joined by an ImageSource discriminated union over the source's ``type`` field - ImageBlock(type, source, media_type, detail) with a validator that rejects inline sources missing a media_type. detail defaults to None so the wire omits the field unless explicitly set (providers apply their own conceptual default of "auto"); the docstring spells out the subtle case of an explicit detail="auto" - ContentBlock discriminated union over TextBlock | ImageBlock UserMessage.content becomes ``str | list[ContentBlock]``. The existing _check_content validator extends to enforce the non-empty rule on both shapes. Other roles (system, assistant, tool) stay text-string only — content blocks are user-only in v1 per the spec. media_type is typed as ``str | None`` (not a Literal of the three guaranteed types) so callers can pass additional image/* types providers document support for. --- src/openarmature/llm/__init__.py | 12 +++ src/openarmature/llm/messages.py | 135 +++++++++++++++++++++++++++++-- 2 files changed, 142 insertions(+), 5 deletions(-) diff --git a/src/openarmature/llm/__init__.py b/src/openarmature/llm/__init__.py index 90dfdfe..2939063 100644 --- a/src/openarmature/llm/__init__.py +++ b/src/openarmature/llm/__init__.py @@ -46,8 +46,14 @@ ) from .messages import ( AssistantMessage, + ContentBlock, + ImageBlock, + ImageSource, + ImageSourceInline, + ImageSourceURL, Message, SystemMessage, + TextBlock, Tool, ToolCall, ToolMessage, @@ -75,7 +81,12 @@ "STRUCTURED_OUTPUT_INVALID", "TRANSIENT_CATEGORIES", "AssistantMessage", + "ContentBlock", "FinishReason", + "ImageBlock", + "ImageSource", + "ImageSourceInline", + "ImageSourceURL", "LlmProviderError", "Message", "OpenAIProvider", @@ -93,6 +104,7 @@ "RuntimeConfig", "StructuredOutputInvalid", "SystemMessage", + "TextBlock", "Tool", "ToolCall", "ToolMessage", diff --git a/src/openarmature/llm/messages.py b/src/openarmature/llm/messages.py index fab9de7..44987be 100644 --- a/src/openarmature/llm/messages.py +++ b/src/openarmature/llm/messages.py @@ -94,17 +94,136 @@ def _check_content(self) -> SystemMessage: return self +class TextBlock(BaseModel): + """Text content block. The content-array equivalent of a plain + text-string user message; a user message with exactly one + ``TextBlock(text=T)`` is normatively equivalent to one with + ``content=T``. + + Attributes: + type: The discriminator literal ``"text"``. + text: A non-empty string. + """ + + model_config = ConfigDict(extra="forbid") + + type: Literal["text"] = "text" + text: str + + @model_validator(mode="after") + def _check_text(self) -> TextBlock: + if not self.text: + raise ValueError("text block: text MUST be a non-empty string") + return self + + +class ImageSourceURL(BaseModel): + """URL-referenced image source. The URL is passed to the provider + unchanged; the framework does not fetch, cache, or transform it. + + Attributes: + type: The discriminator literal ``"url"``. + url: The image URL. MAY be ``http(s)://``, ``data:`` (RFC 2397 + inline data URI), or another scheme the provider documents + support for. + """ + + model_config = ConfigDict(extra="forbid") + + type: Literal["url"] = "url" + url: str + + +class ImageSourceInline(BaseModel): + """Inline base64-encoded image source. The framework does not + inspect, transcode, or re-encode the bytes; the parent ``ImageBlock`` + MUST carry a ``media_type`` for inline sources. + + Attributes: + type: The discriminator literal ``"inline"``. + base64_data: The base64-encoded image bytes. + """ + + model_config = ConfigDict(extra="forbid") + + type: Literal["inline"] = "inline" + base64_data: str + + +# Discriminated union over the two image-source shapes. The +# discriminator is the source's ``type`` field, matching the spec's +# "single image block carries exactly one source — url XOR inline. +# The discriminator is the type field on the source itself." +ImageSource = Annotated[ + ImageSourceURL | ImageSourceInline, + Field(discriminator="type"), +] + + +class ImageBlock(BaseModel): + """Image content block. Carries one source (URL or inline base64), + a conditional ``media_type`` (required for inline sources; ignored + for URL sources), and an optional ``detail`` hint. + + The class-level default of ``detail=None`` preserves the + omit-by-default wire behavior: providers apply their own + conceptual default (``"auto"``) when ``detail`` is absent from the + wire payload. To force the wire to carry an explicit ``"auto"``, + set ``detail="auto"`` on the block. + + Attributes: + type: The discriminator literal ``"image"``. + source: One of ``ImageSourceURL`` or ``ImageSourceInline``. + media_type: IANA media type. Required when source is inline; + ignored when source is a URL. Providers MUST accept + ``image/png``, ``image/jpeg``, ``image/webp`` at minimum + and MAY accept additional ``image/*`` types they document + support for. + detail: Image-processing fidelity hint. One of ``"auto"``, + ``"low"``, ``"high"``. ``None`` (the default) omits the + field from the wire. + """ + + model_config = ConfigDict(extra="forbid") + + type: Literal["image"] = "image" + source: ImageSource + media_type: str | None = None + detail: Literal["auto", "low", "high"] | None = None + + @model_validator(mode="after") + def _check_media_type_for_inline(self) -> ImageBlock: + if isinstance(self.source, ImageSourceInline) and self.media_type is None: + raise ValueError("image block: media_type is required when source is inline") + return self + + +# Discriminated union over the two content-block shapes. The +# discriminator is the block's ``type`` field, matching the spec's +# "typed record with a discriminator field identifying the block +# type." +ContentBlock = Annotated[ + TextBlock | ImageBlock, + Field(discriminator="type"), +] + + class UserMessage(_MessageBase): - """User messages have non-empty ``content``; no tool_calls; no - tool_call_id.""" + """User messages carry content as either a non-empty text string + or a non-empty ordered sequence of content blocks (text and/or + image). No tool_calls; no tool_call_id.""" role: Literal["user"] = "user" - content: str + content: str | list[ContentBlock] @model_validator(mode="after") def _check_content(self) -> UserMessage: - if not self.content: - raise ValueError("user message: content MUST be a non-empty string") + if isinstance(self.content, str): + if not self.content: + raise ValueError("user message: content MUST be a non-empty string") + else: + if not self.content: + raise ValueError("user message: content MUST be a non-empty list of content blocks") return self @@ -151,8 +270,14 @@ class ToolMessage(_MessageBase): __all__ = [ "AssistantMessage", + "ContentBlock", + "ImageBlock", + "ImageSource", + "ImageSourceInline", + "ImageSourceURL", "Message", "SystemMessage", + "TextBlock", "Tool", "ToolCall", "ToolMessage", From 89b15ddbe008f8a086b617b57f95192fab6f0426 Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 17:47:17 -0700 Subject: [PATCH 3/6] feat(llm/openai): content-array wire mapping + content-rejection mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two extensions in OpenAIProvider for proposal 0015: - _message_to_wire's user case now branches on content shape: string maps directly (the v0.4.0 form); a content-block sequence maps to OpenAI's content-array form per §8.1.1 via the new _block_to_wire helper. TextBlock → {type: "text", text}. ImageBlock(URL) → {type: "image_url", image_url: {url, detail?}}. ImageBlock(inline) constructs an RFC 2397 data: URI from media_type + base64_data and routes through the same image_url entry shape. The detail hint goes on the wire only when the spec block has it set (None on the spec block omits it from the wire; providers apply their own default of "auto" per §3.1.2). - classify_http_error's 400 branch now routes content-rejection bodies to ProviderUnsupportedContentBlock rather than the generic ProviderInvalidRequest. Detection is a heuristic on error.code (known set: image_content_not_supported, unsupported_image_media_type, audio_content_not_supported, video_content_not_supported, unsupported_content_block; plus an image+not_supported substring fallback), error.type (image_parse_error, image_content_not_supported), and error.message ("does not support" + image/audio/video). The spec is implementation-defined on the detection rule (§8.3); the heuristic lives inline so it's evolvable as OpenAI's error-code surface shifts. _extract_rejected_block_type pulls a best-effort "image" / "audio" / "video" identifier out of the error code or message for surfacing on ProviderUnsupportedContentBlock.block_type. --- src/openarmature/llm/providers/openai.py | 106 ++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/src/openarmature/llm/providers/openai.py b/src/openarmature/llm/providers/openai.py index 682dfa9..1e1574f 100644 --- a/src/openarmature/llm/providers/openai.py +++ b/src/openarmature/llm/providers/openai.py @@ -69,12 +69,16 @@ ProviderModelNotLoaded, ProviderRateLimit, ProviderUnavailable, + ProviderUnsupportedContentBlock, StructuredOutputInvalid, ) from ..messages import ( AssistantMessage, + ContentBlock, + ImageSourceInline, Message, SystemMessage, + TextBlock, Tool, ToolCall, UserMessage, @@ -641,7 +645,15 @@ def _message_to_wire(msg: Message) -> dict[str, Any]: if isinstance(msg, SystemMessage): return {"role": "system", "content": msg.content} if isinstance(msg, UserMessage): - return {"role": "user", "content": msg.content} + # Dual-shape user content (§8.1): string maps directly; a + # content-block sequence maps to OpenAI's content-array form + # per §8.1.1. + if isinstance(msg.content, str): + return {"role": "user", "content": msg.content} + return { + "role": "user", + "content": [_block_to_wire(block) for block in msg.content], + } if isinstance(msg, AssistantMessage): # Tool-call-only assistants emit ``"content": null`` on the # wire — that's the OpenAI convention for "no textual reply, @@ -672,6 +684,27 @@ def _message_to_wire(msg: Message) -> dict[str, Any]: } +# Spec §8.1.1: content-block to OpenAI content-array entry mapping. +# Both URL-referenced and inline-base64 image blocks go through +# OpenAI's `image_url` entry shape; the inline case is expressed as +# an RFC 2397 data: URI carrying media_type + base64_data. The +# `detail` hint goes on the wire only when explicitly set on the spec +# block (None on the spec block omits it from the wire; providers +# apply their own conceptual default of "auto"). +def _block_to_wire(block: ContentBlock) -> dict[str, Any]: + if isinstance(block, TextBlock): + return {"type": "text", "text": block.text} + # ImageBlock + if isinstance(block.source, ImageSourceInline): + url = f"data:{block.media_type};base64,{block.source.base64_data}" + else: + url = block.source.url + image_url: dict[str, Any] = {"url": url} + if block.detail is not None: + image_url["detail"] = block.detail + return {"type": "image_url", "image_url": image_url} + + def _tool_to_wire(tool: Tool) -> dict[str, Any]: return { "type": "function", @@ -762,6 +795,16 @@ def classify_http_error(resp: httpx.Response) -> LlmProviderError: if status in (401, 403): return ProviderAuthentication(message or f"HTTP {status}") if status == 400: + # Spec §8.3: HTTP 400 bodies that indicate the bound model + # rejected a content block map to provider_unsupported_content_block + # rather than the generic provider_invalid_request. The + # detection rule is implementation-defined. + if _looks_like_content_rejection(error_code, error_type, message): + return ProviderUnsupportedContentBlock( + message or "HTTP 400 (content block not supported)", + block_type=_extract_rejected_block_type(error_code, message), + reason=message, + ) return ProviderInvalidRequest(message or "HTTP 400") if status == 404: # 404 with model-not-found body → invalid_model. @@ -782,6 +825,67 @@ def classify_http_error(resp: httpx.Response) -> LlmProviderError: return ProviderUnavailable(message or f"HTTP {status}") +# Known OpenAI error codes for content-block rejections. Used by +# classify_http_error's 400 branch to route to +# ProviderUnsupportedContentBlock instead of ProviderInvalidRequest. +# The list is best-effort and evolves as OpenAI's error-code surface +# shifts; the substring fallback below catches near-misses. +_CONTENT_REJECTION_ERROR_CODES = frozenset( + { + "image_content_not_supported", + "unsupported_image_media_type", + "audio_content_not_supported", + "video_content_not_supported", + "unsupported_content_block", + } +) + + +def _looks_like_content_rejection( + error_code: object, + error_type: object, + message: str | None, +) -> bool: + """Heuristic for HTTP 400 bodies that indicate the bound model + rejected a content block (image / audio / video / unsupported + media_type). Used to route to provider_unsupported_content_block + rather than the generic provider_invalid_request.""" + if isinstance(error_code, str): + if error_code in _CONTENT_REJECTION_ERROR_CODES: + return True + lower_code = error_code.lower() + if "image" in lower_code and ("not_supported" in lower_code or "unsupported" in lower_code): + return True + if isinstance(error_type, str) and error_type.lower() in { + "image_parse_error", + "image_content_not_supported", + }: + return True + if isinstance(message, str): + lower_msg = message.lower() + if "does not support" in lower_msg and ( + "image" in lower_msg or "audio" in lower_msg or "video" in lower_msg + ): + return True + return False + + +def _extract_rejected_block_type(error_code: object, message: str | None) -> str | None: + """Pull a best-effort block-type identifier (``"image"`` / ``"audio"`` + / ``"video"``) out of an error code or message, for surfacing on + ProviderUnsupportedContentBlock.block_type.""" + haystacks: list[str] = [] + if isinstance(error_code, str): + haystacks.append(error_code.lower()) + if isinstance(message, str): + haystacks.append(message.lower()) + for haystack in haystacks: + for block_type in ("image", "audio", "video"): + if block_type in haystack: + return block_type + return None + + def parse_retry_after(value: str | None) -> float | None: """Parse a ``Retry-After`` header value to a float seconds count. From 6f0e008e12b26baca977fba89c63528eaece9a38 Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 17:49:57 -0700 Subject: [PATCH 4/6] test(conformance): drive 0015 fixtures 009-020 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the 12 deferred-skip rows for content-block fixtures from both _DEFERRED_FIXTURES dicts (test_llm_provider.py runtime + the test_fixture_parsing.py typed parser). _build_message in test_llm_provider.py extends the user case to pass raw["content"] through (str or list) unchanged; Pydantic's discriminated union on the content-block ``type`` field parses each dict in the list to the right TextBlock / ImageBlock variant automatically. LlmCallSpec.messages in harness/directives.py is already typed as list[dict[str, Any]] (permissive), so the typed parser accepts the content-block list-of-dicts shape without model extensions. The parsing tests slip past for the 009-020 fixtures via the same path PR-1's 021-028 used. All 28 llm-provider conformance fixtures now pass (the prior 16 plus the 12 new content-block ones). Full suite: 515 pass, 72 skipped (down from 84 — only the 16 deferred fixtures for proposals 0011 / 0014 / 0017 remain). --- tests/conformance/test_fixture_parsing.py | 5 +++++ tests/conformance/test_llm_provider.py | 22 ++++++---------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/tests/conformance/test_fixture_parsing.py b/tests/conformance/test_fixture_parsing.py index ed85e19..c9a145a 100644 --- a/tests/conformance/test_fixture_parsing.py +++ b/tests/conformance/test_fixture_parsing.py @@ -51,6 +51,11 @@ def _id(case: tuple[str, Path]) -> str: ), "pipeline-utilities/045-state-migration-no-path-in-registry": "0014 state migration (PR-4)", "pipeline-utilities/046-state-migration-function-raises": "0014 state migration (PR-4)", + # proposal 0015's llm-provider fixtures (009-020) were removed + # from this list as part of PR-2; the typed harness parses the + # content-block message shape via LlmCallSpec's permissive + # ``messages: list[dict[str, Any]]`` typing without needing + # model extensions. } diff --git a/tests/conformance/test_llm_provider.py b/tests/conformance/test_llm_provider.py index 6cfc497..79f5bef 100644 --- a/tests/conformance/test_llm_provider.py +++ b/tests/conformance/test_llm_provider.py @@ -62,21 +62,7 @@ # Skip-marked here so a green test run at this commit means "everything we # claim to implement passes." Each subsequent PR drops its own rows as it # lands the underlying support. -_DEFERRED_FIXTURES: dict[str, str] = { - # proposal 0015 — multimodal images (PR-2 of the batch) - "009-content-blocks-text-only-equivalence": "0015 multimodal images (PR-2)", - "010-content-blocks-image-url": "0015 multimodal images (PR-2)", - "011-content-blocks-image-inline-base64": "0015 multimodal images (PR-2)", - "012-content-blocks-image-detail-hint": "0015 multimodal images (PR-2)", - "013-content-blocks-mixed-order-preserved": "0015 multimodal images (PR-2)", - "014-content-blocks-validation-empty-sequence": "0015 multimodal images (PR-2)", - "015-content-blocks-validation-empty-text-block": "0015 multimodal images (PR-2)", - "016-content-blocks-unsupported-by-model": "0015 multimodal images (PR-2)", - "017-content-blocks-system-message-text-only": "0015 multimodal images (PR-2)", - "018-content-blocks-image-source-missing": "0015 multimodal images (PR-2)", - "019-content-blocks-invalid-detail-value": "0015 multimodal images (PR-2)", - "020-content-blocks-inline-image-missing-media-type": "0015 multimodal images (PR-2)", -} +_DEFERRED_FIXTURES: dict[str, str] = {} def _fixture_paths() -> list[Path]: @@ -172,7 +158,11 @@ def _build_message(raw: Mapping[str, Any]) -> Message: if role == "system": return SystemMessage(content=cast("str", raw["content"])) if role == "user": - return UserMessage(content=cast("str", raw["content"])) + # Per spec §3, user content is str OR a list of content blocks. + # Pydantic's discriminated union on the block ``type`` field + # parses each dict in the list to the right TextBlock / + # ImageBlock variant automatically. + return UserMessage(content=raw["content"]) if role == "assistant": tool_calls_raw = raw.get("tool_calls") tool_calls: list[ToolCall] | None = None From 027ae568043b7fff684e616df809f45cb3ce4e42 Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 17:55:35 -0700 Subject: [PATCH 5/6] test+docs: content-block unit tests + docs + CHANGELOG entry Adds tests/unit/test_content_blocks.py (24 tests) covering bits the conformance fixtures don't exercise directly: - TextBlock / ImageBlock construction validation (non-empty text, inline-needs-media_type, detail enum, URL source can skip media_type) - UserMessage construction from dict-form content blocks (the path the conformance test fixture loader uses) - _block_to_wire mapping for text, URL with/without detail, inline base64 (RFC 2397 data URI construction) - classify_http_error 400 routing to ProviderUnsupportedContentBlock via the heuristic; negative cases (unrelated 400 stays ProviderInvalidRequest) - _extract_rejected_block_type picks up "image" / "audio" from error.code or error.message Docs: - docs/concepts/llms.md: new "Content blocks (multimodal user messages)" section between Structured output and Routing, covering the two content shapes, URL vs inline sources, the detail hint, and the new ProviderUnsupportedContentBlock category. - docs/model-providers/index.md: errors table extended to 9 categories with the new row + a Behaviour-guarantees note that OpenAIProvider does post-receive detection only; pre-send is a userland-middleware pattern. - docs/model-providers/authoring.md: "Beyond the skeleton" gains a content-blocks entry pointing custom-provider authors at the multimodal wire mapping + the unsupported-content category. CHANGELOG [Unreleased] gains 3 entries: the user-message content extension, the OpenAI wire mapping, and the new error category. All in the same release as PR-1's 0016 entries per the consolidated- release strategy. --- CHANGELOG.md | 3 + docs/concepts/llms.md | 111 +++++++++++++ docs/model-providers/authoring.md | 10 ++ docs/model-providers/index.md | 39 +++-- tests/unit/test_content_blocks.py | 267 ++++++++++++++++++++++++++++++ 5 files changed, 416 insertions(+), 14 deletions(-) create mode 100644 tests/unit/test_content_blocks.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 58bc816..3fabfa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The ### Added +- **Image content blocks for user messages (proposal 0015, spec v0.13.0).** `UserMessage.content` now accepts `str | list[ContentBlock]`. The block surface introduces `TextBlock`, `ImageBlock`, `ImageSourceURL`, `ImageSourceInline`, and the `ContentBlock` / `ImageSource` discriminated unions over the block / source `type` field. `ImageBlock` carries a `media_type` (required for inline sources; ignored for URL sources; typed as `str | None` so callers MAY pass any `image/*` type the bound model supports) and an optional `detail` hint (`"auto"` / `"low"` / `"high"`; `None` default omits the field from the wire so providers apply their own default). System, assistant, and tool messages stay text-string-only; image inputs are user-only in v1. +- **`OpenAIProvider` content-array wire mapping.** When `UserMessage.content` is a content-block sequence, the wire body uses OpenAI's `content` array per §8.1.1. `TextBlock → {type: "text", text}`. `ImageBlock` with a URL source maps to `{type: "image_url", image_url: {url, detail?}}`. `ImageBlock` with an inline source constructs an RFC 2397 `data:;base64,` URI and goes through the same `image_url` entry shape. Inline bytes pass through unchanged — no inspection, transcoding, or re-encoding. +- **New error category `ProviderUnsupportedContentBlock` (non-transient).** Raised when the bound model rejects a content block type / media variant. Distinct from `ProviderInvalidRequest` (which covers spec-shape malformation): this category surfaces a *capability* mismatch, letting callers route differently (e.g., fall back to a multimodal-capable provider) without overloading the malformed-request category. Carries `block_type` ("image" / "audio" / "video") and `reason` (provider's human-readable message) when those are recoverable from the rejection. `OpenAIProvider` detects content rejection via HTTP 400 bodies — heuristic on `error.code` (known set: `image_content_not_supported`, `unsupported_image_media_type`, `audio_content_not_supported`, etc.), `error.type` (`image_parse_error`), and `error.message` ("does not support" + image/audio/video). - **Structured output (proposal 0016, spec v0.14.0).** `Provider.complete()` now accepts an optional `response_schema` parameter — either a JSON Schema dict or a Pydantic `BaseModel` subclass. When supplied, the provider constrains the model's output to the schema and populates `Response.parsed` with the validated value (`dict` for dict-schema input, a `BaseModel` instance for class input). New `StructuredOutputInvalid` error category (non-transient by default) raises on JSON parse failure or schema validation failure; carries the requested schema, the raw response content, and a failure description. - **`OpenAIProvider` native response_format wire path.** When `response_schema` is supplied, the chat-completions request body carries `response_format: { type: "json_schema", json_schema: { name, schema, strict } }`. The `strict` flag is determined by a deep recursive walk over the schema (object-property required-coverage rule across `anyOf` / `oneOf` / `allOf` and `$ref` targets, with cycle protection); unresolvable refs fall through to `strict: false`. The `name` field uses `schema.title` when present, otherwise a deterministic sha256-prefix hash. - **`OpenAIProvider` prompt-augmentation fallback.** Constructor flag `force_prompt_augmentation_fallback: bool` (default `False`) and read-only inspect property `uses_prompt_augmentation_fallback: bool`. When the flag is on, structured-output calls build a fresh message list with a system directive containing the serialized schema, omit `response_format` from the wire, and validate the response post-receive. The caller's original `messages` list is never mutated. Use for OpenAI-compatible servers (older vLLM, some LM Studio releases, llama.cpp variants) that reject or silently ignore `response_format`. diff --git a/docs/concepts/llms.md b/docs/concepts/llms.md index c44b23d..9c8c154 100644 --- a/docs/concepts/llms.md +++ b/docs/concepts/llms.md @@ -221,6 +221,117 @@ on every object. Pydantic-derived schemas may need `model_config = ConfigDict(extra="forbid")` on the class to get the `additionalProperties: false` in the generated JSON Schema. +## Content blocks (multimodal user messages) + +User messages carry content in one of two shapes: a plain text string, +or an ordered sequence of typed content blocks. The string form is the +common case. Blocks are how you mix non-text modalities into a single +turn. v1 defines two block types: text and image. Audio and video are +deferred to future proposals. + +System, assistant, and tool messages stay text-string only. Image +inputs are user-only in v1; image outputs (assistant-message-borne +images, e.g. DALL-E-style generation) are out of scope. + +### Text and image blocks + +A text block is the array-form equivalent of a text-string message: +`TextBlock(text="describe this")`. A user message holding a single +text block is normatively equivalent to one with `content="describe +this"`. + +An image block carries one source — URL or inline base64 — plus an +optional `detail` hint: + +```python +from openarmature.llm import ( + ImageBlock, + ImageSourceInline, + ImageSourceURL, + OpenAIProvider, + TextBlock, + UserMessage, +) + + +async def describe_image(provider: OpenAIProvider) -> str: + response = await provider.complete( + [ + UserMessage( + content=[ + ImageBlock( + source=ImageSourceURL(url="https://example.com/diagram.png"), + detail="high", # optional; omitted from wire when None + ), + TextBlock(text="What does this diagram show?"), + ] + ) + ] + ) + return response.message.content +``` + +Block order is preserved on the wire. Providers vary in whether they +treat order as semantically meaningful (an image followed by its +describing text is a different signal from text followed by the +image); construct the sequence in the order you want the model to +perceive it. + +### URL vs inline sources + +- **URL source** (`ImageSourceURL`): the provider fetches the URL. Any + scheme the provider documents support for is valid (`http(s)://`, + `data:`, etc.). The framework passes it through unchanged. +- **Inline source** (`ImageSourceInline`): the image is sent as + base64-encoded bytes in the request body. The `media_type` field on + the surrounding `ImageBlock` is **required** for inline sources (and + ignored for URL sources). The framework constructs an RFC 2397 + `data:;base64,` URI for the wire; it does not + inspect, transcode, or re-encode the bytes. + +OpenAI, Anthropic, and Google all accept `image/png`, `image/jpeg`, +and `image/webp` as guaranteed media types. `media_type` is typed as +`str | None`, so callers MAY pass additional `image/*` types when +they know the bound model supports them; portable code sticks to the +three. + +### The `detail` hint + +`detail` is a per-image hint to the provider about processing +fidelity: `"auto"`, `"low"`, or `"high"`. The class default is `None`, +which **omits the field from the wire** and lets the provider apply +its own default (conceptually `"auto"`). Setting `detail="auto"` +explicitly on the spec block forces the wire to carry an explicit +`"auto"` — usually unnecessary, since the provider's default is the +same value. + +### When the model can't handle the block + +`provider_unsupported_content_block` raises when the bound model +rejects a content block type or media variant. Concrete cases: + +- A text-only model (e.g., `gpt-3.5-turbo`) received an image block. +- The model supports images but not the requested `media_type`. +- The model supports the type but rejected the specific source variant + (a URL the provider can't fetch, for example). + +The error category is **non-transient**: retrying without changing +the request, the bound model, or the provider won't succeed. Userland +fallback patterns (e.g., a middleware that routes to a multimodal +provider on this category) compose cleanly against it. + +`ProviderUnsupportedContentBlock` carries `block_type` ("image", +"audio", "video") and `reason` (the provider's human-readable +message) when those are recoverable from the rejection. + +`OpenAIProvider` detects content rejection via the response body — +HTTP 400 with an error code like `image_content_not_supported` or a +message like "does not support image inputs." Pre-send capability +checks (failing fast before the wire trip when you know the model +doesn't support images) live above the provider as userland +middleware — the provider doesn't ship a static model-capability +catalog. + ## Routing on parsed fields A conditional edge is a function `state -> str` that names the next diff --git a/docs/model-providers/authoring.md b/docs/model-providers/authoring.md index f25ebaa..97c4648 100644 --- a/docs/model-providers/authoring.md +++ b/docs/model-providers/authoring.md @@ -198,6 +198,16 @@ of: - **Tool calls.** Wire-mapping the `tool_calls` array on `AssistantMessage` to the Provider's expected shape, parsing tool results back from `ToolMessage`s. +- **Content blocks (multimodal user input).** Wire-mapping the + `list[ContentBlock]` form of `UserMessage.content` to the provider's + multimodal shape (OpenAI's `image_url` content-array entries, + Anthropic's image blocks, Google's `inlineData` parts, etc.). The + spec types (`TextBlock`, `ImageBlock`, `ImageSourceURL`, + `ImageSourceInline`) are stable across providers; only the wire + shape differs. Provider authors targeting non-multimodal models + MUST surface `ProviderUnsupportedContentBlock` when the request + carries blocks the bound model can't serve — pre-send or + post-receive per §7. - **Structured output.** Threading `response_schema` through the request body (native `response_format` if the underlying wire supports it; prompt-augmentation fallback otherwise) and validating diff --git a/docs/model-providers/index.md b/docs/model-providers/index.md index d074d43..1a87b82 100644 --- a/docs/model-providers/index.md +++ b/docs/model-providers/index.md @@ -64,24 +64,35 @@ class Provider(Protocol): ## Errors -Eight canonical error categories cover every failure mode: - -| Error | Trigger | -| --------------------------- | ---------------------------------------------------------------------- | -| `ProviderAuthentication` | 401 / 403 (bad key, expired token) | -| `ProviderUnavailable` | 5xx, network failure, timeout | -| `ProviderInvalidModel` | Bound model doesn't exist on the provider | -| `ProviderModelNotLoaded` | Model known but not currently serving | -| `ProviderRateLimit` | 429 (with `Retry-After` exposed) | -| `ProviderInvalidResponse` | 200 OK that fails to parse | -| `ProviderInvalidRequest` | Malformed request (per-message or list-level) | -| `StructuredOutputInvalid` | Response failed to parse as JSON or failed to validate against schema | +Nine canonical error categories cover every failure mode: + +| Error | Trigger | +| ---------------------------------- | ---------------------------------------------------------------------- | +| `ProviderAuthentication` | 401 / 403 (bad key, expired token) | +| `ProviderUnavailable` | 5xx, network failure, timeout | +| `ProviderInvalidModel` | Bound model doesn't exist on the provider | +| `ProviderModelNotLoaded` | Model known but not currently serving | +| `ProviderRateLimit` | 429 (with `Retry-After` exposed) | +| `ProviderInvalidResponse` | 200 OK that fails to parse | +| `ProviderInvalidRequest` | Malformed request (per-message or list-level) | +| `ProviderUnsupportedContentBlock` | Bound model rejected a content block (image / audio / media-type) | +| `StructuredOutputInvalid` | Response failed to parse as JSON or failed to validate against schema | Three of these (`Unavailable`, `RateLimit`, `ModelNotLoaded`) are exported in `TRANSIENT_CATEGORIES`, the canonical "safe to retry" set used by the default retry-middleware classifier. -`StructuredOutputInvalid` is non-transient by default; see -[Structured output](#structured-output) below. +`StructuredOutputInvalid` and `ProviderUnsupportedContentBlock` are +non-transient by default. See [Content blocks](../concepts/llms.md#content-blocks-multimodal-user-messages) +in the LLMs concept page for the multimodal contract; see +[Structured output](#structured-output) below for the +`response_schema` path. + +`OpenAIProvider` detects unsupported-content-block rejections via +the response body (HTTP 400 with an error code or message indicating +content rejection) — a post-receive mapping rather than a static +pre-send capability check. Pre-send protection is a userland +middleware pattern when callers know the bound model's capabilities +up front. ## Structured output diff --git a/tests/unit/test_content_blocks.py b/tests/unit/test_content_blocks.py new file mode 100644 index 0000000..65f7928 --- /dev/null +++ b/tests/unit/test_content_blocks.py @@ -0,0 +1,267 @@ +"""Focused tests for the content-block surface. + +The conformance suite (``tests/conformance/test_llm_provider.py``) +covers the spec's behavioral surface end-to-end against fixtures +009–020. These unit tests fill gaps the conformance fixtures don't +exercise directly: per-class construction validation, the inline- +image-needs-media_type rule, detail-default-None wire-omission +behavior, content-rejection HTTP-error mapping heuristics, and +construction from the dict-form a fixture YAML loader would feed in. +""" + +from __future__ import annotations + +import json +from typing import Any + +import httpx +import pytest + +from openarmature.llm import ( + ImageBlock, + ImageSourceInline, + ImageSourceURL, + ProviderInvalidRequest, + ProviderUnsupportedContentBlock, + TextBlock, + UserMessage, +) +from openarmature.llm.providers.openai import ( + _block_to_wire, + _extract_rejected_block_type, + _looks_like_content_rejection, + classify_http_error, +) + +# --------------------------------------------------------------------------- +# TextBlock construction +# --------------------------------------------------------------------------- + + +def test_text_block_accepts_non_empty_text() -> None: + block = TextBlock(text="hello") + assert block.type == "text" + assert block.text == "hello" + + +def test_text_block_rejects_empty_text() -> None: + with pytest.raises(ValueError, match="non-empty"): + TextBlock(text="") + + +# --------------------------------------------------------------------------- +# ImageBlock construction +# --------------------------------------------------------------------------- + + +def test_image_block_url_source_no_media_type() -> None: + # URL sources have media_type inferred from the URL payload, so + # media_type may be omitted on the spec block. + block = ImageBlock(source=ImageSourceURL(url="https://example.com/a.png")) + assert block.type == "image" + assert isinstance(block.source, ImageSourceURL) + assert block.media_type is None + + +def test_image_block_inline_source_requires_media_type() -> None: + with pytest.raises(ValueError, match="media_type is required when source is inline"): + ImageBlock(source=ImageSourceInline(base64_data="AAA=")) + + +def test_image_block_inline_source_with_media_type() -> None: + block = ImageBlock( + source=ImageSourceInline(base64_data="AAA="), + media_type="image/png", + ) + assert isinstance(block.source, ImageSourceInline) + assert block.media_type == "image/png" + + +def test_image_block_detail_defaults_to_none() -> None: + block = ImageBlock(source=ImageSourceURL(url="https://example.com/a.png")) + assert block.detail is None + + +def test_image_block_detail_accepts_known_values() -> None: + for detail in ("auto", "low", "high"): + block = ImageBlock( + source=ImageSourceURL(url="https://example.com/a.png"), + detail=detail, # type: ignore[arg-type] + ) + assert block.detail == detail + + +def test_image_block_detail_rejects_unknown_value() -> None: + with pytest.raises(ValueError): + ImageBlock( + source=ImageSourceURL(url="https://example.com/a.png"), + detail="foo", # type: ignore[arg-type] + ) + + +# --------------------------------------------------------------------------- +# UserMessage with content blocks +# --------------------------------------------------------------------------- + + +def test_user_message_accepts_string_content() -> None: + msg = UserMessage(content="hello") + assert msg.content == "hello" + + +def test_user_message_rejects_empty_string_content() -> None: + with pytest.raises(ValueError, match="non-empty string"): + UserMessage(content="") + + +def test_user_message_accepts_block_sequence() -> None: + msg = UserMessage(content=[TextBlock(text="hello")]) + assert isinstance(msg.content, list) + assert len(msg.content) == 1 + + +def test_user_message_rejects_empty_block_sequence() -> None: + with pytest.raises(ValueError, match="non-empty list"): + UserMessage(content=[]) + + +def test_user_message_accepts_dict_form_via_discriminator() -> None: + # The YAML loader feeds raw dicts; Pydantic's discriminated union + # over ContentBlock's `type` field parses each dict to the right + # variant. _build_message in test_llm_provider.py relies on this. + raw_blocks: list[Any] = [ + {"type": "text", "text": "describe"}, + { + "type": "image", + "source": {"type": "url", "url": "https://example.com/a.png"}, + }, + ] + msg = UserMessage(content=raw_blocks) + assert isinstance(msg.content, list) + assert len(msg.content) == 2 + assert isinstance(msg.content[0], TextBlock) + assert isinstance(msg.content[1], ImageBlock) + + +# --------------------------------------------------------------------------- +# _block_to_wire mapping +# --------------------------------------------------------------------------- + + +def test_block_to_wire_text() -> None: + wire = _block_to_wire(TextBlock(text="hello")) + assert wire == {"type": "text", "text": "hello"} + + +def test_block_to_wire_image_url_no_detail() -> None: + wire = _block_to_wire(ImageBlock(source=ImageSourceURL(url="https://example.com/a.png"))) + assert wire == { + "type": "image_url", + "image_url": {"url": "https://example.com/a.png"}, + } + + +def test_block_to_wire_image_url_with_detail() -> None: + wire = _block_to_wire( + ImageBlock( + source=ImageSourceURL(url="https://example.com/a.png"), + detail="high", + ) + ) + assert wire == { + "type": "image_url", + "image_url": {"url": "https://example.com/a.png", "detail": "high"}, + } + + +def test_block_to_wire_image_inline_constructs_data_uri() -> None: + wire = _block_to_wire( + ImageBlock( + source=ImageSourceInline(base64_data="QUJD"), + media_type="image/jpeg", + ) + ) + assert wire == { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,QUJD"}, + } + + +def test_block_to_wire_image_inline_with_detail() -> None: + wire = _block_to_wire( + ImageBlock( + source=ImageSourceInline(base64_data="QUJD"), + media_type="image/png", + detail="low", + ) + ) + assert wire == { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,QUJD", "detail": "low"}, + } + + +# --------------------------------------------------------------------------- +# Content-rejection HTTP-error mapping +# --------------------------------------------------------------------------- + + +def _mock_400( + *, + code: str | None = None, + error_type: str | None = None, + message: str = "bad request", +) -> httpx.Response: + body: dict[str, Any] = {"error": {"message": message}} + if code is not None: + body["error"]["code"] = code + if error_type is not None: + body["error"]["type"] = error_type + return httpx.Response(400, content=json.dumps(body).encode("utf-8")) + + +def test_classify_400_with_known_content_code_maps_to_unsupported() -> None: + exc = classify_http_error( + _mock_400( + code="image_content_not_supported", + error_type="invalid_request_error", + message="This model does not support image inputs.", + ) + ) + assert isinstance(exc, ProviderUnsupportedContentBlock) + assert exc.block_type == "image" + assert exc.reason is not None and "image" in exc.reason.lower() + + +def test_classify_400_substring_fallback_via_error_message() -> None: + exc = classify_http_error( + _mock_400( + code="some_other_error", + message="This model does not support image inputs at this size.", + ) + ) + assert isinstance(exc, ProviderUnsupportedContentBlock) + + +def test_classify_400_unrelated_400_stays_invalid_request() -> None: + # A normal HTTP 400 (schema violation, missing field, etc.) must + # still map to ProviderInvalidRequest. The content-rejection + # heuristic is conservative — it only fires on known codes / + # types / message patterns. + exc = classify_http_error(_mock_400(code="invalid_field", message="messages: missing")) + assert isinstance(exc, ProviderInvalidRequest) + assert not isinstance(exc, ProviderUnsupportedContentBlock) + + +def test_extract_rejected_block_type_picks_up_image() -> None: + assert _extract_rejected_block_type("image_content_not_supported", None) == "image" + + +def test_extract_rejected_block_type_picks_up_audio_from_message() -> None: + assert _extract_rejected_block_type(None, "audio is not supported") == "audio" + + +def test_looks_like_content_rejection_negative_cases() -> None: + # Unrelated codes and messages should NOT trigger the heuristic. + assert _looks_like_content_rejection("invalid_field", None, "field missing") is False + assert _looks_like_content_rejection(None, None, None) is False From fb84521c6b5a421bd0b22a7e5b7b16522be377b2 Mon Sep 17 00:00:00 2001 From: chris-colinsky Date: Fri, 15 May 2026 18:07:55 -0700 Subject: [PATCH 6/6] fix: CoPilot review pass on PR #44 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - audio/video symmetry in the substring fallback of _looks_like_content_rejection - explicit isinstance(block, ImageBlock) guard in _block_to_wire to surface added union variants as a TypeError instead of an AttributeError on .source - clarify ImageBlock.media_type docstring: permitted but redundant on URL sources (the URL payload carries content-type), provider implementations MAY consume it as a hint - reword CHANGELOG qualifier '(proposal X, spec vY.Z)' → '(proposal X, introduced in spec vY.Z)' on the 0015 and 0016 entries so it doesn't read like a per-entry submodule pin change --- CHANGELOG.md | 4 ++-- src/openarmature/llm/messages.py | 13 ++++++++----- src/openarmature/llm/providers/openai.py | 9 ++++++--- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fabfa1..2519495 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,10 +8,10 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The ### Added -- **Image content blocks for user messages (proposal 0015, spec v0.13.0).** `UserMessage.content` now accepts `str | list[ContentBlock]`. The block surface introduces `TextBlock`, `ImageBlock`, `ImageSourceURL`, `ImageSourceInline`, and the `ContentBlock` / `ImageSource` discriminated unions over the block / source `type` field. `ImageBlock` carries a `media_type` (required for inline sources; ignored for URL sources; typed as `str | None` so callers MAY pass any `image/*` type the bound model supports) and an optional `detail` hint (`"auto"` / `"low"` / `"high"`; `None` default omits the field from the wire so providers apply their own default). System, assistant, and tool messages stay text-string-only; image inputs are user-only in v1. +- **Image content blocks for user messages (proposal 0015, introduced in spec v0.13.0).** `UserMessage.content` now accepts `str | list[ContentBlock]`. The block surface introduces `TextBlock`, `ImageBlock`, `ImageSourceURL`, `ImageSourceInline`, and the `ContentBlock` / `ImageSource` discriminated unions over the block / source `type` field. `ImageBlock` carries a `media_type` (required for inline sources; ignored for URL sources; typed as `str | None` so callers MAY pass any `image/*` type the bound model supports) and an optional `detail` hint (`"auto"` / `"low"` / `"high"`; `None` default omits the field from the wire so providers apply their own default). System, assistant, and tool messages stay text-string-only; image inputs are user-only in v1. - **`OpenAIProvider` content-array wire mapping.** When `UserMessage.content` is a content-block sequence, the wire body uses OpenAI's `content` array per §8.1.1. `TextBlock → {type: "text", text}`. `ImageBlock` with a URL source maps to `{type: "image_url", image_url: {url, detail?}}`. `ImageBlock` with an inline source constructs an RFC 2397 `data:;base64,` URI and goes through the same `image_url` entry shape. Inline bytes pass through unchanged — no inspection, transcoding, or re-encoding. - **New error category `ProviderUnsupportedContentBlock` (non-transient).** Raised when the bound model rejects a content block type / media variant. Distinct from `ProviderInvalidRequest` (which covers spec-shape malformation): this category surfaces a *capability* mismatch, letting callers route differently (e.g., fall back to a multimodal-capable provider) without overloading the malformed-request category. Carries `block_type` ("image" / "audio" / "video") and `reason` (provider's human-readable message) when those are recoverable from the rejection. `OpenAIProvider` detects content rejection via HTTP 400 bodies — heuristic on `error.code` (known set: `image_content_not_supported`, `unsupported_image_media_type`, `audio_content_not_supported`, etc.), `error.type` (`image_parse_error`), and `error.message` ("does not support" + image/audio/video). -- **Structured output (proposal 0016, spec v0.14.0).** `Provider.complete()` now accepts an optional `response_schema` parameter — either a JSON Schema dict or a Pydantic `BaseModel` subclass. When supplied, the provider constrains the model's output to the schema and populates `Response.parsed` with the validated value (`dict` for dict-schema input, a `BaseModel` instance for class input). New `StructuredOutputInvalid` error category (non-transient by default) raises on JSON parse failure or schema validation failure; carries the requested schema, the raw response content, and a failure description. +- **Structured output (proposal 0016, introduced in spec v0.14.0).** `Provider.complete()` now accepts an optional `response_schema` parameter — either a JSON Schema dict or a Pydantic `BaseModel` subclass. When supplied, the provider constrains the model's output to the schema and populates `Response.parsed` with the validated value (`dict` for dict-schema input, a `BaseModel` instance for class input). New `StructuredOutputInvalid` error category (non-transient by default) raises on JSON parse failure or schema validation failure; carries the requested schema, the raw response content, and a failure description. - **`OpenAIProvider` native response_format wire path.** When `response_schema` is supplied, the chat-completions request body carries `response_format: { type: "json_schema", json_schema: { name, schema, strict } }`. The `strict` flag is determined by a deep recursive walk over the schema (object-property required-coverage rule across `anyOf` / `oneOf` / `allOf` and `$ref` targets, with cycle protection); unresolvable refs fall through to `strict: false`. The `name` field uses `schema.title` when present, otherwise a deterministic sha256-prefix hash. - **`OpenAIProvider` prompt-augmentation fallback.** Constructor flag `force_prompt_augmentation_fallback: bool` (default `False`) and read-only inspect property `uses_prompt_augmentation_fallback: bool`. When the flag is on, structured-output calls build a fresh message list with a system directive containing the serialized schema, omit `response_format` from the wire, and validate the response post-receive. The caller's original `messages` list is never mutated. Use for OpenAI-compatible servers (older vLLM, some LM Studio releases, llama.cpp variants) that reject or silently ignore `response_format`. - **Provider-agnostic schema helpers.** `openarmature.llm.validate_response_schema(schema)` (raises `ProviderInvalidRequest` when the schema is not a dict with a top-level `type: "object"`) and `openarmature.llm.strict_mode_supported(schema)` (the deep-tree strict-mode constraint check) are exported for reuse by future Anthropic/Gemini providers. diff --git a/src/openarmature/llm/messages.py b/src/openarmature/llm/messages.py index 44987be..b63477a 100644 --- a/src/openarmature/llm/messages.py +++ b/src/openarmature/llm/messages.py @@ -174,11 +174,14 @@ class ImageBlock(BaseModel): Attributes: type: The discriminator literal ``"image"``. source: One of ``ImageSourceURL`` or ``ImageSourceInline``. - media_type: IANA media type. Required when source is inline; - ignored when source is a URL. Providers MUST accept - ``image/png``, ``image/jpeg``, ``image/webp`` at minimum - and MAY accept additional ``image/*`` types they document - support for. + media_type: IANA media type. Required when source is inline. + Permitted but redundant when source is a URL (the URL + payload carries the content-type); the OpenAI wire path + currently does not surface it for URL sources, but + provider implementations MAY consume it as a hint. + Providers MUST accept ``image/png``, ``image/jpeg``, + ``image/webp`` at minimum and MAY accept additional + ``image/*`` types they document support for. detail: Image-processing fidelity hint. One of ``"auto"``, ``"low"``, ``"high"``. ``None`` (the default) omits the field from the wire. diff --git a/src/openarmature/llm/providers/openai.py b/src/openarmature/llm/providers/openai.py index 1e1574f..b76088d 100644 --- a/src/openarmature/llm/providers/openai.py +++ b/src/openarmature/llm/providers/openai.py @@ -75,6 +75,7 @@ from ..messages import ( AssistantMessage, ContentBlock, + ImageBlock, ImageSourceInline, Message, SystemMessage, @@ -694,7 +695,8 @@ def _message_to_wire(msg: Message) -> dict[str, Any]: def _block_to_wire(block: ContentBlock) -> dict[str, Any]: if isinstance(block, TextBlock): return {"type": "text", "text": block.text} - # ImageBlock + if not isinstance(block, ImageBlock): # pyright: ignore[reportUnnecessaryIsInstance] + raise TypeError(f"unhandled content block type: {type(block).__name__}") if isinstance(block.source, ImageSourceInline): url = f"data:{block.media_type};base64,{block.source.base64_data}" else: @@ -854,8 +856,9 @@ def _looks_like_content_rejection( if error_code in _CONTENT_REJECTION_ERROR_CODES: return True lower_code = error_code.lower() - if "image" in lower_code and ("not_supported" in lower_code or "unsupported" in lower_code): - return True + for block_type in ("image", "audio", "video"): + if block_type in lower_code and ("not_supported" in lower_code or "unsupported" in lower_code): + return True if isinstance(error_type, str) and error_type.lower() in { "image_parse_error", "image_content_not_supported",