In [24]:
import json
from pydantic import BaseModel
import os
from litellm import completion
from dotenv import load_dotenv
import re
load_dotenv()

True

In [7]:
class User(BaseModel):
    id: int
    name: str = "John Doe"
    age: int | None = None

response_model = User
# 生成 JSON Schema
schema = User.model_json_schema()
print(schema)

{'properties': {'id': {'title': 'Id', 'type': 'integer'}, 'name': {'default': 'John Doe', 'title': 'Name', 'type': 'string'}, 'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'Age'}}, 'required': ['id'], 'title': 'User', 'type': 'object'}


In [10]:
content = '''
小红今年5岁，学号32
'''

In [15]:
system_prompt = f'''
As a genius expert, your task is to understand the content and provide
the parsed objects in json that match the following json_schema:\n

{json.dumps(response_model.model_json_schema(), indent=2, ensure_ascii=False)}

Make sure to return an instance of the JSON, not the schema itself.Return the correct JSON response within a markdown ```json codeblock. not the JSON_SCHEMA.
'''

In [9]:
system_prompt

'\nAs a genius expert, your task is to understand the content and provide\nthe parsed objects in json that match the following json_schema:\n\n\n{\n  "properties": {\n    "id": {\n      "title": "Id",\n      "type": "integer"\n    },\n    "name": {\n      "default": "John Doe",\n      "title": "Name",\n      "type": "string"\n    },\n    "age": {\n      "anyOf": [\n        {\n          "type": "integer"\n        },\n        {\n          "type": "null"\n        }\n      ],\n      "default": null,\n      "title": "Age"\n    }\n  },\n  "required": [\n    "id"\n  ],\n  "title": "User",\n  "type": "object"\n}\n\nMake sure to return an instance of the JSON, not the schema itself\n'

In [13]:
api_key = os.getenv("QWEN_API_KEY")
base_url = os.getenv("QWEN_API_BASE_URL")

In [14]:
MODEL_NAME = 'openai/qwen-turbo-latest'

In [17]:
MODEL_NAME = 'openai/qwen3-8b'

In [None]:
from numpy import full


stream = completion(
    model=MODEL_NAME, 
    messages=[{'role':'assistant','content':system_prompt},{"role": "user", "content": content}],
    max_retries= 3,
    temperature=0.2,
    api_key=api_key,
    base_url=base_url,
    stream=True,
)

reasoning_content = ''
answer_content = ''
is_answering = False  # 是否进入回复阶段

print("\n" + "=" * 20 + "思考过程" + "=" * 20 + "\n")
if stream:
    response = ""
    for chunk in stream:
        delta = chunk.choices[0].delta
        # 收集思考内容
        if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None:
                if not is_answering:
                    print(delta.reasoning_content, end="", flush=True)
                reasoning_content += delta.reasoning_content
        # 收到content，开始进行回复
        if hasattr(delta, "content") and delta.content:
            if not is_answering:
                print("\n" + "=" * 20 + "回复部分" + "=" * 20 + "\n")
                is_answering = True
            print(delta.content, end="", flush=True)
            answer_content += delta.content      
            



好的，我现在需要处理用户的输入，把信息转换成符合给定JSON Schema的格式。用户提供的信息是“小红今年5岁，学号32”。首先，我要解析这句话，提取出各个字段的值。

首先，用户提到了“小红”，这应该是名字，对应JSON中的name字段。根据schema，name的默认值是John Doe，但这里显然需要替换成“小红”。接下来是年龄，用户说“5岁”，所以age应该是5，类型是整数。然后是学号32，但schema里并没有学号这个字段，所以可能需要忽略掉。不过，我需要再仔细检查一下schema的结构，确保没有遗漏。

看一下schema的properties，里面有id、name、age三个字段。其中id是必填的，类型是整数。用户提到的学号32可能对应id，因为学号通常是一个唯一的标识符，类似于id。所以这里的学号32应该映射到id字段，即id:32。而name是小红，age是5。这样三个字段都符合schema的要求。

再检查一下是否所有必填字段都存在。required里只有id，所以只要id存在就行。其他字段如name和age虽然不是必填，但用户提供了信息，应该填上。年龄的schema允许整数或null，默认是null，但这里用户明确给出了5岁，所以age应该是5。

确认没有问题后，就可以构造JSON对象了：id是32，name是小红，age是5。这样就符合用户提供的schema要求了。学号虽然没有在schema里出现，但根据上下文推测它对应id，所以正确。

```json
{
  "id": 32,
  "name": "小红",
  "age": 5
}
```

In [25]:
# Regex patterns for JSON extraction
_JSON_CODEBLOCK_PATTERN = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL)
_JSON_PATTERN = re.compile(r"({[\s\S]*})")

In [26]:
def extract_json_from_codeblock(content: str) -> str:
    """
    Extract JSON from a string that may contain markdown code blocks or plain JSON.

    This optimized version uses regex patterns to extract JSON more efficiently.

    Args:
        content: The string that may contain JSON

    Returns:
        The extracted JSON string
    """
    # First try to find JSON in code blocks
    match = _JSON_CODEBLOCK_PATTERN.search(content)
    if match:
        json_content = match.group(1).strip()
    else:
        # Look for JSON objects with the pattern { ... }
        match = _JSON_PATTERN.search(content)
        if match:
            json_content = match.group(1)
        else:
            # Fallback to the old method if regex doesn't find anything
            first_paren = content.find("{")
            last_paren = content.rfind("}")
            if first_paren != -1 and last_paren != -1:
                json_content = content[first_paren : last_paren + 1]
            else:
                json_content = content  # Return as is if no JSON-like content found

    return json_content


In [27]:
extract_json_from_codeblock(answer_content)

'{\n  "id": 32,\n  "name": "小红",\n  "age": 5\n}'