In [43]:
import json
import os
import re
from collections import Counter
from copy import deepcopy
from typing import List, Optional

import numpy as np
import tqdm as notebook_tqdm
from dotenv import load_dotenv
from pydantic import BaseModel, Field, constr, field_validator
from sentence_transformers import SentenceTransformer
from sklearn.cluster import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import logging as transformers_logging

from dq_swirl.clients.async_llm_client import AsyncLLMClient
from dq_swirl.ingestion.structure_analyzer import StructuralAnalyzer
from dq_swirl.rust_ingestion import smart_parse_batch

transformers_logging.set_verbosity_error()

In [44]:
load_dotenv("../secrets.env")

True

## Messy Data

In [45]:
messy_data = [
    "Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable",
    "Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp",
    "Order 1005: Buyer=Raj Patel, Total=1,200.50, Items: monitor, stand, cable",
    "Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard",
    "Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers -- [DISCOUNT APPLIED]",
    "Order 1008: Buyer=O'Connor, S., Location=Portland, OR, Total=$0.00, Items: ",
    "Order 1011: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable",
    "Order 1012: Buyer=Sarah Liu, Location=Austin, TX, Total=$156.55, Items: headphones",
    "Order 1013: Buyer=Mike Turner, Location=Cleveland, OH, Total=$1299.99, Items: gaming pc, mouse",
    "Order 1014: Buyer=Rachel Kim, Locadtion=Seattle, WA, Total=$89.50, Items: coffee maker",
    "Order 1015: Buyer=Chris Myers, Location=Cincinnati, OH, Total=$512.00, Items: monitor, desk lamp",
    "Order=1016, Buyer=Jake Myers, Total=$1,512.00, Items: monitor,",
    '{"id": "usr_001", "name": "Alex Johnson", "role": "admin", "isActive": true, "createdAt": "2025-11-02T09:14:23Z"}',
    '{"id": "usr_002", "name": "Maria Lopez", "email": "maria.lopez@example.com", "role": "editor", "isActive": null, "createdAt": "2025-12-18T16:47:10Z", "lastLoginIp": "192.168.1.42"}',
    '{"id": "usr_003", "email": "samir.patel@example.com", "role": "viewer", "isActive": false, "createdAt": "08/05/2024"}',
    '{"id": 4, "name": "Chen Wei", "email": "chen.wei@example.com", "isActive": true, "createdAt": null}',
    '{"id": "usr_005", "name": "Broken Record", "email": "broken@example.com"}',
    "Order 1017: Buyer=Griffin Arora, Location=Columbia, SC, Total=$512.00, Items: monitor, desk lamp, Discount: yes",
    "Order=1018, Buyer=Jae Arora, Location=Dreher, FL, Total=$6.00, Items: chair, Discount: true, phone=123-456-789",
    "Order=1019, Buyer=Jae Kao, Location=Atlanta, GA, Total=$12.00, Items: desk, Discount: False, phone=123-456-789",
    "2026-01-30 14:22:01 INFO User login successful user_id=123",
    "2026-01-30 14:22:01 INFO User login successful",
    "level =INFO, user =Sam, id=1",
    "timestamp=2026-01-30T14:22:01Z level=INFO user=alice action=login success=true",
    "level=INFO cpu_usage=1,234.56 memory=512MB",
    '{"level":"INFO","service":"orders","order_id":1001,"status":"created"}',
    '[2026-01-31 17:11:22 +0000] [7] [INFO] 127.0.0.1:56718 - - [31/Jan/2026:17:11:22 +0000] "GET /health 1.1" 200 16 "-" "curl/8.14.1"',
    "2026-01-31 17:11:00 swirl [DEBUG] saq_worker.py:28 Running cron job health check",
]

## Data Preprocessing

In [46]:
#################################################################################
################################# Grammar Parsing ###############################
#################################################################################

string_batch = []
string_json_batch = []
for msg in messy_data:
    if not (msg.startswith("[") and msg.endswith("]")) and not (
        msg.startswith("{") and msg.endswith("}")
    ):
        string_batch.append(msg)
    else:
        string_json_batch.append(msg)

print(f"\nUNSTRUCTURED STRING SAMPLES: {len(string_batch)}\n")
print(f"JSON STRING SAMPLES: {len(string_json_batch)}\n")


string_samples = smart_parse_batch(string_batch)

for i, (msg, parsed) in enumerate(string_samples):
    print(f"Original: {msg}\nParsed: {parsed}\n")


json_samples = []
leftovers = []

for msg in string_json_batch:
    try:
        data = json.loads(msg)
        json_samples.append((msg, data))
    except Exception:
        leftovers.append((msg, None))


data_samples = string_samples + json_samples

print(f"\nTOTAL SAMPLES: {len(data_samples)}\nERROR SAMPLES: {len(leftovers)}\n")

#################################################################################
############################### Structure Analyzer ##############################
#################################################################################


analyzer = StructuralAnalyzer(ignore_unparsed=False)

hash_counts = Counter()
unique_structures = {}

for raw, parsed in data_samples:
    result = analyzer.generate_fingerprint(raw, parsed)
    signature_hash = result["hash"]
    hash_counts[signature_hash] += 1
    unique_structures[signature_hash] = unique_structures.get(signature_hash, result)

print(
    f"Detected {len(unique_structures)} unique schemas across {len(data_samples)} records.\n"
)

for h, count in sorted(hash_counts.items()):
    print(f"Schema {h} ({count} occurrences):")
    print(f"  Layout: {unique_structures[h]['signature']}")
    print("-" * 30)


#################################################################################
############################# Structural Clustering #############################
#################################################################################


def conjoin_signatures(registry_output: dict):
    # 1. Extract the unique signatures and their keys
    # registry_output is that "all-encompassing" JSON you shared
    hashes = list(registry_output.keys())

    # Create a "sentence" of keys for each hash
    # e.g., "order buyer location total items"
    signatures_as_text = [
        " ".join(registry_output[h]["signature"].keys()) for h in hashes
    ]

    # 2. Vectorize using character n-grams (resilient to typos)
    # This turns 'locadtion' and 'location' into very similar vectors
    vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 5))
    matrix = vectorizer.fit_transform(signatures_as_text)

    # 3. Cluster using HDBSCAN
    # min_cluster_size=2 because we want to find groups of variants
    clusterer = HDBSCAN(
        min_cluster_size=2,
        metric="euclidean",
        copy=True,
    )
    labels = clusterer.fit_predict(matrix.toarray())

    # 4. Map Hashes to Cluster Groups
    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        conjoined_map[h] = {
            "cluster_id": int(cluster_id),
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    sorted_dict = dict(sorted(conjoined_map.items()))
    return sorted_dict

#################################################################################
############################## Semantic Clustering ##############################
#################################################################################


def conjoin_signatures_semantic(
    registry_output: dict, 
    embedding_model: str = "all-MiniLM-L6-v2", 
    cache_dir: str = "./.models"
):
    hashes = list(registry_output.keys())
    if not hashes: 
        return {}

    signatures_as_text = []
    for h in hashes:
        h_dict = dict(registry_output[h]["signature"])
        # remove the 'black hole' field that swallows everything
        h_dict.pop("_unparsed", None)
        
        # sort keys to ensure structural identity regardless of log order
        sorted_keys = sorted(h_dict.keys())
        
        if not sorted_keys:
            text_rep = "schema:empty_blob"
        else:
            # 'field:' prefix to define the role of the tokens
            text_rep = " ".join([f"field:{k}" for k in sorted_keys])
            
        signatures_as_text.append(text_rep)

    model = SentenceTransformer(embedding_model, cache_folder=cache_dir)
    embeddings = model.encode(signatures_as_text)
    X = np.ascontiguousarray(embeddings, dtype=np.float64)

    clusterer = HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric="cosine",
        cluster_selection_epsilon=0.08, 
        cluster_selection_method="eom",
        allow_single_cluster=True,
    )

    labels = clusterer.fit_predict(X)

    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        # unique IDs to outliers so they don't group into one '-1' bucket
        final_id = int(cluster_id) if cluster_id != -1 else (400 + i)
        
        conjoined_map[h] = {
            "cluster_id": final_id,
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    return conjoined_map

# run structure clustering
structure_cluster_map = conjoin_signatures(analyzer.signature_map)
structure_clusters = {}
for k, v in structure_cluster_map.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    structure_clusters[cluster_id] = structure_clusters.get(cluster_id, [])
    structure_clusters[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )
print(f"Structural Clusters: \n{json.dumps(structure_clusters, indent=4)}\n")


# run semantic clustering
print(json.dumps(analyzer.signature_map, indent=4))
semantic_cluster_map = conjoin_signatures_semantic(analyzer.signature_map)
semantic_clusters = {}
for k, v in semantic_cluster_map.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    semantic_clusters[cluster_id] = semantic_clusters.get(cluster_id, [])
    semantic_clusters[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )
print(f"Semantic Clusters: \n{json.dumps(semantic_clusters, indent=4)}\n")


UNSTRUCTURED STRING SAMPLES: 22

JSON STRING SAMPLES: 6

Original: Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable
Parsed: {'order': '1001', 'buyer': 'John Davis', 'location': 'Columbus, OH', 'total': '$742.10', 'items': 'laptop, hdmi cable'}

Original: Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp
Parsed: {'order': '1004', 'buyer': 'AMANDA SMITH', 'location': 'Seattle, WA', 'total': '$50.00', 'items': 'desk lamp'}

Original: Order 1005: Buyer=Raj Patel, Total=1,200.50, Items: monitor, stand, cable
Parsed: {'order': '1005', 'buyer': 'Raj Patel', 'total': '1,200.50', 'items': 'monitor, stand, cable'}

Original: Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard
Parsed: {'order': '1006', 'total': '$89.99', 'location': 'Miami, FL', 'buyer': 'Elena Rossi', 'items': 'keyboard'}

Original: Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers -- [DIS

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1872.56it/s, Materializing param=pooler.dense.weight]                             


Semantic Clusters: 
{
    "0": [
        {
            "signature_hash": "fd116cd512d5ecd2e59edf12fc258b32",
            "fields": [
                "order",
                "buyer",
                "location",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "50eb97a85647221ecc7f65f74d68d156",
            "fields": [
                "order",
                "buyer",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "28d9f3b14d0e5516a186062212502d0c",
            "fields": [
                "order",
                "buyer",
                "locadtion",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "461a895ef9c5046dd2cb5026b6a62de0",
            "fields": [
                "order",
              

  warn(


In [47]:
# semantic cluster map to raw string, rough parsed dict, structure cluster, structure signature hash
cluster_dict = {}
for cluster_id, records in semantic_clusters.items():
    cluster_dict[cluster_id] = cluster_dict.get(cluster_id, [])
    for rec in records:
        signature_hash = rec["signature_hash"]
        analyzer_records = analyzer.signature_map[signature_hash]["records"]
        for r in analyzer_records:
            cluster_dict[cluster_id].append(
                {
                    "signature_hash": signature_hash,
                    "structure_cluster_id": structure_cluster_map[signature_hash].get(
                        "cluster_id"
                    ),
                    "raw": r["raw"],
                    "parsed": r["parsed"],
                }
            )

cluster_dict

{0: [{'signature_hash': 'fd116cd512d5ecd2e59edf12fc258b32',
   'structure_cluster_id': 1,
   'raw': 'Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable',
   'parsed': {'order': '1001',
    'buyer': 'John Davis',
    'location': 'Columbus, OH',
    'total': '$742.10',
    'items': 'laptop, hdmi cable'}},
  {'signature_hash': 'fd116cd512d5ecd2e59edf12fc258b32',
   'structure_cluster_id': 1,
   'raw': 'Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp',
   'parsed': {'order': '1004',
    'buyer': 'AMANDA SMITH',
    'location': 'Seattle, WA',
    'total': '$50.00',
    'items': 'desk lamp'}},
  {'signature_hash': 'fd116cd512d5ecd2e59edf12fc258b32',
   'structure_cluster_id': 1,
   'raw': 'Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard',
   'parsed': {'order': '1006',
    'total': '$89.99',
    'location': 'Miami, FL',
    'buyer': 'Elena Rossi',
    'items': 'keyboard'}},
  {

## LLM Client Setup

In [48]:
# llm connection
API_KEY = os.getenv("LLM_API_KEY")
# api_base_url = os.getenv("LsLM_BASE_URL")
api_base_url = "https://openrouter.ai/api/v1"
# model = "openai/google/gemma-3-27b-it"
MODEL = "openai/gpt-oss-120b:exacto"

In [49]:
client = AsyncLLMClient(
    MODEL,
    api_base_url,
)

## LLM Prompts

In [50]:
# prompts
PYDANTIC_SYSTEM_PROMPT = """You are a Data Architect. Your goal is to perform unsupervised schema inference on a sample of unstructured data.

Generate a Pydantic `BaseModel` class that represents the "Gold Standard" foundation for this data pattern. 

Instructions:
- Normalization: Suggest clean, snake_case keys for the identified fields.
- If you see a string value for a field that follows a consistent structure (e.g., "<city>, <state>") then make sure that structure is accurately typed in the BaseModel.
- Determine what fields should be required vs optional based on overall semantic meaning of the entity you are creating a BaseModel class for.

Constraints:
- Include a detailed description for each field using the `Field` class to explain what the field is and if there are any expected structural patterns (e.g., `state` should be two letters).
- Create supplemental BaseModel classes where necessary to preserve semantic clarity.
- Do NOT include any regex.
- You MUST wrap your code in a python block with the following start marking "```python" and end marking "```".
- If a field appears in some rows but not others, mark it as `Optional`.
- You are only allowed to use the following imports: "from typing import List, Dict, Optional; from pydantic import BaseModel, Field".
- Return ONLY the Pydantic class definitions (you are allowed to generate multiple as long as they are logically linked).
"""

PYDANTIC_USER_PROMPT = """Please analyze the following representative samples of a new data pattern and generate the Pydantic 'Foundation' model.

### Data Samples:
{samples}
"""

## Generate Pydantic BaseModel Class

In [51]:
class ModelResponseStructure(BaseModel):
    code_string: str = Field(..., description="generated python code")
    entrypoint_class_name: str = Field(
        ..., description="name of entrypoint base model class in the code generated"
    )


def extract_python_code(text):
    """
    Extracts the Python code block from a string.

    Returns:
        str: The extracted source code or an empty string if not found.
    """
    block_pattern = r"```(?:python)?\s*(.*?)\s*```"
    match = re.search(block_pattern, text, re.DOTALL)

    return match.group(1).strip() if match else ""


for c_id, records in cluster_dict.items():
    string_li = [r["raw"] for r in records]
    messages = [
        {"role": "system", "content": PYDANTIC_SYSTEM_PROMPT},
        {
            "role": "user",
            "content": PYDANTIC_USER_PROMPT.format(
                samples=string_li,
            ),
        },
    ]

    buffer = []
    response = await client.chat(
        messages=messages,
        stream=True,
        temperature=0.0,
        response_format=ModelResponseStructure,
    )
    async for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            content = chunk.choices[0].delta.content
            print(content, end="", flush=True)
            buffer.append(content)

    resp = "".join(buffer)
    resp: ModelResponseStructure = ModelResponseStructure(**json.loads(resp))

    if not resp.code_string.startswith("```python"):
        resp.code_string = f"```python\n{resp.code_string}\n```"

    code = extract_python_code(resp.code_string)

    namespace = {}
    exec(code, globals(), namespace)

    # access the function from the namespace dictionary
    cls = namespace.get(resp.entrypoint_class_name)
    cls.model_rebuild(_types_namespace=namespace)
    schema = cls.model_json_schema()

    fname = f"{resp.entrypoint_class_name.lower()}_base_model.py"
    code = code.encode("ascii", errors="ignore").decode("ascii")
    with open(fname, "w", encoding="utf-8") as f:
        f.write(code)
        print(f"\nSuccessfully wrote code to {fname}")

    print()
    break

{
  "code_string": "```python\nfrom typing import List, Optional\nfrom pydantic import BaseModel, Field\n\nclass Location(BaseModel):\n    city: str = Field(..., description=\"City name extracted from the location field.\")\n    state: str = Field(..., description=\"Two‑letter US state abbreviation, e.g., 'OH'.\")\n\nclass Order(BaseModel):\n    order_id: int = Field(..., description=\"Numeric identifier of the order, extracted from the leading 'Order' token.\")\n    buyer_name: str = Field(..., description=\"Full name of the buyer as it appears after the 'Buyer=' token.\")\n    location: Optional[Location] = Field(\n        None, description=\"Geographic location of the buyer; may be missing in some records.\"\n    )\n    total_amount: float = Field(\n        ..., description=\"Monetary total for the order, parsed as a float. Currency symbols and commas are ignored.\"\n    )\n    items: List[str] = Field(\n        default_factory=list,\n        description=\"List of item names purchas

## Langgraph Robustness and Stategraph 

In [52]:
ARCHITECT_PROMPT = """You are a Lead Data Architect.
Define a simple Pydantic v2 `BaseModel` that represents the "Gold Standard" foundation for the data pattern found in the input samples.

INPUT SAMPLES (Multiple variations):
{samples}

REQUIREMENTS:
1. Normalization: Suggest clean, snake_case keys for the identified fields.
2. Optionality: If a field is missing in ANY of the samples, you MUST wrap it in Optional[...].
3. Determine what fields should be required vs optional based on overall semantic meaning of the entity you are creating a BaseModel class for.
4. Include a detailed description for each field using the `Field` class to explain what the field is and if there are any expected structural patterns (e.g., `state` should be two letters).
5. Do NOT include any regex.
6. You MUST wrap your code in a python block with the following start marking "```python" and end marking "```".
7. Create supplemental BaseModel classes where necessary to preserve semantic clarity.
8. You are ONLY allowed to use the following imports: "from typing import List, Dict, Optional; from pydantic import BaseModel, Field".
9. Keep primary keys as type string.
10. Infer best data type from string value (e.g., money should be a float, "true/false" or "yes/no" fields should be a boolean, and fields that represent multiple entities should use a representative aggregate data structure type)
11. NEVER set potentially boolean fields as optional. Instead, when not explicitly declared, infer as to what the default value ought based on the semantic meaning of the field and how it appears in the samples that do provide it.
12. Perform semantic merging: Identify fields across structural variants that share the same intent and conjoin them under a single, definitive schema key to avoid redundancy (e.g., "location" vs "city", "state", "zip code")
13. Avoid information loss when it comes to key:value pairs in the sample data.

Return ONLY the Python code for the class. Include necessary imports (from pydantic import BaseModel, Field, etc.).
"""

CODER_PROMPT = """You are a Senior Data Engineer.
Your task is to write a concise but effective transformation function `transform_to_models(parsed_dict: list[dict]) -> list[dict]` that maps roughly parsed dictionaries into the provided pydantic v2 target schema base model definition.

TARGET SCHEMA (Python Pydantic v2 BaseModel):
{schema}

SOURCE SAMPLES:
{samples}

Logic Requirements:
1. Use a 'coalesce' approach: for each target field, check all possible source keys from the input dictionary samples.
2. Use parsed_dict.get() for optional fields.
3. Infer best data type from string (e.g., "$120.00" should be a float, and "true" should be a boolean). 
4. ALL python code must be encapsulated by the `transform_to_models()` function -- if it's not in that function it will not be run.

Return ONLY the Python code for the function `transform_to_models`. Do not include the Pydantic class in your response.
"""

CODE_EXECUTION = """
from pydantic import BaseModel, Field, ValidationError
from typing import *
import json, re

{schema}

{parser_code}
"""

In [53]:
from __future__ import annotations

import json
import operator
import traceback
from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict

from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import END, START, StateGraph
from pydantic import BaseModel, Field

from dq_swirl.utils.log_utils import get_custom_logger

logger = get_custom_logger()


class ModelResponseStructure(BaseModel):
    code_string: str = Field(..., description="generated python code")
    entrypoint_class_name: str = Field(
        ...,
        description="name of entrypoint base model class in the code generated",
    )


class MultiAgentState(TypedDict):
    semantic_id: str
    structure_cluster_id: str
    data_pairs_all: List[Dict[str, Any]]
    data_pairs_structure: List[Dict[str, Any]]
    # Reducers: 'new' replaces 'old' for strings/objects, but we sum 'attempts'
    gold_schema: Annotated[Optional[ModelResponseStructure], lambda old, new: new]
    parser_code: Annotated[Optional[str], lambda old, new: new]
    feedback: Annotated[Optional[str], lambda old, new: new]
    error_type: Annotated[
        Optional[Literal["SCHEMA_ISSUE", "CODE_ISSUE"]], lambda old, new: new
    ]
    attempts: Annotated[int, operator.add]  # Use addition to


async def architect_node(state: MultiAgentState):
    if state.get("gold_schema") and state.get("error_type") != "SCHEMA_ISSUE":
        return {"attempts": 0}

    logger.info(f"[Architect] Defining Semantic Goal: {state['attempts']}")
    # Send a variety of samples so the architect sees all potential fields
    samples = json.dumps([p["parsed"] for p in state["data_pairs_all"][:100]], indent=2)

    logger.debug(samples)

    prompt = ARCHITECT_PROMPT.format(
        samples=samples,
    )
    buffer = []
    response = await client.chat(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        api_key_override=API_KEY,
        stream=True,
        temperature=0.0,
        response_format=ModelResponseStructure,
    )
    async for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            content = chunk.choices[0].delta.content
            buffer.append(content)

    resp = "".join(buffer)
    resp = ModelResponseStructure(**json.loads(resp))
    resp.code_string = extract_python_code(resp.code_string)
    print(resp.code_string)

    return {
        "gold_schema": resp,
        "attempts": 1,
        "feedback": None,
        "error_type": None,  # CLEAR the error type
    }


async def schema_tester_node(state: MultiAgentState):
    logger.info(f"[Scehma Tester] Validating Functional BaseModel: {state['attempts']}")
    python_base_model_str = state["gold_schema"].code_string

    env = {}
    try:
        exec(python_base_model_str, globals(), env)
        cls_name = state["gold_schema"].entrypoint_class_name
        model = env[cls_name]

        model.model_rebuild(_types_namespace=env)
        schema = model.model_json_schema()

        return {"feedback": "SUCCESS"}
    except Exception as e:
        err_msg = traceback.format_exc()
        logger.exception(e)
        return {"feedback": err_msg, "error_type": "SCHEMA_ISSUE"}


async def coder_node(state: MultiAgentState):
    logger.info(f"[Coder] Parser for Gold Schema: {state['attempts']}")
    samples = json.dumps(
        [rec["parsed"] for rec in state["data_pairs_structure"]], indent=2
    )

    prompt = CODER_PROMPT.format(
        schema=state["gold_schema"].code_string, samples=samples
    )
    buffer = []
    response = await client.chat(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        api_key_override=API_KEY,
        stream=True,
        temperature=0.0,
    )
    async for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            content = chunk.choices[0].delta.content
            print(content, end="", flush=True)
            buffer.append(content)

    resp = "".join(buffer)
    code = extract_python_code(resp)

    return {"parser_code": code, "attempts": 1, "feedback": None, "error_type": None}


async def code_tester_node(state: MultiAgentState):
    logger.info(f"[Code Tester] Stress-testing parser: {state['attempts']}")
    full_code = CODE_EXECUTION.format(
        schema=state["gold_schema"].code_string, parser_code=state["parser_code"]
    )

    env = {}
    try:
        cls_name = state["gold_schema"].entrypoint_class_name
        exec(full_code, globals(), env)
        func = env["transform_to_models"]
        model = env[cls_name]
        model.model_rebuild(_types_namespace=env)

        input_data = [pair["parsed"] for pair in state["data_pairs_structure"]]
        mapped_batch = func(input_data)
        for mapped_dict in mapped_batch:
            model.model_validate(mapped_dict)
            print(f"Input: {mapped_dict} -- PASSED")
        return {"feedback": "SUCCESS"}
    except Exception as e:
        err_msg = traceback.format_exc()
        try:
            print(f"Input: {mapped_dict} -- FAILED")
        except Exception:
            pass
        return {"feedback": err_msg, "error_type": "CODE_ISSUE"}


async def exporter_node(state: MultiAgentState):
    base_model_name = state["gold_schema"].entrypoint_class_name
    filename = f"parser_{base_model_name}_struct_{state['structure_cluster_id']}.py"
    with open(filename, "w") as f:
        f.write(f"{state['gold_schema'].code_string}\n\n{state['parser_code']}")
    print(f"--> Exported: {filename}")
    return {"feedback": "DONE"}


def schema_router(state: MultiAgentState):
    """Determines if we move to Coder or retry the Architect."""
    feedback = state.get("feedback")
    attempts = state.get("attempts", 0)

    if feedback == "SUCCESS":
        return "coder"

    # if failed too many times, just stop the process
    if attempts >= 3:
        logger.error(f"Schema failed after {attempts} attempts. Aborting.")
        return "end"

    return "architect"


def code_router(state: MultiAgentState):
    """Determines if we export or retry Coder/Architect."""
    feedback = state.get("feedback")
    error_type = state.get("error_type")
    attempts = state.get("attempts", 0)

    if feedback == "SUCCESS":
        return "exporter"

    if attempts >= 6:
        return "end"

    # Specific routing based on where the failure happened
    if error_type == "SCHEMA_ISSUE":
        return "architect"

    # Default to retrying the coder for CODE_ISSUE or unknown errors
    return "coder"


## Define Graph
workflow = StateGraph(MultiAgentState)
workflow.add_node("architect", architect_node)
workflow.add_node("schema_tester", schema_tester_node)
workflow.add_node("coder", coder_node)
workflow.add_node("code_tester", code_tester_node)
workflow.add_node("exporter", exporter_node)

workflow.add_edge(START, "architect")
workflow.add_edge("architect", "schema_tester")
workflow.add_conditional_edges(
    "schema_tester",
    schema_router,
    {"architect": "architect", "coder": "coder", "end": END},
)

workflow.add_edge("coder", "code_tester")
workflow.add_conditional_edges(
    "code_tester",
    code_router,
    {"architect": "architect", "coder": "coder", "exporter": "exporter", "end": END},
)

workflow.add_edge("exporter", END)

app = workflow.compile(checkpointer=MemorySaver())

In [54]:
async def run_data_matrix(all_data: Dict[int, List[Dict]]):
    for sem_id, records in all_data.items():
        struct_groups = {}
        for s in records:
            cid = s["structure_cluster_id"]
            struct_groups.setdefault(cid, []).append(s)

        shared_gold_schema = None

        for struct_id, pairs in struct_groups.items():
            config = {"configurable": {"thread_id": f"sem_{sem_id}_str_{struct_id}"}}

            # Fix the typo 'structure_causter_id' -> 'structure_cluster_id'
            initial_state = {
                "semantic_id": str(sem_id),
                "structure_cluster_id": str(struct_id),
                "data_pairs_all": records,
                "data_pairs_structure": pairs,
                "gold_schema": shared_gold_schema,
                "attempts": 0,
            }

            final_output = await app.ainvoke(initial_state, config)
            shared_gold_schema = final_output.get("gold_schema")

            logger.info(f"--- Finished Cluster {struct_id} ---")


await run_data_matrix(cluster_dict)

[38;5;245m2026-02-02 12:44:46[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:44:46[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "order": "1001",
    "buyer": "John Davis",
    "location": "Columbus, OH",
    "total": "$742.10",
    "items": "laptop, hdmi cable"
  },
  {
    "order": "1004",
    "buyer": "AMANDA SMITH",
    "location": "Seattle, WA",
    "total": "$50.00",
    "items": "desk lamp"
  },
  {
    "order": "1006",
    "total": "$89.99",
    "location": "Miami, FL",
    "buyer": "Elena Rossi",
    "items": "keyboard"
  },
  {
    "order": "1007",
    "buyer": "Chris P.",
    "location": "Denver, CO",
    "total": "$12.00",
    "items": "stickers -- [DISCOUNT APPLIED]"
  },
  {
    "order": "1008",
    "buyer": "O'Connor, S.",
    "location": "Portland, OR",
    "total": "$0.00",
    "items": "None"
  },
  {
    "order": "1011",
    "buyer": "John D

from typing import List, Optional
from pydantic import BaseModel, Field

class Order(BaseModel):
    order: str = Field(..., description="Unique order identifier as a string.")
    buyer: str = Field(..., description="Name of the buyer.")
    location: Optional[str] = Field(
        None,
        description="Full location in 'City, ST' format. Optional because some records lack this field."
    )
    total: float = Field(
        ..., 
        description="Total amount of the order as a float. Currency symbols and commas are ignored during parsing."
    )
    items: List[str] = Field(
        ..., 
        description="List of items purchased. In the source data items are comma‑separated strings."
    )
    discount: bool = Field(
        False,
        description="Flag indicating whether a discount was applied. Defaults to False when the field is absent."
    )
    phone: Optional[str] = Field(
        None,
        description="Contact phone number for the buyer, if provided."
    

[38;5;245m2026-02-02 12:45:27[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2
[38;5;245m2026-02-02 12:45:28[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 1 ---
[38;5;245m2026-02-02 12:45:28[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:45:28[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "user_id": "123",
    "_unparsed": "2026-01-30 14:22:01 INFO User login successful"
  },
  {
    "level": "INFO",
    "user": "Sam",
    "id": "1"
  },
  {
    "timestamp": "2026-01-30T14:22:01Z",
    "level": "INFO",
    "user": "alice",
    "action": "login",
    "success": "true"
  }
]


Input: {'order': '1001', 'buyer': 'John Davis', 'location': 'Columbus, OH', 'total': 742.1, 'items': ['laptop', 'hdmi cable'], 'discount': False, 'phone': None} -- PASSED
Input: {'order': '1004', 'buyer': 'AMANDA SMITH', 'location': 'Seattle, WA', 'total': 50.0, 'items': ['desk lamp'], 'discount': False, 'phone': None} -- PASSED
Input: {'order': '1006', 'buyer': 'Elena Rossi', 'location': 'Miami, FL', 'total': 89.99, 'items': ['keyboard'], 'discount': False, 'phone': None} -- PASSED
Input: {'order': '1007', 'buyer': 'Chris P.', 'location': 'Denver, CO', 'total': 12.0, 'items': ['stickers'], 'discount': True, 'phone': None} -- PASSED
Input: {'order': '1008', 'buyer': "O'Connor, S.", 'location': 'Portland, OR', 'total': 0.0, 'items': [], 'discount': False, 'phone': None} -- PASSED
Input: {'order': '1011', 'buyer': 'John Davis', 'location': 'Columbus, OH', 'total': 742.1, 'items': ['laptop', 'hdmi cable'], 'discount': False, 'phone': None} -- PASSED
Input: {'order': '1012', 'buyer': 'Sara

[38;5;245m2026-02-02 12:45:37[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 1
[38;5;245m2026-02-02 12:45:37[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 1


from typing import Optional
from pydantic import BaseModel, Field

class LogEntry(BaseModel):
    """Gold‑standard representation of a generic log / audit record.

    The model normalises disparate source keys into a consistent, snake_case
    schema while preserving all semantic information present in the example
    payloads.
    """

    user_id: Optional[str] = Field(
        None,
        description=(
            "Identifier of the user associated with the event. "
            "May appear in source data as `user_id` or `id`."
        ),
    )
    user: Optional[str] = Field(
        None,
        description="Human‑readable username or account name.",
    )
    level: Optional[str] = Field(
        None,
        description="Log level such as `INFO`, `WARN`, `ERROR`, etc.",
    )
    timestamp: Optional[str] = Field(
        None,
        description=(
            "ISO‑8601 formatted timestamp of the event, e.g. ``2026-01-30T14:22:01Z``."
        ),
    )
    action: Optional[st

[38;5;245m2026-02-02 12:45:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2
[38;5;245m2026-02-02 12:45:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 2


Input: {'user_id': 123, 'success': False, 'unparsed': '2026-01-30 14:22:01 INFO User login successful'} -- FAILED
```python
import re
from typing import Any, List, Dict, Optional

def transform_to_models(parsed_dict: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Convert a list of loosely‑parsed dictionaries into a list of dictionaries
    that match the ``LogEntry`` schema.

    The function:
    * coalesces multiple possible source keys for each target field,
    * infers simple data types from string values (bool, float),
    * supplies defaults (``success`` → ``False``) when a field is missing.
    """
    # Mapping of target field → possible source keys (ordered by priority)
    field_map: Dict[str, List[str]] = {
        "user_id": ["user_id", "id"],
        "user": ["user", "username", "account_name"],
        "level": ["level", "log_level", "severity"],
        "timestamp": ["timestamp", "time", "date", "datetime"],
        "action": ["action", "event", "msg", "mess

[38;5;245m2026-02-02 12:46:11[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 3
[38;5;245m2026-02-02 12:46:11[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 3


Input: {'user_id': 123.0, 'user': None, 'level': None, 'timestamp': None, 'action': None, 'success': False, 'unparsed': '2026-01-30 14:22:01 INFO User login successful'} -- FAILED
```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """
    Transform a list of loosely‑parsed dictionaries into a list of dictionaries
    that conform to the ``LogEntry`` schema.

    The function:
    * coalesces multiple possible source keys for each target field,
    * infers simple data types (bool, int, float) from string values,
    * supplies defaults (e.g. ``success`` defaults to ``False``),
    * retains any raw line under the ``unparsed`` field when present.
    """
    import re

    # Mapping of target field -> possible source keys (ordered by priority)
    FIELD_MAP = {
        "user_id": ["user_id", "id"],
        "user": ["user", "username", "account_name"],
        "level": ["level", "log_level", "severity"],
        "timestamp": ["timestamp", "time", "date", "datet

[38;5;245m2026-02-02 12:46:22[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 4
[38;5;245m2026-02-02 12:46:22[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 4


Input: {'user_id': 123, 'user': None, 'level': None, 'timestamp': None, 'action': None, 'success': False, 'unparsed': '2026-01-30 14:22:01 INFO User login successful'} -- FAILED
```python
import re
from typing import Any, List, Dict, Optional

def transform_to_models(parsed_dict: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Convert a list of loosely‑parsed dictionaries into a list of dictionaries
    that match the ``LogEntry`` schema.

    The function:
    * coalesces multiple possible source keys for each target field,
    * infers simple data types from string values (bool, float),
    * supplies defaults (``success`` → ``False``) when a field is missing.
    """
    # Mapping of target field → possible source keys (ordered by priority)
    field_map: Dict[str, List[str]] = {
        "user_id": ["user_id", "id"],
        "user": ["user", "username", "account_name"],
        "level": ["level", "log_level", "severity"],
        "timestamp": ["timestamp", "time", "date",

[38;5;245m2026-02-02 12:46:41[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 5
[38;5;245m2026-02-02 12:46:41[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 5


Input: {'user_id': 123.0, 'user': None, 'level': None, 'timestamp': None, 'action': None, 'success': False, 'unparsed': '2026-01-30 14:22:01 INFO User login successful'} -- FAILED
```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """
    Transform a list of loosely‑parsed dictionaries into a list of dictionaries
    that match the ``LogEntry`` schema.

    The function:
    * coalesces multiple possible source keys for each target field,
    * attempts to infer a more appropriate Python type from string values,
    * supplies a default ``False`` for the ``success`` flag when missing,
    * and returns plain ``dict`` objects (ready to be fed to the Pydantic model).
    """
    # Mapping of target field -> possible source keys (ordered by priority)
    field_map = {
        "user_id": ["user_id", "id"],
        "user": ["user", "username", "account_name"],
        "level": ["level", "log_level", "severity"],
        "timestamp": ["timestamp", "time", "date", "

[38;5;245m2026-02-02 12:46:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 6
[38;5;245m2026-02-02 12:46:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 3 ---
[38;5;245m2026-02-02 12:46:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 0


Input: {'user_id': 123, 'user': None, 'level': None, 'timestamp': None, 'action': None, 'success': False, 'unparsed': '2026-01-30 14:22:01 INFO User login successful'} -- FAILED


[38;5;245m2026-02-02 12:46:51[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 0


```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """Map loosely‑parsed log dictionaries to the LogEntry schema.

    The function coalesces possible source keys for each target field,
    converts the ``success`` flag to a proper boolean, and leaves all other
    values as‑is (strings or ``None``) so they match the ``LogEntry`` model.
    """
    # Helper: return the first non‑null value found for a list of possible keys
    def _coalesce(src: dict, keys: list[str]):
        for k in keys:
            if k in src and src[k] is not None:
                return src[k]
        return None

    # Helper: robust conversion of various truthy representations to bool
    def _to_bool(val):
        if isinstance(val, bool):
            return val
        if isinstance(val, (int, float)):
            return bool(val)
        if isinstance(val, str):
            return val.strip().lower() in {"true", "1", "yes", "y", "t"}
        return False

    transformed: list[di

[38;5;245m2026-02-02 12:47:07[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 1
[38;5;245m2026-02-02 12:47:07[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 2 ---
[38;5;245m2026-02-02 12:47:07[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:47:07[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "_unparsed": "2026-01-30 14:22:01 INFO User login successful"
  },
  {
    "_unparsed": "[2026-01-31 17:11:22 +0000] [7] [INFO] 127.0.0.1:56718 - - [31/Jan/2026:17:11:22 +0000] \"GET /health 1.1\" 200 16 \"-\" \"curl/8.14.1\""
  },
  {
    "_unparsed": "2026-01-31 17:11:00 swirl [DEBUG] saq_worker.py:28 Running cron job health check"
  }
]


Input: {'user_id': '1', 'user': 'Sam', 'level': 'INFO', 'timestamp': None, 'action': None, 'success': False, 'unparsed': None} -- PASSED
Input: {'user_id': None, 'user': 'alice', 'level': 'INFO', 'timestamp': '2026-01-30T14:22:01Z', 'action': 'login', 'success': True, 'unparsed': None} -- PASSED
--> Exported: parser_LogEntry_struct_2.py


[38;5;245m2026-02-02 12:47:24[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 1
<string>:46: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
[38;5;245m2026-02-02 12:47:24[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 1


from typing import Optional
from pydantic import BaseModel, Field

class RequestInfo(BaseModel):
    client_ip: Optional[str] = Field(
        None,
        description="IP address of the client making the request."
    )
    client_port: Optional[int] = Field(
        None,
        description="Port number of the client."
    )
    request_time: Optional[str] = Field(
        None,
        description="Timestamp of the request as recorded in the log entry."
    )
    request_method: Optional[str] = Field(
        None,
        description="HTTP method used in the request, e.g., GET, POST."
    )
    request_path: Optional[str] = Field(
        None,
        description="Path component of the HTTP request."
    )
    request_protocol: Optional[str] = Field(
        None,
        description="Protocol version of the HTTP request, e.g., 1.1."
    )
    status_code: Optional[int] = Field(
        None,
        description="HTTP response status code returned to the client."
    )
    respo

[38;5;245m2026-02-02 12:47:59[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2


Input: {'raw_timestamp': '2026-01-30 14:22:01 INFO User login successful', 'log_level': '', 'message': '2026-01-30 14:22:01 INFO User login successful', 'component': None, 'process_id': None, 'source_file': None, 'source_line': None, 'request': None} -- PASSED
Input: {'raw_timestamp': '[2026-01-31 17:11:22 +0000] [7] [INFO] 127.0.0.1:56718 - - [31/Jan/2026:17:11:22 +0000] "GET /health 1.1" 200 16 "-" "curl/8.14.1"', 'log_level': '', 'message': '[2026-01-31 17:11:22 +0000] [7] [INFO] 127.0.0.1:56718 - - [31/Jan/2026:17:11:22 +0000] "GET /health 1.1" 200 16 "-" "curl/8.14.1"', 'component': None, 'process_id': None, 'source_file': None, 'source_line': None, 'request': None} -- PASSED
Input: {'raw_timestamp': '2026-01-31 17:11:00 swirl [DEBUG] saq_worker.py:28 Running cron job health check', 'log_level': '', 'message': '2026-01-31 17:11:00 swirl [DEBUG] saq_worker.py:28 Running cron job health check', 'component': None, 'process_id': None, 'source_file': None, 'source_line': None, 'request

<string>:51: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
[38;5;245m2026-02-02 12:47:59[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 3 ---
[38;5;245m2026-02-02 12:47:59[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:47:59[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "level": "INFO",
    "cpu_usage": "1,234.56",
    "memory": "512MB"
  }
]


--> Exported: parser_LogEntry_struct_3.py


[38;5;245m2026-02-02 12:48:07[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 1
[38;5;245m2026-02-02 12:48:07[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 1


from typing import Optional
from pydantic import BaseModel, Field

class LogMetrics(BaseModel):
    level: str = Field(..., description="Log level indicator, e.g., 'INFO', 'DEBUG', 'WARN', 'ERROR'.")
    cpu_usage: float = Field(..., description="CPU usage as a percentage, represented as a float. The original string may contain commas as thousand separators.")
    memory: str = Field(..., description="Memory usage string including numeric value and unit, e.g., '512MB'.")
```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """
    Transform a list of loosely‑parsed dictionaries into a list of dictionaries that match
    the LogMetrics Pydantic model schema.

    The function uses a *coalesce* strategy: for each target field it checks a prioritized
    list of possible source keys and picks the first one that exists in the input dict.
    It also performs minimal type inference/clean‑up (e.g., removing commas from numbers).

    Parameters
    ----------
    par

[38;5;245m2026-02-02 12:48:23[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2


Input: {'level': 'INFO', 'cpu_usage': 1234.56, 'memory': '512MB'} -- PASSED


[38;5;245m2026-02-02 12:48:23[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 2 ---
[38;5;245m2026-02-02 12:48:23[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:48:23[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "id": "usr_001",
    "name": "Alex Johnson",
    "role": "admin",
    "isActive": true,
    "createdAt": "2025-11-02T09:14:23Z"
  },
  {
    "id": "usr_002",
    "name": "Maria Lopez",
    "email": "maria.lopez@example.com",
    "role": "editor",
    "isActive": null,
    "createdAt": "2025-12-18T16:47:10Z",
    "lastLoginIp": "192.168.1.42"
  },
  {
    "id": "usr_003",
    "email": "samir.patel@example.com",
    "role": "viewer",
    "isActive": false,
    "createdAt": "08/05/2024"
  },
  {
    "id": 4,
    "name": "Chen Wei",
    "email": "chen.wei@example.com",
    "isActive": true,
    "createdAt": n

--> Exported: parser_LogMetrics_struct_2.py


[38;5;245m2026-02-02 12:48:33[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 1
[38;5;245m2026-02-02 12:48:33[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 1


from typing import Optional
from pydantic import BaseModel, Field

class UserBase(BaseModel):
    """
    Gold‑standard representation of a user entity.
    """
    id: str = Field(..., description="Primary identifier of the user, kept as a string.")
    name: Optional[str] = Field(None, description="Full name of the user.")
    email: Optional[str] = Field(None, description="User's email address.")
    role: Optional[str] = Field(None, description="Role assigned to the user (e.g., admin, editor, viewer).")
    is_active: bool = Field(
        default=False,
        description="Flag indicating whether the user account is currently active."
    )
    created_at: Optional[str] = Field(
        None,
        description=(
            "Timestamp when the user record was created. "
            "Accepts ISO‑8601 strings or other common date representations."
        )
    )
    last_login_ip: Optional[str] = Field(
        None,
        description="IP address from which the user last logge

[38;5;245m2026-02-02 12:48:55[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2
[38;5;245m2026-02-02 12:48:55[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 0 ---
[38;5;245m2026-02-02 12:48:55[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:44[0m [Architect] Defining Semantic Goal: 0
[38;5;245m2026-02-02 12:48:55[0m [36;20mswirl[0m [[34;20mDEBUG[0m] [38;5;245m2336556215.py:48[0m [
  {
    "level": "INFO",
    "service": "orders",
    "order_id": 1001,
    "status": "created"
  }
]


Input: {'id': 'usr_001', 'name': 'Alex Johnson', 'email': None, 'role': 'admin', 'is_active': True, 'created_at': '2025-11-02T09:14:23Z', 'last_login_ip': None} -- PASSED
Input: {'id': 'usr_002', 'name': 'Maria Lopez', 'email': 'maria.lopez@example.com', 'role': 'editor', 'is_active': False, 'created_at': '2025-12-18T16:47:10Z', 'last_login_ip': '192.168.1.42'} -- PASSED
Input: {'id': 'usr_003', 'name': None, 'email': 'samir.patel@example.com', 'role': 'viewer', 'is_active': False, 'created_at': '08/05/2024', 'last_login_ip': None} -- PASSED
Input: {'id': '4', 'name': 'Chen Wei', 'email': 'chen.wei@example.com', 'role': None, 'is_active': True, 'created_at': None, 'last_login_ip': None} -- PASSED
Input: {'id': 'usr_005', 'name': 'Broken Record', 'email': 'broken@example.com', 'role': None, 'is_active': False, 'created_at': None, 'last_login_ip': None} -- PASSED
--> Exported: parser_UserBase_struct_0.py


[38;5;245m2026-02-02 12:49:02[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:85[0m [Scehma Tester] Validating Functional BaseModel: 1
[38;5;245m2026-02-02 12:49:02[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 1


from typing import List, Dict, Optional
from pydantic import BaseModel, Field


class OrderLogEntry(BaseModel):
    """Gold‑standard representation of a log entry emitted by the orders service.

    The model captures the minimal, semantically‑rich fields observed across
    sample payloads while preserving clear, snake_case naming.
    """

    level: str = Field(
        ..., description="Severity level of the log entry (e.g., 'INFO', 'WARN', 'ERROR')."
    )
    service: str = Field(
        ..., description="Name of the service that generated the log entry."
    )
    order_id: str = Field(
        ..., description="Unique identifier of the order associated with this event. Stored as a string to preserve primary‑key semantics."
    )
    status: str = Field(
        ..., description="Current status of the order (e.g., 'created', 'processed', 'shipped')."
    )
```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """
    Convert a list of loosely‑structured 

[38;5;245m2026-02-02 12:49:09[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 2
[38;5;245m2026-02-02 12:49:09[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:105[0m [Coder] Parser for Gold Schema: 2


Input: {'level': 'INFO', 'service': 'orders', 'order_id': 1001, 'status': 'created'} -- FAILED
```python
def transform_to_models(parsed_dict: list[dict]) -> list[dict]:
    """
    Transform a list of loosely‑parsed dictionaries into a list of dictionaries
    that conform to the ``OrderLogEntry`` schema.

    The function:
    * Coalesces possible source keys for each target field.
    * Casts values to their most appropriate Python type.
    * Ensures ``order_id`` is emitted as a string.
    """
    from typing import Any

    # ------------------------------------------------------------------
    # Helper: best‑effort type inference / coercion
    # ------------------------------------------------------------------
    def _coerce(value: Any) -> Any:
        """Convert strings that look like numbers, booleans or currency to native types."""
        if isinstance(value, str):
            v = value.strip()
            # Boolean strings
            low = v.lower()
            if low i

[38;5;245m2026-02-02 12:49:18[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m2336556215.py:138[0m [Code Tester] Stress-testing parser: 3
[38;5;245m2026-02-02 12:49:18[0m [36;20mswirl[0m [[32;20mINFO[0m] [38;5;245m1738028846.py:26[0m --- Finished Cluster 3 ---


Input: {'level': 'INFO', 'service': 'orders', 'order_id': '1001', 'status': 'created'} -- PASSED
--> Exported: parser_OrderLogEntry_struct_3.py
