In [1]:
import json
import os
import re
from collections import Counter
from copy import deepcopy
from typing import List, Optional

import tqdm as notebook_tqdm
from dotenv import load_dotenv
from pydantic import BaseModel, Field, constr, field_validator
from sentence_transformers import SentenceTransformer
from sklearn.cluster import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import logging as transformers_logging

from dq_swirl.clients.async_llm_client import AsyncLLMClient
from dq_swirl.ingestion.structure_analyzer import StructuralAnalyzer
from dq_swirl.rust_ingestion import smart_parse_batch

transformers_logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv("../secrets.env")

True

## Messy Data

In [3]:
messy_data = [
    "Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable",
    "Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp",
    "Order 1005: Buyer=Raj Patel, Total=1,200.50, Items: monitor, stand, cable",
    "Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard",
    "Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers -- [DISCOUNT APPLIED]",
    "Order 1008: Buyer=O'Connor, S., Location=Portland, OR, Total=$0.00, Items: ",
    "Order 1011: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable",
    "Order 1012: Buyer=Sarah Liu, Location=Austin, TX, Total=$156.55, Items: headphones",
    "Order 1013: Buyer=Mike Turner, Location=Cleveland, OH, Total=$1299.99, Items: gaming pc, mouse",
    "Order 1014: Buyer=Rachel Kim, Locadtion=Seattle, WA, Total=$89.50, Items: coffee maker",
    "Order 1015: Buyer=Chris Myers, Location=Cincinnati, OH, Total=$512.00, Items: monitor, desk lamp",
    "Order=1016, Buyer=Jake Myers, Total=$1,512.00, Items: monitor,",
    "Maples=Tree, Name:  Jae",
    '{"id": "usr_001", "name": "Alex Johnson", "role": "admin", "isActive": true, "createdAt": "2025-11-02T09:14:23Z"}',
    '{"id": "usr_002", "name": "Maria Lopez", "email": "maria.lopez@example.com", "role": "editor", "isActive": null, "createdAt": "2025-12-18T16:47:10Z", "lastLoginIp": "192.168.1.42"}',
    '{"id": "usr_003", "email": "samir.patel@example.com", "role": "viewer", "isActive": false, "createdAt": "08/05/2024"}',
    '{"id": 4, "name": "Chen Wei", "email": "chen.wei@example.com", "isActive": true, "createdAt": null}',
    '{"id": "usr_005", "name": "Broken Record", "email": "broken@example.com"}',
    "name =Sam, hobby = computers, id=1",
    "name=Sam, hobby=computers, id=1",
    "name = Sam, hobby = computers, id = 1",
    "version: 3, product: software",
    # '{"user":{"id":123,"name":"Alice","profile":{"age":30,"hobbies":["reading","cycling","coding"],"address":{"street":"123 Main St","city":"Metropolis","zip":"12345"}}},"orders":[{"order_id":1001,"items":["book","pen"],"total":25.5},{"order_id":1002,"items":["laptop"],"total":1200.0}],"active":true}',
    "Order 1017: Buyer=Chris Myers, Location=Columbia, SC, Total=$512.00, Items: monitor, desk lamp, Discount: yes",
    "2026-01-30 14:22:01 INFO User login successful user_id=123",
    "2026-01-30 14:22:01 INFO User login successful",
    "level =INFO, user =Sam, id=1",
    "timestamp=2026-01-30T14:22:01Z level=INFO user=alice action=login success=true",
    "level=INFO cpu_usage=1,234.56 memory=512MB",
    '{"level":"INFO","service":"orders","order_id":1001,"status":"created"}',
    '[2026-01-31 17:11:22 +0000] [7] [INFO] 127.0.0.1:56718 - - [31/Jan/2026:17:11:22 +0000] "GET /health 1.1" 200 16 "-" "curl/8.14.1"',
    "2026-01-31 17:11:00 swirl [DEBUG] saq_worker.py:28 Running cron job health check",
]

## Data Preprocessing

In [4]:
#################################################################################
################################# Grammar Parsing ###############################
#################################################################################

string_batch = []
string_json_batch = []
for msg in messy_data:
    if not (msg.startswith("[") and msg.endswith("]")) and not (
        msg.startswith("{") and msg.endswith("}")
    ):
        string_batch.append(msg)
    else:
        string_json_batch.append(msg)

print(f"\nUNSTRUCTURED STRING SAMPLES: {len(string_batch)}\n")
print(f"JSON STRING SAMPLES: {len(string_json_batch)}\n")


string_samples = smart_parse_batch(string_batch)

for i, (msg, parsed) in enumerate(string_samples):
    print(f"Original: {msg}\nParsed: {parsed}\n")


json_samples = []
leftovers = []

for msg in string_json_batch:
    try:
        data = json.loads(msg)
        json_samples.append((msg, data))
    except Exception:
        leftovers.append((msg, None))


data_samples = string_samples + json_samples

print(f"\nTOTAL SAMPLES: {len(data_samples)}\nERROR SAMPLES: {len(leftovers)}\n")

#################################################################################
############################### Structure Analyzer ##############################
#################################################################################


analyzer = StructuralAnalyzer(ignore_unparsed=False)

hash_counts = Counter()
unique_structures = {}

for raw, parsed in data_samples:
    result = analyzer.generate_fingerprint(raw, parsed)
    signature_hash = result["hash"]
    hash_counts[signature_hash] += 1
    unique_structures[signature_hash] = unique_structures.get(signature_hash, result)

print(
    f"Detected {len(unique_structures)} unique schemas across {len(data_samples)} records.\n"
)

for h, count in sorted(hash_counts.items()):
    print(f"Schema {h} ({count} occurrences):")
    print(f"  Layout: {unique_structures[h]['signature']}")
    print("-" * 30)


#################################################################################
############################# Structural Clustering #############################
#################################################################################


def conjoin_signatures(registry_output: dict):
    hashes = list(registry_output.keys())

    signatures_as_text = [
        " ".join(registry_output[h]["signature"].keys()) for h in hashes
    ]

    vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 5))
    matrix = vectorizer.fit_transform(signatures_as_text)

    clusterer = HDBSCAN(
        min_cluster_size=2,
        metric="euclidean",
        copy=True,
    )
    labels = clusterer.fit_predict(matrix.toarray())

    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        conjoined_map[h] = {
            "cluster_id": int(cluster_id),
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    sorted_dict = dict(sorted(conjoined_map.items()))
    return sorted_dict


#################################################################################
############################## Semantic Clustering ##############################
#################################################################################


def conjoin_signatures_semantic(
    registry_output: dict,
    embedding_model: str = "all-MiniLM-L6-v2",
    cache_dir: str = "./.models",
):
    hashes = list(registry_output.keys())
    registry_copy = deepcopy(registry_output)

    signatures_as_text = []
    for h in hashes:
        h_dict = registry_copy[h]["signature"]
        h_dict.pop("_unparsed", None)
        signatures_as_text.append(", ".join(h_dict))

    model = SentenceTransformer(embedding_model, cache_folder=cache_dir)
    embeddings = model.encode(signatures_as_text)

    clusterer = HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric="cosine",
        cluster_selection_epsilon=0.18,
        cluster_selection_method="eom",
        copy=True,
    )
    labels = clusterer.fit_predict(embeddings.astype("float64"))

    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        conjoined_map[h] = {
            "cluster_id": int(cluster_id),
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    return dict(sorted(conjoined_map.items()))


# run structure clustering
structure_cluster_map = conjoin_signatures(analyzer.signature_map)
structure_clusters = {}
for k, v in structure_cluster_map.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    structure_clusters[cluster_id] = structure_clusters.get(cluster_id, [])
    structure_clusters[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )
print(f"Structural Clusters: \n{json.dumps(structure_clusters, indent=4)}\n")


# run semantic clustering
semantic_cluster_map = conjoin_signatures_semantic(analyzer.signature_map)
semantic_clusters = {}
for k, v in semantic_cluster_map.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    semantic_clusters[cluster_id] = semantic_clusters.get(cluster_id, [])
    semantic_clusters[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )
print(f"Semantic Clusters: \n{json.dumps(semantic_clusters, indent=4)}\n")




UNSTRUCTURED STRING SAMPLES: 25

JSON STRING SAMPLES: 6

Original: Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable
Parsed: {'Order': '1001', 'Buyer': 'John Davis', 'Location': 'Columbus, OH', 'Total': '$742.10', 'Items': 'laptop, hdmi cable'}

Original: Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp
Parsed: {'Order': '1004', 'Buyer': 'AMANDA SMITH', 'Location': 'Seattle, WA', 'Total': '$50.00', 'Items': 'desk lamp'}

Original: Order 1005: Buyer=Raj Patel, Total=1,200.50, Items: monitor, stand, cable
Parsed: {'Order': '1005', 'Buyer': 'Raj Patel', 'Total': '1,200.50', 'Items': 'monitor, stand, cable'}

Original: Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard
Parsed: {'Order': '1006', 'total': '$89.99', 'location': 'Miami, FL', 'buyer': 'Elena Rossi', 'Items': 'keyboard'}

Original: Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers -- [DIS

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 958.94it/s, Materializing param=pooler.dense.weight]                              


Semantic Clusters: 
{
    "1": [
        {
            "signature_hash": "28d9f3b14d0e5516a186062212502d0c",
            "fields": [
                "order",
                "buyer",
                "locadtion",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "461a895ef9c5046dd2cb5026b6a62de0",
            "fields": [
                "order",
                "buyer",
                "location",
                "total",
                "items",
                "discount"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "50eb97a85647221ecc7f65f74d68d156",
            "fields": [
                "order",
                "buyer",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "fd116cd512d5ecd2e59edf12fc258b32",
            "fields": [
           

In [33]:
# semantic cluster map to raw strings
cluster_dict = {}
for cluster_id, records in semantic_clusters.items():
    cluster_dict[cluster_id] = cluster_dict.get(cluster_id, [])
    for rec in records:
        signature_hash = rec["signature_hash"]
        analyzer_records = analyzer.signature_map[signature_hash]["records"]
        for r in analyzer_records:
            cluster_dict[cluster_id].append(
                {
                    "signature_hash": signature_hash,
                    "raw": r["raw"],
                    "parsed": r["parsed"],
                }
            )

cluster_dict

{1: [{'signature_hash': '28d9f3b14d0e5516a186062212502d0c',
   'raw': 'Order 1014: Buyer=Rachel Kim, Locadtion=Seattle, WA, Total=$89.50, Items: coffee maker',
   'parsed': {'Order': '1014',
    'Buyer': 'Rachel Kim',
    'Locadtion': 'Seattle, WA',
    'Total': '$89.50',
    'Items': 'coffee maker'}},
  {'signature_hash': '461a895ef9c5046dd2cb5026b6a62de0',
   'raw': 'Order 1017: Buyer=Chris Myers, Location=Columbia, SC, Total=$512.00, Items: monitor, desk lamp, Discount: yes',
   'parsed': {'Order': '1017',
    'Buyer': 'Chris Myers',
    'Location': 'Columbia, SC',
    'Total': '$512.00',
    'Items': 'monitor, desk lamp',
    'Discount': 'yes'}},
  {'signature_hash': '50eb97a85647221ecc7f65f74d68d156',
   'raw': 'Order 1005: Buyer=Raj Patel, Total=1,200.50, Items: monitor, stand, cable',
   'parsed': {'Order': '1005',
    'Buyer': 'Raj Patel',
    'Total': '1,200.50',
    'Items': 'monitor, stand, cable'}},
  {'signature_hash': '50eb97a85647221ecc7f65f74d68d156',
   'raw': 'Order=1

## LLM Client Setup

In [34]:
# llm connection
api_key = os.getenv("LLM_API_KEY")
# api_base_url = os.getenv("LLM_BASE_URL")
api_base_url = "https://openrouter.ai/api/v1"
# model = "openai/google/gemma-3-27b-it"
model = "openai/gpt-oss-120b:exacto"

In [35]:
client = AsyncLLMClient(
    model,
    api_base_url,
)

## LLM Prompts

In [36]:
# prompts
PYDANTIC_SYSTEM_PROMPT = """You are a Data Architect. Your goal is to perform unsupervised schema inference on a sample of unstructured data.

Generate a Pydantic `BaseModel` class that represents the "Gold Standard" foundation for this data pattern. 

Instructions:
- Normalization: Suggest clean, snake_case keys for the identified fields.
- If you see a string value for a field that follows a consistent structure (e.g., "<city>, <state>") then make sure that structure is accurately typed in the BaseModel.
- Determine what fields should be required vs optional based on overall semantic meaning of the entity you are creating a BaseModel class for.

Constraints:
- Include a detailed description for each field using the `Field` class to explain what the field is and if there are any expected structural patterns (e.g., `state` should be two letters).
- Create supplemental BaseModel classes where necessary to preserve semantic clarity.
- Do NOT include any regex.
- If a field appears in some rows but not others, mark it as `Optional`.
- You are only allowed to use the following imports: "from typing import List, Dict, Optional; from pydantic import BaseModel, Field".
- Return ONLY the Pydantic class definitions (you are allowed to generate multiple as long as they are logically linked).
"""

PYDANTIC_USER_PROMPT = """Please analyze the following representative samples of a new data pattern and generate the Pydantic 'Foundation' model.

### Data Samples:
{samples}
"""

## Generate Pydantic BaseModel Class

In [None]:
class ModelResponseStructure(BaseModel):
    code_string: str = Field(..., description="generated python code")
    entrypoint_class_name: str = Field(
        ..., description="name of entrypoint base model class in the code generated"
    )


def extract_python_code(text):
    """
    Extracts the Python code block from a string.

    Returns:
        str: The extracted source code or an empty string if not found.
    """
    block_pattern = r"```(?:python)?\s*(.*?)\s*```"
    match = re.search(block_pattern, text, re.DOTALL)

    return match.group(1).strip() if match else ""


for c_id, records in cluster_dict.items():
    string_li = [r["raw"] for r in records]
    messages = [
        {"role": "system", "content": PYDANTIC_SYSTEM_PROMPT},
        {
            "role": "user",
            "content": PYDANTIC_USER_PROMPT.format(
                samples=string_li,
            ),
        },
    ]

    buffer = []
    response = await client.chat(
        messages=messages,
        stream=True,
        temperature=0.0,
        response_format=ModelResponseStructure,
    )
    async for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            content = chunk.choices[0].delta.content
            print(content, end="", flush=True)
            buffer.append(content)

    resp = "".join(buffer)
    resp: ModelResponseStructure = ModelResponseStructure(**json.loads(resp))

    if not resp.code_string.startswith("```python"):
        resp.code_string = f"```python\n{resp.code_string}\n```"

    code = extract_python_code(resp.code_string)

    namespace = {}
    exec(code, globals(), namespace)

    # 3. Access the function from the namespace dictionary
    cls = namespace.get(resp.entrypoint_class_name)
    cls.model_rebuild(_types_namespace=namespace)
    schema = cls.model_json_schema()

    fname = f"c{c_id}_{resp.entrypoint_class_name.lower()}_base_model.py"
    code = code.encode("ascii", errors="ignore").decode("ascii")
    with open(fname, "w", encoding="utf-8") as f:
        f.write(code)
        print(f"\nSuccessfully wrote code to {fname}")

    print()

{
  "code_string": "from typing import List, Optional\nfrom pydantic import BaseModel, Field\n\n\nclass Location(BaseModel):\n    \"\"\"Geographic location of the buyer.\n    \n    The location is represented as a city and a two‑letter US state abbreviation.\n    \"\"\"\n    city: str = Field(..., description=\"Name of the city where the buyer is located.\")\n    state: str = Field(..., description=\"Two‑letter US state abbreviation (e.g., \\\"WA\\\", \\\"OH\\\").\")\n\n\nclass Order(BaseModel):\n    \"\"\"Representation of a single purchase order.\n    \n    Each order includes an identifier, buyer information, optional location, the monetary total,\n    a list of purchased items, and an optional flag indicating whether a discount was applied.\n    \"\"\"\n    order_id: int = Field(..., description=\"Unique numeric identifier for the order.\")\n    buyer: str = Field(..., description=\"Full name of the buyer as it appears in the source data.\")\n    location: Optional[Location] = Fiel

In [40]:
# Example from `c1_order_base_model.py`

from typing import List, Optional

from pydantic import BaseModel, Field


class Location(BaseModel):
    """Geographic location of the buyer.

    The location is represented as a city and a twoletter US state abbreviation.
    """

    city: str = Field(..., description="Name of the city where the buyer is located.")
    state: str = Field(
        ..., description='Twoletter US state abbreviation (e.g., "WA", "OH").'
    )


class Order(BaseModel):
    """Representation of a single purchase order.

    Each order includes an identifier, buyer information, optional location, the monetary total,
    a list of purchased items, and an optional flag indicating whether a discount was applied.
    """

    order_id: int = Field(..., description="Unique numeric identifier for the order.")
    buyer: str = Field(
        ..., description="Full name of the buyer as it appears in the source data."
    )
    location: Optional[Location] = Field(
        None,
        description="Optional location object containing city and state. May be missing in some records.",
    )
    total: float = Field(
        ...,
        description="Total amount of the order in US dollars. Dollar sign and commas are stripped during parsing.",
    )
    items: List[str] = Field(
        default_factory=list,
        description="List of item names purchased. The list may be empty if no items are recorded.",
    )
    discount_applied: Optional[bool] = Field(
        None,
        description="Flag indicating whether a discount was applied to the order. True when a discount marker is present; otherwise False or omitted.",
    )

In [None]:
# Example `c2_logentry_base_model.py`

from typing import Dict, List, Optional

from pydantic import BaseModel, Field


class LogEntry(BaseModel):
    """A normalized representation of a log line.

    The model captures common elements observed across heterogeneous log formats,
    providing a unified schema for downstream processing.
    """

    timestamp: Optional[str] = Field(
        None,
        description="Timestamp of the log entry in the form 'YYYY-MM-DD HH:MM:SS'. "
        "Present when the source includes an explicit datetime component.",
    )
    level: str = Field(
        ...,
        description="Log severity level, such as INFO, WARN, ERROR, etc. "
        "This field is required because every sample contains a level indicator.",
    )
    message: Optional[str] = Field(
        None,
        description="Humanreadable message describing the event, e.g., 'User login successful'. "
        "May be absent when the log format focuses on key/value pairs only.",
    )
    user: Optional[str] = Field(
        None,
        description="Username associated with the event when provided (e.g., 'Sam').",
    )
    user_id: Optional[int] = Field(
        None,
        description="Numeric identifier of the user performing the action, extracted from a key like 'user_id'.",
    )
    record_id: Optional[int] = Field(
        None,
        description="Generic numeric identifier present in some logs (e.g., 'id=1'). "
        "Named generically to avoid conflating with user_id.",
    )

## Langgraph Robustness and Stategraph 