In [None]:
from openai import OpenAI
import json

In [1]:
def get_chunk(text,prefix):
    
    CHUNK_PROMPT="""
    You are a semantic document chunker working on patent data. The content below is from a section of a patent document (e.g., abstract, description, claims, etc.).

    Your task is to:
    - Split the section into multiple coherent **semantic chunks**, labeled from `chunk_1` to `chunk_N`.
    - Each chunk should be **approximately 300–400 words**, depending on content flow.
    - If the input section is too short for multiple chunks, return just one chunk.
    - Maintain original wording and structure — **do not summarize or paraphrase**.
    - Ensure each chunk captures a meaningful segment (e.g., full idea, concept, implementation detail).
    - Each chunk should be logically self-contained and usable independently.
    - Do NOT put json directly into chunks , convert into a paragraph with all information

    NOTE: Do NOT use your information just use whatever is given 
    Return the output in the following JSON format:


    [
    {
        "chunk_id": "chunk_1",
        "chunk_text": "..."
    },
    {
        "chunk_id": "chunk_2",
        "chunk_text": "..."
    }
    ]


    """
    CHUNK_PROMPT += f"\nADD THIS PREFIX IN EACH CHUNK ID: {prefix}_"
    response = client.responses.create(
        model="gpt-4.1-2025-04-14",
        input=[
            {"role": "system","content": CHUNK_PROMPT},
            {"role": "user","content": json.dumps(text,indent=2)},
        ],
    )
    response=response.output_text
    response=json.loads(response.strip().strip("'"))
    return response

In [None]:
import json

def create_chunks(json_path):
    with open(json_path, "r", encoding="utf8") as file:
        file_json = json.load(file)

    # Safely extract major sections
    root = file_json.get("us-patent-application", {})

    us_bibliographic_data_application = root.get("us-bibliographic-data-application", {})
    abstract = root.get("abstract", {})
    # drawings = root.get("drawings", {})
    description = root.get("description", {})
    claims = root.get("claims", {})

    other_info = {
        "language": root.get("@lang", "N/A"),
        "dtd_version": root.get("@dtd-version", "N/A"),
        "file": root.get("@file", "N/A"),
        "status": root.get("@status", "N/A"),
        "id": root.get("@id", "N/A"),
        "country": root.get("@country", "N/A"),
        "date_produced": root.get("@date-produced", "N/A"),
        "date_published": root.get("@date-publ", "N/A")
    }
    chunks = {
        "meta_data": get_chunk(other_info,"meta_data"),
        "us-bibliographic-data-application": get_chunk(us_bibliographic_data_application,"us-bibliographic-data-application"),
        "abstract": get_chunk(abstract,"abstract"),
        # "drawings": get_chunk(drawings),
        "description": get_chunk(description,"description"),
        "claims": get_chunk(claims,"claims")
    }

    return chunks,other_info["file"]

# Example usage:
# create_chunks("your_patent_file.json")


In [None]:
chunks,file_id=create_chunks("D:\Desktop\Projects\Research Agent\patent.json")

In [None]:
chunks['us-bibliographic-data-application']

In [None]:
text_path="D:\Desktop\Projects\Research Agent\\test.txt"
with open(text_path,"r",encoding="utf8") as f:
    text=f.read()

In [None]:
def output_verifier(json_ouput):
    

    OUTPUT_VERIFIER = """
    If the model output is  like this:
    {
        "chunks": {
            ...
        }
    }
    Then return "Yes", otherwise return "No".
    """

    response = client.responses.create(
        model="gpt-4.1-2025-04-14",
        input=[
            {"role": "system","content": OUTPUT_VERIFIER},
            {"role": "user","content": json_ouput},
        ],
    )

    return response.output_text
