In [27]:
import base64, json, re, math
import fitz  # PyMuPDF
from openai import OpenAI
from dotenv import load_dotenv


In [28]:
load_dotenv()

True

In [29]:
client = OpenAI()
MODEL_SYNTH = "gpt-4o"        # final synthesis (higher quality)
MODEL_BATCH = "gpt-4o"   # cheaper for per-batch extraction

BATCH_SIZE = 30                # pages per request; tweak if tokens overflow
DPI = 220
SCHEMA = {
  "product": "string",
  "language": "string",
  "steps": [
    {"step": 1, "title": "string", "description": "string",
     "tools": [], "parts": [], "warnings": [], "notes": []}
  ],
  "assumptions": [],
  "uncertainties": []
}


In [30]:
def pdf_to_b64_images(pdf_path, dpi=DPI):
    imgs = []
    doc = fitz.open(pdf_path)
    try:
        for i, page in enumerate(doc):
            pix = page.get_pixmap(dpi=dpi, alpha=False)
            png_bytes = pix.tobytes("png")
            imgs.append(base64.b64encode(png_bytes).decode("utf-8"))
    finally:
        doc.close()
    return imgs

def build_parts_for_batch(images_b64, start_idx, end_idx):
    instr = (
        "Extract ONLY assembly steps (ignore safety/warranty/retighten pages). "
        "Return JSON with this schema (no extra fields). "
        "For each step include a 'page_hint' array of page indices (0-based) "
        f"within this batch [{start_idx}-{end_idx-1}] where the action is shown. "
        + json.dumps(SCHEMA, ensure_ascii=False)
    )
    parts = [{"type": "text", "text": instr}]
    for k, b64 in enumerate(images_b64[start_idx:end_idx], start=start_idx):
        parts.append({"type": "image_url",
                      "image_url": {"url": f"data:image/png;base64,{b64}"}})
    return parts

def call_chat(parts, model):
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": parts}],
        temperature=0
    )
    return resp.choices[0].message.content.strip()

def strip_code_fences(s):
    return re.sub(r"^```(?:json)?\s*|\s*```$", "", s.strip(), flags=re.IGNORECASE)

def try_json(s):
    try:
        return json.loads(strip_code_fences(s))
    except Exception:
        return None

def merge_partial_steps(partials):
    """
    Very simple merge: concatenate, then dedup by (title+description) hash.
    Keep page hints to help synthesis.
    """
    seen = set()
    merged_steps = []
    for part in partials:
        steps = (part or {}).get("steps") or []
        for st in steps:
            key = (st.get("title","").strip().lower(),
                   st.get("description","").strip().lower())
            if key in seen:  # merge page hints if duplicate
                # attach page hints to last occurrence
                if st.get("page_hint"):
                    merged_steps[-1]["page_hint"] = sorted(
                        set(merged_steps[-1].get("page_hint", [])) |
                        set(st.get("page_hint", []))
                    )
                continue
            seen.add(key)
            merged_steps.append(st)
    return merged_steps

def synthesize_full_json(merged_steps, product_hint="unknown", language_hint="auto"):
    prompt = (
        "You are given a list of assembly steps (possibly partial or out of order). "
        "Produce a FINAL, COMPLETE assembly JSON strictly using this schema: "
        + json.dumps(SCHEMA, ensure_ascii=False) +
        "\nRules:\n"
        "- Deduplicate and order the steps into a coherent end-to-end sequence.\n"
        "- Ignore maintenance (e.g., 'retighten after 2 weeks').\n"
        "- If the manual clearly has more steps, include them; infer reasonable actions between images.\n"
        "- Keep it concise and actionable; include tools/parts when visible.\n"
        "- Do not add any fields not present in the schema."
        f"\nHints: product='{product_hint}', language='{language_hint}'."
    )

    parts = [
        {"type": "text", "text": prompt},
        {"type": "text", "text": "Here are the candidate steps (JSON):"},
        {"type": "text", "text": json.dumps({"steps": merged_steps}, ensure_ascii=False)}
    ]

    txt = call_chat(parts, MODEL_SYNTH)
    data = try_json(txt)
    if not data:
        # last resort: wrap into schema
        data = {"product": product_hint, "language": language_hint,
                "steps": merged_steps, "assumptions": [], "uncertainties": []}
    # renumber steps sequentially
    for i, s in enumerate(data.get("steps", []), start=1):
        s["step"] = i
    return data

def extract_all(pdf_path):
    images = pdf_to_b64_images(pdf_path)
    n = len(images)
    partials = []
    for i in range(0, n, BATCH_SIZE):
        j = min(i + BATCH_SIZE, n)
        parts = build_parts_for_batch(images, i, j)
        txt = call_chat(parts, MODEL_BATCH)
        js = try_json(txt)
        if not js:
            # retry once with stricter wording
            parts[0]["type"] = "text"
            parts[0]["text"] = parts[0]["text"] + " Return ONLY valid JSON."
            txt2 = call_chat(parts, MODEL_BATCH)
            js = try_json(txt2)
        partials.append(js or {"steps": []})

    merged = merge_partial_steps(partials)
    final = synthesize_full_json(merged)
    return final

In [31]:
pdf = r"ikea.pdf"
result = extract_all(pdf)


In [32]:
print(json.dumps(result, ensure_ascii=False, indent=2))

{
  "product": "unknown",
  "language": "auto",
  "steps": [
    {
      "step": 1,
      "title": "Attach Mounting Plate",
      "description": "Secure the mounting plate to the table using the screws provided. Ensure the screws are tightened to 25mm (1 inch).",
      "tools": [
        "Screwdriver"
      ],
      "parts": [
        "Mounting Plate (104217)",
        "Screws (108443)"
      ],
      "notes": []
    },
    {
      "step": 2,
      "title": "Attach Leg",
      "description": "Twist the leg onto the mounting plate until secure. Adjust the height as needed.",
      "tools": [],
      "parts": [
        "Leg"
      ],
      "notes": []
    }
  ],
  "assumptions": [],
  "uncertainties": []
}


In [33]:
pdf2 = "ikea2.pdf"
out = extract_all(pdf2)

In [34]:
print(json.dumps(out, ensure_ascii=False, indent=2))

{
  "product": "unknown",
  "language": "auto",
  "steps": [
    {
      "step": 1,
      "title": "Insert Dowels",
      "description": "Insert 16 dowels into the side panels.",
      "tools": [],
      "parts": [
        "16x Dowels"
      ],
      "notes": []
    },
    {
      "step": 2,
      "title": "Attach Brackets",
      "description": "Hammer 6 brackets into the panels and secure with screws.",
      "tools": [
        "Hammer"
      ],
      "parts": [
        "6x Brackets",
        "Screws"
      ],
      "notes": []
    },
    {
      "step": 3,
      "title": "Insert Pins",
      "description": "Insert 4 pins into the side panels.",
      "tools": [],
      "parts": [
        "4x Pins"
      ],
      "notes": []
    },
    {
      "step": 4,
      "title": "Secure with Cams",
      "description": "Use 2 cams to secure the panels.",
      "tools": [],
      "parts": [
        "2x Cams"
      ],
      "notes": []
    },
    {
      "step": 5,
      "title": "Attach Back Pa