In [30]:
import os
import sys
import json
import datetime
from typing import Dict, Any, Optional

try:
    import requests
except Exception:
    requests = None

from PIL import Image, ImageOps
import tempfile

In [31]:
LOCAL_API_URL = "http://localhost:1234/v1/chat/completions"
DEFAULT_IMAGE = r"HighResImage/pexels-hikaique-109919.jpg"
MAX_SIZE = (896, 896)
JPEG_QUALITY = 85
DEFAULT_MODEL = os.environ.get("LM_MODEL", "qwen/qwen3-vl-8b")
# ----------------------------------

OUTPUT_JSON_TEMPLATE = {
    "source_image": None,
    "timestamp": None,
    "clip_caption": None,
    "blip_description": None,
    "prompt_used": None,
    "lm_response_raw": None
}

In [43]:
def build_prompt_text(filename: str, width: int, height: int) -> str:
    """
    Prompt curto que pede ao modelo para detectar todas as pessoas na imagem
    e retornar um JSON com um caption global e uma lista 'people' com
    clip_caption (1-6 words) e blip_description (1-2 sentences) para cada pessoa.
    Também pede um campo bbox normalizado [x1,y1,x2,y2] se o modelo puder fornecer,
    caso contrário colocar null.
    """
    return (
        "Analyze the image and detect every person visible. "
        "Respond ONLY with a single JSON object (no prose) with exactly two top-level fields:\n"
        "1) \"global\": a very short caption for the whole image (1-6 words, label-style, lower-case).\n"
        "2) \"people\": an array of objects, one per detected person, each with the fields:\n"
        "   - \"id\": integer (1..N)\n"
        "   - \"clip_caption\": 1-6 words, label-style, lower-case, no trailing punctuation\n"
        "   - \"blip_description\": 1-2 short sentences describing that person (clothing, color, action, pose, visible objects, relative location); avoid inventing identities or unverifiable facts\n"
        "   - \"bbox\": either an array [x1, y1, x2, y2] with coordinates normalized to the image width/height (values 0..1), OR null if you cannot provide coordinates\n"
        "Rules: do not invent names, ages, or identities. If unsure, use neutral phrasing (e.g. 'appears to be', 'possibly'). "
        "Return only valid JSON. No additional text."
    )

def extract_json_from_response(resp: Dict[str, Any]) -> Optional[Dict[str,str]]:
    """
    Heurística para extrair clip_caption e blip_description de várias formas de resposta.
    """
    if not resp:
        return None

    # direct keys
    if all(k in resp for k in ("clip_caption", "blip_description")):
        return {"clip_caption": resp["clip_caption"], "blip_description": resp["blip_description"]}

    # common wrapper keys
    for key in ("response", "text", "output", "result", "choices"):
        if key in resp:
            val = resp[key]
            # choices array (OpenAI style)
            if key == "choices" and isinstance(val, list) and len(val) > 0:
                # try to extract message content
                first = val[0]
                if isinstance(first, dict):
                    # openai chat-style
                    msg = first.get("message") or first.get("text") or first.get("content")
                    if isinstance(msg, dict):
                        # message -> content: could be string or list
                        content = msg.get("content")
                        if isinstance(content, str):
                            try:
                                parsed = json.loads(content)
                                if all(k in parsed for k in ("clip_caption", "blip_description")):
                                    return {"clip_caption": parsed["clip_caption"], "blip_description": parsed["blip_description"]}
                            except Exception:
                                # fallthrough to regex parsing
                                text = content
                                return _extract_from_text(text)
                    elif isinstance(msg, str):
                        return _extract_from_text(msg)
                # fallback: choice text field
                if isinstance(first.get("text"), str):
                    return _extract_from_text(first.get("text"))
            # if val is a string
            if isinstance(val, str):
                return _extract_from_text(val)
            # if val is dict, try nested
            if isinstance(val, dict):
                # try to find textual fields inside
                for sub in ("text", "output", "response"):
                    if sub in val and isinstance(val[sub], str):
                        return _extract_from_text(val[sub])
    # last attempt: search entire resp JSON string
    try:
        s = json.dumps(resp)
        return _extract_from_text(s)
    except Exception:
        return None

def _extract_from_text(text: str) -> Optional[Dict[str,str]]:
    """Try to parse JSON from text or extract via regex."""
    import re
    if not text:
        return None
    txt = text.strip()
    # try parse JSON directly
    try:
        parsed = json.loads(txt)
        if isinstance(parsed, dict) and all(k in parsed for k in ("clip_caption", "blip_description")):
            return {"clip_caption": parsed["clip_caption"], "blip_description": parsed["blip_description"]}
    except Exception:
        pass
    # regex extraction
    m1 = re.search(r'"clip_caption"\s*:\s*"([^"]+)"', txt)
    m2 = re.search(r'"blip_description"\s*:\s*"([^"]+)"', txt)
    if m1 and m2:
        return {"clip_caption": m1.group(1), "blip_description": m2.group(1)}
    # try loose pattern: 'clip_caption: something' lines
    lines = [ln.strip() for ln in txt.splitlines() if ln.strip()]
    # find lines containing clip_caption / blip_description
    cc = None
    bd = None
    for ln in lines:
        if "clip_caption" in ln and ":" in ln:
            try:
                cc = ln.split(":",1)[1].strip().strip('", ')
            except:
                pass
        if "blip_description" in ln and ":" in ln:
            try:
                bd = ln.split(":",1)[1].strip().strip('", ')
            except:
                pass
    if cc and bd:
        return {"clip_caption": cc, "blip_description": bd}
    return None

def prepare_image_for_model(image_path: str, max_size=(896,896), mode: str = "resize",
                            output_dir: Optional[str] = None, jpeg_quality: int = 85) -> str:
    """
    Prepara imagem para o modelo (resize/crop/pad). Retorna caminho do arquivo salvo.
    Default: resize mantendo aspecto, max dimension = max_size.
    """
    if mode not in ("resize", "crop", "pad"):
        raise ValueError("mode deve ser 'resize'|'crop'|'pad'")

    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    target_w, target_h = max_size

    if mode == "resize":
        im.thumbnail(max_size, Image.LANCZOS)
        out_im = im
    elif mode == "crop":
        short = min(w,h)
        left = (w - short)//2
        top = (h - short)//2
        right = left + short
        bottom = top + short
        im_cropped = im.crop((left, top, right, bottom))
        out_im = im_cropped.resize(max_size, Image.LANCZOS)
    else:  # pad
        im.thumbnail(max_size, Image.LANCZOS)
        out_im = Image.new("RGB", max_size, (0,0,0))
        paste_x = (target_w - im.width)//2
        paste_y = (target_h - im.height)//2
        out_im.paste(im, (paste_x, paste_y))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        base = os.path.basename(image_path)
        name, _ = os.path.splitext(base)
        out_path = os.path.join(output_dir, f"{name}_prepared_{target_w}x{target_h}.jpg")
    else:
        fd, out_path = tempfile.mkstemp(suffix=f"_{target_w}x{target_h}.jpg")
        os.close(fd)

    out_im.save(out_path, format="JPEG", quality=jpeg_quality, optimize=True)
    return os.path.abspath(out_path)

def send_to_local_api(prompt_text: str, api_url: str, image_path: Optional[str] = None, model_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """
    Envia payload para /v1/chat/completions com:
      messages: [{role:'user', content: [{type:'input_text', 'text':...}, {type:'input_image','image_url':{'url':'data:image/jpeg;base64,...'}}]}]
    Observação: pode gerar payload grande — já resize a imagem antes (896x896).
    """
    import os, json, base64
    if requests is None:
        print("requests não instalado.")
        return None

    model = model_name or DEFAULT_MODEL

    # montar blocos de conteúdo
    content_blocks = [{"type": "input_text", "text": prompt_text}]

    if image_path and os.path.exists(image_path):
        # converter a imagem para base64 (data URL)
        with open(image_path, "rb") as fh:
            b64 = base64.b64encode(fh.read()).decode("ascii")
        data_url = f"data:image/jpeg;base64,{b64}"
        content_blocks.append({"type": "input_image", "image_url": {"url": data_url}})

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": content_blocks
            }
        ]
    }

    # usar chat/completions (se quiser testar /responses, precisa saber esquema exato do servidor)
    try:
        print(">>> POST /v1/chat/completions payload (truncated):")
        print(json.dumps(payload, ensure_ascii=False)[:1000])
        resp = requests.post(api_url, json=payload, timeout=300)  # timeout maior para uploads grandes
        print("status:", resp.status_code)
        print("body (start):", resp.text[:4000])
        resp.raise_for_status()
        try:
            return resp.json()
        except Exception:
            return {"text": resp.text}
    except Exception as e:
        print("Request failed:", e)
        return None


In [39]:
def main(
    image_path: str = DEFAULT_IMAGE,
    api_url: str = LOCAL_API_URL,
    out_path: str = "image_caption_blip_clip.json",
    auto_send: bool = True,
):
    if not os.path.exists(image_path):
        print("Arquivo não encontrado:", image_path)
        return

    # Prepare image (resize) and build short prompt text (no base64)
    prepared_path = prepare_image_for_model(image_path, max_size=MAX_SIZE, mode="resize", jpeg_quality=JPEG_QUALITY)
    im = Image.open(prepared_path)
    w, h = im.size
    prompt_text = build_prompt_text(os.path.basename(prepared_path), w, h)

    output = OUTPUT_JSON_TEMPLATE.copy()
    output["source_image"] = os.path.abspath(prepared_path)
    output["timestamp"] = datetime.datetime.now().isoformat()
    output["prompt_used"] = prompt_text

    lm_resp = None
    parsed = None

    if auto_send:
        print(f"Tentando enviar prompt para {api_url} ...")
        lm_resp = send_to_local_api(prompt_text, api_url, image_path=prepared_path)
        output["lm_response_raw"] = lm_resp
        parsed = extract_json_from_response(lm_resp if isinstance(lm_resp, dict) else {})
        if parsed:
            output["clip_caption"] = parsed["clip_caption"]
            output["blip_description"] = parsed["blip_description"]
            print("Resposta extraída com sucesso do endpoint.")
        else:
            print("Não foi possível extrair JSON estruturado da resposta automática.")
            print("Se falhar, rode em modo manual (auto_send=False) e cole o prompt no UI do LM Studio.")
    else:
        print("Modo manual: cole o prompt curto e a imagem (prepared) no UI do LM Studio:")
        print("Prompt:\n", prompt_text)
        print("Prepared image path (use file:/// in UI if supported):", prepared_path)
        # allow interactive paste of the model JSON response
        try:
            raw = input("\nCole aqui o JSON resposta do modelo (ou pressione Enter para pular):\n").strip()
            if raw:
                j = json.loads(raw)
                if "clip_caption" in j and "blip_description" in j:
                    output["clip_caption"] = j["clip_caption"]
                    output["blip_description"] = j["blip_description"]
                    output["lm_response_raw"] = j
                    print("Resposta manual salva.")
                else:
                    print("JSON colado não possui as chaves esperadas.")
        except Exception as e:
            print("Erro ao ler entrada manual:", e)

    # Save JSON (even if partial)
    os.makedirs(os.path.dirname(os.path.abspath(out_path)) or ".", exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as fh:
        json.dump(output, fh, ensure_ascii=False, indent=2)

    print("\nArquivo salvo em:", out_path)
    if output.get("clip_caption"):
        print("clip_caption:", output["clip_caption"])
    if output.get("blip_description"):
        print("blip_description:", output["blip_description"])

In [44]:
if __name__ == "__main__":
    main("HighResImage/pexels-hikaique-109919.jpg")

Tentando enviar prompt para http://localhost:1234/v1/chat/completions ...
>>> POST /v1/chat/completions payload (truncated):
{"model": "qwen/qwen3-vl-8b", "messages": [{"role": "user", "content": [{"type": "input_text", "text": "Analyze the image and detect every person visible. Respond ONLY with a single JSON object (no prose) with exactly two top-level fields:\n1) \"global\": a very short caption for the whole image (1-6 words, label-style, lower-case).\n2) \"people\": an array of objects, one per detected person, each with the fields:\n   - \"id\": integer (1..N)\n   - \"clip_caption\": 1-6 words, label-style, lower-case, no trailing punctuation\n   - \"blip_description\": 1-2 short sentences describing that person (clothing, color, action, pose, visible objects, relative location); avoid inventing identities or unverifiable facts\n   - \"bbox\": either an array [x1, y1, x2, y2] with coordinates normalized to the image width/height (values 0..1), OR null if you cannot provide coordi