In [None]:
from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch, pathlib

MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
device = "cuda"           # o "cuda:0"
dtype  = torch.bfloat16   # ocupa la mitad

processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype=dtype, device_map="auto")
model.eval()


In [None]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langgraph.graph import MessageGraph

def glm4v_call(messages, max_tokens=256):
    """messages = [SystemMessage | HumanMessage | AIMessage …]"""
    # 2.1 convierte a la lista dict{role,content} que espera el template
    m = []
    for msg in messages:
        # content multimodal → lista; texto → str puro
        m.append({"role": msg.type, "content": msg.content})

    # 2.2 tokeniza con el chat_template
    inputs = processor.apply_chat_template(
        m, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to(device)

    # 2.3 genera y decodifica
    out = model.generate(**inputs, max_new_tokens=max_tokens)
    reply = processor.decode(out[0][inputs["input_ids"].shape[1]:],
                             skip_special_tokens=False)
    return reply

# --- nodo LLM para el grafo ---
def llm_node(state):
    state["assistant_text"] = glm4v_call(state["prompt"])
    return state


In [None]:
ACTION_RE = re.compile(r"Action:\s*(\{.*?\})", re.S)

def parse_route(state):
    txt = state["assistant_text"]
    m   = ACTION_RE.search(txt)
    if not m:
        return END, {"final_answer": txt}
    state["action"] = json.loads(m.group(1))
    return "call_tool", state

def call_tool(state):
    act = state["action"]
    name = act["action_type"]
    obs  = TOOLS[name].invoke(act | {})     # disparas la tool
    state["observation"] = obs
    return "bridge", state

def bridge(state):
    # empaqueta observación → mensaje user multimodal
    obs = state["observation"]
    user_msg = {"role": "user", "content": [obs] if isinstance(obs, dict) else obs}
    state["prompt"].extend([
        {"role":"assistant", "content": state["assistant_text"]},
        user_msg
    ])
    return "llm", state


In [None]:
g = MessageGraph()
g.add_node("llm",       llm_node)
g.add_node("parse",     parse_route)
g.add_node("call_tool", call_tool)
g.add_node("bridge",    bridge)

g.add_edge("llm",   "parse")
g.add_edge("parse", "call_tool")
g.add_edge("call_tool", "bridge")
g.add_edge("bridge", "llm")

agent = g.compile()
