# # SigLIP Mask Token(Image-level) LLaVA 답변 확인

In [1]:
import torch
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

MASK_DIR = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_mask_token/hazelnut/hole")
MODEL_ID = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
DEV      = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE    = torch.bfloat16                      

In [None]:
proc  = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
llava = LlavaOnevisionForConditionalGeneration.from_pretrained(
            MODEL_ID,
            torch_dtype       = DTYPE,
            device_map        = "auto",
            low_cpu_mem_usage = True,
            trust_remote_code = True
        ).eval()

TOK_IMG = proc.tokenizer.additional_special_tokens[0]      

In [3]:
def ask_llava(mask_tok: torch.Tensor) -> str:
    v_emb = llava.multi_modal_projector(mask_tok.unsqueeze(0).to(DEV, DTYPE))  
    M     = v_emb.size(1)

    prompt = (TOK_IMG + " ") * M + "USER: Is there any anomaly in the image? (Yes / No)\nASSISTANT:"
    tok_in = proc.tokenizer(prompt, return_tensors="pt").to(DEV)
    t_emb  = llava.get_input_embeddings()(tok_in.input_ids).to(DTYPE)           

    inp_emb   = torch.cat([v_emb, t_emb], dim=1)                                
    att_mask  = torch.cat([torch.ones(1, M, dtype=torch.long, device=DEV),
                           tok_in.attention_mask], dim=1)

    out = llava.generate(
            inputs_embeds   = inp_emb,
            attention_mask  = att_mask,
            max_new_tokens  = 30,
            do_sample       = False,     
            eos_token_id    = proc.tokenizer.eos_token_id,
            pad_token_id    = proc.tokenizer.eos_token_id
          )
    return proc.tokenizer.decode(out[0], skip_special_tokens=True).strip()

for i in range(18):
    pt = MASK_DIR / f"{i:03d}_siglip_tokens.pt"
    if not pt.exists():
        print(f"[{i:03d}]  ⨯ file missing"); continue

    obj = torch.load(pt, map_location="cpu", weights_only=False)
    mtk = obj["mask_token"]                                  
    if mtk.numel() == 0:
        print(f"[{i:03d}]  ⨯ empty"); continue

    ans = ask_llava(mtk)
    print(f"[{i:03d}]  {ans}")

[000]  No, there is no anomaly in the image.
[001]  No, there is no anomaly in the image.
[002]  No, there is no anomaly in the image.
[003]  No, there is no anomaly in the image.
[004]  No, there is no anomaly in the image.
[005]  No, there is no anomaly in the image.
[006]  No, there is no anomaly in the image.
[007]  No, there is no anomaly in the image.
[008]  No, there is no anomaly in the image.
[009]  No, there is no anomaly in the image.
[010]  No, there is no anomaly in the image.
[011]  No, there is no anomaly in the image.
[012]  No, there is no anomaly in the image.
[013]  No, there is no anomaly in the image.
[014]  No, there is no anomaly in the image.
[015]  No, there is no anomaly in the image.
[016]  No, there is no anomaly in the image.
[017]  No, there is no anomaly in the image.


# # Global Token 추가 LLaVA 답변 확인

In [1]:
# Global Token : 원본 Image를 SigLIP에 입력해 Token으로 만듦

import torch
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

ROOT = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole")
ROOT2 = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_mask_token/hazelnut/hole")
MASK = ROOT2
GLOB = ROOT / "good"
MODEL = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
DEV   = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16

In [None]:
proc  = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
llava = LlavaOnevisionForConditionalGeneration.from_pretrained(
          MODEL, torch_dtype=DTYPE, device_map="auto",
          low_cpu_mem_usage=True, trust_remote_code=True).eval()

tok_img = proc.tokenizer.additional_special_tokens[0]          

In [4]:
for i in range(18):
    stem = f"{i:03d}"
    f_mask = MASK / f"{stem}_siglip_tokens.pt"
    f_glo  = GLOB / f"{stem}_global_token.pt"
    if not (f_mask.exists() and f_glo.exists()):
        print(f"[{stem}]  ⨯ token missing"); continue

    d   = torch.load(f_mask, map_location="cpu", weights_only=False)
    g   = torch.load(f_glo,  map_location="cpu", weights_only=False)["global_token"]
    tok = torch.cat([g.unsqueeze(0), d["mask_token"]], dim=0).to(DEV, DTYPE)  
    with torch.no_grad():
        v_emb = llava.multi_modal_projector(tok.unsqueeze(0))                 

    N = v_emb.size(1)
    user_prompt = "USER: Is there any anomaly in the image? (Yes / No)\nASSISTANT:"
    prompt = (tok_img + " ") * N + user_prompt        
    tok_in = proc.tokenizer(prompt, return_tensors="pt").to(DEV)
    t_emb  = llava.get_input_embeddings()(tok_in.input_ids).to(DTYPE)
    att_img = torch.ones(1, N, dtype=torch.long, device=DEV)

    inp      = torch.cat([v_emb, t_emb], dim=1)
    att_mask = torch.cat([att_img, tok_in.attention_mask], dim=1)

    with torch.no_grad():
        gen = llava.generate(
                inputs_embeds=inp, attention_mask=att_mask,
                max_new_tokens=30, temperature=0.7, do_sample=False,
                eos_token_id=proc.tokenizer.eos_token_id,
                pad_token_id=proc.tokenizer.eos_token_id)
    ans = proc.tokenizer.decode(gen[0], skip_special_tokens=True).strip()
    print(f"[{stem}]  {ans}")



[000]  No, there is no anomaly in the image.
[001]  No, there is no anomaly in the image.
[002]  No, there is no anomaly in the image.
[003]  No, there is no anomaly in the image.
[004]  No, there is no anomaly in the image.
[005]  No, there is no anomaly in the image.
[006]  No, there is no anomaly in the image.
[007]  No, there is no anomaly in the image.
[008]  No, there is no anomaly in the image.
[009]  No, there is no anomaly in the image.
[010]  No, there is no anomaly in the image.
[011]  No, there is no anomaly in the image.
[012]  No, there is no anomaly in the image.
[013]  No, there is no anomaly in the image.
[014]  No, there is no anomaly in the image.
[015]  No, there is no anomaly in the image.
[016]  No, there is no anomaly in the image.
[017]  No, there is no anomaly in the image.
