In [None]:
proc  = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
llava = LlavaOnevisionForConditionalGeneration.from_pretrained(
          MODEL, device_map="auto", torch_dtype=DTYPE,
          low_cpu_mem_usage=True, trust_remote_code=True).eval()
tok_img = proc.tokenizer.additional_special_tokens[0]

# # 000.png

In [5]:
import torch, torch.nn.functional as F
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

MASK_PT = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/000_siglip_tokens.pt")
GLOB_PT = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/good/000_global_token.pt")
MODEL   = "llava-hf/llava-onevision-qwen2-7b-ov-hf"

DEV, DTYPE = ("cuda" if torch.cuda.is_available() else "cpu"), torch.bfloat16

In [6]:
d   = torch.load(MASK_PT, map_location="cpu", weights_only=False)
tok = d["mask_token"]                                          
if GLOB_PT.exists():
    g = torch.load(GLOB_PT, map_location="cpu", weights_only=False)["global_token"]
    tok = torch.cat([g.unsqueeze(0), tok], 0)                 

tok = F.layer_norm(tok.float(), tok.shape[-1:])               
tok = tok.to(DEV, DTYPE).unsqueeze(0)                          

with torch.no_grad():
    vis = llava.multi_modal_projector(tok)                    
vis = F.layer_norm(vis.float(), vis.shape[-1:]).to(DEV, DTYPE)  

T = vis.size(1)
prompt = (tok_img + " ")*T + "USER: Is there any anomaly in the image? Yes or No\nASSISTANT:"
ids  = proc.tokenizer(prompt, return_tensors="pt").to(DEV).input_ids
txt  = llava.get_input_embeddings()(ids)                      
att  = torch.ones(1, T, dtype=torch.long, device=DEV)

inp  = torch.cat([vis, txt], 1)                                
attm = torch.cat([att, ids.new_ones(ids.shape)], 1)

with torch.no_grad():
    out = llava(inputs_embeds=inp, attention_mask=attm, return_dict=True)

logits = out.logits[0, -1].float()                            
top10  = torch.topk(logits, 10)
words  = [proc.tokenizer.decode([i]).strip().replace("\n"," ") for i in top10.indices]
probs  = F.softmax(top10.values, 0).tolist()

print("Top-10 token probabilities")
for w, p in zip(words, probs):
    print(f"{w:<12}  {p:.3f}")

tid_yes = proc.tokenizer(" Yes", add_special_tokens=False).input_ids[0]
tid_no  = proc.tokenizer(" No",  add_special_tokens=False).input_ids[0]
p_yes, p_no = F.softmax(logits[[tid_yes, tid_no]], 0).tolist()
print(f"\nP(Yes) = {p_yes:.3f}   P(No) = {p_no:.3f}")

with torch.no_grad():
    gen = llava.generate(inputs_embeds=inp, attention_mask=attm,
                         max_new_tokens=30, do_sample=False,
                         eos_token_id=proc.tokenizer.eos_token_id,
                         pad_token_id=proc.tokenizer.eos_token_id)
answer = proc.tokenizer.decode(gen[0], skip_special_tokens=True).strip()
print("\n=== LLaVA Answer ===")
print(answer)

Top-10 token probabilities
Yes           0.443
No            0.345
I             0.082
<|im_end|>    0.032
There         0.027
The           0.022
              0.019
It            0.012
Let           0.010
Can           0.009

P(Yes) = 0.562   P(No) = 0.438

=== LLaVA Answer ===
Yes, there is an anomaly in the image.


# # 006.png

In [3]:
import torch, torch.nn.functional as F
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

MASK_PT = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/006_siglip_tokens.pt")
GLOB_PT = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/good/006_global_token.pt")
MODEL   = "llava-hf/llava-onevision-qwen2-7b-ov-hf"

DEV, DTYPE = ("cuda" if torch.cuda.is_available() else "cpu"), torch.bfloat16

In [4]:
d   = torch.load(MASK_PT, map_location="cpu", weights_only=False)
tok = d["mask_token"]                                           
if GLOB_PT.exists():
    g = torch.load(GLOB_PT, map_location="cpu", weights_only=False)["global_token"]
    tok = torch.cat([g.unsqueeze(0), tok], 0)                   

tok = F.layer_norm(tok.float(), tok.shape[-1:])                 
tok = tok.to(DEV, DTYPE).unsqueeze(0)                          

with torch.no_grad():
    vis = llava.multi_modal_projector(tok)                      
vis = F.layer_norm(vis.float(), vis.shape[-1:]).to(DEV, DTYPE)  

T = vis.size(1)
prompt = (tok_img + " ")*T + "USER: Is there any anomaly in the image? Yes or No\nASSISTANT:"
ids  = proc.tokenizer(prompt, return_tensors="pt").to(DEV).input_ids
txt  = llava.get_input_embeddings()(ids)                       
att  = torch.ones(1, T, dtype=torch.long, device=DEV)

inp  = torch.cat([vis, txt], 1)                                
attm = torch.cat([att, ids.new_ones(ids.shape)], 1)

with torch.no_grad():
    out = llava(inputs_embeds=inp, attention_mask=attm, return_dict=True)

logits = out.logits[0, -1].float()                            
top10  = torch.topk(logits, 10)
words  = [proc.tokenizer.decode([i]).strip().replace("\n"," ") for i in top10.indices]
probs  = F.softmax(top10.values, 0).tolist()

print("Top-10 token probabilities")
for w, p in zip(words, probs):
    print(f"{w:<12}  {p:.3f}")

tid_yes = proc.tokenizer(" Yes", add_special_tokens=False).input_ids[0]
tid_no  = proc.tokenizer(" No",  add_special_tokens=False).input_ids[0]
p_yes, p_no = F.softmax(logits[[tid_yes, tid_no]], 0).tolist()
print(f"\nP(Yes) = {p_yes:.3f}   P(No) = {p_no:.3f}")

with torch.no_grad():
    gen = llava.generate(inputs_embeds=inp, attention_mask=attm,
                         max_new_tokens=30, do_sample=False,
                         eos_token_id=proc.tokenizer.eos_token_id,
                         pad_token_id=proc.tokenizer.eos_token_id)
answer = proc.tokenizer.decode(gen[0], skip_special_tokens=True).strip()
print("\n=== LLaVA Answer ===")
print(answer)

Top-10 token probabilities
(             nan
'             nan
%             nan
&             nan
"             nan
!             nan
#             nan
$             nan
)             nan
*             nan

P(Yes) = nan   P(No) = nan

=== LLaVA Answer ===
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


# # LLaVA Answer & Softmax

In [13]:
import torch, torch.nn.functional as F
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

MASK = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/003_siglip_tokens.pt")
GLOB = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole/good/003_global_token.pt")
MODEL= "llava-hf/llava-onevision-qwen2-7b-ov-hf"
DEV, DTYPE = "cuda" if torch.cuda.is_available() else "cpu", torch.bfloat16

In [14]:
d   = torch.load(MASK, map_location="cpu", weights_only=False)
tok = d["mask_token"]
if GLOB.exists():
    g = torch.load(GLOB, map_location="cpu", weights_only=False)["global_token"]
    tok = torch.cat([g.unsqueeze(0), tok], 0)

tok = torch.nan_to_num(tok).float()                
tok = F.layer_norm(tok, tok.shape[-1:])            
tok = tok.to(DEV, DTYPE).unsqueeze(0)              

with torch.no_grad():
    vis = llava.multi_modal_projector(tok)      
vis = F.layer_norm(vis.float(), vis.shape[-1:]).to(DEV, DTYPE)

T = vis.size(1)
prompt = (tok_img + " ")*T + "USER: Is there any anomaly in the image? Yes or No\nASSISTANT:"
ids  = proc.tokenizer(prompt, return_tensors="pt").to(DEV).input_ids
txt  = llava.get_input_embeddings()(ids)            
inp  = torch.cat([vis, txt], 1)
attm = torch.cat([torch.ones(1,T,device=DEV), ids.new_ones(ids.shape)], 1)

with torch.no_grad():
    out = llava(inputs_embeds=inp, attention_mask=attm, return_dict=True)

logits = out.logits[0,-1].float()                  
top10  = torch.topk(logits, 10)
words  = [proc.tokenizer.decode([i]).strip().replace("\n"," ") for i in top10.indices]
probs  = F.softmax(top10.values, 0).tolist()
print("\nTop-10 logits")
for w,p in zip(words, probs): print(f"{w:<12} {p:.3f}")

tid_yes = proc.tokenizer(" Yes", add_special_tokens=False).input_ids[0]
tid_no  = proc.tokenizer(" No",  add_special_tokens=False).input_ids[0]
p_yes, p_no = F.softmax(logits[[tid_yes, tid_no]], 0).tolist()
print(f"\nP(Yes)={p_yes:.3f}   P(No)={p_no:.3f}")

gen = llava.generate(inputs_embeds=inp, attention_mask=attm,
                     max_new_tokens=30, do_sample=False,
                     eos_token_id=proc.tokenizer.eos_token_id,
                     pad_token_id=proc.tokenizer.eos_token_id)
print("\n=== LLaVA Answer ===")
print(proc.tokenizer.decode(gen[0], skip_special_tokens=True).strip())


Top-10 logits
Yes          0.458
No           0.357
I            0.066
There        0.029
<|im_end|>   0.026
The          0.019
             0.016
It           0.011
Can          0.011
Is           0.007

P(Yes)=0.562   P(No)=0.438

=== LLaVA Answer ===
Yes, there is an anomaly in the image. The person in the image is wearing a hat that is not typically worn in a professional setting.


# # LLaVA Answer

In [1]:
import torch, torch.nn.functional as F, numpy as np
from pathlib import Path
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

BASE   = Path("/home/s2behappy4/data/gyuhyeong/code/siglip_token_demo/hazelnut/hole")
MODEL  = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
DEV, DTYPE = "cuda" if torch.cuda.is_available() else "cpu", torch.bfloat16

In [None]:
proc  = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
llava = LlavaOnevisionForConditionalGeneration.from_pretrained(
          MODEL, device_map="auto", torch_dtype=DTYPE,
          low_cpu_mem_usage=True, trust_remote_code=True).eval()
tok_img = proc.tokenizer.additional_special_tokens[0]
tid_yes = proc.tokenizer(" Yes", add_special_tokens=False).input_ids[0]
tid_no  = proc.tokenizer(" No",  add_special_tokens=False).input_ids[0]

In [3]:
def clean_norm(x):
    x = torch.nan_to_num(x).float()            
    return F.layer_norm(x, x.shape[-1:])       

for i in range(18):
    stem = f"{i:03d}"
    f_mask = BASE / f"{stem}_siglip_tokens.pt"
    f_glo  = BASE / "good" / f"{stem}_global_token.pt"
    if not f_mask.exists():
        print(f"[{stem}]  ⨯ mask 파일 없음"); continue

    d   = torch.load(f_mask, map_location="cpu", weights_only=False)
    tok = d["mask_token"]
    if f_glo.exists():
        g = torch.load(f_glo, map_location="cpu", weights_only=False)["global_token"]
        tok = torch.cat([g.unsqueeze(0), tok], 0)

    if tok.size(0) > 64:
        areas = torch.tensor([m.sum() for m in d["mask"]]); keep = torch.topk(areas, 63).indices
        tok = torch.cat([tok[:1], tok[1:][keep]], 0)

    tok = clean_norm(tok).to(DEV, DTYPE).unsqueeze(0)         

    with torch.no_grad():
        vis = llava.multi_modal_projector(tok)               
    vis = clean_norm(vis).to(DEV, DTYPE)                       

    T   = vis.size(1)
    prompt = (tok_img + " ")*T + "USER: Is there any anomaly in the image? Yes or No\nASSISTANT:"
    ids  = proc.tokenizer(prompt, return_tensors="pt").to(DEV).input_ids
    txt  = llava.get_input_embeddings()(ids)                   
    inp  = torch.cat([vis, txt], 1)
    att  = torch.cat([torch.ones(1,T,device=DEV), ids.new_ones(ids.shape)], 1)

    with torch.no_grad():
        out = llava(inputs_embeds=inp, attention_mask=att, return_dict=True)

    logits = out.logits[0, -1].float()
    p_yes, p_no = F.softmax(logits[[tid_yes, tid_no]], 0).tolist()

    with torch.no_grad():
        ans = proc.tokenizer.decode(
                llava.generate(inputs_embeds=inp, attention_mask=att,
                               max_new_tokens=30, do_sample=False,
                               eos_token_id=proc.tokenizer.eos_token_id,
                               pad_token_id=proc.tokenizer.eos_token_id)[0],
                skip_special_tokens=True).strip()

    print(f"[{stem}]  P(Yes)={p_yes:.3f}  P(No)={p_no:.3f}  →  {ans}")

[000]  P(Yes)=0.562  P(No)=0.438  →  Yes, there is an anomaly in the image.
[001]  P(Yes)=0.516  P(No)=0.484  →  Yes, there is an anomaly in the image.
[002]  P(Yes)=0.547  P(No)=0.453  →  Yes, there is an anomaly in the image.
[003]  P(Yes)=0.562  P(No)=0.438  →  Yes, there is an anomaly in the image. The person in the image is wearing a hat that is not typically worn in a professional setting.
[004]  P(Yes)=0.577  P(No)=0.423  →  Yes, there is an anomaly in the image. The anomaly is that the image is rotated 90 degrees clockwise.
[005]  P(Yes)=0.622  P(No)=0.378  →  Yes, there is an anomaly in the image. The anomaly is that the image is rotated 90 degrees clockwise.
[006]  P(Yes)=0.637  P(No)=0.363  →  Yes, there is an anomaly in the image. The image contains a text that reads "I love you" in a bold, red font.
[007]  P(Yes)=0.562  P(No)=0.438  →  Yes, there is an anomaly in the image.
[008]  P(Yes)=0.577  P(No)=0.423  →  Yes, there is an anomaly in the image. The person in the image 