In [None]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import glob

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model     = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl"
).to(device)
model.eval()

overlay_paths = sorted(glob.glob("/home/s2behappy4/data/gyuhyeong/code/overlays/overlay_*.png"))
images = [Image.open(p).convert("RGB") for p in overlay_paths]
num_masks = len(images)
print(f"Loaded {num_masks} overlay images")

inputs = processor(images=images, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values          

with torch.no_grad():
    vision_out   = model.vision_model(pixel_values=pixel_values)
    image_embeds = vision_out.last_hidden_state

query_tokens = model.query_tokens.to(device)   
batch_queries = query_tokens.expand(image_embeds.size(0), -1, -1)

attention_mask = torch.ones(image_embeds.size()[:-1], device=device, dtype=torch.long)
with torch.no_grad():
    qf_out     = model.qformer(
        query_embeds           = batch_queries,
        encoder_hidden_states  = image_embeds,
        encoder_attention_mask = attention_mask,
        return_dict            = True,
    )
    mask_tokens = qf_out.last_hidden_state      

mask_token_single = mask_tokens.mean(dim=1)

print("mask_tokens shape       :", mask_tokens.shape)      
print("mask_token_single shape :", mask_token_single.shape) 
assert mask_tokens.shape[0] == num_masks, "Error"
print("✅ 1 mask → 1 pooled token 확인 완료")

In [3]:
torch.save(mask_token_single.cpu(), "/home/s2behappy4/data/gyuhyeong/code/mask_token_single.pt")

In [None]:
from transformers import Blip2ForConditionalGeneration, Blip2Processor
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model     = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl"
).to(device)
model.eval()

In [4]:
print("query_tokens.shape:", model.query_tokens.shape)
print("vision hidden size :", model.config.vision_config.hidden_size)
print("qformer hidden size:", model.config.qformer_config.hidden_size)
print("text hidden size  :", model.config.text_config.hidden_size)

query_tokens.shape: torch.Size([1, 32, 768])
vision hidden size : 1408
qformer hidden size: 768
text hidden size  : 2048


# # Screw Token

In [None]:
import os
import glob
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model     = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl"
).to(device)
model.eval()

base_dir     = "/home/s2behappy4/data/gyuhyeong/code/overlays/screw"
overlay_paths = sorted(glob.glob(os.path.join(base_dir, "overlay_*.png")))
mask_paths    = sorted(glob.glob(os.path.join(base_dir, "mask_*.pt")))

assert len(overlay_paths) == len(mask_paths), "Overlay와 mask 파일 수 불일치"

images = [Image.open(p).convert("RGB") for p in overlay_paths]
num_masks = len(images)
print(f"Loaded {num_masks} overlay images and masks")

inputs = processor(images=images, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values  

with torch.no_grad():
    vision_out   = model.vision_model(pixel_values=pixel_values)
    image_embeds = vision_out.last_hidden_state

query_tokens  = model.query_tokens.to(device)              
batch_queries = query_tokens.expand(num_masks, -1, -1)     
attention_mask = torch.ones(
    image_embeds.size()[:-1], device=device, dtype=torch.long
)

with torch.no_grad():
    qf_out     = model.qformer(
        query_embeds           = batch_queries,
        encoder_hidden_states  = image_embeds,
        encoder_attention_mask = attention_mask,
        return_dict            = True,
    )
    mask_tokens = qf_out.last_hidden_state 

mask_token_single = mask_tokens.mean(dim=1)  
print("mask_token_single shape:", mask_token_single.shape)

mask_list = [torch.load(p) for p in mask_paths]

save_path = "/home/s2behappy4/data/gyuhyeong/code/bridge_data/screw_token_mask_pairs.pt"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

data = {
    "mask_tokens": mask_token_single.cpu(),  
    "masks":       mask_list                 
}

torch.save(data, save_path)
print(f"✅ Saved {num_masks} token-mask pairs to\n   {save_path}")

print("mask_tokens shape       :", mask_tokens.shape)      
print("mask_token_single shape :", mask_token_single.shape) 

# # Hazelnut Token

In [None]:
import os
import glob
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model     = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl"
).to(device)
model.eval()

base_dir     = "/home/s2behappy4/data/gyuhyeong/code/overlays/hazelnut"
overlay_paths = sorted(glob.glob(os.path.join(base_dir, "overlay_*.png")))
mask_paths    = sorted(glob.glob(os.path.join(base_dir, "mask_*.pt")))

assert len(overlay_paths) == len(mask_paths), "Overlay와 mask 파일 수 불일치"

images = [Image.open(p).convert("RGB") for p in overlay_paths]
num_masks = len(images)
print(f"Loaded {num_masks} overlay images and masks")

inputs = processor(images=images, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

with torch.no_grad():
    vision_out   = model.vision_model(pixel_values=pixel_values)
    image_embeds = vision_out.last_hidden_state

query_tokens  = model.query_tokens.to(device)              
batch_queries = query_tokens.expand(num_masks, -1, -1)     
attention_mask = torch.ones(
    image_embeds.size()[:-1], device=device, dtype=torch.long
)

with torch.no_grad():
    qf_out     = model.qformer(
        query_embeds           = batch_queries,
        encoder_hidden_states  = image_embeds,
        encoder_attention_mask = attention_mask,
        return_dict            = True,
    )
    mask_tokens = qf_out.last_hidden_state  

mask_token_single = mask_tokens.mean(dim=1)  
print("mask_token_single shape:", mask_token_single.shape)

mask_list = [torch.load(p) for p in mask_paths]

save_path = "/home/s2behappy4/data/gyuhyeong/code/bridge_data/hazelnut_token_mask_pairs.pt"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

data = {
    "mask_tokens": mask_token_single.cpu(),  
    "masks":       mask_list                 
}

torch.save(data, save_path)
print(f"✅ Saved {num_masks} token-mask pairs to\n   {save_path}")

print("mask_tokens shape       :", mask_tokens.shape)       