In [None]:
## VL 모델
# PaliGemma2 PT 아니면 Mix로 시도해보기 // multi image 지원 안됨 - 제외
# DeepSeek‑VL2 tiny 3B
# Qwen2.5-VL 3B (AWQ는 아님)

# 추가적인 모델
# SAIL-VL2 3B
# SmolVLM 3B

## OCT/FPI 센서 인코더
# MLP block?``

## Action Experts
# 

## Dataset
# BridgeDataV2 - scripted는 트레젝토리를 찍은 데이터셋 = 내꺼와 유사함 -> v2만 사용하며, 그중에서도 datacol2를 사용
# DROID
# ALOHA

## chunking size
# 

In [None]:
# ================================
# 4️⃣ Dataset Test & Verification
# ================================
if __name__ == "__main__":
    import matplotlib.pyplot as plt

    # ⚠️ 사용자의 데이터셋 루트 경로로 수정해주세요.
    DATASET_ROOT = "/home/najo/NAS/VLA/dataset/raw/bridge_data_v2" 
    
    print("🧪 [Step 1] Initializing Dataset...")
    try:
        dataset = BridgeRawSequenceDataset(
            root=DATASET_ROOT,
            horizon=8,
            im_size=224,
            max_traj=10 # 빠른 테스트를 위해 trajectory 개수 제한
        )
        print(f"✅ Dataset initialized successfully. Found {len(dataset)} samples.")
    except Exception as e:
        print(f"❌ Dataset initialization failed: {e}")
        exit()

    print("\n🧪 [Step 2 & 3] Fetching and Verifying a single sample...")
    # --- 단일 샘플 테스트 ---
    sample = dataset[0]
    images = sample['images']
    actions = sample['actions']
    instruction = sample['instruction']

    print(f"  - Instruction: '{instruction}'")
    print(f"  - Images shape: {images.shape}")
    print(f"     (H, V, C, H, W) = ({images.shape[0]}, {images.shape[1]}, {images.shape[2]}, {images.shape[3]}, {images.shape[4]})")
    print(f"  - Actions shape: {actions.shape}")
    print(f"     (H, A) = ({actions.shape[0]}, {actions.shape[1]})")
    print(f"  - Images dtype: {images.dtype}")
    print(f"  - Actions dtype: {actions.dtype}")
    print(f"  - Images value range: [{images.min():.2f}, {images.max():.2f}]")
    print("✅ Single sample verification complete.")


    print("\n🧪 [Step 4] Testing with DataLoader...")
    # --- 데이터로더 테스트 ---
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=4,
        shuffle=True,
        num_workers=2 # 병렬 처리 테스트
    )

    batch_sample = next(iter(data_loader))
    batch_images = batch_sample['images']
    batch_actions = batch_sample['actions']
    batch_instructions = batch_sample['instruction']
    
    print(f"  - Batched Images shape: {batch_images.shape}")
    print(f"     (B, H, V, C, H, W)")
    print(f"  - Batched Actions shape: {batch_actions.shape}")
    print(f"     (B, H, A)")
    print(f"  - Batched Instructions: {len(batch_instructions)} instructions")
    print(f"    e.g., '{batch_instructions[0]}'")
    print("✅ DataLoader test successful.")


    print("\n🖼️ [Bonus] Visualizing a sample image...")
    # --- 시각화 테스트 ---
    # 첫 번째 시간 스텝, 첫 번째 뷰의 이미지를 가져옴
    img_tensor = images[0, 0, :, :, :] # (C, H, W)
    
    # [-1, 1] 범위를 [0, 1] 범위로 되돌리기
    img_tensor = (img_tensor * 0.5) + 0.5 
    
    # (C, H, W) -> (H, W, C)로 축 순서 변경
    img_to_show = img_tensor.permute(1, 2, 0).numpy()

    plt.imshow(img_to_show)
    plt.title("Sample Image (t=0, view=0)")
    plt.axis('off')
    plt.show()

## Dataset Load

In [1]:
import os
import glob
import pickle
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from tqdm import tqdm


class BridgeRawSequenceDataset(Dataset):
    def __init__(self, root, horizon=8, im_size=None, num_views=None, transform=None, max_traj=None):
        self.root = root
        self.horizon = horizon
        self.im_size = im_size
        self.transform = transform or transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])

        # ✅ traj 폴더 수집
        self.traj_paths = []
        datacols = glob.glob(os.path.join(root, "datacol2_*"))
        for dc in datacols:
            self.traj_paths += glob.glob(os.path.join(dc, "**", "traj*"), recursive=True)
        self.traj_paths = [p for p in self.traj_paths if os.path.exists(os.path.join(p, "policy_out.pkl"))]

        if max_traj:
            self.traj_paths = self.traj_paths[:max_traj]

        # ✅ view 개수 자동 감지
        if num_views is None:
            sample_traj = self.traj_paths[0]
            view_dirs = sorted([d for d in os.listdir(sample_traj) if d.startswith("images")])
            self.num_views = len(view_dirs)
        else:
            self.num_views = num_views

        # ✅ trajectory indexing
        self.samples = self._index_chunks()

    def _index_chunks(self):
        samples = []
        for traj_path in tqdm(self.traj_paths, desc="Indexing Chunks"):
            # 각 traj마다 이미지 시퀀스 길이 확인
            img_dir = os.path.join(traj_path, "images0")
            imgs = sorted(glob.glob(os.path.join(img_dir, "im_*.jpg")))
            if not imgs:
                continue
            T = len(imgs)
            chunk_count = max(T - self.horizon + 1, 0)
            for i in range(chunk_count):
                samples.append((traj_path, i))
        print(f"✅ Indexed {len(samples)} chunks from {len(self.traj_paths)} trajectories")
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        traj_path, start_idx = self.samples[idx]

        # 🔹 Multi-view image sequences
        img_seq = []
        for t in range(start_idx, start_idx + self.horizon):
            views = []
            for v in range(self.num_views):
                img_path = os.path.join(traj_path, f"images{v}", f"im_{t}.jpg")
                img = Image.open(img_path).convert("RGB")
                if self.im_size:
                    img = img.resize((self.im_size, self.im_size))
                views.append(self.transform(img))
            views = torch.stack(views, dim=0)  # (V, C, H, W)
            img_seq.append(views)
        img_seq = torch.stack(img_seq, dim=0)  # (H, V, C, H, W)

        # 🔹 Action sequence
        with open(os.path.join(traj_path, "policy_out.pkl"), "rb") as f:
            actions = pickle.load(f)
            if isinstance(actions[0], dict):
                actions = [a.get("actions") for a in actions if "actions" in a]
        actions = np.array(actions)
        act_seq = torch.tensor(actions[start_idx:start_idx + self.horizon], dtype=torch.float32)

        # 🔹 Language
        lang_path = os.path.join(traj_path, "lang.txt")
        if os.path.exists(lang_path):
            with open(lang_path, "r") as f:
                lang = f.read().strip()
        else:
            lang = ""

        return {"images": img_seq, "actions": act_seq, "instruction": lang}


In [2]:
root_dir = "/home/najo/NAS/VLA/dataset/raw/bridge_data_v2"
dataset = BridgeRawSequenceDataset(root=root_dir, horizon=8, im_size=None, max_traj=50)
sample = dataset[0]

print(sample["images"].shape)   # (8, V, 3, H, W)
print(sample["actions"].shape)  # (8, A)
print(sample["instruction"])    # text or ""


Indexing Chunks: 100%|██████████| 50/50 [00:00<00:00, 9617.32it/s]

✅ Indexed 1299 chunks from 50 trajectories
torch.Size([8, 3, 3, 480, 640])
torch.Size([8, 7])






In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# ================================
# 1️⃣ Action Expert Head
# ================================
class QwenActionExpert(nn.Module):
    def __init__(self, vl_dim=3072, action_dim=7, horizon=8, hidden_dim=1024, nhead=8, num_layers=4):
        super().__init__()
        self.horizon = horizon
        self.cond_proj = nn.Linear(vl_dim, hidden_dim)
        self.pos_embed = nn.Parameter(torch.randn(1, horizon, hidden_dim))
        
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_dim, nhead=nhead, dim_feedforward=hidden_dim * 4,
            dropout=0.1, batch_first=True
        )
        self.temporal_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        
        self.output_head = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, vl_tokens: torch.Tensor, z_chunk: torch.Tensor):
        B, H, A = z_chunk.shape

        cond = self.cond_proj(vl_tokens.mean(dim=1, keepdim=True))
        tgt = self.pos_embed.repeat(B, 1, 1)

        decoded = self.temporal_decoder(tgt, cond)  # (B,H,Hd)
        delta = self.output_head(decoded)           # (B,H,A)
        pred_actions = z_chunk + delta

        return pred_actions, delta

# ================================
# 2️⃣ Full Qwen-VL + ActionExpert Model
# ================================
class QwenVLAForAction(nn.Module):
    def __init__(self, vl_model_name="Qwen/Qwen2.5-VL-3B-Instruct",
                 action_dim=7, horizon=8, hidden_dim=1024):
        super().__init__()
        print(f"🚀 Loading base model and processor: {vl_model_name}")

        self.processor = AutoProcessor.from_pretrained(vl_model_name)
        self.vl_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            vl_model_name,
            attn_implementation="flash_attention_2",
            torch_dtype=torch.bfloat16,
            device_map="cuda",
        )
        self.action_expert = QwenActionExpert(
            vl_dim=self.vl_model.config.hidden_size,
            action_dim=action_dim,
            horizon=horizon,
            hidden_dim=hidden_dim,
        ).to(dtype=torch.bfloat16, device="cuda")
        
        print("🧊 Freezing the base VL model...")
        for param in self.vl_model.parameters():
            param.requires_grad = False
        print("✅ VL model has been frozen.")


    def forward(self, text_inputs, image_inputs, z_chunk):
        messages = [
            {"role": "user", "content": [{"type": "image", "image": img} for img in imgs] + [{"type": "text", "text": txt}]}
            for imgs, txt in zip(image_inputs, text_inputs)
        ]
        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        img_inputs, vid_inputs = process_vision_info(messages)

        inputs = self.processor(
            text=[text],
            images=img_inputs,
            videos=vid_inputs,
            padding=True,
            return_tensors="pt",
        ).to(next(self.parameters()).device)

        outputs = self.vl_model(**inputs, output_hidden_states=True, return_dict=True)
        vl_tokens = outputs.hidden_states[-1]  # (B, T, Dv), dtype=bf16
        z_chunk = z_chunk.to(device=vl_tokens.device, dtype=vl_tokens.dtype)
        pred_actions, delta = self.action_expert(vl_tokens, z_chunk)

        return pred_actions, delta

def print_trainable_parameters(model):
    """
    모델의 학습 가능한 파라미터 수를 출력합니다.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

# ================================
# 3️⃣ Example Usage
# ================================
if __name__ == "__main__":
    import PIL.Image as Image

    # 예시 이미지와 텍스트
    image_paths = [
        "/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view1/right/zed_41182735_right_1759395176.397.jpg",
        "/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view2/left/zed_49429257_left_1759395176.393.jpg",
        "/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view3/left/zed_44377151_left_1759395176.448.jpg",
        "/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view4/left/zed_49045152_left_1759395176.244.jpg",
        "/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view5_oak/oak_1944301011169A4800_1759395176.364.jpg",
        "/home/najo/NAS/VLA/dataset/output.png"
    ]

    # 모델 초기화 후 확인
    model = QwenVLAForAction(vl_model_name="Qwen/Qwen2.5-VL-3B-Instruct", action_dim=7, horizon=8).to("cuda")
    # ================================
    # 📊 파라미터 상세 분석
    # ================================

    # 1. Vision-Language 모델 파라미터 계산
    vl_model_params = sum(p.numel() for p in model.vl_model.parameters())
    vl_model_trainable = sum(p.numel() for p in model.vl_model.parameters() if p.requires_grad)

    # 2. Action Expert 헤드 파라미터 계산
    action_expert_params = sum(p.numel() for p in model.action_expert.parameters())
    action_expert_trainable = sum(p.numel() for p in model.action_expert.parameters() if p.requires_grad)

    # 3. 전체 파라미터 계산
    total_params = vl_model_params + action_expert_params
    trainable_params = vl_model_trainable + action_expert_trainable

    # 결과 출력
    print("="*50)
    print("📊 모델 파라미터 분석")
    print("="*50)

    print(f"👁️ Vision-Language 모델 (vl_model):")
    print(f"  - 전체 파라미터:     {vl_model_params:,}")
    print(f"  - 학습 가능 파라미터: {vl_model_trainable:,} (동결됨)")
    print("-" * 50)

    print(f"🤖 Action Expert 헤드 (action_expert):")
    print(f"  - 전체 파라미터:     {action_expert_params:,}")
    print(f"  - 학습 가능 파라미터: {action_expert_trainable:,}")
    print("-" * 50)

    print("📈 최종 합계:")
    print(f"  - 전체 파라미터:     {total_params:,}")
    print(f"  - 학습 가능 파라미터: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
    print("="*50)
    
    print_trainable_parameters(model)
    
    B, H, A = 1, 8, 7
    z_chunk = torch.randn(B, H, A).to("cuda")

    # forward 실행
    with torch.no_grad():
        pred_actions, delta = model(
            text_inputs=["Describe the meaning..."],
            image_inputs=[image_paths],
            z_chunk=z_chunk
        )

    print("Predicted actions:", pred_actions.shape)
    print("Δactions:", delta.shape)


🚀 Loading base model and processor: Qwen/Qwen2.5-VL-3B-Instruct


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🧊 Freezing the base VL model...
✅ VL model has been frozen.
📊 모델 파라미터 분석
👁️ Vision-Language 모델 (vl_model):
  - 전체 파라미터:     3,754,622,976
  - 학습 가능 파라미터: 0 (동결됨)
--------------------------------------------------
🤖 Action Expert 헤드 (action_expert):
  - 전체 파라미터:     69,302,279
  - 학습 가능 파라미터: 69,302,279
--------------------------------------------------
📈 최종 합계:
  - 전체 파라미터:     3,823,925,255
  - 학습 가능 파라미터: 69,302,279 (1.81%)
trainable params: 69302279 || all params: 3823925255 || trainable%: 1.81


  return F.conv3d(


Predicted actions: torch.Size([1, 8, 7])
Δactions: torch.Size([1, 8, 7])


## Flow Matching

In [11]:
import torch
import torch.nn as nn

class ChunkedFlowMatchingExpert(nn.Module):
    def __init__(
        self,
        vl_token_dim: int,
        action_dim: int,
        chunk_len: int = 5,
        hidden_dim: int = 1024,
        num_layers: int = 4,
        nhead: int = 4,
    ):
        super().__init__()

        self.chunk_len = chunk_len
        self.action_dim = action_dim

        self.input_proj = nn.Sequential(
            nn.Linear(action_dim + 1, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim)
        )

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=hidden_dim * 4,
            dropout=0.1,
            batch_first=True,
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.output_proj = nn.Linear(hidden_dim, action_dim)

    def forward(self, vl_tokens: torch.Tensor, z_chunk: torch.Tensor, tau: torch.Tensor):
        """
        vl_tokens: (B, T, D)
        z_chunk: (B, C, A)
        tau: (B, 1) or (B, 1, 1)
        """
        # === [1] Shape 검사 및 정리 ===
        if z_chunk.ndim != 3:
            raise ValueError(f"Expected z_chunk shape (B, C, A), got {z_chunk.shape}")

        if tau.ndim == 2:
            tau = tau.unsqueeze(-1)  # (B, 1, 1)
        elif tau.ndim != 3:
            raise ValueError(f"tau shape must be (B, 1) or (B, 1, 1), got {tau.shape}")

        B, C, A = z_chunk.shape

        # === [2] τ를 각 step에 복제 ===
        tau_expanded = tau.expand(B, C, 1)  # (B, C, 1)

        # === [3] 입력 결합 후 인코딩 ===
        z_input = torch.cat([z_chunk, tau_expanded], dim=-1)  # (B, C, A+1)
        z_embed = self.input_proj(z_input)  # (B, C, H)

        # === [4] Cross-Attention ===
        context = self.decoder(tgt=z_embed, memory=vl_tokens)  # (B, C, H)

        # === [5] Δ 예측 및 residual로 복원 ===
        delta = self.output_proj(context)  # (B, C, A)
        pred_actions = z_chunk + delta     # residual prediction

        return pred_actions, delta


In [12]:
# 예제 차원
B, C, A = 4, 5, 6
T, D = 64, 1024

vl_tokens = torch.randn(B, T, D)
x = torch.randn(B, C, A)
y = torch.randn(B, C, A)
tau = torch.rand(B, 1)

# [중요] 차원 맞춰주기
tau = tau[:, None]  # (B, 1, 1) or just let model fix it

# Flow Matching noisy input 생성
z = tau * x + (1 - tau) * y  # (B, C, A)

# 모델 선언 및 실행
model = ChunkedFlowMatchingExpert(
    vl_token_dim=D,
    action_dim=A,
    chunk_len=C,
)
pred, delta = model(vl_tokens, z, tau)

# 손실 계산
loss = nn.MSELoss()(delta, y - x)
print("Loss:", loss.item())


Loss: 2.262145519256592


## SAIL-VL2 env test // 쉬워보임

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image"}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？"}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


## Qwen2.5-VL env test

In [10]:
# !pip install git+https://github.com/huggingface/transformers accelerate
# !pip install transformers==4.57.0
# # It's highly recommanded to use `[decord]` feature for faster video loading.
# !pip install qwen-vl-utils[decord]==0.0.8
# !pip install torchvision>=0.19.0

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-3B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

image_path1 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view1/right/zed_41182735_right_1759395176.397.jpg'
image_path2 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view2/left/zed_49429257_left_1759395176.393.jpg'
image_path3 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view3/left/zed_44377151_left_1759395176.448.jpg'
image_path4 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view4/left/zed_49045152_left_1759395176.244.jpg'
image_path5 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view5_oak/oak_1944301011169A4800_1759395176.364.jpg'
oct_path = '/home/najo/NAS/VLA/dataset/output.png'

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path1},
            {"type": "image", "image": image_path2},
            {"type": "image", "image": image_path3},
            {"type": "image", "image": image_path4},
            {"type": "image", "image": image_path5},
            {"type": "image", "image": oct_path},
            # {"type": "text", "text": "Describe what the robot is doing in detail and explain the end tool needle tip in the image."},
            # {"type": "text", "text": "what have to do next? for insertion the needle tip"},
            {"type": "text", "text": "Describe the meaning of the oct image with other images1 and 2"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


## Deepseek-VL2 env Test

In [None]:
# !git clone https://github.com/deepseek-ai/DeepSeek-VL2.git
# !pip install -e .
# !pip install "numpy<2"

In [1]:
import torch
import torchvision

print("PyTorch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)

PyTorch Version: 2.0.1+cu117
Torchvision Version: 0.15.2+cu117


In [None]:
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images


# specify the path to the model
model_path = "deepseek-ai/deepseek-vl2-tiny"
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

In [2]:
image_path1 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view1/right/zed_41182735_right_1759395176.397.jpg'
image_path2 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view2/left/zed_49429257_left_1759395176.393.jpg'
image_path3 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view3/left/zed_44377151_left_1759395176.448.jpg'
image_path4 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view4/left/zed_49045152_left_1759395176.244.jpg'
image_path5 = '/home/najo/NAS/VLA/dataset/OCT_insertion/Captures4/view5_oak/oak_1944301011169A4800_1759395176.364.jpg'

conversation = [
    {
        "role": "<|User|>",
        "content": "This is image_1: <image>\n"
                   "This is image_2: <image>\n"
                   "This is image_3: <image>\n"
                   "This is image_4: <image>\n"
                   "This is image_5: <image>\n Putting all the photos together describe what the robot is doing in detail and explain the end tool needle tip in the image.",
        "images": [
            image_path1,
            image_path2,
            image_path3,
            image_path4,
            image_path5,
        ],
    },
    {"role": "<|Assistant|>", "content": ""}
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation,
    images=pil_images,
    force_batchify=True,
    system_prompt=""
).to(vl_gpt.device)

# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# run the model to get the response
outputs = vl_gpt.language.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
print(f"{prepare_inputs['sft_format'][0]}", answer)

Python version is above 3.10, patching the collections module.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Add pad token = ['<｜▁pad▁｜>'] to the tokenizer
<｜▁pad▁｜>:2
Add image token = ['<image>'] to the tokenizer
<image>:128815
Add grounding-related tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] to the tokenizer with input_ids
<|ref|>:128816
<|/ref|>:128817
<|det|>:128818
<|/det|>:128819
<|grounding|>:128820
Add chat tokens = ['<|User|>', '<|Assistant|>'] to the tokenizer with input_ids
<|User|>:128821
<|Assistant|>:128822



You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<|User|>: This is image_1: <image>
This is image_2: <image>
This is image_3: <image>
This is image_4: <image>
This is image_5: <image>
 Putting all the photos together describe what the robot is doing in detail and explain the end tool needle tip in the image.

<|Assistant|>: The robot in the images appears to be performing tasks related to material handling or processing within an industrial setting. The first image shows a robotic arm positioned over a workbench with various objects placed around it. This setup suggests that the robot may be involved in tasks such as assembly, packaging, or material sorting.

In the second image, we see another view of the same workspace, but this time focusing more closely on the robot's interaction with one of the objects on the table. The robot seems to be using its end tool, which looks like a needle tip, to manipulate or interact with the object. This could involve tasks such as picking up, placing, or sorting items.

The third image provides a 

## Paligemma 2 env test // 허깅페이스 로그인 안됨 나중에 시도하자 - multi image input 안됨;;

In [3]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests
import torch

model_id = "google/paligemma-3b-mix-224"
device = "cuda:0"
dtype = torch.bfloat16

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=dtype,
    device_map=device,
    revision="bfloat16",
).eval()
processor = AutoProcessor.from_pretrained(model_id)

# Instruct the model to create a caption in Spanish
prompt = "caption es"
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/paligemma-3b-mix-224.
401 Client Error. (Request ID: Root=1-68ea79dd-4d6de4e101f2c6916d6e30ca;1e98ae80-8d1b-4148-9f7d-5ef2570a0cdf)

Cannot access gated repo for url https://huggingface.co/google/paligemma-3b-mix-224/resolve/bfloat16/config.json.
Access to model google/paligemma-3b-mix-224 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [3]:
from transformers import pipeline
from transformers.image_utils import load_image

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = load_image(url)

feature_extractor = pipeline(
    model="facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
    task="image-feature-extraction", 
)
features = feature_extractor(image)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

thinking content: <think>
Okay, the user wants a short introduction to large language models. Let me start by defining what they are. Large language models are AI systems trained on vast amounts of text data. I should mention their purpose, like generating text, understanding language, and being used in various applications.

I need to keep it concise. Maybe start with a definition, then talk about their training, the types like GPT, and their applications. Also, mention the challenges they face, like bias and data quality. But since it's a short intro, maybe just touch on the key points without going too deep. Make sure to highlight their ability to generate human-like text and their impact on different fields. Avoid technical jargon to keep it accessible. Check for any errors and ensure the flow is logical.
</think>
content: Large language models (LLMs) are advanced AI systems designed to understand, generate, and interact with human language. Trained on vast amounts of text data, th

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoImageProcessor, AutoModel
from PIL import Image
import requests

# -- 1. 모델 및 프로세서 로드 --

# Vision Model (DINOv3)
image_processor = AutoImageProcessor.from_pretrained("facebook/dinov3-convnext-tiny-pretrain-lvd1689m")
vision_model = AutoModel.from_pretrained("facebook/dinov3-convnext-tiny-pretrain-lvd1689m")

# Language Model (Qwen3)
model_name = "Qwen/Qwen3-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
vision_model.to(llm.device)

# -- 2. 이미지 특징을 언어 모델의 입력 공간으로 변환하는 프로젝션 레이어 정의 --

with torch.no_grad():
    dummy_inputs = torch.randn(1, 3, 224, 224).to(llm.device)
    dummy_outputs = vision_model(dummy_inputs)
    vision_hidden_dim = dummy_outputs.last_hidden_state.shape[-1] 

llm_hidden_dim = llm.config.hidden_size
print(f"✅ Vision Model Output Dim: {vision_hidden_dim}")
print(f"✅ Language Model Input Dim: {llm_hidden_dim}")

projection_layer = nn.Linear(vision_hidden_dim, llm_hidden_dim).to(llm.device, dtype=llm.dtype)

# -- 3. 이미지와 텍스트 준비 및 처리 --

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "이 이미지에 대해 설명해줘."

with torch.no_grad():
    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(llm.device, dtype=vision_model.dtype)
    image_features = vision_model(pixel_values).last_hidden_state

messages = [{"role": "user", "content": prompt}]
text_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text_inputs = tokenizer(text_template, return_tensors="pt").to(llm.device)

# -- 4. 이미지와 텍스트 임베딩 결합 --

text_embeddings = llm.get_input_embeddings()(text_inputs.input_ids)

# === 여기만 수정되었습니다! ===
# 이미지 특징의 데이터 타입을 LLM과 동일하게 맞춰줍니다.
image_features_casted = image_features.to(llm.dtype)
# 타입이 맞춰진 특징을 프로젝션 레이어에 통과시킵니다.
image_embeddings = projection_layer(image_features_casted)
# ===========================

combined_embeddings = torch.cat([image_embeddings, text_embeddings], dim=1)
image_attention_mask = torch.ones(image_embeddings.shape[:2], dtype=torch.long, device=llm.device)
combined_attention_mask = torch.cat([image_attention_mask, text_inputs.attention_mask], dim=1)

# -- 5. LLM을 통해 답변 생성 --

generated_ids = llm.generate(
    inputs_embeds=combined_embeddings,
    attention_mask=combined_attention_mask,
    max_new_tokens=100
)

output_ids = generated_ids[0][combined_embeddings.shape[1]:].tolist() 
response = tokenizer.decode(output_ids, skip_special_tokens=True)

print("\n--- 모델의 답변 ---")
print(response)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Vision Model Output Dim: 768
✅ Language Model Input Dim: 2048

--- 모델의 답변 ---
, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive, more comprehensive
