In [None]:
# !pip install xformers
# !pip install pillow
# !pip install transformers==4.57.0
# !pip install --no-deps --force-resintall torch # jetson orin AGX version https://developer.download.nvidia.com/compute/redist/jp/
# !pip install --no-cache $TORCH_INSTALL torchvision # jetson orin AGX version
# !pip install --force-reinstall "numpy<2.0"

import sys
sys.path.append("/home/zed_box/NAS/Qwen2.5-VL-3B-_OCT_FPI_Action_Model")
import numpy as np

import torch
import time

from PIL import Image
from transformers import AutoProcessor
import matplotlib.pyplot as plt

from model import QwenVLAForAction
from Total_Dataset import BridgeRawSequenceDataset, collate_fn

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CKPT_PATH = "../checkpoints/qwen_vla_final_1000.pt"
VL_MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"

model = QwenVLAForAction(
    vl_model_name=VL_MODEL_NAME,
    action_dim=7,
    horizon=8,
    hidden_dim=1024,
    cache_dir="./cache/qwen_vl_features"
).to(DEVICE)

model.set_cache(False)  # 캐시 기능 자체는 비활성

checkpoint = torch.load(CKPT_PATH, map_location=DEVICE, weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"], strict=False)
model.eval()

ImportError: /home/zed_box/miniconda3/envs/qwen_env/lib/python3.10/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/zed_box/miniconda3/envs/qwen_env/lib/python3.10/site-packages/torch/lib/libtorch_python.so)

In [None]:

# Processor (Qwen2.5-VL과 동일한 전처리)
processor = AutoProcessor.from_pretrained(VL_MODEL_NAME)

# ===========================================
# 2️⃣ 예시 입력
# ===========================================
# 샘플 이미지 로드 (예: 테스트용 jpg)
image_path = "/home/zed_box/NAS/VLA_make_the_dataset/dataset/part2/ZED_Captures_11th/view1/zed_41182735_left_1759125181.869.jpg"  # 아무 이미지나 하나 두세요
image = Image.open(image_path).convert("RGB")

instruction = "Move the gripper towards the white block."

z_chunk = torch.zeros((1, 8, 7), dtype=torch.bfloat16, device="cuda")

# ===========================================
# 3️⃣ 모델 입력 구성
# ===========================================
inputs = processor(
    text=instruction,
    images=image,
    return_tensors="pt"
).to(DEVICE)

torch.cuda.synchronize()
start = time.time()

with torch.no_grad():
    output = model(
        text_inputs=instruction,
        image_inputs=[[image]],
        z_chunk=z_chunk
    )

torch.cuda.synchronize()
end = time.time()
elapsed = end - start

print(f"✅ Inference time: {elapsed*1000:.2f} ms")

# ===========================================
# 5️⃣ 결과 확인
# ===========================================
if isinstance(output, (tuple, list)):
    pred_action = output[0]
else:
    pred_action = output

print("Predicted action:", pred_action)
print("Shape:", pred_action.shape)
