In [1]:
import torch
import numpy as np
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from accelerate import Accelerator
from typing import List, Dict, Any, Tuple

# --- 0. Mock Utils (실제 환경에 토크나이저가 없을 경우를 대비한 임시 클래스) ---
class MockFastTokenizer:
    """physical-intelligence/fast 토크나이저가 없을 때 테스트용"""
    def __call__(self, action):
        # 액션 차원만큼 임의의 토큰 ID 리스트 반환 (예: 0~255 사이)
        return [(np.array(action) * 10).astype(int).flatten().tolist()]

# --- 1. Model & Processor Loading ---
def load_model_and_processor(accelerator: Accelerator):
    """
    Load NORA model and processors.
    """
    accelerator.print(f"Loading Processor & Model from 'declare-lab/nora'...")
    
    # 1. Load Processor
    try:
        processor = AutoProcessor.from_pretrained('declare-lab/nora', trust_remote_code=True)
    except Exception as e:
        accelerator.print(f"Warning: Failed to load specific processor, using default Qwen2.5-VL. Error: {e}")
        processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", trust_remote_code=True)

    processor.tokenizer.padding_side = 'left'

    # 2. Load Model
    # GLIBC 호환성 문제시 attn_implementation="eager" 또는 "sdpa" 사용
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        'declare-lab/nora',
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2", 
        device_map="auto"
    )

    # 3. Freeze Logic
    accelerator.print("Freezing Qwen VLM parameters...")

    # Freeze vision encoder
    if hasattr(model, 'visual'):
        for param in model.visual.parameters():
            param.requires_grad = False
        accelerator.print("  ✓ Vision encoder frozen")

    # Freeze language model backbone
    if hasattr(model, 'model'):
        for param in model.model.parameters():
            param.requires_grad = False
        accelerator.print("  ✓ Language model frozen")

    # Keep lm_head trainable
    if hasattr(model, 'lm_head'):
        for param in model.lm_head.parameters():
            param.requires_grad = True
        accelerator.print("  ✓ LM head (action decoder) trainable")

    # Count parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    accelerator.print(f"  Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")

    # 4. Load FAST tokenizer
    try:
        accelerator.print("Loading FAST tokenizer...")
        fast_tokenizer = AutoProcessor.from_pretrained(
            "physical-intelligence/fast", trust_remote_code=True
        )
    except Exception as e:
        accelerator.print(f"Warning: Could not load 'physical-intelligence/fast'. Using MockTokenizer for testing. Error: {e}")
        fast_tokenizer = MockFastTokenizer()

    return model, processor, fast_tokenizer

# --- 2. Helper Functions ---
def map_fast_token_to_vlm_action(tokens: List[int]) -> str:
    """
    Maps FAST action tokens to VLM action format string.
    """
    return ''.join([f"<robot_action_{token}>" for token in tokens])

def process_example_for_nora(example: Dict[str, Any], processor: AutoProcessor, fast_tokenizer: Any) -> List[Dict[str, Any]]:
    """
    Process a single example for NORA training format.
    """
    # 1. Get Action
    action = example['action']
    if isinstance(action, torch.Tensor):
        action = action.numpy()
    
    if len(action.shape) == 1:
        action = action[np.newaxis, :]  # (1, action_dim)

    # 2. Tokenize Action
    # fast_tokenizer returns input_ids or raw tokens depending on implementation.
    # Here we assume it returns a list of token IDs per sample.
    fast_tokens_batch = fast_tokenizer(action)
    
    # Handle Mock vs Real return types
    if isinstance(fast_tokens_batch, dict) and 'input_ids' in fast_tokens_batch:
        fast_tokens = fast_tokens_batch['input_ids'][0]
    elif isinstance(fast_tokens_batch, list):
        fast_tokens = fast_tokens_batch[0]
    else:
        fast_tokens = fast_tokens_batch # Fallback

    vlm_action_str = map_fast_token_to_vlm_action(fast_tokens)

    # 3. Process Image
    # example['observation.images.camera1'] is assumed (C, H, W) Tensor
    pixel_values = example['observation.images.camera1'] 
    
    # (C, H, W) -> (H, W, C) & Scale
    img_np = pixel_values.permute(1, 2, 0).numpy()
    img_np = (img_np * 255).astype(np.uint8)
    pil_image = Image.fromarray(img_np)

    # 4. Construct Message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": example['task']},
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": vlm_action_str},
            ],
        },
    ]

    return messages

# --- 3. Main Execution ---
if __name__ == "__main__":
    # 1. Accelerator Init
    accelerator = Accelerator()
    
    # 2. Load System
    model, processor, fast_tokenizer = load_model_and_processor(accelerator)
    model.eval() # Test mode

    # 3. Create Dummy Data (Mocking HDF5 Dataset output)
    accelerator.print("\nGenerating dummy data...")
    dummy_example = {
        'observation.images.camera1': torch.rand(3, 224, 224), # Random Image (C, H, W)
        'action': torch.tensor([0.1, 0.2, 0.3, 0.0, 0.0, 1.0]), # 6-DOF Action
        'task': "Pick up the red apple on the table."
    }

    # 4. Process Data
    accelerator.print("Processing example...")
    messages = process_example_for_nora(dummy_example, processor, fast_tokenizer)
    
    # Debug: Print structure
    accelerator.print("\n[Processed Messages Structure]:")
    print(f"User: {messages[0]['content'][1]['text']}")
    print(f"Assistant (Action Tokens): {messages[1]['content'][0]['text']}")

    # 5. Prepare Inputs for Model (Creating Tensors)
    text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    image_inputs, video_inputs = [], []
    for msg in messages:
        for content in msg["content"]:
            if content["type"] == "image":
                image_inputs.append(content["image"])
            # 비디오 처리 로직이 있다면 여기에 추가
    
    # [수정됨] 비디오 리스트가 비어있으면 None을 전달해야 에러가 안 납니다.
    final_video_inputs = video_inputs if len(video_inputs) > 0 else None

    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        videos=final_video_inputs,  # <--- 수정된 부분 ([] 대신 None 전달)
        padding=True,
        return_tensors="pt"
    )
    
    inputs = inputs.to(model.device)
    # 6. Inference Test (Forward Pass)
    accelerator.print("\nRunning Forward Pass (Test)...")
    with torch.no_grad():
        # labels logic is needed for training, here we just check generation or forward shape
        outputs = model.generate(**inputs, max_new_tokens=20)
        
    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)
    accelerator.print("\n[Generation Result]:")
    accelerator.print(generated_text)
    
    accelerator.print("\nTest Complete! Code is valid.")

  from .autonotebook import tqdm as notebook_tqdm
2026-02-01 21:03:45.525304: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-01 21:03:45.569668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-02-01 21:03:45.569707: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-02-01 21:03:45.571043: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-02-01 21:03:45.5

Loading Processor & Model from 'declare-lab/nora'...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


Freezing Qwen VLM parameters...
  ✓ Vision encoder frozen
  ✓ Language model frozen
  ✓ LM head (action decoder) trainable
  Trainable parameters: 314,804,224 / 3,758,262,272 (8.38%)
Loading FAST tokenizer...


Some kwargs in processor config are unused and will not have any effect: action_dim, vocab_size, time_horizon, scale, min_token. 



Generating dummy data...
Processing example...

[Processed Messages Structure]:
User: Pick up the red apple on the table.
Assistant (Action Tokens): <robot_action_444><robot_action_277><robot_action_257><robot_action_308>

Running Forward Pass (Test)...

[Generation Result]:
['system\nYou are a helpful assistant.\nuser\nPick up the red apple on the table.\nassistant\n\nassistant\n']

Test Complete! Code is valid.


In [3]:
torch.__version__

'2.4.0+cu121'

In [2]:
pip uninstall -y flash-attn

Found existing installation: flash-attn 2.6.1
Uninstalling flash-attn-2.6.1:
  Successfully uninstalled flash-attn-2.6.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

Collecting flash-attn==2.6.1+cu123torch2.4cxx11abifalse
  Downloading https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl (198.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.5/198.5 MB[0m [31m9.3 MB/s[0m  [33m0:00:21[0m[0m eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: flash-attn
Successfully installed flash-attn-2.6.1
