In [None]:
!pip install flash-attn
!pip uninstall -y timm
!pip install timm==0.9.16

Found existing installation: timm 0.9.16
Uninstalling timm-0.9.16:
  Successfully uninstalled timm-0.9.16
Collecting timm==0.9.16
  Using cached timm-0.9.16-py3-none-any.whl.metadata (38 kB)
Using cached timm-0.9.16-py3-none-any.whl (2.2 MB)
Installing collected packages: timm
Successfully installed timm-0.9.16


In [None]:
!nvidia-smi

Thu Jan  8 14:12:59 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import torch
import numpy as np
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor

In [None]:
# =========================
# 1. Mock Robot Interface
# =========================

class MockRobot:
    """
    用来模拟真实机器人控制接口
    """
    def act(self, action: np.ndarray):
        assert action.shape == (7,)
        print("Executing action:")
        print(f"  Δx, Δy, Δz = {action[:3]}")
        print(f"  Δroll, Δpitch, Δyaw = {action[3:6]}")
        print(f"  gripper = {action[6]}")


# =========================
# 2. Load OpenVLA
# =========================

def load_openvla(device="cuda:0"):
    processor = AutoProcessor.from_pretrained(
        "openvla/openvla-7b",
        trust_remote_code=True
    )

    model = AutoModelForVision2Seq.from_pretrained(
        "openvla/openvla-7b",
        attn_implementation="eager",# "flash_attention_2",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    ).to(device)

    model.eval()
    return processor, model


# =========================
# 3. Get Image (Camera / File)
# =========================

def get_image_from_file(path: str) -> Image.Image:
    """
    模拟从相机获取一帧
    """
    img = Image.open(path).convert("RGB")
    return img


# =========================
# 4. Inference with OpenVLA
# =========================

@torch.no_grad()
def predict_action(
    processor,
    model,
    image: Image.Image,
    instruction: str,
    device="cuda:0"
):
    prompt = (
        "In: What action should the robot take to "
        f"{instruction}?\n"
        "Out:"
    )

    inputs = processor(
        prompt,
        image,
        return_tensors="pt"
    ).to(device, dtype=torch.bfloat16)

    # OpenVLA 专用 API（不是 generate）
    action = model.predict_action(
        **inputs,
        unnorm_key="bridge_orig",  # BridgeData V2 反归一化
        do_sample=False
    )

    # torch.Tensor -> numpy
    return action




In [None]:
# =========================
# 5. Main Script
# =========================

def main():
    device = "cuda:0"
    robot = MockRobot()

    print("Loading OpenVLA...")
    processor, model = load_openvla(device)

    print("Loading image...")
    image = get_image_from_file("scene.jpg")

    instruction = "pick up the red block and place it on the blue square"

    print("Running OpenVLA inference...")
    action = predict_action(
        processor,
        model,
        image,
        instruction,
        device
    )

    print("Action predicted:", action)

    print("Sending to robot...")
    robot.act(action)


if __name__ == "__main__":
    main()

Loading OpenVLA...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading image...
Running OpenVLA inference...
Action predicted: [-0.00020879 -0.00042412  0.00703386  0.00049971 -0.00747924 -0.00167851
  0.        ]
Sending to robot...
Executing action:
  Δx, Δy, Δz = [-0.00020879 -0.00042412  0.00703386]
  Δroll, Δpitch, Δyaw = [ 0.00049971 -0.00747924 -0.00167851]
  gripper = 0.0
