In [1]:
# VLM related
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
# OmniGibson related
from envs.base_env import BaseEnvironment
from omnigibson.utils.ui_utils import KeyboardRobotController
from utils.debug import setup_debug_keys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 初始化Qwen VL模型
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)
model.eval()

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
def inference_with_vlm(rgb_tensor, proprio_tensor, model, processor, process_vision_info, prompt_template=None):
    """
    使用视觉语言模型对机器人观测进行推理
    
    参数:
        rgb_tensor (torch.Tensor): RGB图像张量，形状为(H, W, C)
        proprio_tensor (torch.Tensor): 机器人本体感知数据张量
        model: VLM模型
        processor: VLM处理器
        process_vision_info: 处理视觉信息的函数
        prompt_template (str, optional): 可选的提示模板，如果为None则使用默认模板
        
    返回:
        str: 模型生成的文本结果
    """
    # 处理RGB图像
    if rgb_tensor.shape[2] == 4:
        rgb_tensor = rgb_tensor[:, :, :3]
    # 转换为PIL图像
    rgb_img = Image.fromarray(rgb_tensor.numpy())
    # 本体感知张量转换为numpy
    proprio_vector = proprio_tensor.numpy()
    
    # 构建提示文本
    prompt = f"Based on the robot's proprioceptive data:{proprio_vector}, determine the next action."
    if prompt_template:
        prompt = prompt_template.format(proprio=proprio_vector)
    
    # 定义输入消息
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": rgb_img},
                {"type": "text", "text": prompt}
            ],
        }
    ]
    
    # 应用聊天模板
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # 处理视觉信息
    image_inputs, video_inputs = process_vision_info(messages)

    # 准备模型输入
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )

    # 将输入移至适当的设备
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # 执行推理
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128)

    # 解码生成的输出
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # 返回生成的文本
    return output_text[0]

In [4]:
# initialize og env
env = BaseEnvironment(configs="config/scene_config.yaml")
robot = env.robots[0]
# initialize robot controller
keyboard_controller = KeyboardRobotController(robot)
setup_debug_keys(keyboard_controller, robot, env)

count = 0
while True:
    count += 1
    action = keyboard_controller.get_teleop_action()
    obs, reward, terminated, truncated, info = env.step(action)
    # fetch rgb and proprio
    rgb_tensor = obs[robot.name][f"{robot.name}:eyes:Camera:0"]["rgb"]
    proprio_tensor = obs[robot.name]["proprio"]
    # shape of rgb_tensor
    print(rgb_tensor.shape)
    # shape of proprio_tensor
    print(proprio_tensor.shape)
    # Remove the alpha channel if present
    if rgb_tensor.shape[2] == 4:
        rgb_tensor = rgb_tensor[:, :, :3]
    # Convert to PIL Image
    rgb_img = Image.fromarray(rgb_tensor.numpy())
    # Priprio tensor to numpy
    proprio_vector = proprio_tensor.numpy()
    # Define the input message
    result = inference_with_vlm(
        rgb_tensor=rgb_tensor,
        proprio_tensor=proprio_tensor,
        model=model,
        processor=processor,
        process_vision_info=process_vision_info,
        prompt_template="我看到了机器人环境的图像，机器人的状态数据为:{proprio}，请分析这个场景并建议下一步操作。"
    )
    print(result)


[INFO] [omnigibson.simulator] ----- Starting [2m[1m[37mOmni[0m[1m[91mGibson[0m. This will take 10-30 seconds... -----


正在创建环境，这可能需要一些时间...
Starting kit application with the following args:  ['/home/litiangong/miniconda3/envs/omnigibson/lib/python3.10/site-packages/isaacsim/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py', '/home/litiangong/miniconda3/envs/omnigibson/lib/python3.10/site-packages/isaacsim/apps/omnigibson_4_1_0.kit', '--/app/tokens/exe-path=/home/litiangong/miniconda3/envs/omnigibson/lib/python3.10/site-packages/omni', '--/persistent/app/viewport/displayOptions=3094', '--/rtx/materialDb/syncLoads=True', '--/rtx/hydra/materialSyncLoads=True', '--/omni.kit.plugin/syncUsdLoads=True', '--/app/renderer/resolution/width=1280', '--/app/renderer/resolution/height=720', '--/app/window/width=1440', '--/app/window/height=900', '--/renderer/multiGpu/enabled=False', '--/app/fastShutdown=True', '--ext-folder', '/home/litiangong/miniconda3/envs/omnigibson/lib/python3.10/site-packages/isaacsim/exts', '--ext-folder', '/home/litiangong/miniconda3/envs/omnigibson/lib/python3.10/site-packages/isaacsim/a

[INFO] [omni.kit.telemetry.impl.sentry_extension] sentry is disabled for external build
[INFO] [omni.kit.telemetry.impl.sentry_extension] sentry is disabled for external build


[0.858s] [ext: omni.kit.telemetry-0.5.0] startup
[0.881s] [ext: omni.kit.loop-isaac-1.2.0] startup
[0.883s] [ext: omni.kit.test-0.0.0] startup
[0.905s] [ext: omni.appwindow-1.1.8] startup
[0.908s] [ext: omni.kit.renderer.core-1.0.1] startup
[0.918s] [ext: omni.kit.renderer.capture-0.0.0] startup
[0.919s] [ext: omni.kit.renderer.imgui-1.0.1] startup
[0.991s] [ext: omni.ui-2.23.11] startup
[0.998s] [ext: omni.kit.mainwindow-1.0.3] startup
[0.998s] [ext: carb.audio-0.1.0] startup
[1.001s] [ext: omni.uiaudio-1.0.0] startup
[1.002s] [ext: omni.kit.uiapp-0.0.0] startup
[1.002s] [ext: omni.usd.schema.audio-0.0.0] startup
[1.064s] [ext: omni.usd.schema.physx-106.0.20] startup
[1.093s] [ext: omni.usd.schema.forcefield-106.0.20] startup
[1.098s] [ext: omni.usd.schema.anim-0.0.0] startup
[1.118s] [ext: omni.usd.schema.omniscripting-1.0.0] startup
[1.123s] [ext: omni.usd.schema.omnigraph-1.0.0] startup
[1.130s] [ext: omni.anim.graph.schema-106.0.2] startup
[1.134s] [ext: omni.anim.navigation.schem

[DEBUG] [AutoNode] Defining data type 'any' as 'Any'
[DEBUG] [AutoNode] Defining data type 'bool' as 'Bool' and array 'BoolArray
[DEBUG] [AutoNode] Defining data type 'bundle' as 'Bundle'
[DEBUG] [AutoNode] Defining data type 'colord[3]' as 'Color3d' and array 'Color3dArray
[DEBUG] [AutoNode] Defining data type 'colorf[3]' as 'Color3f' and array 'Color3fArray
[DEBUG] [AutoNode] Defining data type 'colorh[3]' as 'Color3h' and array 'Color3hArray
[DEBUG] [AutoNode] Defining data type 'colord[4]' as 'Color4d' and array 'Color4dArray
[DEBUG] [AutoNode] Defining data type 'colorf[4]' as 'Color4f' and array 'Color4fArray
[DEBUG] [AutoNode] Defining data type 'colorh[4]' as 'Color4h' and array 'Color4hArray
[DEBUG] [AutoNode] Defining data type 'double' as 'Double' and array 'DoubleArray
[DEBUG] [AutoNode] Defining data type 'double[2]' as 'Double2' and array 'Double2Array
[DEBUG] [AutoNode] Defining data type 'double[3]' as 'Double3' and array 'Double3Array
[DEBUG] [AutoNode] Defining data t

[1.503s] [ext: omni.kit.widget.text_editor-1.0.2] startup
[1.504s] [ext: omni.graph.image.core-0.3.2] startup
[1.510s] [ext: omni.kit.window.property-1.11.1] startup
[1.511s] [ext: omni.physx-106.0.20] startup
[1.524s] [ext: omni.kit.widget.toolbar-1.6.2] startup
[1.528s] [ext: omni.kit.property.usd-3.21.28] startup
[1.532s] [ext: omni.physx.stageupdate-106.0.20] startup
[1.533s] [ext: omni.physx.commands-106.0.20] startup
[1.535s] [ext: omni.kit.manipulator.tool.snap-1.4.5] startup
[1.537s] [ext: omni.graph.tools-1.78.0] startup
[1.554s] [ext: omni.physx.ui-106.0.20] startup
[1.576s] [ext: omni.graph-1.135.0] startup
[1.597s] [ext: omni.physx.demos-106.0.20] startup
[1.605s] [ext: omni.graph.image.nodes-1.0.2] startup
[1.606s] [ext: omni.graph.action_core-1.1.4] startup
[1.617s] [ext: omni.isaac.version-1.1.0] startup
[1.618s] [ext: omni.syntheticdata-0.6.7] startup
[1.630s] [ext: omni.physx.graph-106.0.20] startup
[1.645s] [ext: omni.isaac.nucleus-0.3.0] startup
[1.647s] [ext: omni.p



[1.943s] [ext: omni.kit.window.material_graph-1.8.15] startup
[2.000s] [ext: omni.kit.numpy.common-0.1.2] startup
[2.002s] [ext: omni.warp-1.2.1] startup
[2.004s] [ext: omni.sensors.tiled-0.0.4] startup
[2.011s] [ext: omni.physx.bundle-106.0.20] startup
[2.011s] [ext: omni.graph.scriptnode-1.19.1] startup
[2.012s] [ext: omni.isaac.dynamic_control-1.3.8] startup
[2.028s] [ext: omni.replicator.core-1.11.14] startup
Warp 1.2.1 initialized:
   CUDA Toolkit 11.8, Driver 12.6
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "NVIDIA GeForce RTX 4090" (24 GiB, sm_89, mempool enabled)
   Kernel cache:
     /home/litiangong/.cache/warp/1.2.1
[2.091s] [ext: omni.isaac.core-3.18.1] startup
[2.181s] [ext: omni.graph.visualization.nodes-2.1.1] startup
[2.192s] [ext: omni.isaac.core_nodes-1.16.1] startup
[2.208s] [ext: omni.isaac.cloner-0.8.1] startup
[2.213s] [ext: omni.isaac.ui-0.16.0] startup
[2.221s] [ext: omni.kit.graph.widget.variables-2.1.0] startup
[2.225s] [ext: omni.kit.graph.delega



[2.415s] [ext: omni.kit.property.bundle-1.2.11] startup
[2.417s] [ext: omni.isaac.sensor-12.7.1] startup
[2.457s] [ext: omni.kit.property.layer-1.1.6] startup
[2.460s] [ext: omni.kit.stage_column.variant-1.0.13] startup
[2.463s] [ext: omni.kit.stage_column.payload-2.0.0] startup
[2.465s] [ext: omni.isaac.scene_blox-0.1.2] startup
[2.467s] [ext: omni.isaac.quadruped-1.4.5] startup
[2.477s] [ext: omni.kit.viewport.menubar.camera-105.1.8] startup
[2.484s] [ext: omni.isaac.lula-3.0.1] startup
[2.488s] [ext: omni.isaac.surface_gripper-1.0.1] startup
[2.491s] [ext: omni.kit.viewport.menubar.settings-106.0.1] startup
[2.495s] [ext: omni.kit.viewport.menubar.render-106.1.3] startup
[2.497s] [ext: omni.kit.manipulator.camera-105.0.5] startup
[2.503s] [ext: omni.isaac.motion_generation-7.1.0] startup
[2.505s] [ext: omni.isaac.manipulators-2.1.0] startup
[2.507s] [ext: omni.kit.viewport.bundle-104.0.1] startup
[2.509s] [ext: omni.kit.viewport.menubar.lighting-106.0.2] startup
[2.510s] [ext: omni.



[4.924s] [ext: omni.kit.quicklayout-1.0.7] startup
[4.949s] [ext: omni.anim.navigation.core-106.0.2] startup
[4.957s] [ext: omni.anim.navigation.ui-106.0.2] startup
[5.009s] [ext: omni.anim.navigation.bundle-106.0.1] startup
[5.010s] [ext: omni.anim.skelJoint-106.0.1] startup
[5.019s] [ext: omni.anim.retarget.ui-106.0.1] startup
[5.048s] [ext: omni.anim.retarget.bundle-106.0.1] startup
[5.049s] [ext: omni.kit.scripting-106.0.1] startup
[5.052s] [ext: omni.anim.people-0.4.1] startup
[5.067s] [ext: omni.anim.curve.core-1.1.13] startup
[5.084s] [ext: omni.anim.timeline-105.0.23] startup




[5.131s] [ext: omni.kit.streamsdk.plugins-3.2.1] startup
[5.136s] [ext: omni.kit.renderer.cuda_interop-1.0.1] startup
[5.137s] [ext: omni.kit.livestream.core-3.2.0] startup
[5.142s] [ext: omni.kit.livestream.native-4.1.0] startup

Active user not found. Using default user [kiosk]

2025-04-27 10:10:05 [10,597ms] [Error] [carb.livestream.plugin] Stream Server: starting the server failed, 0x800B1002
2025-04-27 10:10:05 [10,597ms] [Error] [carb.livestream.plugin] Could not initialize streaming components
2025-04-27 10:10:05 [10,597ms] [Error] [carb.livestream.plugin] Couldn't initialize the capture device.


Now streaming on 169.235.18.46 via Omniverse Streaming Client
[10.843s] [ext: omni.physx.fabric-106.0.20] startup


[INFO] [omnigibson.simulator] ---------- Welcome to [2m[1m[37mOmni[0m[1m[91mGibson[0m! ----------




[1m[37m                   ___________[0m[2m[1m[37m[0m[1m[37m[0m[2m[1m[37m[0m[1m[91m[0m[1m[37m[0m[1m[91m_[0m
[1m[37m                  /          [0m[2m[1m[37m[0m[1m[37m[0m[2m[1m[37m[0m[1m[91m[0m[1m[37m[0m[1m[91m/ \[0m
[1m[37m                 /          [0m[2m[1m[37m[0m[1m[37m[0m[2m[1m[37m[0m[1m[91m/ /[0m[1m[37m__[0m[1m[91m[0m
[1m[37m                /          [0m[2m[1m[37m[0m[1m[37m[0m[2m[1m[37m[0m[1m[91m[0m[1m[37m[0m[1m[91m/ /  /\[0m
[1m[37m               /[0m[2m[1m[37m__________[0m[1m[37m[0m[2m[1m[37m[0m[1m[91m/ /[0m[1m[37m__[0m[1m[91m/  \[0m
[1m[37m               [0m[2m[1m[37m\   _____  [0m[1m[37m[0m[2m[1m[37m[0m[1m[91m\ \[0m[1m[37m__[0m[1m[91m\  /[0m
[1m[37m                [0m[2m[1m[37m\  \  [0m[1m[37m/ [0m[2m[1m[37m\  [0m[1m[91m[0m[1m[37m[0m[1m[91m\ \_/ /[0m
[1m[37m                 [0m[2m[1m[37m\  \[0m[1m[37m/[0

[INFO] [omnigibson.simulator] Imported scene 0.


期望的物体总数: 7
分配给各类别的数量: [1, 1, 1, 1, 1, 1, 1]
桌子包围盒中心: tensor([ 1.4889, -0.5252,  0.3084])
桌子包围盒尺寸: tensor([1.1380, 0.7617, 0.6504])
桌子高度: 0.6335694789886475
桌子朝向: tensor([0.0000, 0.0000, 0.7071, 0.7071])
total_grid_cells: 28
桌面尺寸: 1.0779612064361572 x 0.7016814351081848
网格大小: 0.15 x 0.15
网格数量: 7 x 4 = 28
占用率: 0.8
可用位置数量: 22
生成了 22 个位置点
根据网格和占用率生成的可用位置数量: 22
可用位置(22)多于期望物体总数(7)，可以补充更多物体
最终的物体分配数量: [4, 3, 3, 3, 3, 3, 3]
最终的物体总数: 22
已生成 22 个物品配置，使用位置数量: 22
成功添加了 22 个动态物体
等待场景稳定...
场景稳定完成
已设置机器人初始关节位置
已将当前关节位置设为默认重置位置
环境创建完成！
[K[Ftorch.Size([480, 640, 4])
torch.Size([66])
从你提供的机器人状态数据来看，机器人似乎处于一个混乱的环境中，桌面上散落着各种物品和工具。以下是对当前场景的分析以及可能的下一步操作：

### 场景分析：
1. **物品分布**：桌面上有多个碗、盘子、杯子、刀具、勺子等餐具，还有一些可能是厨房用具的物品。
2. **工具位置**：有几个工具（如铲子、夹子）放在桌面上，但它们的位置并不明确。
3. **机器人状态**：机器人似乎在尝试处理这些物品，但由于其状态数据中包含大量负
[K[Ftorch.Size([480, 640, 4])
torch.Size([66])
从你提供的状态数据来看，机器人似乎在一个厨房环境中，桌面上有各种物品，包括锅、碗、勺子、刀具等。根据这些信息，我们可以推断出以下几点：

1. **物体位置和状态**：
   - 桌面上有多个锅和碗。
   - 有一些勺子和刀具散落在桌子上。
   - 有一个杯子在桌子的一侧。

2. **动作历史**：
   - 

: 