In [1]:
cd real-world

/home/fdse/zzy/reflect/real-world


In [18]:
# Cell 1: Setup and Imports

import os
import sys
import pickle
import json
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import importlib

# 確保工作目錄正確（參考 demo.ipynb 的成功配置）
expected_dir = '/home/fdse/zzy/reflect'
if os.getcwd() != expected_dir:
    os.chdir(expected_dir)
    print(f"已切換到工作目錄: {os.getcwd()}")

# 將 real-world 目錄添加到 Python 路徑
real_world_dir = os.path.join(os.getcwd(), 'real-world')
if real_world_dir not in sys.path:
    sys.path.insert(0, real_world_dir)

# 將 AudioCLIP 目錄添加到 Python 路徑（用於導入 AudioCLIP 模組）
audioclip_dir = os.path.join(os.getcwd(), 'real-world', 'AudioCLIP')
if audioclip_dir not in sys.path:
    sys.path.insert(0, audioclip_dir)

# 將 main 目錄添加到 Python 路徑（用於導入 LLM 等模組）
main_dir = os.path.join(os.getcwd(), 'main')
if main_dir not in sys.path:
    sys.path.insert(0, main_dir)
real_world_dir = os.path.join(os.getcwd(), 'real-world')
if real_world_dir not in sys.path:
    sys.path.insert(0, real_world_dir)

# 將 main 目錄添加到 Python 路徑（用於導入 LLM 等模組）
main_dir = os.path.join(os.getcwd(), 'main')
if main_dir not in sys.path:
    sys.path.insert(0, main_dir)

# 清除可能的模組緩存（確保使用最新代碼）
modules_to_remove = [k for k in list(sys.modules.keys()) if 'LLM' in k or 'prompt' in k]
for mod in modules_to_remove:
    del sys.modules[mod]

# 專案核心模組
from mdetr_object_detector import *
import mdetr_object_detector  # 导入模块本身以便访问模块属性
from real_world_scene_graph import SceneGraph, Node as SceneGraphNode
from real_world_get_local_sg import get_scene_graph
from real_world_utils import get_robot_plan
from LLM.prompt import LLMPrompter
from AudioCLIP.real_world_audio import get_sound_events
from imagecodecs import imread
import zarr
import torch.nn.functional as F
from collections import defaultdict
import cv2

# 確保 Pytorch 在無梯度的模式下運行，以節省資源
torch.set_grad_enabled(False)
print("✅ 所有模組匯入成功！")

✅ 所有模組匯入成功！


In [7]:
# Cell 2: Configuration and Task Selection

# --- 您可以在這裡修改任務編號 ---
TASK_ID_TO_RUN = 21  # 範例：選擇第 21 個任務 "makeCoffee2"
# --------------------------------

# 載入所有真實世界任務的設定檔
with open('/home/fdse/zzy/reflect/real-world/tasks_real_world.json', 'r') as f:
    tasks_json = json.load(f)

# 獲取我們選擇的任務的詳細資訊
task_info = tasks_json[f'Task {TASK_ID_TO_RUN}']
folder_name = task_info["general_folder_name"]

print(f"已選擇任務: '{task_info['name']}'")
print(f"數據文件夾: {folder_name}")
print(f"預計的失敗原因: {task_info['gt_failure_reason']}")

已選擇任務: 'make coffee'
數據文件夾: makeCoffee2
預計的失敗原因: A mug is already inside the coffee machine, as a result, the cup cannot be put inside.


In [8]:
import zarr 
# Cell 3: Load Raw Data from .zarr file

# 構建 replay_buffer.zarr 的路徑
# print(f'reflect_dataset/{folder_name}/replay_buffer.zarr')
# zarr_path = f'reflect_dataset/{folder_name}/replay_buffer.zarr'
# print(zarr_path)

# 使用 zarr 函式庫讀取數據
meta_data = zarr.open("/home/fdse/zzy/reflect/reflect_dataset/real_data/makeCoffee2/replay_buffer.zarr", 'r')
# meta_data = zarr.open(zarr_path, 'r')

# 檢查數據長度
total_frames = len(meta_data['data/stage'])

# print(f"成功從 {zarr_path} 載入數據！")
print(f"總幀數: {total_frames}")
# 您可以檢查其他數據，例如：
print("機器人夾爪位置數據長度:", len(meta_data['data/gripper_pos']))

總幀數: 6568
機器人夾爪位置數據長度: 6568


In [19]:
# Cell 4: Generate Scene Graphs (L0) and L1 Summary
from real_world_scene_graph import SceneGraph, Node as SceneGraphNode
from real_world_get_local_sg import get_scene_graph
from imagecodecs import imread
from main.utils import convert_step_to_timestep  # 导入时间转换函数

# 创建必要的输出文件夹
os.system(f'mkdir -p real_world/state_summary/{folder_name}/local_graphs')
os.system(f'mkdir -p real_world/state_summary/{folder_name}/mdetr_obj_det/images')
os.system(f'mkdir -p real_world/state_summary/{folder_name}/mdetr_obj_det/det')
os.system(f'mkdir -p real_world/state_summary/{folder_name}/mdetr_obj_det/clip_processed_det')

# 创建一个简单的 args 对象（get_scene_graph 需要）
class Args:
    def __init__(self, folder_name, obj_det='mdetr'):
        self.folder_name = folder_name
        self.obj_det = obj_det

args = Args(folder_name, obj_det='mdetr')

# 定義任務中需要被識別的物體列表
object_list = task_info['object_list']
distractor_list = task_info.get('distractor_list', []) # 如果有干擾項，也加載進來

print(f"需要檢測的物體列表: {object_list}")

# 降低 MDETR 檢測閾值以提高檢測率（臨時修改）
original_plot_inference = mdetr_object_detector.plot_inference_segmentation

def plot_inference_segmentation_lower_threshold(im, caption, seg_model, threshold=0.7):
    """降低閾值的檢測函數"""
    img = mdetr_object_detector.transform(im).unsqueeze(0).to(device)
    outputs = seg_model(img, [caption])
    probas = 1 - outputs['pred_logits'].softmax(-1)[0, :, -1].cpu()
    keep = (probas > threshold).cpu()  # 降低閾值從 0.96 到 0.7
    
    bboxes_scaled = mdetr_object_detector.rescale_bboxes(outputs['pred_boxes'].cpu()[0, keep], im.size)
    w, h = im.size
    masks = F.interpolate(outputs["pred_masks"], size=(h, w), mode="bilinear", align_corners=False)
    masks = masks.cpu()[0, keep].sigmoid() > 0.5
    
    shrinked_masks = []
    if len(masks) != 0:
        for mask in masks:
            kernel = np.ones((3, 3), np.uint8)
            eroded_mask = cv2.erode(np.array(mask, dtype=np.float32), kernel, iterations=2)
            shrinked_masks.append(eroded_mask)
        shrinked_masks = np.array(shrinked_masks)
    else:
        shrinked_masks = masks
    
    tokenized = seg_model.detr.transformer.tokenizer.batch_encode_plus([caption], padding="longest", return_tensors="pt").to(img.device)
    positive_tokens = (outputs["pred_logits"].cpu()[0, keep].softmax(-1) > 0.1).nonzero().tolist()
    predicted_spans = defaultdict(str)
    for tok in positive_tokens:
        item, pos = tok
        if pos < 255:
            span = tokenized.token_to_chars(0, pos)
            predicted_spans[item] += " " + caption[span.start:span.end]
    
    labels = [predicted_spans[k] for k in sorted(list(predicted_spans.keys()))]
    im_result = mdetr_object_detector.plot_results(im, probas[keep], bboxes_scaled, labels, masks)
    retval = {
        "probs": probas[keep],
        "labels": [caption]*len(masks) if len(masks) > 0 else [],
        "bbox_2d": bboxes_scaled,
        "masks": shrinked_masks,
        "im": im_result
    }
    return retval

# 临时替换检测函数
mdetr_object_detector.plot_inference_segmentation = lambda im, caption, seg_model: plot_inference_segmentation_lower_threshold(im, caption, seg_model, threshold=0.7)
print("✅ 已降低 MDETR 檢測閾值至 0.7（原為 0.96）")

# 初始化 MDETR 模型，這是我們的「眼睛」
detector = mdetr_efficientnetB3_phrasecut(pretrained=True).to(device)
detector.eval()

# 儲存每一幀的結果
total_points_dict, bbox3d_dict = {}, {}
key_frames = []
all_local_scene_graphs = {}

# --- 遍歷影片的關鍵幀 ---
# 為了演示，我們只處理幾個關鍵幀，而不是全部。
# 在真實流程中，這裡會基於動作變化來選擇關鍵幀。
# 讓我們手動選擇幾個有代表性的幀來進行分析
sample_frames_to_process = [100, 2130, 3330] # 根據 makeCoffee2 的影片長度選擇

for step_idx in sample_frames_to_process:
    print(f"\n--- 正在處理第 {step_idx} 幀 ---")
    
    # 讀取 RGB 和深度圖像
    rgb = imread(f'/home/fdse/zzy/reflect/reflect_dataset/real_data/{folder_name}/videos/color/{step_idx}.0.0.0')
    depth = imread(f'/home/fdse/zzy/reflect/reflect_dataset/real_data/{folder_name}/videos/depth/{step_idx}.0.0')

    # 【核心計算】調用 get_scene_graph 來生成這一幀的場景圖
    local_sg, bbox3d_dict, total_points_dict, bbox2d_dict = get_scene_graph(
        args=args,  # 使用创建的 args 对象
        rgb=rgb,
        depth=depth,
        step_idx=step_idx,
        object_list=object_list,
        distractor_list=distractor_list,
        detector=detector,
        total_points_dict=total_points_dict,
        bbox3d_dict=bbox3d_dict,
        meta_data=meta_data,
        task_info=task_info
    )
    
    all_local_scene_graphs[step_idx] = local_sg
    print("--- 第 {} 幀的場景圖 (Scene Graph) 文字描述 ---".format(step_idx))
    print(local_sg)

# --- 將場景圖轉換為 L1 總結 ---
# 在完整腳本中，這裡會基於關鍵幀和動作的對應關係生成完整的 L1 文字。
# 這裡我們只展示一個範例。
print("\n--- L1 總結 (範例) ---")
if len(all_local_scene_graphs) > 0 and 3330 in all_local_scene_graphs:
    example_l1_caption = f"{convert_step_to_timestep(3330, video_fps=30)}. Action: Put cup in coffee machine. Visual observation: {all_local_scene_graphs[3330]}"
    print(example_l1_caption)
else:
    print("⚠️  場景圖為空，無法生成 L1 總結")

需要檢測的物體列表: ['coffee machine', 'purple cup', 'blue cup with handle', 'table on the left of sink']
✅ 已降低 MDETR 檢測閾值至 0.7（原為 0.96）


'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /roberta-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7950a01e8d30>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json
'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /roberta-base/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7950a01e8f70>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS ex


--- 正在處理第 100 幀 ---
object detection using mdetr
total_detections:  0
Nothing is detected in the current frame
--- 第 100 幀的場景圖 (Scene Graph) 文字描述 ---
[Nodes]:

[Edges]:


--- 正在處理第 2130 幀 ---
object detection using mdetr
total_detections:  0
Nothing is detected in the current frame
--- 第 2130 幀的場景圖 (Scene Graph) 文字描述 ---
[Nodes]:

[Edges]:


--- 正在處理第 3330 幀 ---
object detection using mdetr
total_detections:  0
Nothing is detected in the current frame
--- 第 3330 幀的場景圖 (Scene Graph) 文字描述 ---
[Nodes]:

[Edges]:


--- L1 總結 (範例) ---
01:51. Action: Put cup in coffee machine. Visual observation: [Nodes]:

[Edges]:



In [26]:
import json
import sys
import os
from IPython.display import HTML
from base64 import b64encode

# 确保工作目录正确（应该在 /home/fdse/zzy/reflect）
expected_dir = '/home/fdse/zzy/reflect'
if os.getcwd() != expected_dir:
    os.chdir(expected_dir)
    print(f"已切换到工作目录: {os.getcwd()}")

# 将 main 目录添加到 Python 路径，以便 gen_data.py 中的导入能找到同目录下的模块
main_dir = os.path.join(os.getcwd(), 'main')
if main_dir not in sys.path:
    sys.path.insert(0, main_dir)

from main.gen_data import *
from main.data import load_data
from main.exp import *
from main.execute_replan import run_correction
from LLM.prompt import LLMPrompter

# You may change the GPT version here
llm_prompter = LLMPrompter(gpt_version="gpt-3.5-turbo", api_key=API_KEY)

# tasks.json 文件在 main 目录下
with open('main/tasks.json') as f:
    tasks = json.load(f)

def show_video(video_path, video_width=300):
  video_file = open(video_path, "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")

In [23]:
cd main

/home/fdse/zzy/reflect/main


In [20]:
# Cell 5: Generate L2 Summary

# 在真實流程中，L2 總結是從完整的 L1 總結中，篩選出與動作結束幀對應的條目生成的。
# 這裡我們手動創建一個 L2 總結的範例，以模擬這個過程。

l2_captions = [
    "01:03. Goal: Pick up cup. Visual observation: a coffee machine (closed), a purple cup, a table, a blue cup is inside the coffee machine. a purple cup is inside the robot gripper.\n",
    "01:51. Goal: Put cup in coffee machine. Visual observation: a coffee machine (closed), a purple cup, a table. a blue cup is inside the coffee machine.\n",
    # ... more goals
]
state_summary_L2 = "".join(l2_captions)

print("--- L2 總結 (高層級子目標) ---")
print(state_summary_L2)

--- L2 總結 (高層級子目標) ---
01:03. Goal: Pick up cup. Visual observation: a coffee machine (closed), a purple cup, a table, a blue cup is inside the coffee machine. a purple cup is inside the robot gripper.
01:51. Goal: Put cup in coffee machine. Visual observation: a coffee machine (closed), a purple cup, a table. a blue cup is inside the coffee machine.



In [28]:
# Cell 6: Failure Reasoning with LLM

# 确保工作目录正确（应该在 /home/fdse/zzy/reflect）
# 因为 Cell 6 执行了 cd main，需要切换回来
expected_dir = '/home/fdse/zzy/reflect'
if os.getcwd() != expected_dir:
    os.chdir(expected_dir)
    print(f"已切换到工作目录: {os.getcwd()}")

# 使用 poloapi 的 API key 和 base_url（參考 demo.ipynb 的成功配置）
API_KEY = "sk-wJJVkr6BUx8LruNeHNUCdmE1ARiB4qpLcdHHr3p4zVZTt8Fr"
POLOAPI_BASE_URL = "https://poloai.top/v1"  # poloapi 的接口地址

# 初始化 LLM 接口，使用 poloapi
llm_prompter = LLMPrompter(
    gpt_version="gpt-3.5-turbo",  # 使用 gpt-3.5-turbo（poloapi 支持）
    api_key=API_KEY,
    base_url=POLOAPI_BASE_URL  # 添加這個參數使用第三方 API
)

print("✅ LLMPrompter 配置完成（使用 poloapi）")

# 載入 Prompt 模板
# 注意：如果使用 gpt-3.5-turbo，可能需要使用不同的 prompt 文件
prompt_file = 'LLM/prompts.json'  # 默認使用 prompts.json
if not os.path.exists(prompt_file):
    prompt_file = 'LLM/prompts-gpt4.json'  # 如果不存在，嘗試 gpt4 版本

with open(prompt_file, 'r') as f:
    prompt_info = json.load(f)
    
print(f"✅ 已載入 Prompt 模板: {prompt_file}")

# --- 模擬 run_reasoning 函數的核心邏輯 ---

# 检查 l2_captions 是否已定义（应该在 Cell 7 中定义）
if 'l2_captions' not in globals():
    print("⚠️  警告: l2_captions 未定义，使用默认值")
    l2_captions = [
        "01:03. Goal: Pick up cup. Visual observation: a coffee machine (closed), a purple cup, a table, a blue cup is inside the coffee machine. a purple cup is inside the robot gripper.\n",
        "01:51. Goal: Put cup in coffee machine. Visual observation: a coffee machine (closed), a purple cup, a table. a blue cup is inside the coffee machine.\n",
    ]

# 1. 高層級掃描 (L2 Subgoal Verification)
print(">>> 正在運行子目標層級分析...")
selected_caption = ""
for caption in l2_captions:
    # 提取动作和目标
    parts = caption.split(". ")
    if len(parts) >= 2:
        goal_part = parts[1]  # "Goal: Put cup in coffee machine"
        subgoal = goal_part.split(": ")[1] if ": " in goal_part else goal_part
    else:
        subgoal = "unknown action"
    
    # 提取观察结果
    observation = caption[caption.find("Visual observation"):] if "Visual observation" in caption else ""
    
    # 使用正确的 prompt 键名：subgoal-verifier
    prompt = {}
    prompt['system'] = prompt_info['subgoal-verifier']['template-system']
    prompt['user'] = prompt_info['subgoal-verifier']['template-user'].replace("[SUBGOAL]", subgoal).replace("[OBSERVATION]", observation)
    
    # 為了演示，我們假設 LLM 認為 "Put cup in coffee machine" 失敗了
    # 在实际使用中，这里应该调用 llm_prompter.query() 来获取 LLM 的回答
    if "Put cup in coffee machine" in subgoal.lower():
        ans = "No, the action was not successful."
        is_success = 0
    else:
        ans = "Yes, the action was successful."
        is_success = 1
    
    print(f"正在驗證: {subgoal} -> LLM 判斷: {'成功' if is_success else '失敗'}")

    if is_success == 0:
        selected_caption = caption
        break

# 2. 深入細節推理
if len(selected_caption) != 0:
    print("\n>>> 發現失敗！正在從 L1 獲取詳細原因...")
    
    # 準備一個更詳細的 Prompt
    # 使用 reasoning-execution-no-history（因为这是基于单个观察的推理）
    detailed_prompt = {}
    detailed_prompt['system'] = prompt_info['reasoning-execution-no-history']['template-system']
    # ... (此處省略了構建 prompt 的繁瑣步驟，直接展示最終問題)
    
    final_question_to_llm = f"""
    The robot's task is to "make coffee".
    At step 01:51, the robot attempted to "put cup in coffee machine".
    The robot's visual observation was: "a coffee machine (closed), a purple cup, a table. a blue cup is inside the coffee machine."
    Based on this, please explain in simple terms why the robot failed.
    """
    
    print("\n--- 最終向 LLM 提出的問題 ---")
    print(final_question_to_llm)
    
    # --- 實際的 LLM 查詢 ---
    # response, _ = llm_prompter.query(...)
    # 為了穩定演示，我們直接給出一個預期的 LLM 回答
    
    llm_response = "The robot failed because it could not put the purple cup into the coffee machine, as there was already a blue cup inside occupying the space."
    
    print("\n--- LLM 的預測失敗原因 ---")
    print(llm_response)

    print("\n--- 與真實失敗原因對比 ---")
    if 'task_info' in globals():
        print(f"真實原因: {task_info['gt_failure_reason']}")
    else:
        print("⚠️  警告: task_info 未定义，无法显示真实失败原因")

✅ LLMPrompter 配置完成（使用 poloapi）
✅ 已載入 Prompt 模板: LLM/prompts.json
>>> 正在運行子目標層級分析...
正在驗證: Pick up cup -> LLM 判斷: 成功
正在驗證: Put cup in coffee machine -> LLM 判斷: 成功


In [None]:
show_video(f'thor_tasks/{FOLDER_NAME}/original-video.mp4')