In [28]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
import litellm

response = litellm.completion(
    model="gpt-4o-mini-2024-07-18",
    messages=[{"role": "user", "content": "hi"}],
    num_retries=10,
)
response.choices[0].message.content

In [34]:
import base64
from openai import OpenAI
import os

import os
base_url = os.environ.get("OPENAI_BASE_URL", "https://openrouter.ai/api/v1")
api_key = os.environ.get("OPENAI_API_KEY")

client = OpenAI(
    api_key=api_key, base_url=base_url
)

# Path to your image
image_path = "../logs/puzzle_assembly_quick_test_20251007_121114/images/step_0.png"

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Getting the base64 string
base64_image = encode_image(image_path)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": '''3D CUBE STACKING PUZZLE

You have 7 3D puzzle pieces and one container.

TASK:
Assemble all pieces into the container to form a solid 3×3×3 cube (27 unit cubes total).

GOAL:
- Fit every piece completely inside the container.
- No gaps, overlaps, or floating pieces.
- The final structure must be stable and form a perfect cube.

ACTION RULE:
- You can move or rotate one piece at a time.
- Continue placing pieces until the cube is fully assembled.


OBJECT MAPPING (object_id → properties):
============================================================
object_id=8, RGB=(201, 88, 0)
object_id=7, RGB=(23, 115, 206)
object_id=6, RGB=(0, 87, 43)
object_id=5, RGB=(222, 197, 12)
object_id=4, RGB=(0, 42, 113)
object_id=3, RGB=(169, 27, 0)
object_id=2, RGB=(153, 48, 97)
============================================================
Total movable objects: 7

Now, what's your next action?'''
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ],
        }
    ],
    model="gpt-4o",
    tools=[{'type': 'function', 'function': {'name': 'move_object', 'description': 'Move an object to a specific 3D position in the workspace. The object will be teleported to the target position instantly. CONSTRAINTS: Position coordinates should be within the workspace bounds (typically -2.0 to 2.0 meters for x and y, 0.0 to 2.0 meters for z). Objects moved above ground (z > 0) will fall under gravity.', 'parameters': {'type': 'object', 'properties': {'object_id': {'type': 'integer', 'description': 'Unique object_id (integer) of the object to move. Get this from the observation state.'}, 'position': {'type': 'array', 'items': {'type': 'number'}, 'minItems': 3, 'maxItems': 3, 'description': 'Target position [x, y, z] in meters. Example: [0.5, 0.2, 0.1]. Ensure z >= 0 to keep objects above ground.'}}, 'required': ['object_id', 'position']}}}, {'type': 'function', 'function': {'name': 'rotate_object', 'description': "Rotate an object around a specified axis by a given angle. The rotation is applied relative to the object's current orientation. CONSTRAINTS: Angle should typically be between -180 and 180 degrees. Common angles: 90° for quarter turn, 180° for half turn.", 'parameters': {'type': 'object', 'properties': {'object_id': {'type': 'integer', 'description': 'Unique object_id (integer) of the object to rotate. Get this from the observation state.'}, 'axis': {'type': 'string', 'enum': ['x', 'y', 'z'], 'description': "Rotation axis: 'x' for pitch, 'y' for yaw, 'z' for roll"}, 'angle': {'type': 'number', 'description': 'Rotation angle in degrees (positive = counter-clockwise when viewing along the axis). Typical range: -180 to 180.'}}, 'required': ['object_id', 'axis', 'angle']}}}, {'type': 'function', 'function': {'name': 'observe', 'description': 'Change camera viewpoint to observe the scene from a different angle. The camera rotates around the center of the scene at a fixed distance. Use this to inspect objects from different perspectives. CONSTRAINTS: Angle wraps around at 360 degrees (0° = front view, 90° = right side, 180° = back, 270° = left side).', 'parameters': {'type': 'object', 'properties': {'angle': {'type': 'number', 'default': 0.0, 'description': 'Camera rotation angle in degrees around the scene center. 0° = front, 90° = right, 180° = back, 270° = left. Range: 0-360 (wraps around).'}}, 'required': ['angle']}}}, {'type': 'function', 'function': {'name': 'finish', 'description': 'Signal that you have completed the task successfully. ONLY call this when you believe all task objectives have been fully achieved. The system will evaluate your result after this call.', 'parameters': {'type': 'object', 'properties': {}, 'required': []}}}, {'type': 'function', 'function': {'name': 'place_into_container', 'description': "Place a puzzle piece into the container at a specified offset position relative to the container's center. This is a convenient alternative to move_object when placing pieces into the container. The piece will be positioned relative to the container's coordinate system. CONSTRAINTS: Offsets must be small to keep pieces inside the container. Use offset_z to stack pieces vertically.", 'parameters': {'type': 'object', 'properties': {'object_id': {'type': 'integer', 'description': "Unique object_id (integer) of the puzzle piece to place. Get this from the observation state. Do NOT use the container's object_id."}, 'offset_x': {'type': 'number', 'description': 'X offset from container center in meters. Positive = right, negative = left. RANGE: -0.1 to 0.1 (must keep piece inside container).', 'default': 0.0}, 'offset_y': {'type': 'number', 'description': 'Y offset from container center in meters. Positive = forward, negative = backward. RANGE: -0.1 to 0.1 (must keep piece inside container).', 'default': 0.0}, 'offset_z': {'type': 'number', 'description': 'Z offset from container bottom in meters. Use this for vertical stacking. RANGE: 0.0 to 0.15. Start from 0.0 for bottom layer, increase for higher layers.', 'default': 0.0}}, 'required': ['object_id']}}}],
    tool_choice='auto',
)
print(chat_completion.choices[0].message)

ChatCompletionMessage(content="To start solving the puzzle, I'll first need to move all the pieces closer to the container to prepare for assembling them into a cube.\n\nLet's move each piece to a position adjacent to the container, starting with object_id=8. Once they're close, we'll determine the rotations and arrangements needed for them to fit together perfectly.\n\nI need to observe the container closely, let's rotate the camera to get a side view, which might help with positioning and maneuvering pieces effectively.", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='call_nzXrIJVT64qpB5OpNlnQjgMg', function=Function(arguments='{"angle":90}', name='observe'), type='function', index=0)], reasoning=None)


In [5]:
import pybullet as p
import pybullet_data
import numpy as np
from PIL import Image
import os
import glob

# Connect to pybullet in DIRECT mode (no GUI)
physicsClient = p.connect(p.DIRECT)
p.setAdditionalSearchPath(pybullet_data.getDataPath())

# Load plane
planeId = p.loadURDF("plane.urdf")

# Find all urdf files under the specified directory
urdf_root = r"C:\Users\smj81\Desktop\VisualReasonBench\src\phyvpuzzle\environment\phobos_models\3x3-stacking-puzzle"
urdf_files = []
for root, dirs, files in os.walk(urdf_root):
    for file in files:
        if file.endswith(".urdf"):
            urdf_files.append(os.path.join(root, file))

# Optionally sort for consistent order
urdf_files.sort()

# Load all urdf files, spread them out in a grid for visibility
loaded_ids = []
num_per_row = int(np.ceil(np.sqrt(len(urdf_files))))
spacing = 0.3
for idx, urdf_path in enumerate(urdf_files):
    row = idx // num_per_row
    col = idx % num_per_row
    x = (col - num_per_row / 2) * spacing
    y = (row - num_per_row / 2) * spacing
    z = 0.5
    try:
        obj_id = p.loadURDF(urdf_path, [x, y, z])
        loaded_ids.append(obj_id)
    except Exception as e:
        print(f"Failed to load {urdf_path}: {e}")

# 获取每个object的visual shape颜色
obj_color_dict = {}
for obj_id in loaded_ids:
    visual_shapes = p.getVisualShapeData(obj_id)
    # visual_shapes: list of tuples, each tuple: (objectUniqueId, linkIndex, visualGeometryType, dimensions, filename, meshScale, rgbaColor, ...)
    # 可能有多个link, 这里只取第一个link的颜色
    if visual_shapes:
        # 取所有link的颜色
        link_colors = {}
        for vs in visual_shapes:
            link_index = vs[1]
            color = vs[7]  # rgbaColor
            link_colors[link_index] = color
        obj_color_dict[obj_id] = link_colors
    else:
        obj_color_dict[obj_id] = {}

print(obj_color_dict)

# Set camera parameters
width, height = 512, 512
view_matrix = p.computeViewMatrix(
    cameraEyePosition=[1, 1, 1],
    cameraTargetPosition=[0, 0, 0.5],
    cameraUpVector=[0, 0, 1]
)
fov = 60
aspect = width / height
near = 0.1
far = 3.1
projection_matrix = p.computeProjectionMatrixFOV(
    fov=fov,
    aspect=aspect,
    nearVal=near,
    farVal=far
)

# Step simulation a few times to settle
for _ in range(10):
    p.stepSimulation()

# Render the image
img_arr = p.getCameraImage(
    width=width,
    height=height,
    viewMatrix=view_matrix,
    projectionMatrix=projection_matrix,
    renderer=p.ER_BULLET_HARDWARE_OPENGL
)

rgb_array = np.reshape(img_arr[2], (height, width, 4))[:, :, :3]
img = Image.fromarray(rgb_array.astype(np.uint8))

# Show the image inline if in notebook, or save to file
img.show()  # If running in a notebook, this will display the image

# Disconnect
p.disconnect()


{1: {-1: (0.7882400155067444, 0.3450999855995178, 0.0, 1.0)}, 2: {-1: (0.09019999951124191, 0.45489999651908875, 0.8117600083351135, 1.0)}, 3: {-1: (0.0, 0.3411799967288971, 0.16863000392913818, 1.0)}, 4: {-1: (0.8705899715423584, 0.7725499868392944, 0.05098000168800354, 1.0)}, 5: {-1: (0.0, 0.16471000015735626, 0.4431400001049042, 1.0)}, 6: {-1: (0.6627500057220459, 0.10980000346899033, 0.0, 1.0)}, 7: {-1: (0.6000000238418579, 0.18824000656604767, 0.3843100070953369, 1.0)}, 8: {-1: (0.2117599993944168, 0.2117599993944168, 0.2117599993944168, 1.0)}}


In [24]:
img.show()

In [20]:
import numpy as np
seg = img_arr[4]
crop = seg[290:340, 230:290]
id_counts = np.bincount(crop[crop >= 0].flatten())
if len(id_counts) > 0:
    target_id = np.argmax(id_counts)
    print("AI 看到的方块对应物理ID:", target_id)


AI 看到的方块对应物理ID: 0


In [23]:
img2 = img.crop((190, 270, 270, 310))
img2.show()

In [25]:
crop

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
print(np.unique(img_arr[4]))

[-1  0  1  2  3  4  5  6  8]


In [6]:
visual_shapes

((8,
  -1,
  5,
  (0.005, 0.005, 0.005),
  b'C:\\Users\\smj81\\Desktop\\VisualReasonBench\\src\\phyvpuzzle\\environment\\phobos_models\\3x3-stacking-puzzle\\obj_8\\urdf/../meshes/obj/bt_cube3x3_box.obj',
  (0.29408, 0.66932, 0.07707),
  (0.0, 0.0, 0.0, 1.0),
  (0.2117599993944168, 0.2117599993944168, 0.2117599993944168, 1.0)),)