In [132]:
import gymnasium as gym
from minigrid.wrappers import SymbolicObsWrapper

# The environment name implies a size of 11x11
env = gym.make("MiniGrid-BlockedUnlockPickup-v0")
# Reset the environment to get the initial observation
obs, _ = env.reset()
print(env)

print(obs['mission'])
print(obs['direction'])
original_image = obs['image']
print(f"The origin image shape is (Number of Objects, 3): {obs['image'].shape}")
print("\nHere are the Object Type [-, -, 0]:")
print(obs['image'][:,:,0])

print("\nHere are the color  [-, -, 1]:")
print(obs['image'][:,:,1])
print("\nHere are the State  [-, -, 2]:")
print(obs['image'][:,:,2])

# Apply the SymbolicObsWrapper
env_symbolic = SymbolicObsWrapper(env)
symobs, _ = env_symbolic.reset()

# The 'image' key now holds the symbolic list
symbolic_list = symobs['image']

print("\nHere are the objects [-, -, ObjectID]:")
print(symbolic_list[:,:,2])

<OrderEnforcing<PassiveEnvChecker<BlockedUnlockPickupEnv<MiniGrid-BlockedUnlockPickup-v0>>>>
pick up the blue box
2
The origin image shape is (Number of Objects, 3): (7, 7, 3)

Here are the Object Type [-, -, 0]:
[[0 0 2 2 2 2 2]
 [0 0 2 1 1 5 1]
 [0 0 2 1 1 1 6]
 [0 0 2 1 1 1 1]
 [0 0 2 1 1 1 1]
 [0 0 2 2 2 2 2]
 [0 0 0 0 0 0 0]]

Here are the color  [-, -, 1]:
[[0 0 5 5 5 5 5]
 [0 0 5 0 0 4 0]
 [0 0 5 0 0 0 2]
 [0 0 5 0 0 0 0]
 [0 0 5 0 0 0 0]
 [0 0 5 5 5 5 5]
 [0 0 0 0 0 0 0]]

Here are the State  [-, -, 2]:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]

Here are the objects [-, -, ObjectID]:
[[ 2  2  2  2  2  2]
 [ 2 -1 -1 -1 -1  2]
 [ 2 -1 -1 -1  5  2]
 [ 2 -1 10 -1 -1  2]
 [ 2 -1 -1 -1  6  2]
 [ 2  2  2  2  4  2]
 [ 2 -1 -1 -1 -1  2]
 [ 2  7 -1 -1 -1  2]
 [ 2 -1 -1 -1 -1  2]
 [ 2 -1 -1 -1 -1  2]
 [ 2  2  2  2  2  2]]


In [133]:
## matrix->text

import gymnasium as gym
import minigrid
import numpy as np
from collections import defaultdict

# --- 1. 定义ID到英文单词的映射 (保持不变) ---
IDX_TO_OBJECT = {
    0: 'unseen',
    1: 'empty',
    2: 'wall',
    3: 'floor',
    4: 'door',
    5: 'key',
    6: 'ball',
    7: 'box',
    8: 'goal',
    9: 'lava',
    10: 'agent',
}

IDX_TO_COLOR = {
    0: 'red',
    1: 'green',
    2: 'blue',
    3: 'purple',
    4: 'yellow',
    5: 'grey',
}

IDX_TO_STATE = {
    0: 'open',
    1: 'closed',
    2: 'locked',
}


def describe_observation_in_english_grouped(obs) -> str:
    """
    (新版本) 接收Minigrid的观察字典，
    返回一个将相同物体组合在一起的、更自然的英文文字描述。
    """
    direct = obs['direction']
    direction_map = {
        0: 'right (East)',
        1: 'down (South)',
        2: 'left (West)',
        3: 'up (North)'
    }
    direction = direction_map.get(direct, 'unknown direction')
    direction_text = f"You're facing {direction}."

    image= obs['image']
    height, width, _ = image.shape
    
    # --- 新增逻辑：使用字典来给物体分组 ---
    # 键是一个元组 (颜色, 类型, 状态)，值是一个坐标列表
    grouped_objects = defaultdict(list)
    
    object_grid = image[:, :, 0]
    color_grid = image[:, :, 1]
    state_grid = image[:, :, 2]
    
    # 第一步：遍历视野，收集并给所有物体分组
    for y in range(height):
        for x in range(width):
            obj_id = object_grid[y, x]
            
            # 忽略 "unseen" (0) 和 "empty" (1) 的格子
            if obj_id not in [0, 1]:
                color_id = color_grid[y, x]
                state_id = state_grid[y, x]
                
                obj_name = IDX_TO_OBJECT.get(obj_id, f'object(ID:{obj_id})')
                color_name = IDX_TO_COLOR.get(color_id, f'color(ID:{color_id})')
                
                # 为物体创建一个唯一的键。对于门，状态也是其身份的一部分
                object_key = (color_name, obj_name)
                if obj_name == 'door':
                    state_name = IDX_TO_STATE.get(state_id, f'state(ID:{state_id})')
                    object_key = (color_name, obj_name, state_name)
                
                # 将当前坐标添加到对应的分组中
                grouped_objects[object_key].append(f"({x}, {y})")

    # 第二步：根据分组后的物体生成描述
    object_descriptions = []
    for object_key, coords in grouped_objects.items():
        color_name = object_key[0]
        obj_name = object_key[1]
        
        coords_str = ", ".join(coords)
        
        # 根据物体数量选择单数或复数形式
        if len(coords) == 1:
            description = f"- At coordinate {coords_str}, there is a {color_name} {obj_name}."
        else:
            # 简单的复数形式，对大多数简单名词有效
            plural_obj_name = obj_name + 's' 
            description = f"- There are {color_name} {plural_obj_name} at coordinates: {coords_str}."
        
        # 如果是门，附加上其状态
        if obj_name == 'door':
            state_name = object_key[2]
            description += f" It is {state_name}."
            
        object_descriptions.append(description)

    # 组装最终的完整描述
    
    if not object_descriptions:
        full_text = direction_text+"\nYou see nothing of interest."
    else:
        full_text = direction_text+"\n".join(object_descriptions)
        
    return full_text


    # 调用我们新的、经过优化的函数
text_description = describe_observation_in_english_grouped(obs)
    
    # 打印结果
print(text_description)
print("\nHere are the Object Type [-, -, 0]:")
print(obs['image'][:,:,0])


You're facing left (West).- There are grey walls at coordinates: (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (3, 5), (4, 5), (5, 5), (6, 5).
- At coordinate (5, 1), there is a yellow key.
- At coordinate (6, 2), there is a blue ball.

Here are the Object Type [-, -, 0]:
[[0 0 2 2 2 2 2]
 [0 0 2 1 1 5 1]
 [0 0 2 1 1 1 6]
 [0 0 2 1 1 1 1]
 [0 0 2 1 1 1 1]
 [0 0 2 2 2 2 2]
 [0 0 0 0 0 0 0]]


In [134]:
import gymnasium as gym
import minigrid
import numpy as np
from collections import defaultdict
import google.generativeai as genai
import os
import time
import re
from minigrid.core.actions import Actions

# --- 0. 配置 Gemini API ---
# 请确保您已经设置了 GOOGLE_API_KEY 环境变量
# 或者直接在这里配置：genai.configure(api_key="YOUR_API_KEY")
import os
import google.generativeai as genai
import textworld.gym
api_key = os.environ.get('GOOGLE_API_KEY')
print(f"Google API Key: {api_key}")
genai.configure(api_key= "AIzaSyApmTC5BaW21s9xrhznwYyuYxWU6U2GTno")

#try call gemini-2.5-flash
model = genai.GenerativeModel('gemini-2.5-flash')
response = model.generate_content(
    contents="How are you?",
)
print(response.text)

def extract_commands(response: str) -> list[str]:
    """(新版) 从 Gemini 的响应中提取一个或多个命令的列表"""
    # 1. 优先寻找被方括号包裹的动作列表
    match = re.search(r'\[(.*?)\]', response)
    if match:
        # 提取括号内的内容，按逗号分割，并清理每个命令
        commands_str = match.group(1)
        return [cmd.strip().lower() for cmd in commands_str.split(',')]

    # 2. 如果没找到，作为备用方案，寻找独立的动作名称
    found_commands = []
    for line in response.splitlines():
        cleaned_line = line.strip().lower()
        if cleaned_line in ['left', 'right', 'forward', 'pickup', 'drop', 'toggle', 'done']:
            found_commands.append(cleaned_line)
    
    if found_commands:
        return found_commands

    return [] # 如果完全没找到，返回空列表
    
# --- 2. 新的 Gemini 交互逻辑 ---
def get_gemini_action(prompt: str) -> str:
    """调用 Gemini API 获取决策"""
    model = genai.GenerativeModel('gemini-2.5-flash')
    try:
        response = model.generate_content(prompt)
        print(f"Gemini response: {response.text}")
        action = extract_commands(response.text)
        return action
    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        return "forward" # 默认动作


Google API Key: None
As an AI, I don't have feelings or a physical state like humans do, so I don't experience "how I am" in that way.

However, I am fully operational and ready to assist you!

How can I help you today?


In [135]:

# --- 1. 我们之前编写的“观察转文字”函数和映射字典 ---
IDX_TO_OBJECT = {0:'unseen', 1:'empty', 2:'wall', 3:'floor', 4:'door', 5:'key', 6:'ball', 7:'box', 8:'goal', 9:'lava', 10:'agent'}
IDX_TO_COLOR = {0:'red', 1:'green', 2:'blue', 3:'purple', 4:'yellow', 5:'grey'}
IDX_TO_STATE = {0:'open', 1:'closed', 2:'locked'}


def build_minigrid_prompt(obs, history: str) -> str:
    mission = obs['mission']
    world_description = describe_observation_in_english_grouped(obs)
    fixed_actions = """- 'left': Turn left by 90 degrees on the spot.
- 'right': Turn right by 90 degrees on the spot.
- 'forward': Move one step forward in the direction you are facing.
- 'pickup': Pick up an object directly in front of you.
- 'drop': Drop the object you are carrying.
- 'toggle': Interact with an object in front of you (e.g., open a door).
- 'done': End the mission if you think you are finished."""

    prompt_template = f"""You are an intelligent agent in a grid-based video game. Your goal is to navigate and interact with objects to solve a mission.
Your Mission is to {mission}.
### RULES OF THE WORLD ###
1.  The world is a single, continuous map which may contain multiple rooms connected by doors.
2.  Your 7x7 view is a small window onto this larger map. You are always at the bottom-center of this 7x7 grid, the coordinate (3, 6) .
3.  A change in your view means you have moved or turned, NOT that you have entered a new, separate room.
4.  Actions have simple, physical effects as described below.
5.  You can only interact with objects that are directly in front of you. You cannot toggle the wall.
### ACTION HISTORY LOG (Last 10 Steps) ###
{history if history else 'This is the first step. Lets go!'}
Here is what you see now: {world_description}
Based on your mission and current situation, you MUST choose your next action from the following fixed list:
{fixed_actions}
Take a deep thought of what to do and respond in a new line with only the action name in [], for example: 
[forward]

But you don't have to rush. Remember turning or moving will give you a new view and new information!
"""
    return prompt_template

#If you are certain about what to do, you can select a sequence of actions, for example,
#[forward, right, forward, forward, pickup, right]
#But you don't have to rush. Remember turning or moving will give you a new view and new information!


In [136]:
import imageio # 确保在文件顶部导入

def visualize_playthrough(env_name: str, seed: int, actions: list, filename: str = "agent_playthrough.gif"):
    """
    接收一个动作列表，并将其在一个新环境中重播，最终保存为GIF动图。
    """
    print(f"\n--- Creating visualization. Saving to {filename} ---")
    
    # 1. 使用相同的种子和设置重新创建环境
    #    关键：将渲染模式设置为 'rgb_array'
    env = gym.make(env_name, render_mode="rgb_array")
    obs, info = env.reset(seed=seed)
    
    frames = []
    
    # 2. 获取第一帧的画面
    frame = env.render()
    frames.append(frame)
    
    # 3. 循环播放记录下来的动作
    for action_id in actions:
        # 执行动作
        obs, reward, terminated, truncated, info = env.step(action_id)
        
        # “截图”并保存当前帧
        frame = env.render()
        frames.append(frame)
        
        if terminated or truncated:
            break
            
    env.close()
    
    # 4. 使用 imageio 将所有帧合成为一个GIF
    imageio.mimsave(filename, frames, fps=3) # fps是每秒帧数，可以调整
    print("--- Visualization saved! ---")


In [137]:
from collections import deque
recorded_actions = []

def main():

    env_name = "MiniGrid-Empty-5x5-v0"
    seed = 42
    
    env = gym.make(env_name)
    ACTION_MAP = {action.name.lower(): action.value for action in Actions}
    
    obs, info = env.reset(seed=seed)
    print(obs['mission'])
    
    history_deque = deque(maxlen=10)

    
    moves = 0
    done = False
    
    print("--- Game Start ---")
    
    while not done:       
        if moves > 25:
            print(f"\n--- Moves exceeded 25. Ending episode early. ---")
            break
        history_for_prompt = "\n".join(list(history_deque))
        prompt = build_minigrid_prompt(obs, history_for_prompt)
        print(prompt)
        # 从 Gemini 获取一个动作序列（列表）
        action_sequence = get_gemini_action(prompt)
        # 如果 Gemini 没有给出有效动作，则默认为'前进'
        if not action_sequence:
            print(f"--- Gemini chose an INVALID or EMPTY command sequence. Defaulting to ['forward']. ---")
            action_sequence = ['forward']

        # 内部循环，按顺序执行动作序列中的每一个动作
        for i, action_name in enumerate(action_sequence):
            print(f"\n... Executing step {i+1}/{len(action_sequence)}: '{action_name}' ...")
            
            if action_name in ACTION_MAP:
                action_id = ACTION_MAP[action_name]
            else:
                print(f"--- Invalid action '{action_name}' in sequence. Skipping. ---")
                break
            
            recorded_actions.append(action_id)
            
            new_obs, reward, terminated, truncated, info = env.step(action_id)
            done = terminated or truncated
            
            obs = new_obs

            if terminated: print("    MISSION SUCCESSFUL! Halting sequence.")
            
            # 如果中途任务完成，必须立刻跳出内循环和外循环
            if done:
                break
        observation=describe_observation_in_english_grouped(obs)
        history_entry = f"- Step {moves+1}: Chose action {', '.join(action_sequence) }, Current view {observation}"
        history_deque.append(history_entry)
        print(history_entry)
            
        moves += 1
        time.sleep(1) 

    env.close()
    print(f"\n--- Game Over! ---")
    print(f"Episode finished in {moves} moves.")
    
    # 新增：游戏结束后，调用可视化函数
    if recorded_actions:
        visualize_playthrough(env_name, seed, recorded_actions)

if __name__ == "__main__":
    main()

get to the green goal square
--- Game Start ---
You are an intelligent agent in a grid-based video game. Your goal is to navigate and interact with objects to solve a mission.
Your Mission is to get to the green goal square.
### RULES OF THE WORLD ###
1.  The world is a single, continuous map which may contain multiple rooms connected by doors.
2.  Your 7x7 view is a small window onto this larger map. You are always at the bottom-center of this 7x7 grid, the coordinate (3, 6) .
3.  A change in your view means you have moved or turned, NOT that you have entered a new, separate room.
4.  Actions have simple, physical effects as described below.
5.  You can only interact with objects that are directly in front of you. You cannot toggle the wall.
### ACTION HISTORY LOG (Last 10 Steps) ###
This is the first step. Lets go!
Here is what you see now: You're facing right (East).- There are grey walls at coordinates: (0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (0, 1), (1, 1), (2, 1),

In [138]:
env_name = "MiniGrid-Empty-5x5-v0"
seed = 42
if recorded_actions:
    visualize_playthrough(env_name, seed, recorded_actions)


--- Creating visualization. Saving to agent_playthrough.gif ---
--- Visualization saved! ---


####Visualize