# Environment

In [2]:
import numpy as np
import os
from pyboy import PyBoy
from gymnasium import Env, spaces
from stable_baselines3 import PPO
from stable_baselines3.ppo import MultiInputPolicy
from stable_baselines3.common.env_checker import check_env
from gymnasium.wrappers import TransformObservation
from gymnasium.spaces import Box, Dict

# Definir acciones
ACTIONS = ['a', 'b', 'left', 'right', 'up', 'down', 'start']

class GenericPyBoyEnv(Env):
    def __init__(self, pyboy, debug=False, render_mode=False, max_gameplay_time=216000):
        super().__init__()
        self.pyboy = pyboy
        self.debug = debug
        self.render_mode = render_mode
        self.max_gameplay_time = max_gameplay_time
        self.current_gameplay_time = 0
        
        self.action_space = spaces.Discrete(len(ACTIONS))
        self.observation_space = Dict({
            "image": Box(low=0, high=255, shape=(144, 160, 1), dtype=np.uint8),
            "info": Box(low=0.0, high=1.0, shape=(3,), dtype=np.float32)
        })
        
        self.visited_maps = set()
        self.visited_positions = set()
        self.last_pos = None
        
        if not self.debug:
            self.pyboy.set_emulation_speed(50 if not self.render_mode else 3)
        
        self.load_state()

    def load_state(self):
        with open("state_file.state", "rb") as f:
            self.pyboy.load_state(f)

    def get_observation(self):
        screen = np.mean(self.pyboy.screen.ndarray, axis=-1).astype(np.uint8)
        pos_x = self.pyboy.memory[0xC0D4] / 255.0
        pos_y = self.pyboy.memory[0xC0D5] / 255.0
        map_id = self.pyboy.memory[0xC92C] / 255.0
        return {"image": screen[..., None], "info": np.array([pos_x, pos_y, map_id], dtype=np.float32)}

    def step(self, action):
        self.pyboy.button(ACTIONS[action])
        for _ in range(60):
            self.pyboy.tick()
            self.current_gameplay_time += 1
        
        done = self.current_gameplay_time >= self.max_gameplay_time
        pos_x = self.pyboy.memory[0xC0D4]
        pos_y = self.pyboy.memory[0xC0D5]
        map_value = self.pyboy.memory[0xC92C]
        position = (pos_x, pos_y, map_value)
        
        reward = 10 * (map_value not in self.visited_maps) + (position not in self.visited_positions)
        if position == self.last_pos:
            reward -= 0.05
        
        self.visited_maps.add(map_value)
        self.visited_positions.add(position)
        self.last_pos = position
        
        return self.get_observation(), reward, done, False, {}

    def reset(self, seed=None, **kwargs):
        self.load_state()
        self.current_gameplay_time = 0
        self.visited_positions.clear()
        return self.get_observation(), {}

    def close(self):
        self.pyboy.stop()




# Agent

In [2]:
# Inicializar entorno
pyboy = PyBoy(r"C:\Users\USUARIO\Desktop\RLMEDA\MedarotKabuto.gb")
train_env = GenericPyBoyEnv(pyboy, debug=False, render_mode=False)
train_env = TransformObservation(
    train_env,
    lambda obs: {"image": obs["image"].astype(np.uint8), "info": obs["info"]},
    observation_space=train_env.observation_space
)

# Validar entorno y entrenar modelo
check_env(train_env, warn=True)
model = PPO(MultiInputPolicy, train_env, verbose=1, learning_rate=3e-4, n_steps=3600, gamma=0.95, ent_coef=0.01, batch_size=225, clip_range=0.3)
model.learn(total_timesteps=200000)
model.save("ppo_medarot")

🔄 Reiniciando entorno!
🔄 Reiniciando entorno!
🔄 Reiniciando entorno!
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
🔄 Reiniciando entorno!
🔄 Reiniciando entorno!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.6e+03  |
|    ep_rew_mean     | 130      |
| time/              |          |
|    fps             | 29       |
|    iterations      | 1        |
|    time_elapsed    | 121      |
|    total_timesteps | 3600     |
---------------------------------
🔄 Reiniciando entorno!
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.6e+03     |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 25          |
|    iterations           | 2           |
|    time_elapsed         | 280         |
|    total_timesteps      | 7200        |
| train/ 

In [None]:
# Evaluación del modelo
model = PPO.load("ppo_medarot")
pyboy = PyBoy(r"C:\Users\USUARIO\Desktop\RLMEDA\MedarotKabuto.gb")
eval_env = GenericPyBoyEnv(pyboy, debug=False, render_mode=False)
eval_env = TransformObservation(
    eval_env,
    lambda obs: {"image": obs["image"].astype(np.uint8), "info": obs["info"]},
    observation_space=eval_env.observation_space
)


# Evaluar el modelo entrenado
num_episodes = 5
total_rewards = []

for episode in range(num_episodes):
    obs, _ = eval_env.reset()
    total_reward = 0

    for _ in range(3600):  # 10 minutos, 1 acción por segundo
        # Asegurar que la observación es un diccionario (como en `observation_space`)
        if not isinstance(obs, dict):
            raise ValueError(f"La observación esperada es un diccionario, pero se recibió: {type(obs)}")

        # Obtener la acción del modelo
        action, _ = model.predict(obs)  # No concatenar, usar directamente el diccionario

        # Tomar un paso en el entorno
        obs, reward, done, truncated, _ = eval_env.step(action)
        total_reward += reward

        if done:
            break

    total_rewards.append(total_reward)
    print(f"Episodio {episode + 1}: Recompensa total: {total_reward}")

# Imprimir la recompensa promedio
print(f"Recompensa promedio en {num_episodes} episodios: {np.mean(total_rewards)}")


Episodio 1: Recompensa total: 121.7499999999815


In [3]:
# Inicializar PyBoy
pyboy = PyBoy(r"C:\Users\USUARIO\Desktop\RLAgent\data\MedarotKabuto.gb")
train_env = GenericPyBoyEnv(pyboy, debug=False, render_mode=False)
train_env = TransformObservation(
    train_env,
    lambda obs: {
        "image": obs["image"].astype(np.uint8),  # Asegura que la imagen es uint8
        "info": obs["info"]  # Mantiene la info sin cambios
    },
    observation_space=Dict({
        "image": Box(low=0, high=255, shape=(144, 160, 1), dtype=np.uint8),
        "info": Box(low=0.0, high=1.0, shape=(3,), dtype=np.float32)
    })
)

obs = train_env.reset()[0]
print("Image shape:", obs["image"].shape)  # Debería ser (144, 160, 1)
print("Info shape:", obs["info"].shape)  # Debería ser (3,)
print("Info values:", obs["info"])  # Debería contener valores entre 0 y 1



🔄 Reiniciando entorno!
Image shape: (144, 160, 1)
Info shape: (3,)
Info values: [0.01960784 0.01960784 0.36862746]
