In [None]:
# Entrenamiento de Agente RL
# Este notebook prepara y entrena un agente de aprendizaje por refuerzo (PPO/DQN) usando la simulación del BSM1.

In [1]:
# Entrenamiento de Agente RL
# Este notebook prepara y entrena un agente de aprendizaje por refuerzo (PPO) usando la simulación del BSM1 (modo demostración).

# Instalar librerías necesarias
!pip install stable-baselines3[extra] optuna -q

# Imports
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from gym import spaces

# Simulación simplificada del BSM1 (mock)
class BSM1Env(gym.Env):
    def __init__(self):
        super(BSM1Env, self).__init__()
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)  # Control de oxígeno, por ejemplo
        self.observation_space = spaces.Box(low=0, high=100, shape=(4,), dtype=np.float32)  # Ejemplo: DQO, NH4, NO3, SRT
        self.state = np.array([50.0, 10.0, 5.0, 8.0])
        self.t = 0

    def reset(self):
        self.state = np.array([50.0, 10.0, 5.0, 8.0])
        self.t = 0
        return self.state

    def step(self, action):
        oxigeno = action[0]
        self.state += np.random.normal(0, 0.5, size=self.state.shape) - oxigeno
        reward = -np.sum(np.square(self.state - np.array([30, 5, 2, 5])))  # Penaliza desviación del estado deseado
        self.t += 1
        done = self.t >= 96  # 96 pasos = 4 días si cada paso es 1h
        return self.state, reward, done, {}

    def render(self, mode="human"):
        print(f"Step {self.t}, State: {self.state}")

# Crear y verificar el entorno
env = BSM1Env()
check_env(env)

# Entrenar el agente PPO
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=5000)

# Guardar el modelo entrenado
model.save("ppo_bsm1_agent")

# Probar el agente entrenado
obs = env.reset()
for i in range(24):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done:
        break


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/383.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m378.9/383.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

AssertionError: Your environment must inherit from the gymnasium.Env class cf. https://gymnasium.farama.org/api/env/

In [2]:
# Instalar librerías necesarias
!pip install gymnasium stable-baselines3[extra] optuna -q

# Imports
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from gymnasium import spaces

# Simulación simplificada del BSM1 (mock)
class BSM1Env(gym.Env):
    def __init__(self):
        super(BSM1Env, self).__init__()
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)  # Control de oxígeno
        self.observation_space = spaces.Box(low=0, high=100, shape=(4,), dtype=np.float32)  # DQO, NH4, NO3, SRT
        self.state = np.array([50.0, 10.0, 5.0, 8.0], dtype=np.float32)
        self.t = 0

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.array([50.0, 10.0, 5.0, 8.0], dtype=np.float32)
        self.t = 0
        return self.state, {}

    def step(self, action):
        oxigeno = action[0]
        self.state += np.random.normal(0, 0.5, size=self.state.shape) - oxigeno
        reward = -np.sum(np.square(self.state - np.array([30, 5, 2, 5])))  # Penaliza desviación del estado deseado
        self.t += 1
        done = self.t >= 96
        return self.state, reward, done, False, {}

    def render(self):
        print(f"Step {self.t}, State: {self.state}")

# Crear y verificar el entorno
env = BSM1Env()
check_env(env)

# Entrenar el agente PPO
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=5000)

# Guardar el modelo entrenado
model.save("ppo_bsm1_agent")

# Probar el agente entrenado
obs, _ = env.reset()
for i in range(24):
    action, _states = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()
    if done:
        break



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 96       |
|    ep_rew_mean     | -6.5e+04 |
| time/              |          |
|    fps             | 1296     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 96           |
|    ep_rew_mean          | -7e+04       |
| time/                   |              |
|    fps                  | 900          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0019337073 |
|    clip_fraction        | 0.00107      |
|    clip_range           | 0.2          |
|    en