This is notebook aims to create a minimum framework of RL custom environment

[gymnasium custom environment](https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/#sphx-glr-tutorials-gymnasium-basics-environment-creation-py)

In [1]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_checker import check_env

# 1. Create a Custom Env

In [2]:
class minimumTradeEnv(gym.Env):
    def __init__(self):
        self.observation_space = gym.spaces.Discrete(2) # {0, 1}
        self.action_space = gym.spaces.Discrete(2) # {0, 1}

    def _get_obs(self):
        return np.random.randint(2) # {0, 1}
    
    def reset(self, seed=None):
        # super().reset(seed=seed)
        self.observation = self._get_obs()
        return self.observation, {}

    def step(self, action):
        self.signal = self.observation
        self.action = action
        self.observation = self._get_obs()
        terminated = False
        truncated = False
        self.reward = 1 if self.action==self.signal else 0

        return self.observation, self.reward, terminated, truncated, {}
    
    def render(self):
        print(f"signal: {self.signal}, action: {self.action}, reward:{self.reward}")

# 2. Verify the Custom Env

In [3]:
env = minimumTradeEnv()
check_env(env)

# 3. Train the Model

In [4]:
model_ppo = PPO("MlpPolicy", env, verbose=1, gamma=1, batch_size = 256)
model_ppo.learn(total_timesteps=50000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 472  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.021635562 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.684      |
|    explained_variance   | -0.000255   |
|    learning_rate        | 0.0003      |
|    loss                 | 29.3        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0309     |
|    value_loss        

<stable_baselines3.ppo.ppo.PPO at 0x2526fb88280>

# 4. Instantiate a Test Env

In [5]:
test_env = minimumTradeEnv()

# 5. Test Env with Trained Model

In [7]:
obs, _ = test_env.reset()

for i in range(20):
    action, _states = model_ppo.predict(obs)
    obs, rewards, terminated, truncated, info = test_env.step(action)
    test_env.render()

signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 0, action: 0, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 0, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 0, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
