In [1]:
import gymnasium
from gymnasium import Env # Env class to build custom env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete # different type of spaces

import numpy as np
import random
import os

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

Let's look at types of spaces

In [4]:
Discrete(3).sample() # just discrete  space, in our case 0,1,2

2

In [5]:
Box(0, 1, shape=(3,3)) # box space, lowest highest and shape

Box(0.0, 1.0, (3, 3), float32)

In [6]:
Box(0, 1, shape=(3,3)).sample()

array([[0.80953276, 0.18092021, 0.8583819 ],
       [0.49820903, 0.67182857, 0.61986655],
       [0.75848645, 0.4256289 , 0.15841283]], dtype=float32)

In [7]:
Tuple((Discrete(3), Box(0, 1, shape=(3,3))))  # allows to combine different spaces

Tuple(Discrete(3), Box(0.0, 1.0, (3, 3), float32))

In [9]:
Tuple((Discrete(3), Box(0, 1, shape=(3,3)))).sample()

(0,
 array([[0.49876577, 0.97141844, 0.15645531],
        [0.2760475 , 0.28082424, 0.9451644 ],
        [0.06442003, 0.97389865, 0.31251356]], dtype=float32))

In [10]:
Dict({'height': Discrete(3), 'speed': Box(0,100, shape=(1,))}) # like in tuple, but in dict

Dict('height': Discrete(3), 'speed': Box(0.0, 100.0, (1,), float32))

In [11]:
Dict({'height': Discrete(3), 'speed': Box(0,100, shape=(1,))}).sample()

OrderedDict([('height', 2), ('speed', array([84.66269], dtype=float32))])

In [14]:
MultiBinary(4).sample() # n positions, either 0 or 1

array([1, 1, 0, 1], dtype=int8)

In [16]:
MultiDiscrete([5, 2, 2]).sample() # as MultiBinary but with discrete values

array([4, 1, 1], dtype=int64)

Let's build a custom Env, where the agent need to regulate the shower temperature to get to a comfortable range between 37-39 degrees

In [52]:
class ShowerEnv(Env):
    
    def __init__(self):
        # Actions, increase (2), decrease (0) and not change (1)
        self.action_space = Discrete(3)
        # Temperatures
        self.observation_space = Box(low=0, high=100, shape=(1,))
        # Starting state (initial temperature)
        self.state = np.array([38 + random.randint(-3, 3)]).astype(np.float32)
        # How long to shower
        self.shower_length = 60
        
    def reset(self, seed=None):
        # reset 
        self.state = np.array([38 + random.randint(-3, 3)]).astype(np.float32)
        self.shower_length = 60
        
        return self.state, _
    def step(self, action):
        # Adjust temperature (-1, 0, +1 degrees)
        self.state += action-1
        # Shower timer
        self.shower_length -= 1
        
        # Reward
        if 37 <= self.state <= 39:
            reward = 1
        else:
            reward = -1
        
        # Stop if shower ended
        if self.shower_length <= 0:
            done = True
        else:
            done = False
        
        # Info
        info = {}
        truncated = False
    
        return self.state, reward, done, truncated, info
    
    def render(self, mode='human'):
        # here we can implement visuals using pygame for example
        pass

In [53]:
env = ShowerEnv()

Let's see what we got

In [33]:
env.action_space.sample()

1

In [34]:
env.observation_space.sample()

array([63.00088], dtype=float32)

In [35]:
env.reset()

array([39.], dtype=float32)

In [55]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        state, reward, done, truncated, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-8
Episode:2 Score:-60
Episode:3 Score:-36
Episode:4 Score:-32
Episode:5 Score:-46


Training!

In [56]:
model = PPO('MlpPolicy', env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
# I ran this cell twice
model.learn(total_timesteps=40000)

In [61]:
evaluate_policy(model, env, n_eval_episodes=100)

(59.3, 0.9539392014169458)

Yeeeeeey!