# Custom Environment
## 1. Import

In [4]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

import os
import numpy as np
import random

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Types of Spaces

In [7]:
Discrete(3).sample() # Value between 0 till 3

0

In [10]:
Box(0,1,shape=(3,3)).sample() # Value low=0, high=1, shape is output shape, in this case array of arrays (matrix)

array([[0.7263107 , 0.75180125, 0.64513546],
       [0.5929274 , 0.22702058, 0.07827154],
       [0.870654  , 0.50559676, 0.2721864 ]], dtype=float32)

In [12]:
Tuple((Discrete(3),Box(0,1,shape=(3,)))).sample() # Tuple combines different spaces

(1, array([0.22773798, 0.5434745 , 0.48585442], dtype=float32))

In [15]:
Dict({'height':Discrete(2), 'speed':Box(0,100,shape=(1,))}).sample() # same as Tuple() but using a dictionary

OrderedDict([('height', 1), ('speed', array([69.00639], dtype=float32))])

In [16]:
MultiBinary(4).sample() # One hot encoding of multiple discrete values

array([1, 0, 0, 1], dtype=int8)

In [17]:
MultiDiscrete([5,2,2]).sample() # Multiple discrete values ranges

array([4, 0, 1])

## 3. Building an Environment

In [104]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=0,high=100,shape=(1,))
        self.state = 38 + random.randint(-3,3)
        self.shower_length = 60
        self.info = {}
        
    def step(self,action):
        # Apply action (change in temperature)
        self.state += action -1

        # Decrease episode lenght remaining (shower time)
        self.shower_length -= 1

        # Define reward function
        if self.state >= 37 and self.state <= 40:
            reward = 10
        else:
            reward = -1

        if self.shower_length <=0:
            truncated = True
            terminated = True
        else:
            truncated = False
            terminated = False

        return self.state,reward,terminated,truncated,info
        
    def render(self):
        pass
        
    def reset(self, seed=None):
        self.state = np.array([38 + random.randint(-3, 3)]).astype(float)
        self.shower_length = 60
        return self.state, {}

In [105]:
del env
env = ShowerEnv()
env.observation_space

Box(0.0, 100.0, (1,), float32)

In [106]:
env.action_space

Discrete(3)

## 4. Test Environment

In [107]:
episodes =5
for episode in range(1,episodes+1):
    obs = env.reset()
    terminated = False
    score = 0
    
    while not terminated:
        env.render()
        action = env.action_space.sample()
        obs, reward, terminated,truncated, info = env.step(action) 
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:182
Episode:2 Score:193
Episode:3 Score:226
Episode:4 Score:39
Episode:5 Score:39


## 5. Train Model

In [108]:
del model
log_path = os.path.join('Training','Logs')
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [109]:
model.learn(total_timesteps=40000)

Logging to Training/Logs/PPO_15
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 160      |
| time/              |          |
|    fps             | 722      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 294         |
| time/                   |             |
|    fps                  | 582         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009373603 |
|    clip_fraction        | 0.0486      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 8.64e-06    

<stable_baselines3.ppo.ppo.PPO at 0x7fc297783a90>

## 6. Save Model

In [110]:
shower_path = os.path.join('Training','Saved_Models','Shower_Model_PPO')
model.save(shower_path)

In [111]:
del model

In [112]:
model = PPO.load(shower_path,env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [113]:
evaluate_policy(model,env,n_eval_episodes=10,render=False)

(468.0, 264.0)