In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

In [2]:
import numpy as np
import random
import os

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

Discrete(n).sample() gives values [0,n-1]

In [4]:
Discrete(3).sample()

1

Box(l,r,shape=(r,c)) give rxc matrix of random numbers in [l,r]


In [6]:
Box(0,1,shape=(3,3)).sample()

array([[0.8348152 , 0.999604  , 0.27463803],
       [0.782614  , 0.68793845, 0.9758288 ],
       [0.854323  , 0.4515435 , 0.94891465]], dtype=float32)

Dict is like a tuple

In [10]:
Dict({"a": Discrete(3), "b": Box(0,100,shape=(3,3),dtype=int)}).sample()

OrderedDict([('a', 0),
             ('b', array([[74, 41, 51],
                     [35, 76, 54],
                     [55, 28, 72]]))])

Multibinary(n) gives n random binary values

In [11]:
MultiBinary(4).sample()

array([0, 1, 0, 1], dtype=int8)

MultiDiscrete([a,b,c]) give x,y,z random values in [0,a-1], [0,b-1], [0,c-1]

In [12]:
MultiDiscrete([2,3,4]).sample()

array([1, 1, 3], dtype=int64)

# Building an Environment <br>


In [4]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=0,high = 100,shape =(1,))
        self.state = 38+random.randint(-3,3)
        self.shower_length = 60
        
    def step(self, action):
        self.state+=action-1

        self.shower_length -=1

        # REWARD    
        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = -1

        # TERMINAL
        if self.shower_length <= 0:
            done = True
        else:
            done = False
        info ={}
        
        return self.state, reward, done, info

    def reset(self):
        self.state = 38+random.randint(-3,3)
        self.shower_length = 60
        return self.state


In [5]:
env = ShowerEnv()

In [7]:
env.observation_space.sample()

array([3.8683991], dtype=float32)

In [14]:
env.action_space.sample()

1

# Testing

In [16]:
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()
    done = False
    score =0

    while not done:
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
        # print(obs,reward,done,info)
    print(f"Episode {episode} score {score}")
env.close()

Episode 1 score -60
Episode 2 score -30
Episode 3 score -56
Episode 4 score -28
Episode 5 score -38


In [17]:
log_path = os.path.join('Training','Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
model.learn(total_timesteps=50000)

Logging to Training\Logs\PPO_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -28.5    |
| time/              |          |
|    fps             | 1120     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -26.3       |
| time/                   |             |
|    fps                  | 834         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013244765 |
|    clip_fraction        | 0.0494      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.01       |
|    explained_variance   | -2.65e-05   |

<stable_baselines3.ppo.ppo.PPO at 0x25db66d52b0>

In [None]:
shower_path = os.path.join('Training','Saved_Models','Shower_Model_PPO')

In [25]:
evaluate_policy(model, env, n_eval_episodes=10)

(-24.0, 54.99090833947008)

In [None]:
model.save(shower_path)

In [None]:
del model

In [None]:
model = PPO.load(shower_path,env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10)
