In [89]:
import os
import numpy as np
import random
# import gym as deep reinforcement learning envrionment
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Tuple, Dict, MultiDiscrete, MultiBinary

# import stable_baselines3 for DRL model
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


In [90]:
# this is a custom environment for testing purposes, gym is from gymnasium 
class ShowerEnv(gym.Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=0, high=100, shape=(1,))
        self.state = 38 + random.randint(-3,3)
        self.shower_length = 60

    def step(self, action):
        # Apply temp adjustment
        self.state += action - 1
        # Decrease shower time
        self.shower_length -= 1

        # Calculate reward
        if 37 <= self.state <= 39:
            reward = 1
        else:
            reward = -1

        done = self.shower_length <= 0
        info = {}
    
        # For environments that don't have a natural truncation, 
        # you can typically just return 'done' for 'truncated' as well.
        # However, if your environment has a specific truncation condition,
        # you should return that condition here instead.
        truncated = done  

        return np.array([self.state]).astype(np.float32), reward, done, truncated, info

    def render(self):
        pass

    def reset(self, **kwargs):
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60
        # Return the initial state and an empty info dictionary
        return np.array([self.state]).astype(np.float32), {}  # Ensure the state is returned as a NumPy array and an empty info dict

In [91]:
env = ShowerEnv()

In [86]:
episodes = 500
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print(f'Episode: {episode}, Score: {score}')

Episode: 1, Score: 4
Episode: 2, Score: 2
Episode: 3, Score: -28
Episode: 4, Score: -18
Episode: 5, Score: -22
Episode: 6, Score: -8
Episode: 7, Score: -54
Episode: 8, Score: -56
Episode: 9, Score: -12
Episode: 10, Score: -44
Episode: 11, Score: -32
Episode: 12, Score: -52
Episode: 13, Score: -58
Episode: 14, Score: -58
Episode: 15, Score: -36
Episode: 16, Score: -50
Episode: 17, Score: -60
Episode: 18, Score: -40
Episode: 19, Score: -40
Episode: 20, Score: -60
Episode: 21, Score: -30
Episode: 22, Score: -28
Episode: 23, Score: -20
Episode: 24, Score: 12
Episode: 25, Score: -46
Episode: 26, Score: -30
Episode: 27, Score: 20
Episode: 28, Score: -60
Episode: 29, Score: -28
Episode: 30, Score: -28
Episode: 31, Score: -14
Episode: 32, Score: -60
Episode: 33, Score: -16
Episode: 34, Score: -36
Episode: 35, Score: -24
Episode: 36, Score: -40
Episode: 37, Score: -10
Episode: 38, Score: 12
Episode: 39, Score: -12
Episode: 40, Score: -24
Episode: 41, Score: 8
Episode: 42, Score: -48
Episode: 43

In [94]:
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=20000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_4


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -28.8    |
| time/              |          |
|    fps             | 1883     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -28.3       |
| time/                   |             |
|    fps                  | 1228        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010911289 |
|    clip_fraction        | 0.0916      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 8.21e-05    |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x19d2b593a90>

In [95]:
shower_path = os.path.join('Training', 'Saved Models', 'Shower_Model_PPO')

In [96]:
model.save(shower_path)



In [97]:
del model