In [1]:
import gymnasium as gym
from gymnasium.spaces import Box
import numpy as np
import random
from stable_baselines3 import PPO

from TestEnv import HydroElectric_Test
import matplotlib.pyplot as plt

from DQQN import DQNWrapper
from reward_shaping import reward_shaping


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
class PPOWrapper(gym.Env):
    def __init__(self, env, reward_shape = True):
        super().__init__()
        self.env = env
        self.max_episode_length = len(self.env.price_values.flatten())
        # self.actions = np.linspace(-1.0, 1.0, num_actions)
        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
        self.reward_shape = reward_shape

        # 5 discrete features
        self.observation_space = Box(low=0.0, high=1.0, shape=(5,), dtype=np.float32)
        self.action_history = []

    def normalize(self, state): 
        state = state.astype(np.float32) 
        state[0] /= self.env.max_volume 
        state[1] /= np.max(self.env.price_values) 
        state[2] = (state[2]-1) /23.0 
        state[3] /= 6.0 
        state[5] = (state[5]-1)/ 12.0 

        return np.array([state[0], state[1], state[2], state[3], state[5]],dtype=np.float32)


    def reset(self, seed=None, **kwargs):
        self.env.counter = 0
        self.env.hour = 1
        self.env.day = 1
        self.env.volume = self.env.max_volume / 2
        self.action_history = []
        # self.env.reset()
        obs = np.array(self.env.observation(), dtype=np.float32)
        return self.normalize(obs), {}
    

    def step(self, action):
        
        real_action = float(action[0])
        self.action_history.append(real_action)

        next_obs, reward, terminated, truncated, info = self.env.step(real_action)
        shaped_reward = reward

        if self.reward_shape: 
            shaped_reward = reward_shaping(self.env, reward, self.action_history)

        next_obs = self.normalize(np.array(next_obs, dtype=np.float32))

        return next_obs, shaped_reward, terminated, truncated, info

    def __getattr__(self, name):
        return getattr(self.env, name)


In [None]:
seed = 5
np.random.seed(seed)
random.seed(seed)

env = HydroElectric_Test("train.xlsx")
env = PPOWrapper(env, reward_shape=True)

model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    seed=seed
)

model.learn(total_timesteps=300_000,log_interval=100)
model.save("ppo_model")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.63e+04     |
|    ep_rew_mean          | -1.07e+04    |
| time/                   |              |
|    fps                  | 335          |
|    iterations           | 100          |
|    time_elapsed         | 610          |
|    total_timesteps      | 204800       |
| train/                  |              |
|    approx_kl            | 0.0005406586 |
|    clip_fraction        | 0.00645      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.3         |
|    explained_variance   | -5.25e-06    |
|    learning_rate        | 0.0003       |
|    loss                 | 0.463        |
|    n_updates            | 990          |
|    policy_gradient_loss | 0.000904     |
|    std                  | 0.897        |
|    value_loss           | 0.477        |
---

In [None]:
env_test = HydroElectric_Test("validate.xlsx")
env_test = PPOWrapper(env_test, reward_shape=False)

model = PPO.load("ppo_model")
obs, _ = env_test.reset()

done = False
water_levels = []
rewards = []
actions = []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    
    obs, reward, terminated, truncated, _ = env_test.step(action)
    done = terminated or truncated
    
    water_levels.append(env.env.volume) 
    rewards.append(reward)
    actions.append(action[0])

print("Total reward:", round(sum(rewards),2))
print("Mean reward:", round(np.mean(rewards),3))


Total reward: 47.07
Mean reward: 0.003


In [6]:
print(len(set(water_levels)))
# print(set(actions))
print(len(set(rewards)))

1
4


{np.float64(0.0),
 np.float64(12.1855012483932),
 np.float64(17.401972921567204),
 np.float64(17.481433985136)}