In [1]:
# !pip install stable-baselines3[extra]

In [2]:
# !pip install gymnasium


In [3]:
# !pip install tqdm

In [4]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import tqdm

In [5]:
# arguments
time_step = 15
epi_days = 1
battry_cap =10000
battrry_charge_rate=40000
out_power_const =11000
data_file ='Plant_2_Generation_Data_cleaned.csv'

render_mode="human"

In [6]:
import solar_plant_gym_env
from stable_baselines3.ppo.policies import MlpPolicy
# from solar_plant_gym_env.envs.solar_plant import SolarPlant

# hyperparameters
learning_rate = 0.001
n_episodes = 1000
start_epsilon = 2.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.01

# env = gym.make('solar_plant_gym_env/SolarPlant',render_mode="human")
env = gym.make('solar_plant_gym_env/SolarPlant',epi_days=epi_days,time_step=time_step, battry_cap = battry_cap,battrry_charge_rate = battrry_charge_rate,out_power_const =out_power_const,data_file=data_file)

env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=n_episodes)
env = DummyVecEnv([lambda: env])

model = PPO(MlpPolicy, env, verbose=1,tensorboard_log="Training/Logs/")

print("action space :",env.action_space,"obs space :",env.observation_space)

  logger.warn(


Using cpu device
action space : Discrete(3) obs space : Box(0.0, 1.0, (3,), float32)


In [7]:
from stable_baselines3.common.base_class import BaseAlgorithm


def evaluate(
    model: BaseAlgorithm,
    num_episodes: int = 100,
    deterministic: bool = True,
) -> float:
    """
    Evaluate an RL agent for `num_episodes`.

    :param model: the RL Agent
    :param env: the gym Environment
    :param num_episodes: number of episodes to evaluate it
    :param deterministic: Whether to use deterministic or stochastic actions
    :return: Mean reward for the last `num_episodes`
    """
    # This function will only work for a single environment
    vec_env = model.get_env()
    obs = vec_env.reset()
    all_episode_rewards = []
    for _ in range(num_episodes):
        episode_rewards = []
        done = False
        # Note: SB3 VecEnv resets automatically:
        # https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api
        # obs = vec_env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            # `deterministic` is to use deterministic actions
            action, _states = model.predict(obs, deterministic=deterministic)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, _info = vec_env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")

    return mean_episode_reward

In [8]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100, deterministic=True)

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Mean reward: -9461.08 - Num episodes: 100


In [9]:
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward: -8257.97 +/- 5915.70


# Train and eval


In [27]:

from stable_baselines3.common.callbacks import BaseCallback

class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_step(self) -> bool:
        # get the reward
        value = self.model.get_env().get_attr("info_df")
        self.logger.record("random_value", value)
        return True

In [28]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10_000 , callback=TensorboardCallback())

Logging to Training/Logs/PPO_3


In [11]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:-1471.77 +/- 5044.17


# Save and Reload Model


In [12]:
import os
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [13]:
model.save(PPO_path)

In [14]:
del model

In [15]:
model = PPO.load(PPO_path, env=env)

# Test Model


In [16]:
n_epi_test = 10
test_env = gym.make('solar_plant_gym_env/SolarPlant',render_mode=render_mode,epi_days=epi_days, battry_cap = battry_cap,battrry_charge_rate = battrry_charge_rate,out_power_const =out_power_const,data_file=data_file)
test_env = gym.wrappers.RecordEpisodeStatistics(test_env, buffer_length=n_episodes)
test_env = DummyVecEnv([lambda: test_env])

  logger.warn(


In [17]:
model = PPO.load(PPO_path, env=test_env)

In [18]:
for episode in tqdm(range(n_epi_test)):
    obs = test_env.reset() 
    done = False
    score = 0
    
    while not done :
        env.render()
        action,_ = model.predict(obs)
        obs ,  reward, done, info = test_env.step(action)
        score += reward
    print(f'Episode: {episode}, Score: {score}')

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
 10%|█         | 1/10 [00:03<00:27,  3.10s/it]

Episode: 0, Score: [-8135.2266]


 20%|██        | 2/10 [00:05<00:20,  2.61s/it]

Episode: 1, Score: [-9218.526]


 30%|███       | 3/10 [00:07<00:17,  2.52s/it]

Episode: 2, Score: [-10356.958]


 40%|████      | 4/10 [00:09<00:14,  2.36s/it]

Episode: 3, Score: [-7840.201]


 50%|█████     | 5/10 [00:24<00:34,  6.86s/it]

Episode: 4, Score: [-3750.2202]


 60%|██████    | 6/10 [00:27<00:21,  5.31s/it]

Episode: 5, Score: [-8463.136]


 70%|███████   | 7/10 [00:29<00:12,  4.28s/it]

Episode: 6, Score: [-4869.1343]


 80%|████████  | 8/10 [00:31<00:07,  3.67s/it]

Episode: 7, Score: [-8173.788]


 90%|█████████ | 9/10 [00:33<00:03,  3.23s/it]

Episode: 8, Score: [-9072.059]


100%|██████████| 10/10 [00:36<00:00,  3.62s/it]

Episode: 9, Score: [-4698.9326]





In [19]:
training_log_path = os.path.join("Training","Logs", 'PPO_2')
training_log_path

'Training\\Logs\\PPO_1'

In [21]:
# tensorboard --logdir={training_log_path}

^C
