In [1]:
import gym
import numpy as np

from stable_baselines3 import TD3
from stable_baselines3.td3.policies import MlpPolicy
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.callbacks import BaseCallback
from enviroment import ADPSimEnv
import torch

In [2]:
class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, print_interval=100, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseRLModel
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.print_interval = print_interval

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        temp = self.model.env.envs[0].sim.temperature
        print(f'Step #{self.num_timesteps} | Initial temp:{temp}')
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """
        if self.num_timesteps % self.print_interval == 0:
            info_dict = self.model.env.envs[0].info_dict
            val_str = ''.join(f'{k}:{v} | ' for k, v in info_dict.items())
            print(f'Step #{self.num_timesteps} | {val_str}')
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass


In [3]:
env = ADPSimEnv()

In [4]:
callback = CustomCallback()

In [5]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
#action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

In [6]:
model = TD3(MlpPolicy,
            env,
            action_noise=action_noise,
            verbose=2)

Using cuda device
Wrapping the env in a DummyVecEnv.


In [7]:
episode_duration = 1000
n_episodes = 100

In [8]:
model.learn(total_timesteps=episode_duration*n_episodes,
            callback=callback,
            log_interval=1)

Step #0 | Initial temp:237.34752774686578 K
Step #100 | rg:0.2780815972386482 | temperature:338.0284138809252 K | reward:-0.2780815972386482 | 
Step #200 | rg:0.29688847840036203 | temperature:95.74708338238456 K | reward:-0.29688847840036203 | 
Step #300 | rg:0.2984239232453418 | temperature:1 K | reward:-0.2984239232453418 | 
Step #400 | rg:0.297824586401384 | temperature:1 K | reward:-0.297824586401384 | 
Step #500 | rg:0.29846568123843586 | temperature:1 K | reward:-0.29846568123843586 | 
Step #600 | rg:0.29799650910759934 | temperature:1 K | reward:-0.29799650910759934 | 
Step #700 | rg:0.29787144694065004 | temperature:1 K | reward:-0.29787144694065004 | 
Step #800 | rg:0.2980123558454276 | temperature:1 K | reward:-0.2980123558454276 | 
Step #900 | rg:0.29820605702753944 | temperature:1 K | reward:-0.29820605702753944 | 
Step #1000 | rg:0.2980556637950882 | temperature:1 K | reward:-0.2980556637950882 | 
-----------------------------
| time/              |      |
|    episodes  

KeyboardInterrupt: 

In [None]:
model.save("ADP")