# Simulations with reinforcement learning

In [2]:
import sinergym
from sinergym.utils.callbacks import LoggerEvalCallback
from sinergym.utils.rewards import *
from datetime import datetime
import gym

import numpy as np


from sinergym.utils.wrappers import LoggerWrapper, NormalizeObservation
from sinergym.utils.constants import RANGES_5ZONE

from stable_baselines3 import DQN, DDPG, PPO, A2C, SAC, TD3


from stable_baselines3.common.callbacks import CallbackList
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.logger import configure

from math import exp
from typing import Any, Dict, List, Tuple, Union
from gym import Env


In [3]:
"""Implementation of reward functions."""


class BaseReward(object):

    def __init__(self, env):
        """
        Base reward class.

        All reward functions should inherit from this class.

        Args:
            env (Env): Gym environment.
        """
        self.env = env

    def __call__(self):
        """Method for calculating the reward function."""
        raise NotImplementedError(
            "Reward class must have a `__call__` method.")


class MyLinearReward(BaseReward):

    def __init__(
        self,
        env: Env,
        temperature_variable: Union[str, list],
        energy_variable: str,
        range_comfort_winter: Tuple[int, int],
        range_comfort_summer: Tuple[int, int],
        summer_start: Tuple[int, int] = (6, 1),
        summer_final: Tuple[int, int] = (9, 30),
        energy_weight: float = 0.6,
        #energy_weight: float = 1,
        lambda_energy: float = 0.003,
        #lambda_energy: float = 1,
        lambda_temperature: float = 50
    ):
        """
        Linear reward function.

        It considers the energy consumption and the absolute difference to temperature comfort.

        .. math::
            R = - W * lambda_E * power - (1 - W) * lambda_T * (max(T - T_{low}, 0) + max(T_{up} - T, 0))

        Args:
            env (Env): Gym environment.
            temperature_variable (Union[str, list]): Name(s) of the temperature variable(s).
            energy_variable (str): Name of the energy/power variable.
            range_comfort_winter (Tuple[int,int]): Temperature comfort range for cold season. Depends on environment you are using.
            range_comfort_summer (Tuple[int,int]): Temperature comfort range for hot season. Depends on environment you are using.
            summer_start (Tuple[int,int]): Summer session tuple with month and day start. Defaults to (6,1).
            summer_final (Tuple[int,int]): Summer session tuple with month and day end. defaults to (9,30).
            energy_weight (float, optional): Weight given to the energy term. Defaults to 0.5.
            lambda_energy (float, optional): Constant for removing dimensions from power(1/W). Defaults to 1e-4.
            lambda_temperature (float, optional): Constant for removing dimensions from temperature(1/C). Defaults to 1.0.
        """

        super(MyLinearReward, self).__init__(env)

        # Name of the variables
        self.temp_name = temperature_variable
        self.energy_name = energy_variable

        # Reward parameters
        self.range_comfort_winter = range_comfort_winter
        self.range_comfort_summer = range_comfort_summer
        self.W_energy = energy_weight
        self.lambda_energy = lambda_energy
        self.lambda_temp = lambda_temperature

        # Summer period
        self.summer_start = summer_start  # (month,day)
        self.summer_final = summer_final  # (month,day)

    def __call__(self) -> Tuple[float, Dict[str, Any]]:
        """
        Calculate the reward function.

        Returns:
            Tuple[float, Dict[str, Any]]: Reward value and dictionary with their individual components.
        """
        # Current observation
        obs_dict = self.env.obs_dict.copy()

        # Energy term
        #reward_energy = - self.lambda_energy * obs_dict[self.energy_name]
        reward_energy = - (self.lambda_energy * obs_dict[self.energy_name])

        # Comfort
        comfort, temps = self._get_comfort(obs_dict)

        if comfort == 0:
            reward_comfort = 5
        else:
            reward_comfort = - self.lambda_temp * comfort
        # Weighted sum of both terms
        reward = self.W_energy * reward_energy + \
            (1.0 - self.W_energy) * reward_comfort

        reward_terms = {
            'reward_energy': reward_energy,
            'total_energy': obs_dict[self.energy_name],
            'reward_comfort': reward_comfort,
            'abs_comfort': comfort,
            'temperatures': temps
        }

        return reward, reward_terms

    def _get_comfort(self,
                     obs_dict: Dict[str,
                                    Any]) -> Tuple[float,
                                                   List[float]]:
        """Calculate the comfort term of the reward.

        Returns:
            Tuple[float, List[float]]: comfort penalty and List with temperatures used.
        """

        hour = obs_dict["hour"]
        month = obs_dict['month']
        day = obs_dict['day']
        year = obs_dict['year']
        current_dt = datetime(year, month, day)

        # Periods
        summer_start_date = datetime(
            year,
            self.summer_start[0],
            self.summer_start[1])
        summer_final_date = datetime(
            year,
            self.summer_final[0],
            self.summer_final[1])

        if current_dt >= summer_start_date and current_dt <= summer_final_date:
            if hour not in range(8,19):
                temp_range = (15,30)
            else:
                temp_range = self.range_comfort_summer 
        else:
            if hour not in range(8,19):
                temp_range = (15,30)
            else:
                temp_range = self.range_comfort_winter


        temps = [v for k, v in obs_dict.items() if k in self.temp_name]
        comfort = 0.0
        for T in temps:
            if T < temp_range[0] or T > temp_range[1]:
                comfort += min(abs(temp_range[0] - T), abs(T - temp_range[1]))
  
 

        return comfort, temps


class MyExpReward(MyLinearReward):

    def __init__(
        self,
        env: Env,
        temperature_variable: Union[str, list],
        energy_variable: str,
        range_comfort_winter: Tuple[int, int],
        range_comfort_summer: Tuple[int, int],
        summer_start: Tuple[int, int] = (6, 1),
        summer_final: Tuple[int, int] = (9, 30),
        #changes from 0.5 to 0.7
        energy_weight: float = 1,
        lambda_energy: float = 1e-4,
        #lambda_energy: float = 1,
        lambda_temperature: float = 1
    ):
        """
        Reward considering exponential absolute difference to temperature comfort.

        .. math::
            R = - W * lambda_E * power - (1 - W) * lambda_T * exp( (max(T - T_{low}, 0) + max(T_{up} - T, 0)) )

        Args:
            env (Env): Gym environment.
            temperature_variable (Union[str, list]): Name(s) of the temperature variable(s).
            energy_variable (str): Name of the energy/power variable.
            range_comfort_winter (Tuple[int,int]): Temperature comfort range for cold season. Depends on environment you are using.
            range_comfort_summer (Tuple[int,int]): Temperature comfort range for hot season. Depends on environment you are using.
            summer_start (Tuple[int,int]): Summer session tuple with month and day start. Defaults to (6,1).
            summer_final (Tuple[int,int]): Summer session tuple with month and day end. defaults to (9,30).
            energy_weight (float, optional): Weight given to the energy term. Defaults to 0.5.
            lambda_energy (float, optional): Constant for removing dimensions from power(1/W). Defaults to 1e-4.
            lambda_temperature (float, optional): Constant for removing dimensions from temperature(1/C). Defaults to 1.0.
        """

        super(MyExpReward, self).__init__(
            env,
            temperature_variable,
            energy_variable,
            range_comfort_winter,
            range_comfort_summer,
            summer_start,
            summer_final,
            energy_weight,
            lambda_energy,
            lambda_temperature
        )

    def _get_comfort(self,
                     obs_dict: Dict[str,
                                    Any]) -> Tuple[float,
                                                   List[float]]:
        """Calculate the comfort term of the reward.

        Returns:
            Tuple[float, List[float]]: comfort penalty and List with temperatures used.
        """

        hour = obs_dict["hour"]
        month = obs_dict['month']
        day = obs_dict['day']
        year = obs_dict['year']
        current_dt = datetime(year, month, day)

        # Periods
        summer_start_date = datetime(
            year,
            self.summer_start[0],
            self.summer_start[1])
        summer_final_date = datetime(
            year,
            self.summer_final[0],
            self.summer_final[1])

        # if current_dt >= summer_start_date and current_dt <= summer_final_date:
        #     temp_range = self.range_comfort_summer 
        # else:
        #     temp_range = self.range_comfort_winter
        if current_dt >= summer_start_date and current_dt <= summer_final_date:
            if current_dt.weekday() >= 5 or hour not in range(8,19):
                temp_range = (15,30)
            else:
                temp_range = self.range_comfort_summer 
        else:
            if current_dt.weekday() >= 5 or hour not in range(8,19):
                temp_range = (15,30)
            else:
                temp_range = self.range_comfort_winter


        temps = [v for k, v in obs_dict.items() if k in self.temp_name]
        comfort = 0.0
        for T in temps:
            if T < temp_range[0] or T > temp_range[1]:
                comfort += exp(min(abs(temp_range[0] - T),
                                   abs(T - temp_range[1])))

            # else:
            #     comfort -= 5

        return comfort, temps


class MyHourlyExpReward(MyExpReward):

    def __init__(
        self,
        env: Env,
        temperature_variable: Union[str, list],
        energy_variable: str,
        # range_comfort_winter: Tuple[int, int],
        # range_comfort_summer: Tuple[int, int],
        range_comfort_winter = (20,23),
        range_comfort_summer = (23,26),
        summer_start: Tuple[int, int] = (6, 1),
        summer_final: Tuple[int, int] = (9, 30),
        min_energy_weight: float = 0.6,
        #default energy lambda = 1
        lambda_energy: float = 0.001,
        lambda_temperature: float = 1,
        range_comfort_hours: tuple = (8, 19)
    ):
        """
        Linear reward function with a time-dependent weight for consumption and energy terms.

        Args:
            env (Env): Gym environment.
            temperature_variable (Union[str, list]): Name(s) of the temperature variable(s).
            energy_variable (str): Name of the energy/power variable.
            range_comfort_winter (Tuple[int,int]): Temperature comfort range for cold season. Depends on environment you are using.
            range_comfort_summer (Tuple[int,int]): Temperature comfort range for hot season. Depends on environment you are using.
            summer_start (Tuple[int,int]): Summer session tuple with month and day start. Defaults to (6,1).
            summer_final (Tuple[int,int]): Summer session tuple with month and day end. defaults to (9,30).
            min_energy_weight (float, optional): Minimum weight given to the energy term. Defaults to 0.5.
            lambda_energy (float, optional): Constant for removing dimensions from power(1/W). Defaults to 1e-4.
            lambda_temperature (float, optional): Constant for removing dimensions from temperature(1/C). Defaults to 1.0.
            range_comfort_hours (tuple, optional): Hours where thermal comfort is considered. Defaults to (9, 19).
        """

        super(MyHourlyExpReward, self).__init__(
            env,
            temperature_variable,
            energy_variable,
            range_comfort_winter,
            range_comfort_summer,
            summer_start,
            summer_final,
            min_energy_weight,
            lambda_energy,
            lambda_temperature
        )



        # Reward parameters
        self.range_comfort_hours = range_comfort_hours

    def __call__(self) -> Tuple[float, Dict[str, Any]]:
        """Calculate the reward function.

        Returns:
            Tuple[float, Dict[str, Any]]: Reward and dict with reward terms.
            """
        # Current observation
        obs_dict = self.env.obs_dict.copy()

        # Energy term
        #reward_energy = - self.lambda_energy * obs_dict[self.energy_name]
        reward_energy = - self.lambda_energy * obs_dict['Facility Total HVAC Electricity Demand Rate(Whole Building)']
        # Comfort
        comfort, temps = self._get_comfort(obs_dict)

        if comfort == 0:
            reward_comfort = 5
        else:   
            reward_comfort = - self.lambda_temp * comfort

        # Determine energy weight depending on the hour
        hour = obs_dict['hour']
        if hour >= self.range_comfort_hours[0] and hour <= self.range_comfort_hours[1]:
            weight = self.W_energy
        else:
            weight = 1


        # Weighted sum of both terms
        reward = weight * reward_energy + (1.0 - weight) * reward_comfort

        reward_terms = {
            'reward_energy': reward_energy,
            'total_energy': obs_dict[self.energy_name],
            'reward_comfort': reward_comfort,
            'temperatures': temps
        }

        return reward, reward_terms

class MyHourlyLinearReward(MyLinearReward):

    def __init__(
        self,
        env: Env,
        temperature_variable: Union[str, list],
        energy_variable: str,
        range_comfort_winter: Tuple[int, int],
        range_comfort_summer: Tuple[int, int],
        summer_start: Tuple[int, int] = (6, 1),
        summer_final: Tuple[int, int] = (9, 30),
        min_energy_weight: float = 0.5,
        lambda_energy: float = 0.005,
        lambda_temperature: float = 100,
        range_comfort_hours: tuple = (8, 19),
    ):
        """
        Linear reward function with a time-dependent weight for consumption and energy terms.

        Args:
            env (Env): Gym environment.
            temperature_variable (Union[str, list]): Name(s) of the temperature variable(s).
            energy_variable (str): Name of the energy/power variable.
            range_comfort_winter (Tuple[int,int]): Temperature comfort range for cold season. Depends on environment you are using.
            range_comfort_summer (Tuple[int,int]): Temperature comfort range for hot season. Depends on environment you are using.
            summer_start (Tuple[int,int]): Summer session tuple with month and day start. Defaults to (6,1).
            summer_final (Tuple[int,int]): Summer session tuple with month and day end. defaults to (9,30).
            min_energy_weight (float, optional): Minimum weight given to the energy term. Defaults to 0.5.
            lambda_energy (float, optional): Constant for removing dimensions from power(1/W). Defaults to 1e-4.
            lambda_temperature (float, optional): Constant for removing dimensions from temperature(1/C). Defaults to 1.0.
            range_comfort_hours (tuple, optional): Hours where thermal comfort is considered. Defaults to (9, 19).
        """

        super(MyHourlyLinearReward, self).__init__(
            env,
            temperature_variable,
            energy_variable,
            range_comfort_winter,
            range_comfort_summer,
            summer_start,
            summer_final,
            min_energy_weight,
            lambda_energy,
            lambda_temperature
        )

        # Reward parameters
        self.range_comfort_hours = range_comfort_hours

    def __call__(self) -> Tuple[float, Dict[str, Any]]:
        """Calculate the reward function.

        Returns:
            Tuple[float, Dict[str, Any]]: Reward and dict with reward terms.
            """
        # Current observation
        obs_dict = self.env.obs_dict.copy()

        # Energy term
        reward_energy = - self.lambda_energy * obs_dict[self.energy_name]

        # Comfort
        comfort, temps = self._get_comfort(obs_dict)
        reward_comfort = - self.lambda_temp * comfort

        # Determine energy weight depending on the hour
        hour = obs_dict['hour']
        if hour >= self.range_comfort_hours[0] and hour <= self.range_comfort_hours[1]:
            weight = self.W_energy
        else:
            weight = 1.0

        # Weighted sum of both terms
        reward = weight * reward_energy + (1.0 - weight) * reward_comfort

        reward_terms = {
            'reward_energy': reward_energy,
            'total_energy': obs_dict[self.energy_name],
            'reward_comfort': reward_comfort,
            'temperatures': temps
        }

        return reward, reward_terms



In [16]:
environment = "Eplus-5Zone-hot-discrete-v1"
weather = "USA_NY_New.York-J.F.Kennedy.Intl.AP.744860_TMY3.epw"

episodes = 1

bins = np.linspace(15.125, 30.125, 61)

#choose the simulation period
begin_day = 1
begin_month = 1
begin_year = 2022
end_day = 1
end_month = 2
end_year = 2022

# register run name
# name = F"{environment}-episodes_{episodes}({experiment_date})"


# Set to one month only to reduce running time
extra_params={'timesteps_per_hour' : 4,
              'runperiod' : (begin_day,begin_month,begin_year,end_day,end_month,end_year)}

new_observation_variables=[
    'Site Outdoor Air Drybulb Temperature(Environment)',
    'Zone Air Temperature(SPACE1-1)',
    'Facility Total HVAC Electricity Demand Rate(Whole Building)',
  #  'bins' == np.digitize(obs["ZoneZone Air Temperature(SPACE1-1) "], bins)
 ]

new_observation_space = gym.spaces.Box(
    low=-5e6,
    high=5e6,
    shape=(len(new_observation_variables) + 4,),
    dtype=np.float32)

new_action_variables = [
    'Heating_Setpoint_RL',
    'Cooling_Setpoint_RL',
]

new_action_mapping = {
    0: (15, 30),
    1: (16, 29),
    2: (17, 28),
    3: (18, 27),
    4: (19, 26),
    5: (20, 25),
    6: (21, 24),
    7: (22, 23),
    8: (22, 22),
    9: (21, 21)
}

new_action_space = gym.spaces.Discrete(10)

env = gym.make(environment, 
                weather_file = weather,
                reward = MyHourlyExpReward, 
                config_params = extra_params,
                observation_variables = new_observation_variables,
                observation_space = new_observation_space,
                action_variables=new_action_variables,
                action_mapping=new_action_mapping,
                action_space=new_action_space
                )

#env = LoggerWrapper(NormalizeObservation(env, ranges = RANGES_5ZONE))
env = CSVLogger(env)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [11]:
for i in range(1):
    obs = env.reset()
    rewards = []
    done = False
    current_month = 0
    while not done:
        a = env.action_space.sample()
        obs, reward, done, info = env.step(a)
        rewards.append(reward)
        if info['month'] != current_month:  # display results every month
            current_month = info['month']
            print('Reward: ', sum(rewards), info)
        if current_month == 2:
            done = True

[2023-02-08 08:52:48,441] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:Creating new EnergyPlus simulation episode...
[2023-02-08 08:52:48,441] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:Creating new EnergyPlus simulation episode...
[2023-02-08 08:52:48,441] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:Creating new EnergyPlus simulation episode...
[2023-02-08 08:52:48,441] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:Creating new EnergyPlus simulation episode...
[2023-02-08 08:52:48,779] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus working directory is in /workspaces/sinergym/examples/Eplus-env-5Zone-hot-discrete-v1-res10/Eplus-env-sub_run1
[2023-02-08 08:52:48,779] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus working directory is in /workspaces/sinergym/examples/Eplus-env-5Zone-hot-discrete-v1-res10/Eplus-env-sub_run1
[2023-02-08 08:52:48,779] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus working 

In [12]:
env.close()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


[2023-02-08 08:54:51,296] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus simulation closed successfully. 
[2023-02-08 08:54:51,296] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus simulation closed successfully. 
[2023-02-08 08:54:51,296] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus simulation closed successfully. 
[2023-02-08 08:54:51,296] EPLUS_ENV_5Zone-hot-discrete-v1_MainThread_ROOT INFO:EnergyPlus simulation closed successfully. 


In [None]:
#Getting the state space
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

In [40]:
env.observation_space("hour")

TypeError: 'Box' object is not callable

In [None]:
#discrete observation space size (make bins)
DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE

In [None]:
print(DISCRETE_OS_SIZE)

In [None]:
q_table = np.zeros(shape=DISCRETE_OS_SIZE+[env.action_space.n])

In [None]:
print(len(q_table[0]))

In [None]:
# env = gym.make("MountainCar-v0")
# env.reset()

# done = False

# while not done:
#     action = 2
#     new_state, reward, done, _ = env.step(action)
#     env.render()
    
# env.close()






In [None]:
n_actions

In [None]:
hours = np.arange(0, 24, 1)
temp = np.arange(15, 30, 0.1)
temp

In [None]:
#Initialize the Q-table to 0
# Q_table = np.zeros(len(hours),len(temp))
Q_table = np.zeros(n_obsservations,len(temp))
print(Q_table)

In [None]:

#number of episode we will run
n_episodes = 10000

#maximum of iteration per episode
max_iter_episode = 100

#initialize the exploration probability to 1
exploration_proba = 1

#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001

# minimum of exploration proba
min_exploration_proba = 0.01

#discounted factor
gamma = 0.99

#learning rate
lr = 0.1

In [None]:
total_rewards_episode = list()

In [None]:
#we iterate over episodes
for e in range(n_episodes):
    #we initialize the first state of the episode
    current_state = env.reset()
    done = False
    
    #sum the rewards that the agent gets from the environment
    total_episode_reward = 0
    
    for i in range(max_iter_episode): 
        # we sample a float from a uniform distribution over 0 and 1
        # if the sampled float is less than the exploration probability
        #     the agent selects a random action
        # else
        #     he exploits his knowledge using the bellman equation 
        
        if np.random.uniform(0,1) < exploration_proba:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state,:])
        
        # The environment runs the chosen action and returns
        # the next state, a reward and true if the episode is ended.
        next_state, reward, done, _ = env.step(action)
        
        # We update our Q-table using the Q-learning iteration
        Q_table[current_state, action] = (1-lr) * Q_table[current_state, action] +lr*(reward + gamma*max(Q_table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        # If the episode is finished, we leave the for loop
        if done:
            break
        current_state = next_state
    #We update the exploration proba using exponential decay formula 
    exploration_proba = max(min_exploration_proba, np.exp(-exploration_decreasing_decay*e))
    total_rewards_episode.append(total_episode_reward)

In [None]:
print("Mean reward per thousand episodes")
for i in range(10):
    print((i+1)*1000,": mean espiode reward: ",\
           np.mean(total_rewards_episode[1000*i:1000*(i+1)]))

make 2 observation variables discrete.
For example:
Outside temp


In [None]:
import sinergym
from sinergym.utils.callbacks import LoggerEvalCallback
from sinergym.utils.rewards import *
from sinergym.utils.wrappers import LoggerWrapper
from datetime import datetime
import gym
from stable_baselines3 import DQN, DDPG, PPO, A2C, SAC, TD3 

from stable_baselines3.common.callbacks import CallbackList
from stable_baselines3.common.vec_env import DummyVecEnv
import numpy as np


environment  = "Eplus-5Zone-hot-continuous-v1"
weather = "USA_NY_New.York-J.F.Kennedy.Intl.AP.744860_TMY3.epw"

episodes = 3
experiment_date = datetime.today().strftime('%Y-%m-%d %H:%M')

#choose the simulation period
begin_day = 1
begin_month = 1
begin_year = 2022
end_day = 1
end_month = 2
end_year = 2022

# register run name
name = F"{environment}-episodes_{episodes}({experiment_date})"


# Set to one month only to reduce running time
extra_params={'timesteps_per_hour' : 4,
              'runperiod' : (begin_day,begin_month,begin_year,end_day,end_month,end_year)}

new_observation_variables=[
    'Site Outdoor Air Drybulb Temperature(Environment)',
    'Site Diffuse Solar Radiation Rate per Area(Environment)',
    'Site Direct Solar Radiation Rate per Area(Environment)',
    'Zone Thermostat Heating Setpoint Temperature(SPACE1-1)',
    'Zone Thermostat Cooling Setpoint Temperature(SPACE1-1)',
    'Zone Air Temperature(SPACE1-1)',
    'Zone People Occupant Count(SPACE1-1)',
    'Facility Total HVAC Electricity Demand Rate(Whole Building)']

new_observation_space = gym.spaces.Box(
    low=-5e6,
    high=5e6,
    shape=(len(new_observation_variables) + 4,),
    dtype=np.float32)


env = gym.make(environment, 
                weather_file = weather,
                reward = ExpReward, 
                config_params = extra_params,
                observation_variables = new_observation_variables,
                observation_space = new_observation_space,
                reward_kwargs={
                  'temperature_variable': 'Zone Air Temperature (SPACE1-1)',
                    'energy_variable': 'Facility Total HVAC Electricity Demand Rate(Whole Building)',
                    'range_comfort_winter': (20.0, 23.5),
                    'range_comfort_summer': (23.0, 26.0),
                    'energy_weight': 0.5 })


env = LoggerWrapper(env)


model = PPO('MlpPolicy', env, verbose=1,
        learning_rate = 0.001)

n_timesteps_episode = env.simulator._eplus_one_epi_len / \
                      env.simulator._eplus_run_stepsize

env_vec = DummyVecEnv([lambda: env])

callbacks = []

# Set up Evaluation and saving best model
eval_callback = LoggerEvalCallback(
    env_vec,
    best_model_save_path='best_model/' + name + '/',
    log_path='best_model/' + name + '/',
    eval_freq=n_timesteps_episode * 2,
    deterministic=True,
    render=False,
    n_eval_episodes=2)
callbacks.append(eval_callback)

callback = CallbackList(callbacks)

timesteps = episodes * n_timesteps_episode

model.learn(
    total_timesteps=timesteps,
    callback=callback,
    log_interval=1)

model.save(env.simulator._env_working_dir_parent + '/' + name)

env.close()


