In [1]:
from citylearn import  CityLearn
from pathlib import Path
import numpy as np
from tensorflow.keras.layers import Input, Dense, Lambda
import tensorflow as tf


# Select the climate zone and load environment
climate_zone = 1
data_path = Path("data/Climate_Zone_"+str(climate_zone))
building_attributes = data_path / 'building_attributes.json'
weather_file = data_path / 'weather_data.csv'
solar_profile = data_path / 'solar_generation_1kW.csv'
building_state_actions = 'buildings_state_action_space.json'
building_ids = ["Building_1","Building_2","Building_3","Building_4","Building_5","Building_6","Building_7","Building_8","Building_9"]
objective_function = ['ramping','1-load_factor','average_daily_peak','peak_demand','net_electricity_consumption']

env = CityLearn(data_path, building_attributes, weather_file, solar_profile, building_ids, buildings_states_actions = building_state_actions, cost_function = objective_function)
observations_spaces, actions_spaces = env.get_state_action_spaces()

# Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings
building_info = env.get_building_information()




In [3]:

gamma = 0.99
update_interval = 5
actor_lr = 0.0005
critic_lr = 0.001

In [4]:
class Buffer:
    def __init__(self):
        self.buffer = []
        
    def append_sample(self, sample):
        self.buffer.append(sample)
        
    def sample(self, sample_size):
        s, a, r, s_next, done = [],[],[],[],[]
        
        if sample_size > len(self.buffer):
            sample_size = len(self.buffer)
            
        rand_sample = random.sample(self.buffer, sample_size)
        for values in rand_sample:
            s.append(values[0])
            a.append(values[1])
            r.append(values[2])
            s_next.append(values[3])
            done.append([4])
        return torch.tensor(s,dtype=torch.float32).cuda(), torch.tensor(a,dtype=torch.float32).cuda(), torch.tensor(r,dtype=torch.float32).cuda(), torch.tensor(s_next,dtype=torch.float32).cuda(), done
    
    def __len__(self):
         return len(self.buffer)

In [5]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound, std_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.std_bound = std_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(actor_lr)

    def create_model(self):
        state_input = Input((self.state_dim,))
        dense_1 = Dense(32, activation='relu')(state_input)
        dense_2 = Dense(32, activation='relu')(dense_1)
        out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
        mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
        std_output = Dense(self.action_dim, activation='softplus')(dense_2)
        return tf.keras.models.Model(state_input, [mu_output, std_output])

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        mu, std = self.model.predict(state)
        mu, std = mu[0], std[0]
        return np.random.normal(mu, std, size=self.action_dim)

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / \
            var - 0.5 * tf.math.log(var * 2 * np.pi)
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)

    def compute_loss(self, mu, std, actions, advantages):
        log_policy_pdf = self.log_pdf(mu, std, actions)
        loss_policy = log_policy_pdf * advantages
        return tf.reduce_sum(-loss_policy)

    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            mu, std = self.model(states, training=True)
            loss = self.compute_loss(mu, std, actions, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [6]:
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(critic_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(32, activation='relu'),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [7]:
class RL_Agents_A2C:
    def __init__(self, building_info, observation_spaces = None, action_spaces = None):
        
        #Hyper-parameters
        self.discount = 0.992 #Discount factor
        self.batch_size = 100 #Size of each MINI-BATCH
        self.iterations = 1 # Number of updates of the actor-critic networks every time-step
        self.policy_freq = 2 # Number of iterations after which the actor and target networks are updated
        self.tau = 5e-3 #Rate at which the target networks are updated
        self.lr_init = 1e-3 #5e-2
        self.lr_final = 1e-3 #3e-3
        self.lr_decay_rate = 1/(78*8760)
        self.expl_noise_init = 0.75 # Exploration noise at time-step 0
        self.expl_noise_final = 0.01 # Magnitude of the minimum exploration noise
        self.expl_noise_decay_rate = 1/(290*8760)  # Decay rate of the exploration noise in 1/h
        self.policy_noise = 0.025*0
        self.noise_clip = 0.04*0
        self.max_action = 0.25
        self.min_samples_training = 400 #Min number of tuples that are stored in the batch before the training process begins
        
        # Parameters
        self.device = "cuda:0"
        self.time_step = 0
        self.building_info = building_info # Can be used to create different RL agents based on basic building attributes or climate zones
        self.observation_spaces = observation_spaces
        self.action_spaces = action_spaces
        self.n_buildings = len(observation_spaces)
        self.buffer = {i: Buffer() for i in range(self.n_buildings)}
        self.networks_initialized = False
        
        # Monitoring variables (one per agent)
        self.actor_loss_list = {i: [] for i in range(self.n_buildings)}
        self.critic1_loss_list = {i: [] for i in range(self.n_buildings)}
        self.critic2_loss_list = {i: [] for i in range(self.n_buildings)}
        self.q_val_list = {i: [] for i in range(self.n_buildings)}
        self.q1_list = {i: [] for i in range(self.n_buildings)}
        self.q2_list = {i: [] for i in range(self.n_buildings)}
        self.a_track1 = []
        self.a_track2 = []
        
        #Networks and optimizers (one per agent)
        self.actor, self.critic = {}, {}
        for i, (o, a) in enumerate(zip(observation_spaces, action_spaces)):
            # A2C
            self.state_dim = o.shape[0]
            self.action_dim = a.shape[0]
            self.action_bound = a.high[0]
            self.std_bound = [1e-2, 1.0]
            
            self.actor[i] = Actor(self.state_dim, self.action_dim, self.action_bound, self.std_bound)
            self.critic[i] = Critic(self.state_dim)
            
    def select_action(self, states):
   
        actions = []
        for i, state in enumerate(states):
            a = self.actor[i]
            action = a.get_action(state)
            a = np.clip(action, -self.action_bound, self.action_bound)
            actions.append(a)
        return actions
    
    def add_to_buffer(self, states, actions, rewards, next_states, dones):
        # Information contained in the building_info variable can be used to choose the number of buffers and what information goes to each buffer
        
        dones = [dones for _ in range(self.n_buildings)]
        
        for i, (s, a, r, s_next, done) in enumerate(zip(states, actions, rewards, next_states, dones)):
            s = (s - self.observation_spaces[i].low)/(self.observation_spaces[i].high - self.observation_spaces[i].low + 0.00001)
            s_next = (s_next - self.observation_spaces[i].low)/(self.observation_spaces[i].high - self.observation_spaces[i].low + 0.00001)
            self.buffer[i].append_sample((s, a, r, s_next, done))

        lr = max(self.lr_final, self.lr_init * (1 - self.time_step * self.lr_decay_rate))
        for i in range(self.n_buildings):
            self.actor_optimizer[i] = optim.Adam(self.actor[i].parameters(), lr=lr)
            self.critic_optimizer[i] = optim.Adam(self.critic[i].parameters(), lr=lr)
            
        #One TD3 control agent for each building
        for i in range(self.n_buildings):
            
            #Learning begins when a minimum number of tuples have beena added to the buffer
            if len(self.buffer[i]) > self.min_samples_training:
                
                #Every time-step we randomly sample 'self.iterations' number of minibatches from the buffer of experiences and perform 'self.iterations' number of updates of the networks.
                for k in range(self.iterations):
                    state, action, reward, next_state, dones_mask = self.buffer[i].sample(self.batch_size)
                    target_Q = reward.unsqueeze(dim=-1)

                    with torch.no_grad():
                        noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
                        
                        # Select action according to policy
                        next_action = (self.actor_target[i](next_state) + noise).clamp(-self.max_action, self.max_action)
                        
                        # Compute the target Q value
                        target_Q1, target_Q2 = self.critic_target[i](next_state, next_action)
                        target_Q = torch.min(target_Q1, target_Q2)
                        target_Q = reward.unsqueeze(dim=-1) + target_Q * self.discount
                        
                    # Get current Q estimates
                    current_Q1, current_Q2 = self.critic[i](state, action)    
                    
                    # Compute critic loss
                    critic1_loss = F.mse_loss(current_Q1, target_Q)
                    critic2_loss = F.mse_loss(current_Q2, target_Q)
                    critic_loss = critic1_loss + critic2_loss
                    
                    # Optimize the critic
                    self.critic_optimizer[i].zero_grad()
                    critic_loss.backward()  
                    self.critic_optimizer[i].step()
                    
                    # Save values
                    self.q_val_list[i].append(target_Q)
                    self.q1_list[i].append(current_Q1)
                    self.q2_list[i].append(current_Q2)
                    self.critic1_loss_list[i].append(critic1_loss)
                    self.critic2_loss_list[i].append(critic2_loss)
                    
                    # Delayed policy updates
                    if k % self.policy_freq == 0:
                        
                        # Compute actor loss
                        actor_loss = -self.critic[i].Q1(state, self.actor[i](state)).mean()
                        self.actor_loss_list[i].append(actor_loss)
                                        
                        # Optimize the actor
                        self.actor_optimizer[i].zero_grad()
                        actor_loss.backward()
                        self.actor_optimizer[i].step()

                        # Update the frozen target models
                        for param, target_param in zip(self.critic[i].parameters(), self.critic_target[i].parameters()):
                            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

                        for param, target_param in zip(self.actor[i].parameters(), self.actor_target[i].parameters()):
                            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        self.time_step += 1
   

In [8]:
# RL CONTROLLER
#Instantiating the control agent(s)
agents = RL_Agents_A2C(building_info, observations_spaces, actions_spaces)

In [None]:
# Select many episodes for training. In the final run we will set this value to 1 (the buildings run for one year)
episodes = 1

k, c = 0, 0
cost, cum_reward = {}, {}

# The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward)
for e in range(episodes):     
    cum_reward[e] = 0
    rewards = []
    state = env.reset()
    done = False
    while not done:
        if k%(1000)==0:
            print('hour: '+str(k)+' of '+str(8760*episodes))
            
        action = agents.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agents.add_to_buffer(state, action, reward, next_state, done)
        state = next_state
        
        cum_reward[e] += reward[0]
        rewards.append(reward)
        k+=1
        
    cost[e] = env.cost()
    if c%20==0:
        print(cost[e])
    c+=1


In [None]:
env

In [None]:
env.action_space

In [None]:
env.action_spaces

In [10]:
state = env.reset()

In [11]:
state

array([array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  2.36,  0.  ,
        0.  ], dtype=float32),
       array([ 1.  , 17.81, 25.29,  1.65,  0.  ], dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.46,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32),
       array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32)], dtype=object)

In [12]:
state[1]

array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  2.36,  0.  ,
        0.  ], dtype=float32)

In [13]:
states

NameError: name 'states' is not defined

In [15]:
actions

NameError: name 'actions' is not defined

In [18]:
action=agents.select_action(state)

In [19]:
env.step(action)

(array([array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.23965307,  0.33333333]),
        array([ 2.  , 16.14, 25.96,  0.  , 41.67,  0.  , 65.46,  1.91,  0.  ,
         0.  ]),
        array([ 2.  , 16.14, 25.96,  1.59,  0.  ]),
        array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.42      ,  0.33333334]),
        array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.26345017,  0.        ]),
        array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.        ,  0.32270916]),
        array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.        ,  0.10690593]),
        array([ 2.        , 16.14      , 25.96      ,  0.        , 41.67      ,
         0.        , 65.46      ,  0.28464492,  0.31884058]),
 

In [20]:
env.cost()



AttributeError: 'list' object has no attribute 'max'

In [21]:
state[0]

array([ 1.  , 17.81, 25.29,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
      dtype=float32)

In [31]:
state_dim = observations_spaces[2].shape[0]

In [32]:
state_dim

5

In [39]:
state=np.reshape(state[2], [1,state_dim])

In [40]:
state[2]

IndexError: index 2 is out of bounds for axis 0 with size 1

In [41]:
state_batch=[]

In [42]:
state_batch.append(state)

In [43]:
state_batch

[array([[ 1.  , 17.81, 25.29,  1.65,  0.  ]], dtype=float32)]

In [44]:
states=agents.list_to_batch(state)

AttributeError: 'RL_Agents_A2C' object has no attribute 'list_to_batch'

In [2]:
state_batch=[]

In [3]:
test = [1,2]

In [8]:
state_batch[0].append(test)

In [6]:
state_batch.append([])

In [9]:
state_batch

[[[1, 2], [1, 2]]]

In [12]:
np.reshape(9,[1,9])

ValueError: cannot reshape array of size 1 into shape (1,9)

In [11]:
import numpy as np

In [13]:
state

NameError: name 'state' is not defined

In [15]:
actor_loss_list = {i: [] for i in range(9)}

In [18]:
actor_loss_list

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: []}

In [19]:
state = [1,2,3,4,5,6,7,8,9]

In [24]:
np.reshape(state,[9,1])

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [23]:
state

[1, 2, 3, 4, 5, 6, 7, 8, 9]