In [1]:
import os
os.chdir('/content/drive/MyDrive/research/power system/power_system_centralized_704')

In [2]:
import numpy as np
from torch.distributions import Categorical, Normal

from memory import Memory
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from util import *
from copy import deepcopy

In [3]:
def fanin_init(size, fanin=None):
    fanin = fanin or size[0]
    v = 1. / np.sqrt(fanin)
    return torch.Tensor(size).uniform_(-v, v)
class Actor(nn.Module):
    def __init__(self, nb_states, nb_actions, hidden1=128, hidden2=128, init_w=3e-3):
        super(Actor, self).__init__()
        #self.constraints = constraints
        self.nb_actions = nb_actions
        self.fc1 = nn.Linear(nb_states, hidden1)
        #self.fc2 = nn.Linear(hidden1, hidden2)
        self.mu_layer = nn.Linear(hidden1,nb_actions) #mean
        self.sigma_layer = nn.Linear(hidden1,nb_actions)# variance
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.softplus = nn.Softplus()
        self.sigmoid = nn.Sigmoid()
        self.init_weights(init_w)
        #self.fc = nn.Linear(nb_states,nb_actions)
    def init_weights(self, init_w):
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
        #self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
        self.mu_layer.weight.data.uniform_(-init_w, init_w)
        self.sigma_layer.weight.data.uniform_(-init_w, init_w)
    def forward(self,x):
        out = self.fc1(x)
        out = self.relu(out)
        #out = self.fc2(out)
        #out = self.relu(out)
        mu = self.mu_layer(out)
        sigma = self.sigma_layer(out)
        #out = self.fc3(out)
        #out = self.fc(x)
        mu = self.tanh(mu)#[-1,1]
        sigma = self.softplus(sigma)+ 0.001
        return mu,sigma
    def select_action(self,x):
        mu,sigma_sq = self.forward(x)
        mu = mu.squeeze(0)
        sigma_sq = sigma_sq.squeeze(0)
        #mu = to_numpy(mu)
        #sigma_sq = to_numpy(sigma_sq)
        sigma = torch.sqrt(sigma_sq)
        action = to_tensor(np.random.normal(size = (self.nb_actions,)))
        action = mu + sigma*action
        action = torch.maximum(torch.minimum(torch.tensor(1.0),action),torch.tensor(-1.0))
        prob = 1/torch.sqrt(2*torch.pi*sigma_sq)*torch.exp(-torch.square(action-mu)/(2.0*sigma_sq))
        log_prob = torch.log(prob).sum()
        action = to_numpy(action)
        return action,log_prob
class Critic(nn.Module):
    def __init__(self,nb_states,nb_actions,hidden1 = 128,hidden2 = 128,init_w = 3e-3):
        super(Critic, self).__init__()
       # self.constraints = constraints
        self.fc1 = nn.Linear(nb_states, hidden1)
        #self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden1, 1) 
        self.relu = nn.ReLU()
        self.init_weights(init_w)
        #self.fc = nn.Linear(nb_states+nb_actions,1)
    def init_weights(self, init_w):
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
        #self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
        self.fc3.weight.data.uniform_(-init_w, init_w)
    def forward(self,x):
        out = self.fc1(x)
        out = self.relu(out)
        #out = self.fc2(out)
        #out = self.relu(out)
        out = self.fc3(out)
        #out = self.fc(x)
        return out

In [4]:
criterion = nn.MSELoss()
class AC(object):
    def __init__(self, nb_interval,nb_states,nb_actions, constraints, args):
        if args.seed > 0:
            self.seed(args.seed)
        self.constraints = constraints
        self.nb_interval = nb_interval
        self.nb_states = nb_states
        self.nb_actions = nb_actions
        assert self.nb_states.shape[0] == self.nb_interval
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_w':args.init_w
        }

        self.actor_dict = dict()

        self.actor_optim_dict = dict()

        self.critic_dict = dict()

        self.critic_optim_dict = dict()

        #self.memory_dict = dict()

        for i in range(self.nb_interval):

            self.actor_dict[i] = Actor(self.nb_states[i],self.nb_actions, **net_cfg)

            self.actor_optim_dict[i] = Adam(self.actor_dict[i].parameters(), lr=args.prate)
            

            self.critic_dict[i] = Critic(self.nb_states[i], self.nb_actions, **net_cfg)
            self.critic_optim_dict[i] = Adam(self.critic_dict[i].parameters(), lr=args.rate)



            
            #Create replay buffer
            #self.memory_dict[i] = Memory(limit=args.rmsize)


        #self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount


        # 

        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.is_training = True

        # 
        if USE_CUDA: self.cuda()

    def update_policy(self,state,action,log_prob,reward,next_state,time_interval):
        #============Update Critic======================
        value = self.critic_dict[time_interval](to_tensor(np.array([state])))
        with torch.no_grad():
          if time_interval == self.nb_interval -1:
            target = torch.tensor(np.array([reward.astype(np.float32)])).unsqueeze(0).cuda()
          else:
            target = self.critic_dict[time_interval+1](to_tensor(np.array([next_state]))) + reward
          advantage = target - value
        self.critic_dict[time_interval].zero_grad()
        try:
          loss_critic = criterion(target,value)
        except TypeError:
          print(type(value))
          print(type(target))
        loss_critic.backward()
        self.critic_optim_dict[time_interval].step()

        #==============Update Actor==================
        self.actor_dict[time_interval].zero_grad()
        loss_actor = -advantage*log_prob
        loss_actor.backward()
        self.actor_optim_dict[time_interval].step()

        

    def cuda(self):
        for i in range(self.nb_interval):
            self.actor_dict[i].cuda()
            self.critic_dict[i].cuda()



    def select_action(self, s_t,time_interval):
        action,log_prob = self.actor_dict[time_interval].select_action(to_tensor(np.array([s_t])))     
        return action,log_prob

    def seed(self,s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
    def initial_memory(self):
      for i in range(self.nb_interval):
        self.memory_dict[i] = Memory(limit=args.rmsize)

In [5]:
def rescale_action(action,lb,ub):
    return action * (ub- lb) / 2.0 +\
            (lb + ub) / 2.0
class EnvironmentNode1(object):
    def __init__(self, constraints,cost_func,load,nb_interval,gaussian_noise):
        self.constraints = constraints
        self.cost_func = cost_func
        self.load = load
        self.nb_interval = nb_interval
        self.gaussian_noise = gaussian_noise
        self.normal_scale = constraints['normal_scale']
        self.reset()
    def reset(self):
        self.battery = 0.0
        self.observation =  np.array([self.battery*self.constraints['battery_scale'],\
            self.load[0]*self.constraints['load_scale']])
        self.state = np.array([self.battery,\
            self.load[0]])
        self.time_interval = 0
    def get_obs(self):
        return self.observation,self.time_interval
    def reward(self,state,true_action,time_interval):
        if type(time_interval) != int:
            print(time_interval) 
        a_p,a_c= true_action
        
        prod_cost = self.cost_func(a_p)
        """
        chargingMismatchCost = -self.constraints['penaltyCoef']*np.abs(effective_c-a_c)
        productionMismatchCost = -self.constraints['penaltyCoef']*np.abs(effective_p-a_p)
        penalty = chargingMismatchCost + productionMismatchCost
        """
        return prod_cost
    def make_action(self,action,t = None):
        if t is None:
            t = self.time_interval
        a_c = action.item() #production, node 1 charging
        load_t = self.load[t]
        if self.gaussian_noise:
            a_c += np.random.normal(scale = self.normal_scale) 
            a_c = np.clip(a_c,-1,1)


        a_c = rescale_action(a_c,-self.constraints['max_charge'],self.constraints['max_charge'])
              
        
        lb_c = np.maximum(-self.battery,-load_t)
        ub_c = np.minimum(self.constraints['max_energy']-self.battery, self.constraints['max_prod']-load_t)
        
        effective_c = np.minimum(np.maximum(a_c,lb_c),ub_c)

        a_p = effective_c + load_t

        #effective_p = np.minimum(np.maximum(a_p,0.0),self.constraints['max_prod'])

        return np.array([a_p,effective_c])
    def scale_decay(self):
        self.normal_scale *= self.constraints['decay_rate']
    def step(self,action):
        action_copy = deepcopy(action)
        true_action = self.make_action(action)
        costProduction = self.reward(self.state,true_action,self.time_interval)
        costProduction*= self.constraints['reward_scale']
        #violationPenalty*= self.constraints['reward_scale']
        if self.time_interval == self.nb_interval -1:
            self.reset()
            return self.observation,costProduction,action_copy,self.time_interval
        else:
            a_p,a_c = true_action
            self.battery+= a_c
            self.time_interval += 1

            self.state = np.array([self.battery,self.load[self.time_interval]])

            self.observation =  np.array([self.battery*self.constraints['battery_scale'],\
                self.load[self.time_interval]*self.constraints['load_scale']])
            
            return self.observation,costProduction,action_copy,self.time_interval

In [6]:
class EnvironmentNode2(object):
    def __init__(self, constraints,elFunc,nodalPrice,nb_interval,gaussian_noise):
        self.constraints = constraints
        self.elFunc = elFunc
        self.nodalPrice = nodalPrice
        self.nb_interval = nb_interval
        self.gaussian_noise = gaussian_noise
        self.normal_scale = constraints['normal_scale']
        self.impedance = 0.001
        self.reset()
    def reset(self):
        self.battery = 0.0
        self.observation =  np.array([self.battery*self.constraints['battery_scale'],\
            self.nodalPrice[0]*self.constraints['mc_scale']])
        self.state = np.array([self.battery,\
            self.nodalPrice[0]])
        self.time_interval = 0
    def get_obs(self):
        return self.observation,self.time_interval
    def reward(self,state,true_action,time_interval):
        if type(time_interval) != int:
            print(time_interval) 
        a_t,a_c,a_el = true_action
        
        elUtility = self.elFunc(a_el)
        
        transmission_fee = self.nodalPrice[time_interval]*a_t

        
        return elUtility, transmission_fee
    def make_action(self,action,t = None):
        if t is None:
            t = self.time_interval
        a_t,a_c = action #transmission,charging,elastic load
        
        if self.gaussian_noise:
            a_t += np.random.normal(scale = self.normal_scale) 
            a_t = np.clip(a_t,-1,1)
            a_c += np.random.normal(scale = self.normal_scale) 
            a_c = np.clip(a_c,-1,1)

        
        max_trans = self.constraints['max_angle']/self.impedance
        
        a_t = rescale_action(a_t,lb = 0.0, ub= max_trans)
        a_c = rescale_action(a_c,lb = -self.constraints['max_charge'],ub = self.constraints['max_charge'])
        
     
        lb_c = -self.battery
        ub_c = np.minimum(self.constraints['max_energy']-self.battery, a_t)
        
        effective_c = np.minimum(np.maximum(a_c,lb_c),ub_c)

        a_el = a_t - effective_c

        return np.array([a_t,effective_c,a_el])
    def scale_decay(self):
        self.normal_scale *= self.constraints['decay_rate']
    def step(self,action):
        action_copy = deepcopy(action)
        true_action = self.make_action(action)
        elUtility,transmission_fee = self.reward(self.state,true_action,self.time_interval)
        elUtility*= self.constraints['reward_scale']
        transmission_fee*= self.constraints['reward_scale']
        if self.time_interval == self.nb_interval -1:
            self.reset()
            return self.observation,elUtility,transmission_fee,action_copy,self.time_interval
        else:
            a_t,a_c,a_el = true_action
            self.battery+= a_c
            self.time_interval += 1

            self.state = np.array([self.battery,self.nodalPrice[self.time_interval]])

            self.observation =  np.array([self.battery*self.constraints['battery_scale'],\
                self.nodalPrice[self.time_interval]*self.constraints['mc_scale']])
            
            return self.observation,elUtility,transmission_fee,action_copy,self.time_interval

In [7]:
def trainingNode1():
  step = 0
  observation,time_interval = envNode1.get_obs()
  while step < args.train_iter:
    action,log_prob = agent1.select_action(observation,time_interval)
    new_observation,cP,action,new_time_interval = envNode1.step(action)
    agent1.update_policy(observation,action,log_prob,cP,new_observation,time_interval)
    observation = new_observation
    time_interval = new_time_interval
    step += 1

In [8]:
def trainingNode2():
  step = 0
  observation,time_interval = envNode2.get_obs()
  while step < args.train_iter:
    action,log_prob = agent2.select_action(observation,time_interval)
    new_observation,eLU,tF,action,new_time_interval = envNode2.step(action)
    agent2.update_policy(observation,action,log_prob,eLU+tF,new_observation,time_interval)
    observation = new_observation
    time_interval = new_time_interval
    step += 1

In [9]:
import logging
import sys
import argparse
import datetime

In [10]:
constraints = dict()
constraints['max_prod'] = 600
constraints['max_charge'] = 200
constraints['max_energy'] = 280
constraints['max_angle'] = 0.08
constraints['load_scale'] = 0.005
constraints['mc_scale'] = 0.05
constraints['battery_scale'] = 0.01
constraints['reward_scale'] = 0.001
constraints['normal_scale'] = 0.05
constraints['decay_rate'] = 0.99
constraints['penaltyCoef'] = 100.0

In [11]:
def cost_func(x):
  return -(20+2*x+0.02*x**2)
def elastic_load_func(x):
  return -0.02*x**2 + 12*x +5
def mc_func(x):
  return -0.04*x-2

 
impedance = 0.001
startup = 30
nb_interval = 8
nb_states_1 = np.array([1+1 for x in range(nb_interval)])
nb_states_2 = np.array([1+1 for x in range(nb_interval)])
nb_actions1 = 1
nb_actions2 = 2
load_1 = np.array([100,80,70,75,150,180,220,200]).astype(np.float32)

In [18]:
parser = argparse.ArgumentParser(description='Power system DDPG')

parser.add_argument('--hidden1', default=128, type=int, help='hidden num of first fully connect layer')
parser.add_argument('--hidden2', default=256, type=int, help='hidden num of second fully connect layer')
parser.add_argument('--rate', default=0.001, type=float, help='learning rate')
parser.add_argument('--actorDecay', default=0.999, type=float, help='gamma for actor optimization schedular')
parser.add_argument('--criticDecay', default=0.9999, type=float, help='gamma for critic optimization schedular')
parser.add_argument('--prate', default=0.0001, type=float, help='policy net learning rate (only for DDPG)')
parser.add_argument('--warmup', default=1500, type=int, help='time without training but only filling the replay memory')
parser.add_argument('--discount', default=1., type=float, help='')
parser.add_argument('--bsize', default=128, type=int, help='minibatch size')
parser.add_argument('--rmsize', default=50000, type=int, help='memory size')
parser.add_argument('--tau', default=0.3, type=float, help='moving average for target network')
parser.add_argument('--init_w', default=0.03, type=float, help='') 
parser.add_argument('--train_iter', default=2000, type=int, help='train iters each timestep')
parser.add_argument('--seed', default=123, type=int, help='')
args = parser.parse_args("")



In [19]:
np.random.seed(13)

In [20]:
randomness = False #no gaussian noise if False
agent1 = AC(nb_interval,nb_states_1,nb_actions1, constraints, args)
agent2 = AC(nb_interval,nb_states_2,nb_actions2, constraints, args)

In [21]:
envNode1 =  EnvironmentNode1(constraints,cost_func,load_1,nb_interval,gaussian_noise = randomness)
envNode2 = EnvironmentNode2(constraints,elastic_load_func,np.zeros(nb_interval),nb_interval,gaussian_noise = randomness)

In [22]:
#transmissionVolume = np.array([69.27,75.937,79.27,77.604,52.604,42.604,29.27,35.937])
transmissionVolume = np.zeros(nb_interval)
nodalPrice = np.zeros(nb_interval)
#nodalPrice = np.ones(nb_interval)*(-9.68748)

In [23]:
epoch =0
while epoch < 300:
  #agent1.initial_memory
  #agent2.initial_memory
  envNode1.load = load_1 + transmissionVolume
  envNode2.nodalPrice = nodalPrice
  trainingNode1()
  trainingNode2()
  with torch.no_grad():
    totalUtility = 0.0
    totalReturn = 0.0
    actionNode1_list = []
    actionNode2_list = []
    envNode1.reset()
    envNode2.reset()
    observationNode1,time_interval = envNode1.get_obs()
    observationNode2,_ = envNode2.get_obs()
    for timeStep in range(nb_interval):
      actionNode1,logProbNode1 = agent1.select_action(observationNode1,time_interval)
      actionNode2,logProbNode2 = agent2.select_action(observationNode2,time_interval)
      observationNode1,cP,actionNode1,time_interval = envNode1.step(actionNode1)
      observationNode2,eLU,tF,actionNode2,_ = envNode2.step(actionNode2)
      actionNode1_list.append(actionNode1)
      actionNode2_list.append(actionNode2)
      totalUtility +=  eLU
      totalReturn += cP
    actionNode1_list = np.vstack(actionNode1_list)
    actionNode2_list = np.vstack(actionNode2_list)
    productionArray = rescale_action(actionNode1_list[:,0], 0.0, constraints['max_prod'])
    mcArray = mc_func(productionArray)
    transmissionArray = rescale_action(actionNode2_list[:,0],0.0,constraints['max_angle']/impedance)
    
    nodalPrice = 0.95*nodalPrice + 0.05*mcArray
    transmissionVolume = 0.95*transmissionVolume + 0.05*transmissionArray
    print("=============EPOCH {}===============".format(epoch))
    print("Utility is ",totalUtility)
    print("Cost is ",totalReturn)
    print("nodalPrice discrepancy is ",np.linalg.norm(mcArray-nodalPrice))
    print("transmission discrepancy is ",np.linalg.norm(transmissionArray-transmissionVolume))
  epoch += 1





Utility is  3.8507830253821522
Cost is  -8.918429164307526
nodalPrice discrepancy is  51.440452828225446
transmission discrepancy is  134.2662103720378
Utility is  4.099086187845349
Cost is  -7.0092988107409635
nodalPrice discrepancy is  32.15071614544823
transmission discrepancy is  145.44356556997462
Utility is  3.7485421845118556
Cost is  -5.943706607995619
nodalPrice discrepancy is  19.836577310730743
transmission discrepancy is  116.82039810596177
Utility is  3.59036062966727
Cost is  -8.21795896375049
nodalPrice discrepancy is  27.804915773629954
transmission discrepancy is  116.02477157408244
Utility is  5.656069921035893
Cost is  -7.157187969531993
nodalPrice discrepancy is  31.80613287803263
transmission discrepancy is  161.10803129000777
Utility is  4.2651022156030685
Cost is  -9.712568207081786
nodalPrice discrepancy is  31.62874408599149
transmission discrepancy is  121.63399874229144
Utility is  5.289658954213559
Cost is  -6.529961105578013
nodalPrice discrepancy is  9.399

KeyboardInterrupt: ignored