In [3]:
import gym
import itertools
import numpy as np
import os
import random
import sys
# import psutil
import tensorflow as tf
import multiprocessing
GAME = 'Pendulum-v0'
env = gym.make(GAME)

In [7]:
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
ENTROPY_BETA= 0.01

In [9]:
class ACNet():
    def __init__(self,scope,globalAC = None):
        if scope == 'globalNet':
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S],name = 's')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32,[None,N_S],name = 's')
                self.a_his = tf.placeholder(tf.float32,[None,N_A],name = 'a_his')
                self.v_target = tf.placeholder(tf.float32,[None,1],name = 'v_target')
                mu, sigma,self.v, self.a_params, self.c_params = self._build_net(scope)
                td = tf.subtract(self.v_target,self.v)
                
                with tf.name_scope('c_loss'):
                    self.c_loss = tf.reduce_mean(tf.square(td))
                    
                with tf.name_scope('wrap_a_out'):
                    mu ,sigma = mu*ACTION_BOUND[1] ,sigma + 1e-8
                
                norm_dist = tf.distributions.Normal(mu,sigma)
                with tf.name_scope('a_loss'):
                    a_prob = norm_dist.log_prob(self.a_his)
                    self.exp_v = a_prob * tf.stop_gradient(td)
                    entropy = norm_dist.entropy()
                    self.exp_v = self.exp_v + entropy*ENTROPY_BETA
                    self.a_loss = tf.reduce_mean(-self.exp_v)
                
                with tf.name_scope('choose_a'):
                    self.a_choose = tf.clip_by_value(tf.squeeze(norm_dist.sample(1)),ACTION_BOUND[0],ACTION_BOUND[1])
                
                with tf.name_scope('local_grad'):
                    self.a_grad = tf.gradients(self.a_loss, self.a_params)
                    self.c_grad = tf.gradients(self.c_loss, self.c_params)
                
                with tf.name_scope('sync'):
                    with tf.name_scope('push'):
                        self.update_a_op = OPT_A.apply_gradients(zip(self.a_grad,globalAC.a_params))
                        self.update_c_op = OPT_C.apply_gradients(zip(self.c_grad,globalAC.c_params))
                    with tf.name_scope('pull'):
                        self.pull_to_a_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params,globalAC.a_params)]
                        self.pull_to_c_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params,globalAC.c_params)]
    def _build_net(self,scope):
        w_init = tf.random_normal_initializer(0.,0.1)
        with tf.variable_scope('actor_net'):
            l_a = tf.layers.dense(self.s,10,activation = tf.nn.relu,kernel_initializer = w_init,name = 'la')
            mu = tf.layers.dense(l_a,N_A,activation = tf.nn.tanh, kernel_initializer = w_init,name = 'mu')
            sigma = tf.layers.dense(l1,N_A,activation = tf.nn.softmax, kernel_initializer= w_init,name = 'sigma')
        with tf.variable_scope('critic_net'):
            l_c = tf.layers.dense(self.s,10,activation = tf.nn.relu,kernel_initializer = w_init,name = 'lc')
            v = tf.layers.dense(l_c,1,activation = tf.nn.relu, kernel_initializer= w_init, name = 'v')
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope = scope + '/actor_net')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope = scope + '/critic_net')
        return mu, sigma, v, a_params, c_params
    
    def update_global(self,feed_dict):
        SESS.run([self.update_a_op,self.update_c_op],feed_dict)
    
    def pull_global(self):
        SESS.run([self.pull_to_a_op,self.pull_to_c_op])
    
    def choose_action(self,s):
        s = s[np.newaxis,:]
        SESS.run(self.a_choose,{self.s:s})

In [10]:
MAX_GLOBAL_EP = 2000
MAX_EP_STEP = 200
UPDATE_GLOBAL_ITER = 10
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
GAMMA = 0.9

In [12]:
class Worker():
    def __init__(self,name,globalAC):
        self.name = name
        self.env = env
        self.AC = ACNet(self.name,globalAC)
    def work():
        global GLOBAL_EP
        buffer_s, buffer_a, buffer_r = [],[],[]
        total_step = 0
        while total_step < MAX_GLOBAL_EP and not coord.should_stop():
            s = self.env.reset()
            ep_r = 0
            for ep_t in range(MAX_EP_STEP):
                a = self.AC.choose_action(s)
                s_,r,done,info = self.env.step(a)
                done = True if ep_t == MAX_EP_STEP-1 else False
                ep_r += r
                
                buffer_s.append(s_)
                buffer_a.append(a)
                buffer.r.append(r)
                
                if total_step % UPDATE_GLOBAL_ITER ==0 or done:
                    if done:
                        v_s_ = 0
                    else:
                        v_s_ = SESS.run(self.AC.v,{self.AC.s:s_[np.newaxis,:]})[0,0]
                    buffer_v_target = []
                    for item in r[::-1]:
                        v_s_ += item + GAMMA*v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()
                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a),np.stack(buffer_v_target)
                    feed_dict = {
                        self.ACNet.s:buffer_s,
                        self.ACNet.a_his:buffer_a,
                        self.ACNet.v_target:buffer_v_target
                    }
                    self.ACNet.update_global(feed_dict)
                    buffer_s ,buffer_a,buffer_r = [],[],[]
                    self.ACNet.pull_global()
                    
                s = s_
                total_step +=1
                if done:
                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:", GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                          )
                    GLOBAL_EP += 1
                    break

In [20]:
SESS = tf.Session()
NUM_WORKERS = multiprocessing.cpu_count()
with tf.device('/cpu:0'):
    OPT_A = tf.train.RMSPropOptimizer(LR_A,name = 'RMS_A')
    OPT_C = tf.train.RMSPropOptimizer(LR_C,name = 'RMS_C')
    GlobalNet = ACNet('global_net')
    workers = []
    for i in range(NUM_WORKERS):
        name = 'worker_{0}'.format(i)
        workers.append(Worker(name,GlobalNet))
COORD = tf.coordinator()
threads = []
for worker in workers:
    job = lambda: worker.work()
    t = threading.Thread(target = job)
    threads.append(t)
COORD.join(threads)

plt.plot(np.arange(len(GLOBAL_RECORDING_R)),GLOBAL_RECORDING_R)
plt.xlabel('step')
plt.ylabel('consum reward')
plt.show()

NameError: name 'LR_A' is not defined

In [11]:
a = tf.random.normal(shape=(3, 3))
a = tf.Variable(a)
with tf.GradientTape() as tape:
    c = tf.square(a)
    dc_da = tape.gradient(c, a)
    print(dc_da)
dc_da.numpy()

tf.Tensor(
[[-0.40345725  0.72177875  1.5020591 ]
 [-0.4344124  -2.2236702  -0.25438276]
 [-0.60460037 -4.4565606   2.029144  ]], shape=(3, 3), dtype=float32)


array([[-0.40345725,  0.72177875,  1.5020591 ],
       [-0.4344124 , -2.2236702 , -0.25438276],
       [-0.60460037, -4.4565606 ,  2.029144  ]], dtype=float32)

In [14]:
with tf.GradientTape() as outer_tape:
  with tf.GradientTape() as tape:
    c = tf.sqrt(tf.square(a))
    dc_da = tape.gradient(c, a)
  d2c_da2 = outer_tape.gradient(dc_da, a)
  print(d2c_da2)

tf.Tensor(
[[ 0.0000000e+00  0.0000000e+00 -2.3841858e-07]
 [ 4.7683716e-07  0.0000000e+00  4.7683716e-07]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00]], shape=(3, 3), dtype=float32)


In [20]:
from tensorflow.keras.layers import Layer
class Linear(Layer):
    """y = Wx + b"""
    def __init__(self, units=32):
        super().__init__()
        self.units = units
    
    def build(self, input_shape):
        self.w = self.add_weight(shape=(input_shape[-1], self.units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.units,),
                                 initializer='random_normal',
                                 trainable=True)
        
    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b


linear_layer = Linear(4)
linear_layer(tf.ones((2, 2)))
assert len(linear_layer.weights) == 2

In [21]:
env = gym.make('CartPole-v1')
env.seed(42)
obs = env.reset()
obs

array([-0.01258566, -0.00156614,  0.04207708, -0.00180545])

In [27]:
env.render()

NameError: name 'base' is not defined