# SAC on Ant Bullet <font color='grey'> (*Self-Contained*) </font>

In [19]:
import datetime,gym,os,pybullet_envs,time,os
import numpy as np
import tensorflow as tf
np.set_printoptions(precision=2)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.14.0].


### Replay Buffer

In [2]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """
    def __init__(self, odim, adim, size):
        self.obs1_buf = np.zeros([size, odim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, odim], dtype=np.float32)
        self.acts_buf = np.zeros([size, adim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size
    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)
    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])

### Soft Actor Critic Algorithm

In [3]:
def create_sac_model(odim=10,adim=2,hdims=[256,256]):
    """
    Soft Actor Critic Model (compatible with Ray)
    """
    import tensorflow as tf # make it compatible with Ray actors
    
    def mlp(x,hdims=[256,256],actv=tf.nn.relu,out_actv=tf.nn.relu):
        ki = tf.truncated_normal_initializer(stddev=0.1)
        for hdim in hdims[:-1]:
            x = tf.layers.dense(x,units=hdim,activation=actv,kernel_initializer=ki)
        return tf.layers.dense(x,units=hdims[-1],activation=out_actv,kernel_initializer=ki)
    def gaussian_loglik(x,mu,log_std):
        EPS = 1e-8
        pre_sum = -0.5*(
            ( (x-mu)/(tf.exp(log_std)+EPS) )**2 +
            2*log_std + np.log(2*np.pi)
        )class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """
    def __init__(self, odim, adim, size):
        self.obs1_buf = np.zeros([size, odim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, odim], dtype=np.float32)
        self.acts_buf = np.zeros([size, adim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size
    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)
    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])
        return tf.reduce_sum(pre_sum, axis=1)
    def mlp_gaussian_policy(o,adim=2,hdims=[256,256],actv=tf.nn.relu):
        net = mlp(x=o,hdims=hdims,actv=actv,out_actv=actv) # feature 
        mu = tf.layers.dense(net,adim,activation=None) # mu
        log_std = tf.layers.dense(net,adim,activation=None) # log_std
        LOG_STD_MIN,LOG_STD_MAX = -10.0,+2.0
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) 
        std = tf.exp(log_std) # std 
        pi = mu + tf.random_normal(tf.shape(mu)) * std  # sampled
        logp_pi = gaussian_loglik(x=pi,mu=mu,log_std=log_std) # log lik
        return mu,pi,logp_pi
    def squash_action(mu,pi,logp_pi):
        # Squash those unbounded actions
        logp_pi -= tf.reduce_sum(2*(np.log(2) - pi -
                                    tf.nn.softplus(-2*pi)), axis=1)
        mu,pi = tf.tanh(mu),tf.tanh(pi)
        return mu, pi, logp_pi
    def mlp_actor_critic(o,a,hdims=[256,256],actv=tf.nn.relu,out_actv=None,
                         policy=mlp_gaussian_policy):
        adim = a.shape.as_list()[-1]
        with tf.variable_scope('pi'): # policy
            mu,pi,logp_pi = policy(o=o,adim=adim,hdims=hdims,actv=actv)
            mu,pi,logp_pi = squash_action(mu=mu,pi=pi,logp_pi=logp_pi)
        def vf_mlp(x): return tf.squeeze(
            mlp(x=x,hdims=hdims+[1],actv=actv,out_actv=None),axis=1)
        with tf.variable_scope('q1'): q1 = vf_mlp( tf.concat([o,a],axis=-1))
        with tf.variable_scope('q2'): q2 = vf_mlp( tf.concat([o,a],axis=-1))
        return mu,pi,logp_pi,q1,q2
    
    def placeholder(dim=None):
        return tf.placeholder(dtype=tf.float32,shape=(None,dim) if dim else (None,))
    def placeholders(*args):
        """
        Usage: a_ph,b_ph,c_ph = placeholders(adim,bdim,None)
        """
        return [placeholder(dim) for dim in args]
    def get_vars(scope):
        return [x for x in tf.compat.v1.global_variables() if scope in x.name]
    
    # Have own session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    
    # Placeholders
    o_ph,a_ph,o2_ph,r_ph,d_ph = placeholders(odim,adim,odim,None,None)
    # Actor critic 
    ac_kwargs = {'hdims':hdims,'actv':tf.nn.relu,'out_actv':None,'policy':mlp_gaussian_policy}
    with tf.variable_scope('main'):
        mu,pi,logp_pi,q1,q2 = mlp_actor_critic(o=o_ph,a=a_ph,**ac_kwargs)
    with tf.variable_scope('main',reuse=True):
        _,_,_,q1_pi,q2_pi = mlp_actor_critic(o=o_ph,a=pi,**ac_kwargs)
        _,pi_next,logp_pi_next,_,_ = mlp_actor_critic(o=o2_ph,a=a_ph,**ac_kwargs)
    # Target value
    with tf.variable_scope('target'):
        _,_,_,q1_targ,q2_targ = mlp_actor_critic(o=o2_ph,a=pi_next,**ac_kwargs)
        
    # Get variables
    main_vars,q_vars,pi_vars,target_vars = \
        get_vars('main'),get_vars('main/q'),get_vars('main/pi'),get_vars('target')
    
    model = {'o_ph':o_ph,'a_ph':a_ph,'o2_ph':o2_ph,'r_ph':r_ph,'d_ph':d_ph,
             'mu':mu,'pi':pi,'logp_pi':logp_pi,'q1':q1,'q2':q2,
             'q1_pi':q1_pi,'q2_pi':q2_pi,
             'pi_next':pi_next,'logp_pi_next':logp_pi_next,
             'q1_targ':q1_targ,'q2_targ':q2_targ,
             'main_vars':main_vars,'q_vars':q_vars,'pi_vars':pi_vars,'target_vars':target_vars}
        
    return model,sess

def create_sac_graph(model,lr=1e-3,gamma=0.98,alpha=0.1,polyak=0.995):
    """
    SAC Computational Graph
    """
    # Double Q-learning
    min_q_pi = tf.minimum(model['q1_pi'],model['q2_pi'])
    min_q_targ = tf.minimum(model['q1_targ'],model['q2_targ'])
    
    # Entropy-regularized Bellman backup
    q_backup = tf.stop_gradient(
        model['r_ph'] + 
        gamma*(1-model['d_ph'])*(min_q_targ - alpha*model['logp_pi_next'])
    )
    
    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha*model['logp_pi'] - min_q_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - model['q1'])**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - model['q2'])**2)
    value_loss = q1_loss + q2_loss
    
    # Policy train op
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss,var_list=model['pi_vars'])
    
    # Value train op 
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,var_list=model['q_vars'])
        
    # Polyak averaging for target variables
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in 
                                      zip(model['main_vars'], model['target_vars'])]
                                )
    
    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, model['q1'], model['q2'], model['logp_pi'],
                train_pi_op, train_value_op, target_update]
    
    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in 
                                zip(model['main_vars'], model['target_vars'])]
                          )

    return step_ops,target_init
    
def get_action(model,sess,o,deterministic=False):
    act_op = model['mu'] if deterministic else model['pi']
    return sess.run(act_op, feed_dict={model['o_ph']:o.reshape(1,-1)})[0]

print ("SAC model ready.")

SAC model ready.


### Initialize Environment

In [10]:
gym.logger.set_level(40)
env_name = 'AntBulletEnv-v0'
env,test_env = gym.make(env_name),gym.make(env_name)
_ = test_env.render(mode='human') # enable rendering on test_env
_ = test_env.reset()
for _ in range(3): # dummy run for proper rendering 
    a = test_env.action_space.sample()
    o,r,d,_ = test_env.step(a)
    time.sleep(0.01)
print ("[%s] ready."%(env_name))
observation_space = env.observation_space
action_space = env.action_space # -1.0 ~ +1.0
odim,adim = observation_space.shape[0],action_space.shape[0]
print ("odim:[%d] adim:[%d]."%(odim,adim))

[AntBulletEnv-v0] ready.
odim:[28] adim:[8].


### Initialize SAC

In [5]:
tf.reset_default_graph()
model,sess = create_sac_model(odim=odim,adim=adim)
step_ops,target_init = create_sac_graph(model,lr=1e-3,gamma=0.98,alpha=0.1,polyak=0.995)
# Replay buffers
replay_buffer = ReplayBuffer(odim=odim,adim=adim,size=int(1e6))
replay_buffer_short = ReplayBuffer(odim=odim,adim=adim,size=int(1e5))

### Train

In [6]:
# Training configuration 
total_steps,start_steps = 1e6,1e4
update_every,update_count,batch_size,max_ep_len_train = 1,2,128,1e3
evaluate_every,num_eval,max_ep_len_test = 1e4,3,1e3

In [7]:
# Fix random seed and initialize the model
seed = 0
tf.set_random_seed(seed)
np.random.seed(seed)
sess.run(tf.global_variables_initializer())
sess.run(target_init)

### Loop

In [8]:
start_time = time.time()
o,ep_ret,ep_len = env.reset(),0,0
for t in range(int(total_steps)):
    zero_to_one = (t/total_steps)
    one_to_zero = 1.0-zero_to_one
    esec = time.time()-start_time
    
    # Get action 
    if t > start_steps: a = get_action(model,sess,o,deterministic=False)
    else: a = env.action_space.sample()
        
    # Step the env
    o2,r,d,_ = env.step(a)
    ep_ret += r
    ep_len += 1
    d = False if ep_len==max_ep_len_train else d # ignore done if it maxed out 
    
    # Store experience to replay buffers
    replay_buffer.store(o, a, r, o2, d) # save obs, action, reward, next obs
    replay_buffer_short.store(o, a, r, o2, d) # save obs, action, reward, next obs
    o = o2 # easy to overlook
    
    # End of trajectory handling - reset env
    if d or (ep_len == max_ep_len_train):
        o, ep_ret, ep_len = env.reset(), 0, 0
    
    # Update
    if (t>=start_steps) and (t%update_every == 0):
        for _ in range(update_count):
            batch = replay_buffer.sample_batch(batch_size//2) 
            batch_short = replay_buffer_short.sample_batch(batch_size//2) 
            feed_dict = {model['o_ph']: np.concatenate((batch['obs1'],batch_short['obs1'])),
                         model['o2_ph']: np.concatenate((batch['obs2'],batch_short['obs2'])),
                         model['a_ph']: np.concatenate((batch['acts'],batch_short['acts'])),
                         model['r_ph']: np.concatenate((batch['rews'],batch_short['rews'])),
                         model['d_ph']: np.concatenate((batch['done'],batch_short['done']))
                        }
            outs = sess.run(step_ops,feed_dict=feed_dict) # train 
            q1_val,q2_val = outs[3],outs[4]
            
    # Evaluate
    if (((t+1)%evaluate_every) == 0): 
        print ("[Evaluate] step:[%d/%d][%.1f%%] time:%s."%
               (t+1,total_steps,zero_to_one*100,
                time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time)))
              )
        for eval_idx in range(num_eval): 
            o,d,ep_ret,ep_len = test_env.reset(),False,0,0
            _ = test_env.render(mode='human') 
            while not(d or (ep_len == max_ep_len_test)):
                a = get_action(model,sess,o,deterministic=True)
                o,r,d,_ = test_env.step(a)
                _ = test_env.render(mode='human') 
                ep_ret += r # compute return 
                ep_len += 1
            print ("[Evaluate] [%d/%d] ep_ret:[%.4f] ep_len:[%d]"
                %(eval_idx,num_eval,ep_ret,ep_len))
    
print ("Done.")

[Evaluate] step:[10000/1000000][1.0%] time:00:00:07.
[Evaluate] [0/3] ep_ret:[221.1132] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[237.8531] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[69.1929] ep_len:[180]
[Evaluate] step:[20000/1000000][2.0%] time:00:03:15.
[Evaluate] [0/3] ep_ret:[547.6536] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[561.0090] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[575.7181] ep_len:[1000]
[Evaluate] step:[30000/1000000][3.0%] time:00:06:23.
[Evaluate] [0/3] ep_ret:[488.5191] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[506.4437] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[419.2753] ep_len:[1000]
[Evaluate] step:[40000/1000000][4.0%] time:00:09:32.
[Evaluate] [0/3] ep_ret:[539.0713] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[777.1833] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[517.9417] ep_len:[1000]
[Evaluate] step:[50000/1000000][5.0%] time:00:12:40.
[Evaluate] [0/3] ep_ret:[733.7249] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[806.7745] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[698.7821] ep_len:[1000]
[E

[Evaluate] [1/3] ep_ret:[2767.9099] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[2814.7225] ep_len:[1000]
[Evaluate] step:[420000/1000000][42.0%] time:02:08:35.
[Evaluate] [0/3] ep_ret:[2731.0187] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[2634.4614] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[2698.0933] ep_len:[1000]
[Evaluate] step:[430000/1000000][43.0%] time:02:11:43.
[Evaluate] [0/3] ep_ret:[2795.0351] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[2772.4156] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[2769.1423] ep_len:[1000]
[Evaluate] step:[440000/1000000][44.0%] time:02:14:50.
[Evaluate] [0/3] ep_ret:[2884.1316] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[2894.4042] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[2893.8562] ep_len:[1000]
[Evaluate] step:[450000/1000000][45.0%] time:02:17:56.
[Evaluate] [0/3] ep_ret:[2687.1298] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[2703.9625] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[2740.3868] ep_len:[1000]
[Evaluate] step:[460000/1000000][46.0%] time:02:21:03.
[Evaluate] [0/3] ep_ret:[

[Evaluate] [1/3] ep_ret:[3104.1164] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[3118.8945] ep_len:[1000]
[Evaluate] step:[820000/1000000][82.0%] time:04:13:09.
[Evaluate] [0/3] ep_ret:[3097.5389] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[3120.9309] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[3134.3907] ep_len:[1000]
[Evaluate] step:[830000/1000000][83.0%] time:04:16:15.
[Evaluate] [0/3] ep_ret:[3130.4420] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[3138.9736] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[3121.8303] ep_len:[1000]
[Evaluate] step:[840000/1000000][84.0%] time:04:19:21.
[Evaluate] [0/3] ep_ret:[3102.0770] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[3122.3888] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[3083.5287] ep_len:[1000]
[Evaluate] step:[850000/1000000][85.0%] time:04:22:28.
[Evaluate] [0/3] ep_ret:[3015.2237] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[3004.7884] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[3018.9179] ep_len:[1000]
[Evaluate] step:[860000/1000000][86.0%] time:04:25:35.
[Evaluate] [0/3] ep_ret:[

### Close 

In [16]:
env.close()
test_env.close()

### Test Evaluate

In [29]:
gym.logger.set_level(40)
env_name = 'AntBulletEnv-v0'
test_env = gym.make(env_name)
_ = test_env.render(mode='human') # enable rendering on test_env
_ = test_env.reset()
for _ in range(3): # dummy run for proper rendering 
    a = test_env.action_space.sample()
    o,r,d,_ = test_env.step(a)
    time.sleep(0.01)
print ("[%s] ready."%(env_name))
o,d,ep_ret,ep_len = test_env.reset(),False,0,0
_ = test_env.render(mode='human') 
while not(d or (ep_len == max_ep_len_test)):
    a = get_action(model,sess,o,deterministic=True)
    o,r,d,_ = test_env.step(a)
    _ = test_env.render(mode='human') 
    ep_ret += r # compute return 
    ep_len += 1
print ("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"
    %(ep_ret,ep_len))
test_env.close() # close env 

[AntBulletEnv-v0] ready.
[Evaluate] ep_ret:[2.0000] ep_len:[1000]


### Video

In [1]:
from IPython.display import Video
Video('../vid/SAC_PyBullet_Ant.mp4')