In [49]:
import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
import tensorflow_probability as tfp

import os
import io
import base64
import time
import glob
from IPython.display import HTML

import pybullet_envs

%matplotlib inline

In [50]:
def make_env(env_name, seed=1888):
    # remove time limit wrapper from environment
    env = gym.make(env_name).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

In [51]:

env_name = 'HalfCheetahBulletEnv-v0' #  'HopperBulletEnv-v0' # 'AntBulletEnv-v0' 
seed = 1888
env = make_env(env_name,seed )
#plt.imshow(env.render("rgb_array"))

#env = env.reset()
print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
print(f'action space bound :{env.action_space.low}, {env.action_space.high}')
act_limit = env.action_space.high[0]
act_dim = env.action_space.shape[0]
print(f'action limit = {act_limit} dimension {act_dim}')
state_dim = env.observation_space.shape[0]
print(state_dim)

env : <HalfCheetahBulletEnv<HalfCheetahBulletEnv-v0>>
State shape: (26,)
Action shape: (6,)
action space Box(-1.0, 1.0, (6,), float32) observation space : Box(-inf, inf, (26,), float32)
action space bound :[-1. -1. -1. -1. -1. -1.], [1. 1. 1. 1. 1. 1.]
action limit = 1.0 dimension 6
26


  deprecation(
  deprecation(


In [52]:
class Policy_p(tf.keras.Model):
    def __init__(self, act_dim, act_limit):
        super(Policy_p, self).__init__()
        self.act_limit = act_limit
        self.fc1 = tf.keras.layers.Dense(400, activation="relu")
        #self.fcb1 = tf.keras.layers.BatchNormalization()
        self.fc2 = tf.keras.layers.Dense(300, activation="relu")
        #self.fcb2 = tf.keras.layers.BatchNormalization()
        self.mean = tf.keras.layers.Dense(act_dim, activation = 'tanh')
        self.log_std_dev = tf.keras.layers.Dense(act_dim, activation='linear')
        #self.actor = tf.keras.layers.Dense(act_dim)
    
    def call(self, s):
        x = self.fc1(s)
        x = self.fc2(x)

        mu = self.mean(x)
        log_std = self.log_std_dev(x)
        return mu, log_std
    
    def act(self, state, evaluate=False):
        #state = tf.convert_to_tensor([state], dtype=tf.float32)

        mu, log_std = self.call(state)
        std_dev = tf.exp(log_std)
        
        normal = tfp.distributions.Normal(mu,std_dev)
        z = normal.sample() 
        actions = tf.clip_by_value(z,-self.act_limit,self.act_limit)
        probs = normal.prob(actions)
        #log_probs = normal.log_prob(actions)
        
        return actions, probs, mu, log_std

In [53]:
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb1 = tf.keras.layers.BatchNormalization()
        self.fc2 = tf.keras.layers.Dense(512, activation="relu")
        self.Q = tf.keras.layers.Dense(1)
    
    def call(self, s, a):
        x = tf.concat([s,a], axis=-1)
        x = self.fc1(x)
        #x = self.fcb1(x)
        x = self.fc2(x)
        q = self.Q(x)
        return tf.squeeze(q, -1)

In [54]:
class ReplayBuffer:
    def __init__(self, size=1e6):
        self.size = size #max number of items in buffer
        self.buffer =[] #array to holde buffer
        self.next_id = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, mu, log_std,  reward, next_state, done):
        item = (state, action, mu, log_std, reward, next_state, done)
        if len(self.buffer) < self.size:
            self.buffer.append(item)
        else:
            self.buffer[self.next_id] = item
        self.next_id = (self.next_id + 1) % self.size
        
    def sample(self, batch_size=32):
        idxs = np.random.choice(len(self.buffer), batch_size)
        samples = [self.buffer[i] for i in idxs]
        states, actions,  mus, log_stds, rewards, next_states, done_flags = list(zip(*samples))
        return np.array(states,np.float32), np.array(actions,np.float32), np.array(mus, np.float32), np.array(log_stds, np.float32),  np.array(rewards,np.float32), np.array(next_states,np.float32), np.array(done_flags)
    
    def to_tensors(self, state_dim, act_dim):
        states, actions, mus,log_stds, rewards, next_states, done_flags = self.sample(32)
        #print(type(states))
        states = np.array(states,np.float32)
        states = np.reshape(states, (-1, state_dim))
    
        actions = np.reshape(actions, (-1,act_dim))
        mus = np.reshape(mus, (-1,act_dim))
        log_stds = np.reshape(log_stds, (-1,act_dim))

        rewards = np.reshape(rewards,(-1,1))
        rewards = rewards.squeeze()

        next_states = np.array(next_states,np.float32)
        next_states = np.reshape(next_states, (-1, state_dim))
    
        done_flags = np.reshape(done_flags, (-1,1))
        done_flags = np.squeeze(done_flags)

        ##print(f' states {states} actions : {actions} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags}')

        state_ts = tf.convert_to_tensor(states, dtype= tf.float32)
        action_ts = tf.convert_to_tensor(actions, dtype=tf.float32)
        mus_ts = tf.convert_to_tensor(mus, dtype = tf.float32)
        log_stds_ts = tf.convert_to_tensor(log_stds, dtype = tf.float32)

        reward_ts = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_state_ts = tf.convert_to_tensor(next_states,dtype=tf.float32)
    
        #print(f'Tensor states {state_ts} actions : {action_ts} log p {log_probs_ts} rewards : {reward_ts}:  next_states {next_state_ts} dones flags : {done_flags}')
        ##print(f'log_probs : {log_probs_ts} mus : {mus_ts} log_stds_ts {log_stds_ts}')
        return state_ts, action_ts, mus_ts, log_stds_ts, reward_ts, next_state_ts, done_flags
    
    def initialize_replay_buffer(self,env, n_steps = 100):
        state = env.reset()
        for _ in range(n_steps):
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            #print(f' s: {state} action {action} reward {reward} next state : {next_state} done : {done}')
            buffer.add(state, action,action,action, reward, next_state, done) #action values added as place holders during initialization
            if done:
                state = env.reset()
            state = next_state
buffer = ReplayBuffer(100000)

In [55]:
#TD3 with KL regularizer
class AgentTD3E:
    def __init__(self,env, act_dim, act_limit, state_dim, learning_rate = 1e-3, gamma = 0.99, polyak = 0.95):
        self.dist = tfp.distributions.Normal(0,0.3)
        self.learning_rate_critic = 0.002
        self.learning_rate_policy = 1e-3
        self.polyak = polyak
        self.gamma = gamma
        self.act_dim = act_dim
        self.state_dim = state_dim
        self.act_limit = act_limit

        self.critic1 = Critic()
        self.critic2 = Critic()

        self.target_critic1 = Critic()
        self.target_critic2 = Critic()

        self.policy = Policy_p(act_dim,act_limit)
        self.target_policy = Policy_p(act_dim,act_limit)

        s = env.reset()
        a = env.action_space.sample()
        s = s[np.newaxis]

        _ = self.critic1(s,a[np.newaxis])
        _ = self.critic2(s,a[np.newaxis])

        _ = self.target_critic1(s,a[np.newaxis])
        _ = self.target_critic2(s,a[np.newaxis])

        _,_ = self.policy(s)
        _,_ = self.target_policy(s)

        self.target_critic1.set_weights(self.critic1.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())
        self.target_policy.set_weights(self.policy.get_weights())

        self.target_critic1.trainable = False
        self.target_critic2.trainable = False
        self.target_policy.trainable = False

        self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_policy)
        self.critic_optimizer1 = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_critic)
        self.critic_optimizer2 = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_critic)
   
    def polyak_update(self, target_network, network):
        updated_model_weights = []
        for weights, target_weights in zip(network.get_weights(), target_network.get_weights()):
            new_weights = self.polyak * target_weights + ((1-self.polyak) * weights)
            updated_model_weights.append(new_weights)
        target_network.set_weights(updated_model_weights)
        
    @tf.function
    def compute_q_loss(self,states,actions, rewards, next_states, dones, gamma=0.99, eps=1e-6):
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            target_actions, _, _, _ = self.target_policy.act(next_states) #self.target_policy.act(next_states)
            #noise = tf.clip_by_value(tf.random.normal(shape = target_actions.shape, mean=0, stddev=0.5),-0.5,0.5)
            #target_actions += noise
            target_actions = (tf.clip_by_value(target_actions, -self.act_limit, self.act_limit))
            target_qval1 = self.target_critic1(next_states,target_actions)
            target_qval2 = self.target_critic2(next_states,target_actions)

            qval1 = self.critic1(states, actions, training=True)
            qval2 = self.critic2(states, actions, training=True)

            target_next_qval = tf.math.minimum(target_qval1, target_qval2)
            target_qval = rewards + gamma * (1-dones) * target_next_qval

            #critic_loss1 = tf.keras.losses.MSE(target_qval, qval1)
            #critic_loss2 = tf.keras.losses.MSE(target_qval, qval2)
            
            critic_loss1 = tf.reduce_mean((target_qval - qval1)**2)
            critic_loss2 = tf.reduce_mean((target_qval - qval2)**2)
            #critic_loss1 += 0.1*rev_kl
            #critic_loss2 += 0.1*rev_kl 

        grads1 = tape1.gradient(critic_loss1, self.critic1.trainable_variables)
        grads2 = tape2.gradient(critic_loss2, self.critic2.trainable_variables)
        
        self.critic_optimizer1.apply_gradients(zip(grads1,self.critic1.trainable_variables))       
        self.critic_optimizer2.apply_gradients(zip(grads2,self.critic2.trainable_variables))

        #print(f'target actions : {target_actions} states : {states} target qv1 : {target_qval1} target qv2 : {target_qval2} qval 1 : {qval1} qval2: {qval2} target_qval : {target_qval} critic loss1 : {critic_loss1} critic loss : {critic_loss2} noise: {noise}')

        return critic_loss1, critic_loss2
    
    #@tf.function
    def compute_p_loss(self,states, mean_prev, log_std_prev, eps=1e-6):
        with tf.GradientTape() as tape:
            actions, _, mu_s, log_stds = self.policy.act(states) 


            kl = (log_stds - log_std_prev) + ( tf.square(tf.exp(log_std_prev)) + ((mu_s - mean_prev) **2) ) / (2 * tf.square(tf.exp(log_stds))) - 0.5 
            kl = tf.reduce_mean(tf.reduce_sum(kl, axis = -1))        
            policy_loss = - (self.critic1(states,actions) - (0.01 * kl))
            p_loss =  tf.math.reduce_mean(policy_loss)

        grads = tape.gradient(p_loss,self.policy.trainable_variables)
        self.policy_optimizer.apply_gradients(zip(grads,self.policy.trainable_variables))

        #print(f'states : {states} actions : {actions} policy_loss : {policy_loss} p_loss : {p_loss}') 

        return p_loss, kl

    def train_step(self,step):
        p_loss = 0
        states, actions, mus, log_stds, rewards, next_states, dones = buffer.to_tensors(self.state_dim,self.act_dim)
        #print(f'states: {states} actions : {actions} rewards : {rewards}')
        done_flags = np.array(dones,np.float32)
        c_loss1, c_loss2 = self.compute_q_loss(states,actions, rewards, next_states, done_flags)

        p_loss, kl = self.compute_p_loss(states, mus, log_stds)
                
        self.polyak_update(self.target_critic1, self.critic1)
        self.polyak_update(self.target_critic2, self.critic2)
        self.polyak_update(self.target_policy, self.policy)

        return p_loss, c_loss1, c_loss2, kl  

In [56]:
#primary program
gamma = 0.99

with tf.device('GPU:0'):
    required_reward = 3500  # -150 for pendulum
    test_env = gym.make(env_name)
    max_ep_len = []
    loss_qval, loss_pval = [], []
    ep_reward = []
    total_avg_reward = []
    ploss, kloss, closs1, closs2 = [], [], [], []
    ep_ploss, ep_kloss =0,0
    kl = 0
    pl = 0

    num_episodes = 5000
    num_steps = 0
    target = False
    steps = 0
    buffer.initialize_replay_buffer(env)
    agent = AgentTD3E(env, act_dim, act_limit, state_dim)
    #load weights saved earlier
    #load_saved_weights(agent)
    count_s = 0

    for eps in range(num_episodes):
        if target == True:
            break
        state = env.reset()
        ret = 0
        ep_reward = []
        done = False
        count = 0
        ep_ploss = 0
        ep_kloss = 0
 
    #for steps in range(num_steps):
        while count < 1000:
            action, prob, mu, log_std =  agent.policy.act(state[np.newaxis])
            ##print(action,prob,l_prob[0])
            next_state, reward, done, _ = env.step(action[0])
            #print(f' s: {state} actins {action} reward {reward} next states : {next_state} done : {done}')
            buffer.add(state, action[0], mu[0], log_std[0], reward, next_state, done)

            count_s += 1
            count += 1
            #if count % 5 == 0:
            pl, cl1, cl2, kl = agent.train_step(count)
            ep_ploss += pl
            ep_kloss += kl            
                #closs1.append(cl1)
                #closs2.append(cl2)
              
            state = next_state
            ret += reward                       
            #if done:
            #    break
        ploss.append(ep_ploss)
        kloss.append(ep_kloss) 
        total_avg_reward.append(ret)
        steps += 1
        if steps % 10 == 0:
            avg_rew = np.mean(total_avg_reward[-10:])
            print(f'after {eps} avg reward : {avg_rew} policy loss : {np.mean(ploss[-10:])} KL divergence : {np.mean(kloss[-10:])}')
        if ret > required_reward: #specific for pendulum, change for different environments e.g. for lunarlander-v0 required reward should be greater than 150
            print(f'Episode {eps} return : {ret}')
            break

after 9 avg reward : -3.1832132772416513 policy loss : -42069.0703125 KL divergence : 19493.40234375
after 19 avg reward : 553.555886991647 policy loss : -57811.8828125 KL divergence : 13878.279296875
after 29 avg reward : 753.2247967071582 policy loss : -64638.17578125 KL divergence : 11660.0087890625
after 39 avg reward : 843.0152480559124 policy loss : -76145.0234375 KL divergence : 11641.5625
after 49 avg reward : 944.8065812958317 policy loss : -83781.6171875 KL divergence : 11083.189453125
after 59 avg reward : 1058.5326984158378 policy loss : -88661.34375 KL divergence : 11067.376953125
after 69 avg reward : 1112.3973650512958 policy loss : -95417.203125 KL divergence : 11691.1357421875
after 79 avg reward : 1244.4843048919915 policy loss : -102215.5234375 KL divergence : 12289.845703125
after 89 avg reward : 1282.4324106728566 policy loss : -109179.2890625 KL divergence : 12310.15234375
after 99 avg reward : 1250.0450945594293 policy loss : -112580.8984375 KL divergence : 12508