In [3]:
import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import tensorflow as tf
import gym


import os
import io
import base64
import time
import glob
from IPython.display import HTML

%matplotlib inline

In [4]:
def make_env(env_name, seed=None):
    # remove time limit wrapper from environment
    env = gym.make(env_name).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

In [11]:
env_name =  'Pendulum-v0' # 'LunarLanderContinuous-v2' #

env = make_env(env_name)

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
print(f'action space bound :{env.action_space.low}, {env.action_space.high}')
act_limit = env.action_space.high[0]
act_dim = env.action_space.shape[0]
print(f'action limit = {act_limit} dimension {act_dim}')
state_dim = env.observation_space.shape[0]
print(state_dim)

env : <PendulumEnv<Pendulum-v0>>
State shape: (3,)
Action shape: (1,)
action space Box([-2.], [2.], (1,), float32) observation space : Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
action space bound :[-2.], [2.]
action limit = 2.0 dimension 1
3


In [12]:
class Policy(tf.keras.Model):
    def __init__(self, state_dim, act_dim, act_limit):
        super(Policy, self).__init__()
        self.act_limit = act_limit
        self.fc1 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb1 = tf.keras.layers.BatchNormalization()
        self.fc2 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb2 = tf.keras.layers.BatchNormalization()
        self.actor = tf.keras.layers.Dense(act_dim)
    
    def call(self, s):
        x = self.fc1(s)
        #x = self.fcb1(x)
        x = self.fc2(x)
        #x = self.fcb2(x)
        x = self.actor(x)
        x = tf.keras.activations.tanh(x)  # to output in range(-1,1)
        x = self.act_limit * x
        return x
    
    def act(self, state, evaluate=False):
        #state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.call(state)
        ##print(actions)
        if not evaluate:
            actions += tf.clip_by_value(tf.random.normal(shape=[act_dim], mean=0.0, stddev=0.1),-0.2,0.2)

        actions = (tf.clip_by_value(actions, -self.act_limit , self.act_limit))
        #print(f'actions in ac : {actions}')
        return actions

In [13]:
class Critic(tf.keras.Model):
    def __init__(self, state_dim, act_dim):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb1 = tf.keras.layers.BatchNormalization()
        self.fc2 = tf.keras.layers.Dense(512, activation="relu")
        self.Q = tf.keras.layers.Dense(1)
    
    def call(self, s, a):
        x = tf.concat([s,a], axis=-1)
        x = self.fc1(x)
        #x = self.fcb1(x)
        x = self.fc2(x)
        q = self.Q(x)
        return tf.squeeze(q, -1)

In [14]:
class ReplayBuffer:
    def __init__(self, size=1e6):
        self.size = size #max number of items in buffer
        self.buffer =[] #array to holde buffer
        self.next_id = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, reward, next_state, done):
        item = (state, action, reward, next_state, done)
        if len(self.buffer) < self.size:
            self.buffer.append(item)
        else:
            self.buffer[self.next_id] = item
        self.next_id = (self.next_id + 1) % self.size
        
    def sample(self, batch_size=32):
        idxs = np.random.choice(len(self.buffer), batch_size)
        samples = [self.buffer[i] for i in idxs]
        states, actions, rewards, next_states, done_flags = list(zip(*samples))
        return np.array(states,np.float32), np.array(actions,np.float32), np.array(rewards,np.float32), np.array(next_states,np.float32), np.array(done_flags)
    
    def to_tensors(self, state_dim, act_dim):
        states, actions, rewards, next_states, done_flags = self.sample(32)
        #print(type(states))
        states = np.array(states,np.float32)
        states = np.reshape(states, (-1, state_dim))
    
        actions = np.reshape(actions, (-1,act_dim))
        rewards = np.reshape(rewards,(-1,1))
        rewards = rewards.squeeze()

        next_states = np.array(next_states,np.float32)
        next_states = np.reshape(next_states, (-1, state_dim))
    
        done_flags = np.reshape(done_flags, (-1,1))
        done_flags = np.squeeze(done_flags)

        ##print(f' states {states} actions : {actions} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags}')

        state_ts = tf.convert_to_tensor(states, dtype= tf.float32)
        action_ts = tf.convert_to_tensor(actions, dtype=tf.float32)
        reward_ts = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_state_ts = tf.convert_to_tensor(next_states,dtype=tf.float32)
    
        ##print(f'Tensor states {state_ts} actions : {action_ts} rewards : {reward_ts}:  next_states {next_state_ts} dones flags : {done_flags}')

        return state_ts, action_ts, reward_ts, next_state_ts, done_flags
    def initialize_replay_buffer(self,env, n_steps = 1000):
        state = env.reset()
        for _ in range(n_steps):
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            #print(f' s: {state} action {action} reward {reward} next state : {next_state} done : {done}')
            buffer.add(state, action, reward, next_state, done)
            if done:
                state = env.reset()
            state = next_state
buffer = ReplayBuffer(100000)

In [15]:
class AgentTD3:
    def __init__(self,env, act_dim, act_limit, state_dim, learning_rate = 1e-3, gamma = 0.99, polyak = 0.95):
        self.learning_rate_critic = 0.0002
        self.learning_rate_policy = 1e-3
        self.polyak = polyak
        self.gamma = gamma
        self.act_dim = act_dim
        self.state_dim = state_dim
        self.act_limit = act_limit

        self.critic1 = Critic(state_dim,act_dim)
        self.critic2 = Critic(state_dim,act_dim)

        self.target_critic1 = Critic(state_dim,act_dim)
        self.target_critic2 = Critic(state_dim,act_dim)

        self.policy = Policy(state_dim,act_dim,act_limit)
        self.target_policy = Policy(state_dim,act_dim,act_limit)

        s = env.reset()
        a = env.action_space.sample()
        s = s[np.newaxis]

        _ = self.critic1(s,a[np.newaxis])
        _ = self.critic2(s,a[np.newaxis])

        _ = self.target_critic1(s,a[np.newaxis])
        _ = self.target_critic2(s,a[np.newaxis])

        _ = self.policy(s)
        _ = self.target_policy(s)

        self.target_critic1.set_weights(self.critic1.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())
        self.target_policy.set_weights(self.policy.get_weights())

        self.target_critic1.trainable = False
        self.target_critic2.trainable = False
        self.target_policy.trainable = False

        self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_policy)
        self.critic_optimizer1 = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_critic)
        self.critic_optimizer2 = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_critic)
   
    def polyak_update(self, target_network, network):
        updated_model_weights = []
        for weights, target_weights in zip(network.get_weights(), target_network.get_weights()):
            new_weights = self.polyak * target_weights + ((1-self.polyak) * weights)
            updated_model_weights.append(new_weights)
        target_network.set_weights(updated_model_weights)
        
    @tf.function
    def compute_q_loss(self,states,actions, rewards, next_states, dones, gamma=0.99):
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            target_actions = self.target_policy(next_states) #self.target_policy.act(next_states)
            noise = tf.clip_by_value(tf.random.normal(shape = target_actions.shape, mean=0, stddev=0.5),-0.5,0.5)
            target_actions += noise
            target_actions = (tf.clip_by_value(target_actions, -self.act_limit, self.act_limit))
            target_qval1 = self.target_critic1(next_states,target_actions)
            target_qval2 = self.target_critic2(next_states,target_actions)

            qval1 = self.critic1(states, actions, training=True)
            qval2 = self.critic2(states, actions, training=True)

            target_next_qval = tf.math.minimum(target_qval1, target_qval2)

            target_qval = rewards + gamma * (1-dones) * target_next_qval

            #critic_loss1 = tf.keras.losses.MSE(target_qval, qval1)
            #critic_loss2 = tf.keras.losses.MSE(target_qval, qval2)
            
            critic_loss1 = tf.reduce_mean((target_qval - qval1)**2)
            critic_loss2 = tf.reduce_mean((target_qval - qval2)**2)
        grads1 = tape1.gradient(critic_loss1, self.critic1.trainable_variables)
        grads2 = tape2.gradient(critic_loss2, self.critic2.trainable_variables)
        
        self.critic_optimizer1.apply_gradients(zip(grads1,self.critic1.trainable_variables))       
        self.critic_optimizer2.apply_gradients(zip(grads2,self.critic2.trainable_variables))

        #print(f'target actions : {target_actions} states : {states} target qv1 : {target_qval1} target qv2 : {target_qval2} qval 1 : {qval1} qval2: {qval2} target_qval : {target_qval} critic loss1 : {critic_loss1} critic loss : {critic_loss2} noise: {noise}')

        return critic_loss1, critic_loss2
    
    @tf.function
    def compute_p_loss(self,states):
        
        with tf.GradientTape() as tape:
            actions = self.policy(states, training=True)
            policy_loss = - self.critic1(states,actions)
            p_loss =  tf.math.reduce_mean(policy_loss)

        grads = tape.gradient(p_loss,self.policy.trainable_variables)
        self.policy_optimizer.apply_gradients(zip(grads,self.policy.trainable_variables))

        #print(f'states : {states} actions : {actions} policy_loss : {policy_loss} p_loss : {p_loss}') 

        return p_loss

    def train_step(self,step):
        p_loss = 0
        states, actions, rewards, next_states, dones = buffer.to_tensors(self.state_dim,self.act_dim)
        #print(f'states: {states} actions : {actions} rewards : {rewards}')
        done_flags = np.array(dones,np.float32)
        c_loss1, c_loss2 = self.compute_q_loss(states,actions, rewards, next_states, done_flags)
        #self.critic1.trainable = False

        #if step % 2 == 0 :
        p_loss = self.compute_p_loss(states)
        
        #self.critic1.trainable = True
        
        self.polyak_update(self.target_critic1, self.critic1)
        self.polyak_update(self.target_critic2, self.critic2)
        self.polyak_update(self.target_policy, self.policy)

        return p_loss, c_loss1, c_loss2  


In [10]:
# routine to generate 200 steps first then use td3 to learn
gamma = 0.99
with tf.device('GPU:0'):
    test_env = gym.make(env_name)
    max_ep_len = []
    loss_qval, loss_pval = [], []
    ep_reward = []
    total_avg_reward = []

    num_of_time_steps = 10000
    num_eps = 10000
    num_steps = 300
    target = False
    buffer.initialize_replay_buffer(env)
    agent = AgentTD3(env,act_dim,act_limit,state_dim)

    
    for eps in range(num_eps):
        state = env.reset()
        done = False
        ret = 0
        if target == True:
            break
        for steps in range(num_steps):
            action = agent.policy.act(state[np.newaxis])
            next_state, reward, done, _ = env.step(action[0])
            buffer.add(state,action[0],reward,next_state,done)

            if done: 
                ret += reward
                break                
            else:
                state = next_state
            ret += reward

        total_avg_reward.append(ret)

        for i in range(20):
            agent.train_step(1) # 1 doesnt do anything
            
        if eps % 10 == 0:
            avg_rew = np.mean(total_avg_reward[-10:])
            print(f'after {eps} avg reward : {avg_rew}')
   





To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float

KeyboardInterrupt: 

In [20]:
#primary program
gamma = 0.99
with tf.device('GPU:0'):

    test_env = gym.make(env_name)
    max_ep_len = []
    loss_qval, loss_pval = [], []
    ep_reward = []
    total_avg_reward = []

    num_episodes = 5000
    num_steps = 0
    target = False
    steps = 0
    buffer.initialize_replay_buffer(env)
    agent = AgentTD3(env, act_dim, act_limit, state_dim)

    for eps in range(num_episodes):
        if target == True:
            break
        state = env.reset()
        ret = 0
        ep_reward = []
        done = False
        count = 0

    #for steps in range(num_steps):
        while count < 900:
            action =  agent.policy.act(state[np.newaxis])
            #print(action)
            next_state, reward, done, _ = env.step(action[0])
            #print(f' s: {state} actins {action} reward {reward} next states : {next_state} done : {done}')
            buffer.add(state, action[0], reward, next_state, done)

            count += 1
            if count % 5 == 0:
              agent.train_step(count)
              
            state = next_state
            ret += reward
            total_avg_reward.append(ret)               
            if done:
                break
        steps += 1
        if steps % 10 == 0:
            avg_rew = np.mean(total_avg_reward[-10:])
            print(f'after {steps} avg reward : {avg_rew}')
        if ret > -150: #specific for pendulum, change for different environments e.g. for lunarlander-v0 required reward should be greater than 150
            print(f'Episode {eps} return : {ret}')
            break
        




To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float

In [23]:
def test_agent(env, num_test_episodes, max_ep_len):
    ep_rets, ep_lens = [], []
    for j in range(num_test_episodes):
        state, done, ep_ret, ep_len = env.reset(), False, 0, 0
        while not(done or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            env.render()
            #print(state)
            act1 = agent.policy(state[np.newaxis])
            #print(act)
            state, reward, done, _ = env.step(act1[0])
            ep_ret += reward
            ep_len += 1
        ep_rets.append(ep_ret)
        ep_lens.append(ep_len)
    return np.mean(ep_rets), np.mean(ep_lens)