In [1]:
##Deep deterministic policy gradient

import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
import tensorflow_probability as tfp
import random
from IPython.display import HTML
import pybullet_envs

%matplotlib inline

In [2]:
tf.__version__

'2.1.0'

In [3]:
def make_env(env_name, seed=None):
    # remove time limit wrapper from environment
    env = gym.make(env_name).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

In [4]:
env_name = 'Pendulum-v0' # 'AntBulletEnv-v0' #'LunarLanderContinuous-v2' #'Pendulum-v0' #   #'MountainCarContinuous-v0' #

env = make_env(env_name)

#plt.imshow(env.render("rgb_array"))

#env = env.reset()

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
print(f'action space bound :{env.action_space.low}, {env.action_space.high}')
act_limit = env.action_space.high[0]
act_dim = env.action_space.shape[0]
print(f'action limit = {act_limit} dimension {act_dim}')
state_dim = env.observation_space.shape[0]
print(state_dim)

env : <PendulumEnv<Pendulum-v0>>
State shape: (3,)
Action shape: (1,)
action space Box([-2.], [2.], (1,), float32) observation space : Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
action space bound :[-2.], [2.]
action limit = 2.0 dimension 1
3


In [5]:
class Critic(tf.keras.Model):
    def __init__(self, state_dim, act_dim):
        super(Critic,self).__init__()
        #self.initializer = tf.keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
        self.fc1 = tf.keras.layers.Dense(500, activation='relu')
        self.fc2 = tf.keras.layers.Dense(500, activation='relu')
        self.out = tf.keras.layers.Dense(1, activation='linear')#, kernel_initializer=self.initializer)
        
    def call(self, state, action):
        x = tf.concat([state, action], axis = -1)
        #print(x)
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.out(x)
        return tf.squeeze(q, -1)

In [6]:
class Policy(tf.keras.Model):
    def __init__(self, state_dim, act_dim, act_limit):
        super(Policy, self).__init__()
        self.act_limit = act_limit
        self.fc1 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb1 = tf.keras.layers.BatchNormalization()
        self.fc2 = tf.keras.layers.Dense(512, activation="relu")
        #self.fcb2 = tf.keras.layers.BatchNormalization()
        self.actor = tf.keras.layers.Dense(act_dim)
    
    def call(self, s):
        x = self.fc1(s)
        #x = self.fcb1(x)
        x = self.fc2(x)
        #x = self.fcb2(x)
        x = self.actor(x)
        x = tf.keras.activations.tanh(x)  # to output in range(-1,1)
        x = self.act_limit * x
        return x
    
    def get_action1(self, s):
        action = self.call(s)
        #print(f'action :{action}')
        normal_dist = tfp.distributions.Normal(0,0.1)
        sml = normal_dist.sample()
        sml = tf.clip_by_value(sml, -0.02,0.02)
        #print(f'sample : {sml}')
        action += sml
        action = tf.clip_by_value(action, -self.act_limit,self.act_limit)
        return action
    
    def get_action(self,s):
        #state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.call(s)
        ##print(actions)
        
        actions += tf.clip_by_value(tf.random.normal(shape=[act_dim], mean=0.0, stddev=0.1),-0.2,0.2)

        actions = (tf.clip_by_value(actions, -self.act_limit , self.act_limit))
        ##print(f'actions in ac : {actions}')
        return actions


In [8]:
normal_dist = tfp.distributions.Normal(0,0.1)
for i in range(100):
    sml = normal_dist.sample()
    #sml = tf.clip_by_value(sml, -0.02,0.02)
    ac = tf.random.normal(shape=[act_dim], mean=0.0, stddev=0.1)
    print(sml, ac)

tf.Tensor(-0.04865397, shape=(), dtype=float32) tf.Tensor([-0.0328065], shape=(1,), dtype=float32)
tf.Tensor(-0.095408134, shape=(), dtype=float32) tf.Tensor([-0.07518333], shape=(1,), dtype=float32)
tf.Tensor(0.092920095, shape=(), dtype=float32) tf.Tensor([-0.05934841], shape=(1,), dtype=float32)
tf.Tensor(-0.045488346, shape=(), dtype=float32) tf.Tensor([0.15496205], shape=(1,), dtype=float32)
tf.Tensor(0.1608499, shape=(), dtype=float32) tf.Tensor([-0.10757116], shape=(1,), dtype=float32)
tf.Tensor(0.1354924, shape=(), dtype=float32) tf.Tensor([-0.04784148], shape=(1,), dtype=float32)
tf.Tensor(0.052274447, shape=(), dtype=float32) tf.Tensor([0.00919703], shape=(1,), dtype=float32)
tf.Tensor(-0.04541289, shape=(), dtype=float32) tf.Tensor([0.06768324], shape=(1,), dtype=float32)
tf.Tensor(0.21500933, shape=(), dtype=float32) tf.Tensor([0.03649686], shape=(1,), dtype=float32)
tf.Tensor(-0.015856735, shape=(), dtype=float32) tf.Tensor([-0.10105121], shape=(1,), dtype=float32)
tf.Tens

In [9]:
class ReplayBuffer:
    def __init__(self, size=1e6):
        self.size = size #max number of items in buffer
        self.buffer =[] #array to holde buffer
        self.next_id = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, reward, next_state, done):
        item = (state, action, reward, next_state, done)
        if len(self.buffer) < self.size:
            self.buffer.append(item)
        else:
            self.buffer[self.next_id] = item
        self.next_id = (self.next_id + 1) % self.size
        
    def sample(self, batch_size=32):
        idxs = np.random.choice(len(self.buffer), batch_size)
        samples = [self.buffer[i] for i in idxs]
        states, actions, rewards, next_states, done_flags = list(zip(*samples))
        return np.array(states,np.float32), np.array(actions,np.float32), np.array(rewards,np.float32), np.array(next_states,np.float32), np.array(done_flags)
    
    def to_tensors(self, state_dim, act_dim):
        states, actions, rewards, next_states, done_flags = self.sample(32)
        #print(type(states))
        states = np.array(states,np.float32)
        states = np.reshape(states, (-1, state_dim))
    
        actions = np.reshape(actions, (-1,act_dim))
        rewards = np.reshape(rewards,(-1,1))
        rewards = rewards.squeeze()

        next_states = np.array(next_states,np.float32)
        next_states = np.reshape(next_states, (-1, state_dim))
    
        done_flags = np.reshape(done_flags, (-1,1))
        done_flags = np.squeeze(done_flags)

        ##print(f' states {states} actions : {actions} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags}')

        state_ts = tf.convert_to_tensor(states, dtype= tf.float32)
        action_ts = tf.convert_to_tensor(actions, dtype=tf.float32)
        reward_ts = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_state_ts = tf.convert_to_tensor(next_states,dtype=tf.float32)
    
        ##print(f'Tensor states {state_ts} actions : {action_ts} rewards : {reward_ts}:  next_states {next_state_ts} dones flags : {done_flags}')

        return state_ts, action_ts, reward_ts, next_state_ts, done_flags
    def initialize_replay_buffer(self,env, n_steps = 1000):
        state = env.reset()
        for _ in range(n_steps):
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            #print(f' s: {state} action {action} reward {reward} next state : {next_state} done : {done}')
            buffer.add(state, action, reward, next_state, done)
            if done:
                state = env.reset()
            state = next_state
buffer = ReplayBuffer(10000)

In [10]:
class AgentDDPG:
    def __init__(self,env, act_dim,act_limit, state_dim, learning_rate = 1e-3, gamma = 0.99, polyak = 0.95):
        self.learning_rate_critic = learning_rate
        self.learning_rate_policy = 1e-3
        self.polyak = polyak
        self.gamma = gamma
        self.act_dim = act_dim
        self.state_dim = state_dim
        self.act_limit = act_limit

        self.critic = Critic(state_dim,act_dim)
        self.target_critic = Critic(state_dim,act_dim)

        self.policy = Policy(state_dim,act_dim,act_limit)
        self.target_policy = Policy(state_dim,act_dim,act_limit)

        s = env.reset()
        a = env.action_space.sample()
        s = s[np.newaxis]
        _ = self.critic(s,a[np.newaxis])
        _ = self.target_critic(s,a[np.newaxis])
        _ = self.policy(s)
        _ = self.target_policy(s)

        self.target_critic.set_weights(self.critic.get_weights())
        self.target_policy.set_weights(self.policy.get_weights())

        self.target_critic.trainable = False
        self.target_policy.trainable = False

        self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_policy)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_critic)

        self.target_critic.trainable = False
        self.target_policy.trainable = False

    def polyak_update(self, target_network, network):
        updated_model_weights = []
        for weights, target_weights in zip(network.get_weights(), target_network.get_weights()):
            new_weights = self.polyak * target_weights + ((1-self.polyak) * weights)
            updated_model_weights.append(new_weights)
        target_network.set_weights(updated_model_weights)

    #@tf.function
    def compute_q_loss(self, states, actions, rewards, next_states, dones, gamma=0.99):
        with tf.GradientTape() as tape:
            pred_qvals = self.critic(states,actions)
            
            next_acts = self.target_policy(next_states)
            next_acts += tf.clip_by_value(tf.random.normal(shape = next_acts.shape, mean=0, stddev=0.5),-0.5,0.5)

            next_qval = self.critic(next_states, next_acts)
            target = rewards + gamma*(1-dones)*next_qval

            loss = tf.reduce_mean((pred_qvals - target)**2)

            #print(f' predicted q vals : {pred_qvals} __ next qvals : {next_qval}  target : {target}  loss : {loss}')
        grads = tape.gradient(loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))
        return loss
    
    #@tf.function
    def compute_p_loss(self, states):

        with tf.GradientTape() as tape:
            pred_val = self.critic(states,self.policy(states))
            loss = - tf.reduce_mean(pred_val)            
            #print(f'pred val : {pred_val} loss : {loss}')
        grads = tape.gradient(loss, self.policy.trainable_variables)
        self.policy_optimizer.apply_gradients(zip(grads, self.policy.trainable_variables))

        return loss
    
    def train_step(self):
        
        states, actions, rewards, next_states, dones = buffer.to_tensors(self.state_dim,self.act_dim)

        c_loss = self.compute_q_loss(states,actions, rewards, next_states, dones)
        self.critic.trainable = False

        p_loss = self.compute_p_loss(states)
        
        self.critic.trainable = True
        
        self.polyak_update(self.target_critic, self.critic)
        self.polyak_update(self.target_policy, self.policy)

        return p_loss, c_loss
    
    def test_agent(self, env, num_test_episodes=5, max_ep_len=500):
        ep_rets, ep_lens = [], []
        #print(num_test_episodes, env)
        for j in range(num_test_episodes):
            state, done, ep_ret, ep_len = env.reset(), False, 0, 0
            #print(f' state : {state}')
            while not(done or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                #print(f'state :: {state}')
                action = self.policy(state[np.newaxis])
                #action = tf.squeeze(action)
                #print(action)
                state, reward, done, _ = env.step(action[0]) #env.action_space.sample()) #action[np.newaxis]) #)agent.get_action(state, 0))
                ep_ret += reward
                ep_len += 1
            ep_rets.append(ep_ret)
            ep_lens.append(ep_len)
        return np.mean(ep_rets), np.mean(ep_lens)

In [12]:
with tf.device('GPU:0'):   
    buffer.initialize_replay_buffer(env)
    agent = AgentDDPG(env, act_dim,act_limit, state_dim)
    test_env = make_env(env_name)
    state = env.reset()
    ret = 0 

    c_loss, c_loss2, p_loss,v_loss = [],[],[],[]
    rets = []
    for episode in range(1000):
    #state = state[np.newaxis]
    #for step in range(200000):
        for step in range(500):   
            action = agent.policy.get_action(state[np.newaxis]) 
            next_state, reward, done, _ = env.step(action[0])

            ret +=  reward
            if (done or step % 400 == 0) and step != 0:
                state = env.reset()
                #print( f' return : {ret} after : {step}')
                rets.append(ret)
                ret = 0
                done = True
            else:           
                buffer.add(state, action[0], reward, next_state, done)

                state = next_state
                if step % 100 == 0:
                    for i in range(50):
                        loss_p, loss_c = agent.train_step()
                        c_loss.append(loss_c)
                        p_loss.append(loss_p)                    
        if episode % 10 == 0:
            ep_ret, ep_len = agent.test_agent(test_env, 5, 400)
            print(f'returns =  {np.mean(np.array(rets))} episode {episode} returns : {ep_ret} episode len : {ep_len} avg c_loss {np.mean(np.array(c_loss, np.float32))} avg p_loss {np.mean(np.array(p_loss, np.float32))} ')
            c_loss, p_loss = [], []
            rets = []
            print(f'return in episode : {episode} : {ret}')



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float

KeyboardInterrupt: 

In [15]:
agent = AgentDDPG(env, act_dim,act_limit, state_dim)
#buffer.initialize_replay_buffer(env)
agent.train_step()
ep_ret, ep_len = agent.test_agent(env)
print(ep_ret,ep_len)
s = env.reset()
acc = agent.policy.get_action(s[np.newaxis])
acc2 = agent.policy.get_action1(s[np.newaxis])
print(acc.numpy(),acc2.numpy())



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float