In [1]:
import tensorflow as tf
import gym
import numpy as np
#import tensorflow_probability as tfp


In [2]:
def make_env(env_name, seed=None):
    # remove time limit wrapper from environment
    env = gym.make(env_name).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

In [3]:
env_name = 'CartPole-v0' #'LunarLanderContinuous-v2' #'Pendulum-v0' #   #'MountainCarContinuous-v0' #

env = gym.make(env_name)
#env.render()
#plt.imshow(env.render("rgb_array"))

#env = env.reset()

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
print(state_dim)
print(n_actions)
tf.random.set_seed(336699)

env : <TimeLimit<CartPoleEnv<CartPole-v0>>>
State shape: (4,)
Action shape: ()
action space Discrete(2) observation space : Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
4
2


In [4]:
class Critic(tf.keras.Model):
    def __init__(self, state_dim):
        super(Critic,self).__init__()
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(128,activation='relu')
        self.v = tf.keras.layers.Dense(1,activation='linear')
        
    def call(self,state):
        x = self.fc1(state)
        x = self.fc2(x)
        val = self.v(x)
        return tf.squeeze(val,axis=-1)


In [5]:
class Policy(tf.keras.Model):
    def __init__(self,state_dim, n_actions):
        super(Policy, self).__init__()
        self.fc1 = tf.keras.layers.Dense(256,activation='relu')
        self.fc2 = tf.keras.layers.Dense(128,activation='relu')
        self.prob = tf.keras.layers.Dense(n_actions, activation = 'softmax')

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        probs = self.prob(x)
        return tf.squeeze(probs)
        

In [6]:
## in case of ppo the buffer has to be cleared after every episode, lest the previous samples affect the working of the algorithm

class ReplayBufferPPO:
    def __init__(self, size=1e6):
        self.size = size #max number of items in buffer
        self.buffer =[] #array to holde buffer
        self.next_id = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, prob, reward, next_state, done, val):
        item = (state, action, prob, reward, next_state, done, val)
        if len(self.buffer) < self.size:
            self.buffer.append(item)
        else:
            self.buffer[self.next_id] = item
        self.next_id = (self.next_id + 1) % self.size
        
    def sample(self, batch_size=32):
        idxs = len(self.buffer) #np.random.choice(len(self.buffer), batch_size)
        samples = [self.buffer[i] for i in range(idxs)]
        states, actions, probs, rewards, next_states, done_flags, vals = list(zip(*samples))
        return np.array(states,np.float32), np.array(actions,np.float32),np.array(probs, np.float32), np.array(rewards,np.float32), np.array(next_states,np.float32), np.array(done_flags), np.array(vals, np.float32)
    
    def to_tensors(self, state_dim, act_dim):
        states, actions, probs, rewards, next_states, done_flags, vals = self.sample(32)
        #print(type(states))
        #print(f' states {states} actions : {actions} probs : {probs} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags} vals : {vals}')
        states = np.array(states,np.float32)
        states = np.reshape(states, (-1, state_dim))
    
        actions = np.reshape(actions, (-1)) # ,act_dim))

        probs = np.reshape(probs, (-1,act_dim))

        rewards = np.reshape(rewards,(-1,1))
        rewards = rewards.squeeze()

        next_states = np.array(next_states,np.float32)
        next_states = np.reshape(next_states, (-1, state_dim))
    
        done_flags = np.reshape(done_flags, (-1,1))
        done_flags = np.squeeze(done_flags)

        vals = np.reshape(vals, (-1))

        #print(f' states {states} actions : {actions} probs : {probs} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags} vals : {vals}')

        state_ts = tf.convert_to_tensor(states, dtype= tf.float32)
        action_ts = tf.convert_to_tensor(actions, dtype=tf.int32) #used for indexing
        prob_ts = tf.convert_to_tensor(probs,dtype=tf.float32)
        reward_ts = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_state_ts = tf.convert_to_tensor(next_states,dtype=tf.float32)
        val_ts = tf.convert_to_tensor(vals, dtype=tf.float32)
    
        #print(f'Tensor states {state_ts} actions : {action_ts} probs: {prob_ts} rewards : {reward_ts}:  next_states {next_state_ts} dones flags : {done_flags} vals : {val_ts}')

        return state_ts, action_ts, prob_ts, reward_ts, next_state_ts, done_flags, val_ts
    def generate_trajectory(self,env, policy, critic, n_steps = 1000):
        state = env.reset()
        self.buffer = [] #has to be cleared or the working of the algorithms will be affected
        done = False
        while not done:
        #for _ in range(n_steps):
            prob = policy(state[np.newaxis])
            val = critic(state[np.newaxis])
            prob = np.array(prob,np.float32)
            action = np.random.choice(n_actions, p=prob)
            #action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            #print(f' s: {state} action {action} prob: {prob} reward {reward} next state : {next_state} done : {done} val : {val}')
            buffer.add(state, action, prob, reward, next_state, done, val)
            state = next_state
buffer = ReplayBufferPPO(10000)

In [7]:
class AgentPPO:
    def __init__(self, state_dim, n_actions, clip_val=0.2, learning_rate = 1e-3, gamma = 0.99):
        self.learning_rate_critic = learning_rate
        self.learning_rate_policy = learning_rate      
        self.gamma = gamma        
        self.act_dim = n_actions
        self.state_dim = state_dim
        self.clip_val = clip_val

        self.policy = Policy(state_dim,n_actions)
        self.critic = Critic(state_dim)

        self.critic_optimizer = tf.keras.optimizers.Adam(self.learning_rate_critic)
        self.policy_optimizer = tf.keras.optimizers.Adam(self.learning_rate_policy)
        

    def process(self,buffer):
        states, actions, probs, rewards, next_states, dones, vals = buffer.to_tensors(self.state_dim, self.act_dim)       
        rtg = np.zeros(len(rewards))
        rtg[len(rewards)-1] = rewards[len(rewards)-1]
        for i in range(len(rewards)-2,-1,-1):           
            rtg[i] = rewards[i] + self.gamma*rtg[i+1]

        #print('********')
        #rtg = rtg.numpy()
        vals = vals.numpy()
        advantages = rtg - vals        
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)
        
        rtg = (rtg - rtg.mean())/(rtg.std()+1e-6)        
        
        rtg_ts = tf.convert_to_tensor(rtg, dtype= tf.float32)
        advantages_ts = tf.convert_to_tensor(advantages, dtype=tf.float32)
        #print(f' rtg tensor : {rtg_ts} advantages tensor : {advantages_ts}')
        
        return rtg_ts, advantages_ts, states, actions, probs 
    
    def critic_loss(self, states, rtg):
        with tf.GradientTape() as tape:
            vals = self.critic(states, training = True)
            
            c_loss = tf.reduce_mean((vals - rtg)**2) # tf.keras.losses.MSE(vals,rtg) # tf.reduce_mean((vals - rtg)**2)
            #print(f'vals : {vals} rtg : {rtg} closs : {c_loss}')
        grads = tape.gradient(c_loss,self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads,self.critic.trainable_variables))

        return c_loss

    def policy_loss(self, states, advantages, actions, old_probs, c_loss):
        surrogate1 = []
        surrogate2 = []
        
        #print(f'states: {states} old probs : {old_probs} actions : {actions}')
        with tf.GradientTape() as tape:
            new_probs = self.policy(states, training=True)
            #entropy = tf.reduce_mean(- tf.math.multiply(new_probs, tf.math.log(new_probs)))
            #print(f'new probs : {new_probs}')
            for pb, op, adv, act in zip(new_probs, old_probs, advantages, actions):               
                ratio = tf.divide(pb[act], op[act])                
                s1 = tf.multiply(ratio, adv)
                s2 = tf.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_val, 1.0 + self.clip_val),adv)

                #print(f' pb : {pb} op : {op} adv :{adv} act :{act} ratio : {ratio} s1 :{s1} s2:{s2}')

                surrogate1.append(s1)
                surrogate2.append(s2)
            surrogate1 = tf.stack(surrogate1)
            surrogate2 = tf.stack(surrogate2)

            #print(f'surrogate 1 : {surrogate1} surrogate 2 :{surrogate2}')

            p_loss = tf.negative(tf.reduce_mean((tf.minimum(surrogate1,surrogate2)))) # - (0.3 * c_loss)) #,0.001*entropy)))
            #print(f' P loss : {p_loss}')
        grads = tape.gradient(p_loss, self.policy.trainable_variables)
        self.policy_optimizer.apply_gradients(zip(grads,self.policy.trainable_variables))

        return p_loss

    def train_episode(self,env, buffer, repeat_train_steps=10):
        loss_c, loss_p = [], []
        buffer.generate_trajectory(env, self.policy, self.critic)
        rtg, advantages, states, actions, probs = self.process(buffer)        
        for steps in range(repeat_train_steps):
            c_loss = self.critic_loss(states,rtg)
            p_loss = self.policy_loss(states, advantages, actions, probs, c_loss)
            loss_c.append(c_loss)
            loss_p.append(p_loss)
        return loss_c, loss_p

In [8]:
def test_reward(env,agent):
    total_reward = 0
    state = env.reset()
    done = False
    while not done:
        action = np.argmax(agent.policy(np.array([state])).numpy())
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
    return total_reward

In [9]:
with tf.device('GPU:0'):
    avg_rewards = []
    agent = AgentPPO(state_dim,n_actions)
    for epoch in range(200):
        policy_loss, critic_loss =  agent.train_episode(env,buffer,10)
        avg_reward = np.mean([test_reward(env, agent) for _ in range(5)])

        print(f'avg reward after iteration {epoch} = {avg_reward} critic loss =  {np.mean(np.array(critic_loss[-10:]))} policy loss = {np.mean(np.array(policy_loss[-10:]))} ')
        avg_rewards.append(avg_reward)
        if avg_reward > 190:
            break
    print(f' total rewards = {avg_rewards}')
    



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

avg reward after iteration 0 = 35.4 critic loss =  -0.017462752759456635 policy loss = 0.6538090705871582 
avg reward after iteration 1 = 41.4 critic loss =  -0.03546004742383957 policy loss = 1.3751798868179321 
avg reward after iteration 2 = 9.6 critic loss =  -0.04154008626937866 policy loss = 0.8756036758422852 
avg reward after iteration 3 = 9.6 critic loss =  0.0054579321295022964 policy l