In [1]:
##soft actor critic using value and Q funtion

import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
import tensorflow_probability as tfp
import random
from IPython.display import HTML
import pybullet_envs

%matplotlib inline

In [2]:
def make_env(env_name, seed=None):
    # remove time limit wrapper from environment
    env = gym.make(env_name).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

In [3]:

env_name = 'LunarLanderContinuous-v2' #'Pendulum-v0' #   #'MountainCarContinuous-v0' #'AntBulletEnv-v0' #

env = make_env(env_name)

#plt.imshow(env.render("rgb_array"))

#env = env.reset()

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
print(f'action space bound :{env.action_space.low}, {env.action_space.high}')
act_limit = env.action_space.high[0]
act_dim = env.action_space.shape[0]
print(f'action limit = {act_limit} dimension {act_dim}')
state_dim = env.observation_space.shape[0]
print(state_dim)

env : <LunarLanderContinuous<LunarLanderContinuous-v2>>
State shape: (8,)
Action shape: (2,)
action space Box([-1. -1.], [1. 1.], (2,), float32) observation space : Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)
action space bound :[-1. -1.], [1. 1.]
action limit = 1.0 dimension 2
8


In [4]:
class Critic(tf.keras.Model):
    def __init__(self, state_dim, act_dim):
        super(Critic,self).__init__()
        self.initializer = tf.keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(256, activation='relu')
        self.out = tf.keras.layers.Dense(1, activation='linear', kernel_initializer=self.initializer)
        
    def call(self, state, action):
        x = tf.concat([state, action], axis = -1)
        #print(x)
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.out(x)
        return tf.squeeze(q, -1)

In [5]:
class Policy(tf.keras.Model):
    def __init__(self, state_dim, act_dim, act_limit, clip_val_min = -20, clip_val_max = 2):
        super(Policy, self).__init__()
        self.clip_val_min = clip_val_min
        self.clip_val_max = clip_val_max

        self.initializer = tf.keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(256, activation='relu')
        self.mean = tf.keras.layers.Dense(act_dim, activation = 'linear', kernel_initializer=self.initializer)
        self.log_std_dev = tf.keras.layers.Dense(act_dim, activation='linear', kernel_initializer=self.initializer)
        
    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        mu = self.mean(x)
        log_sigma = self.log_std_dev(x)
        
        log_sigma = tf.clip_by_value(log_sigma,clip_value_min=self.clip_val_min, clip_value_max=self.clip_val_max)
        return mu, log_sigma

    def eval(self, state, eps = 1e-6):
        mu, log_sigma = self.call(state)
        sigma = tf.exp(log_sigma)

        dist = tfp.distributions.Normal(mu, sigma)
        action_ = dist.sample()

        # Apply the tanh squashing to keep the gaussian bounded in (-1,1)
        action = tf.tanh(action_)

        # Calculate the log probability
        log_pi_ = dist.log_prob(action_)
        # Change log probability to account for tanh squashing as mentioned in paper
       
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(1 - action**2 + eps), axis=1, keepdims=True)

        return action, log_pi
    
    def eval_n(self, state, eps = 1e-6):
        mu, log_sigma = self.call(state)
        std_dev = tf.exp(log_sigma)

        normal = tfp.distributions.Normal(0,1)
        z = normal.sample() 

        action = tf.tanh( mu + std_dev * z)
        dist = tfp.distributions.Normal(mu, std_dev)
        log_prob = dist.log_prob(mu + std_dev*z) - tf.math.log(1-action**2 + eps)

        return action, log_prob

In [6]:
class ReplayBuffer:
    def __init__(self, size=1e6):
        self.size = size #max number of items in buffer
        self.buffer =[] #array to holde buffer
        self.next_id = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, reward, next_state, done):
        item = (state, action, reward, next_state, done)
        if len(self.buffer) < self.size:
            self.buffer.append(item)
        else:
            self.buffer[self.next_id] = item
        self.next_id = (self.next_id + 1) % self.size
        
    def sample(self, batch_size=32):
        idxs = np.random.choice(len(self.buffer), batch_size)
        samples = [self.buffer[i] for i in idxs]
        states, actions, rewards, next_states, done_flags = list(zip(*samples))
        return np.array(states,np.float32), np.array(actions,np.float32), np.array(rewards,np.float32), np.array(next_states,np.float32), np.array(done_flags)
    
    def to_tensors(self, state_dim, act_dim):
        states, actions, rewards, next_states, done_flags = self.sample(32)
        #print(type(states))
        states = np.array(states,np.float32)
        states = np.reshape(states, (-1, state_dim))
    
        actions = np.reshape(actions, (-1,act_dim))
        rewards = np.reshape(rewards,(-1,1))
        rewards = rewards.squeeze()

        next_states = np.array(next_states,np.float32)
        next_states = np.reshape(next_states, (-1, state_dim))
    
        done_flags = np.reshape(done_flags, (-1,1))
        done_flags = np.squeeze(done_flags)

        ##print(f' states {states} actions : {actions} rewards : {rewards}:  next_states {next_states} dones flags : {done_flags}')

        state_ts = tf.convert_to_tensor(states, dtype= tf.float32)
        action_ts = tf.convert_to_tensor(actions, dtype=tf.float32)
        reward_ts = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_state_ts = tf.convert_to_tensor(next_states,dtype=tf.float32)
    
        ##print(f'Tensor states {state_ts} actions : {action_ts} rewards : {reward_ts}:  next_states {next_state_ts} dones flags : {done_flags}')

        return state_ts, action_ts, reward_ts, next_state_ts, done_flags
    def initialize_replay_buffer(self,env, n_steps = 1000):
        state = env.reset()
        for _ in range(n_steps):
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            #print(f' s: {state} action {action} reward {reward} next state : {next_state} done : {done}')
            buffer.add(state, action, reward, next_state, done)
            if done:
                state = env.reset()
            state = next_state
buffer = ReplayBuffer(100000)

In [7]:
class AgentSAC:
    def __init__(self, act_dim,act_limit, state_dim, learning_rate = 1e-3, alpha = 0.2, gamma = 0.99, polyak = 0.95):
        #super(self).__init__()
        self.learning_rate_critic = learning_rate
        self.learning_rate_policy = 1e-3
        self.polyak = polyak
        self.gamma = gamma
        self.alpha = alpha
        self.act_dim = act_dim
        self.state_dim = state_dim

        self.critic1 = Critic(state_dim, act_dim)
        self.critic2 = Critic(state_dim, act_dim)

        self.policy = Policy(state_dim,act_dim,act_limit)

        self.target_critic1 = Critic(state_dim, act_dim)
        self.target_critic2 = Critic(state_dim, act_dim)

        #### instantiate networks
        state = env.reset() 
        act = env.action_space.sample()

        _ = self.critic1(state[np.newaxis],act[np.newaxis])
        _ = self.critic2(state[np.newaxis],act[np.newaxis])
        _ = self.target_critic1(state[np.newaxis],act[np.newaxis])
        _ = self.target_critic2(state[np.newaxis],act[np.newaxis])

        _, _ = self.policy(state[np.newaxis])
    

        self.target_critic1.set_weights(self.critic1.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())
        self.target_critic1.trainable = False
        self.target_critic2.trainable = False

        self.critic_optimizer1 = tf.keras.optimizers.Adam(self.learning_rate_critic)
        self.critic_optimizer2 = tf.keras.optimizers.Adam(self.learning_rate_critic)
        self.policy_optimizer = tf.keras.optimizers.Adam(self.learning_rate_policy)

    def polyak_update(self, target_network, network):
        updated_model_weights = []
        for weights, weights_target in zip(network.get_weights(), target_network.get_weights()):
            new_weights = self.polyak*weights_target+(1-self.polyak)*weights
            updated_model_weights.append(new_weights)
        target_network.set_weights(updated_model_weights)

    def compute_q_loss(self, states, actions, rewards, next_states, dones):
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            q_val1 = self.critic1(states,actions)
            q_val2 = self.critic2(states,actions)

            acts, log_pis = self.policy.eval_n(next_states) #should be next_states

            #print(f'log pis before reduction : {log_pis}')

            q_tar1 = self.target_critic1(next_states,acts)            
            q_tar2 = self.target_critic2(next_states,acts)

            min_tar = tf.minimum(q_tar1,q_tar2)

            log_pis = tf.squeeze(log_pis)
            #print(f'log pis after squeeze: {log_pis}')
            log_pis = tf.reduce_mean(log_pis,axis=-1)
            #print(f'log pis after reduction : {log_pis}')

            softq = min_tar - self.alpha * log_pis
            target = rewards + self.gamma*(1-dones)*softq

            critic_loss1 = tf.reduce_mean((q_val1 - target)**2)
            critic_loss2 = tf.reduce_mean((q_val2 - target)**2)

            #print(f'q_val1 : {q_val1} qval2 : {q_val2} q_tar1 : {q_tar1} qtar2 : {q_tar2} acts = {acts} logpis = {log_pis} min_tar = : {min_tar} softq : {softq} target: {target} critic loss :{critic_loss1} :: {critic_loss2}' )
        grads1 = tape1.gradient(critic_loss1, self.critic1.trainable_variables)
        self.critic_optimizer1.apply_gradients(zip(grads1, self.critic1.trainable_variables))

        grads2 = tape2.gradient(critic_loss2,self.critic2.trainable_variables)
        self.critic_optimizer2.apply_gradients(zip(grads2,self.critic2.trainable_variables))

        return critic_loss1, critic_loss2
    
    def compute_p_loss(self, states):
        with tf.GradientTape() as tape:
            actions, log_pis = self.policy.eval_n(states)
            q_val1 = self.critic1(states,actions)
            q_val2 = self.critic2(states, actions)

            min_q = tf.minimum(q_val1,q_val2)

            #print(f'log pis before reduction : {log_pis}')
            log_pis = tf.squeeze(log_pis)
            log_pis = tf.reduce_mean(log_pis,axis=-1)

            softq = min_q - self.alpha * log_pis
            actor_loss = - tf.reduce_mean(softq)
            #print(f'q_val1 : {q_val1} qval2 : {q_val2} log_pis : {log_pis} softq : {softq} actor loss : {actor_loss}')
        grads = tape.gradient(actor_loss, self.policy.trainable_variables)
        self.policy_optimizer.apply_gradients(zip(grads,self.policy.trainable_variables))

        return actor_loss
    
    def train(self):
        states, actions, rewards, next_states, dones = buffer.to_tensors(self.state_dim,self.act_dim)
        c_loss1, c_loss2 = self.compute_q_loss(states,actions,rewards,next_states, dones)
        self.critic1.trainable = False
        self.critic2.trainable = False
        p_loss = self.compute_p_loss(states)

        self.critic1.trainable = True
        self.critic2.trainable = True
        
        self.polyak_update(self.target_critic1, self.critic1)
        self.polyak_update(self.target_critic2, self.critic2)

In [53]:
gamma = 0.99
with tf.device('GPU:0'):

    test_env = gym.make(env_name)
    max_ep_len = []
    loss_qval, loss_pval = [], []
    ep_reward = []
    total_avg_reward = []

    num_episodes = 5000
    num_steps = 0
    target = False
    steps = 0
    buffer.initialize_replay_buffer(env)
    agent = AgentSAC(act_dim, act_limit, state_dim)

    for eps in range(num_episodes):
        if target == True:
            break
        state = env.reset()
        ret = 0
        ep_reward = []
        done = False
        count = 0

    #for steps in range(num_steps):
        while count < 300:
            action,_ =  agent.policy.eval_n(state[np.newaxis])
            #print(action)
            next_state, reward, done, _ = env.step(action[0])
            #print(f' s: {state} actins {action} reward {reward} next states : {next_state} done : {done}')
            buffer.add(state, action[0], reward, next_state, done)

            count += 1
            if count % 5 == 0:
              agent.train()
              
            state = next_state
            ret += reward
            total_avg_reward.append(ret)               
            if done:
                break
        steps += 1
        if steps % 10 == 0:
            avg_rew = np.mean(total_avg_reward[-10:])
            print(f'after {steps} avg reward : {avg_rew}')

after 10 avg reward : -20.86916624335529
after 20 avg reward : -28.52709590437225
after 30 avg reward : -41.28292956990454
after 40 avg reward : 78.6040414986492
after 50 avg reward : 54.827848405855875
after 60 avg reward : -76.78229645959075
after 70 avg reward : -35.146489019754185
after 80 avg reward : 62.86156640464973
after 90 avg reward : -245.68267751463532
after 100 avg reward : -45.47875407530154
after 110 avg reward : 11.129116879513196
after 120 avg reward : -50.92521181631621
after 130 avg reward : 17.114246862745507
after 140 avg reward : 34.64452619020713
after 150 avg reward : -25.388663695123075
after 160 avg reward : 65.98124797652007
after 170 avg reward : 65.52826287499302
after 180 avg reward : 23.328789487261584
after 190 avg reward : 54.101622321708376
after 200 avg reward : 42.84557603224668
after 210 avg reward : 15.546715747631643
after 220 avg reward : 6.275468224453639
after 230 avg reward : 13.937769320497974
after 240 avg reward : 46.59302864438244
after 2

In [1]:
def test_agent(env, num_test_episodes, max_ep_len):
    ep_rets, ep_lens = [], []
    for j in range(num_test_episodes):
        state, done, ep_ret, ep_len = env.reset(), False, 0, 0
        while not(done or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            ##env.render()
            #print(state)
            act1,_ = agent.policy.eval_n(state[np.newaxis])
            #print(act)
            state, reward, done, _ = env.step(act1[0])
            ep_ret += reward
            ep_len += 1
        ep_rets.append(ep_ret)
        ep_lens.append(ep_len)
    return np.mean(ep_rets), np.mean(ep_lens)