# Soft Actor-Critic (SAC) Implementation
Implementation of newest style of SAC with two Q Networks and temperature parameter alpha optimization/tuning.

In [1]:
import numpy as np
import os
import tensorflow as tf
import tensorflow_probability as tfp  # Distributions with reparam trick to gradients flow through
from tensorflow.keras.layers import Dense
from tensorflow.keras import initializers  # for layer weight initializing
import tensorflow.keras.losses as losses
import gym
import pybullet_envs
import matplotlib.pyplot as plt

In [2]:
class ReplayBuffer:
    def __init__(self, n_actions, input_shape, max_size=1000000):
        self.mem_size = max_size

        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

        self.n_count = 0

    def store_transition(self, state, action, reward, state_, done):
        index = self.n_count % self.mem_size

        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done

        self.n_count += 1

    def sample_buffer(self, batch_size=256):
        max_mem = min(self.n_count, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

In [3]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, n_states, n_actions, fc1_dims, fc2_dims, network_name, chkpt_dir='tmp/SAC', init_w=3e-3):
        super(CriticNetwork, self).__init__()

        self.network_name = network_name
        self.checkpoint_dir = chkpt_dir
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.checkpoint_file = os.path.join(self.checkpoint_dir, network_name + '_SAC')

        self.fc1 = Dense(units=fc1_dims, activation='relu', input_shape=(n_states + n_actions, ))
        self.fc2 = Dense(units=fc2_dims, activation='relu')
        self.q = Dense(units=1,
                       kernel_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w), # Änderung
                       bias_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w))

    def call(self, state, action):
        inputs = tf.concat([state, action], axis=1)
        x = self.fc1(inputs)
        x = self.fc2(x)
        return self.q(x) 


class ActorNetwork(tf.keras.Model):
    def __init__(self,
                 n_states,
                 n_actions,
                 fc1_dims,
                 fc2_dims,
                 network_name,
                 chkpt_dir='tmp/SAC',
                 init_w=3e-3,
                 log_std_min=-20,
                 log_std_max=2):
        super(ActorNetwork, self).__init__()
        
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max

        self.network_name = network_name
        self.checkpoint_dir = chkpt_dir
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.checkpoint_file = os.path.join(self.checkpoint_dir, network_name + '_SAC')
        
        self.fc1 = Dense(units=fc1_dims, activation='relu', input_shape=(n_states, ))
        self.fc2 = Dense(units=fc2_dims, activation='relu')
        
        self.mu = Dense(units=n_actions,
                      kernel_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w),
                      bias_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w))
        
        self.log_std = Dense(units=n_actions,
                      kernel_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w),
                      bias_initializer=initializers.RandomUniform(minval=-init_w, maxval=init_w))
        

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        mu = self.mu(x)
        
        log_std = self.log_std(x)
        log_std = tf.clip_by_value(log_std, clip_value_min=self.log_std_min, clip_value_max=self.log_std_max)
        std = tf.exp(log_std)
        
        normal = tfp.distributions.Normal(mu, std)  # make Gaussian distribution of mu, and sigma for actions
        
        z = normal.sample()  # sample from distribution with reparam trick
        action = tf.tanh(z)  # bound actions to [-1, +1]
        
        # correct log_probs because of bounding the actions
        log_prob = normal.log_prob(z) - tf.math.log(1 - tf.math.square(action) + 1e-6)  # Ist dasselbe!
        log_prob = tf.reduce_sum(log_prob, axis=1, keepdims=True) # tf.reduce_sum() or tf.reduce_mean()

        return action, log_prob
    

In [4]:
class Agent:
    """ 2019 State-of-the-Art Implementation of SAC with optimized temperature

    """
    def __init__(self,
                 env,
                 lr_Q = 3e-4,
                 lr_actor = 3e-4,
                 lr_a = 3e-4,
                 gamma=0.99,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 max_size=1000000,
                 warmup=1000,
                 policy_delay=1,
                 minimum_entropy=None):

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]

        self.n_states = env.observation_space.shape[0]
        self.n_actions = env.action_space.shape[0]

        self.min_action = env.action_space.low
        self.max_action = env.action_space.high

        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.warmup = warmup
        self.time_step = 0
        self.update_step = 0
        self.policy_delay = policy_delay

        self.policy_net = ActorNetwork(n_states=self.n_states, n_actions=self.n_actions,
                                       fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Actor')

        self.q_net1 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions,
                                    fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Critic_1')

        self.q_net2 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions,
                                    fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Critic_2')

        self.target_q_net1 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions,
                                           fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Target_Critic_1')

        self.target_q_net2 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions,
                                           fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Target_Critic_2')

        self.replay_buffer = ReplayBuffer(n_actions=self.n_actions,
                                          input_shape=self.n_states,
                                          max_size=max_size)
        
        self.policy_net.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_actor))
        self.q_net1.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q))
        self.q_net2.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q))

        self.update_target_network_parameters(tau=1)  # copy parameters to target networks

        # entropy temperature parameter alpha
        #self.log_alpha = tf.Variable(0.0, dtype=tf.float32)
        print(-tf.constant(env.action_space.shape[0], dtype=tf.float32))
        
        self.log_alpha = tf.Variable(tf.zeros(1), trainable=True)
        self.minimum_entropy = -tf.reduce_prod(tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32))
        self.minimum_entropy = -tf.reduce_prod(tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32)) if minimum_entropy is None else minimum_entropy
        print('Minimum Entropy set to: ', self.minimum_entropy)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_a)
        self.alpha = tf.exp(self.log_alpha).numpy()
        print('alpha: ', self.alpha)
        

    def choose_action(self, state):
        if self.time_step < self.warmup:
            actions = np.random.uniform(low=-1.0, high=1.0, size=self.n_actions)  # "random uniform distribution over all valid actions"
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        else:
            state = tf.convert_to_tensor(state, dtype=tf.float32)
            state = tf.expand_dims(state, axis=0)
            actions, _ = self.policy_net(state)

        self.time_step += 1
        if self.time_step == self.warmup:
            print('No warmup anymore!')
        a = self.rescale_action(actions[0].numpy())
        return a

    def scale_action(self, action):
        """ Scale all actions to [-1., +1.]

        :param action: unscaled actions
        :return: scaled actions all in range -1. .. +1.
        """
        #old = 2 * (action - self.min_action) / (self.max_action - self.min_action) - 1.0
        scale = (2 * action - (self.action_range[1] + self.action_range[0])) / (self.action_range[1] - self.action_range[0])
        return scale

    def rescale_action(self, action):
        """ Rescale all scaled actions to environment actionspace values

        :param action: scaled actions
        :return: rescaled actions all in range min_action .. max_action
        """
        #old = (action + 1.0) * (self.max_action - self.min_action) / 2.0 + self.min_action
        rescale = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \
               (self.action_range[1] + self.action_range[0]) / 2.0
        return rescale

    def remember(self, state, action, reward, new_state, done):
        action = self.scale_action(action)  # ÄNDERUNG! Funktioniert das mit?
        self.replay_buffer.store_transition(state, action, reward, new_state, done)

    def update_target_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        
        weights = []
        for theta_target, theta in zip(self.target_q_net1.get_weights(), 
                                       self.q_net1.get_weights()):
            theta_target = tau*theta + (1-tau)*theta_target
            weights.append(theta_target)
        self.target_q_net1.set_weights(weights)
        
        weights = []
        for theta_target, theta in zip(self.target_q_net2.get_weights(),
                                       self.q_net2.get_weights()):
            theta_target = tau*theta + (1-tau)*theta_target
            weights.append(theta_target)
        self.target_q_net2.set_weights(weights)
        
        #weights = []
        #theta_target = self.target_q_net1.weights
        #for i, theta in enumerate(self.q_net1.weights):
        #    weights.append(tau*theta + (1-tau)*theta_target[i])
        #self.target_q_net1.set_weights(weights)
        # 
        #weights = []
        #theta_target = self.target_q_net2.weights
        #for i, theta in enumerate(self.q_net2.weights):
        #    weights.append(tau*theta + (1-tau)*theta_target[i])
        #self.target_q_net2.set_weights(weights)
        

    def save_models(self):
        print('...save models...') # To Do!

    def load_models(self):
        print('...load models...') # To Do!

    def learn(self):
        if self.replay_buffer.n_count < self.batch_size:
            return
        elif self.replay_buffer.n_count == self.batch_size:
            print('Buffer Size equals batch Size! - Learning begins :)')
            return
        
        # sample batch from replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample_buffer(batch_size=self.batch_size)
        
        # convert batchs from 2D numpy arrays to tensorflow tensors
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
        
        # expand rewards and dones from 1D numpy arrays to 2D tensors and reshape them
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        rewards = tf.expand_dims(rewards, axis=0)
        rewards = tf.reshape(rewards, [self.batch_size, 1])
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)
        dones = tf.expand_dims(dones, axis=0)
        dones = tf.reshape(dones, [self.batch_size, 1])
        
        
        ## Update critic networks Q1 & Q2
        with tf.GradientTape(persistent=True) as tape_Q:
            next_actions, next_log_pi = self.policy_net(next_states)
            Q1_next = self.target_q_net1(next_states, next_actions)
            Q2_next = self.target_q_net2(next_states, next_actions)
            next_q_target = tf.minimum(Q1_next, Q2_next) - self.alpha * next_log_pi
            expected_q = tf.stop_gradient(rewards + (1 - dones) * self.gamma * next_q_target)
            
            curr_q1 = self.q_net1(states, actions)
            curr_q2 = self.q_net2(states, actions)
            
            q1_loss = tf.reduce_mean((curr_q1 - expected_q)**2)
            q2_loss = tf.reduce_mean((curr_q2 - expected_q)**2)  # tf.square()
            q_loss = q1_loss + q2_loss
        
        grad_Q1 = tape_Q.gradient(q_loss, self.q_net1.trainable_variables)
        grad_Q2 = tape_Q.gradient(q_loss, self.q_net2.trainable_variables)
        
        self.q_net1.optimizer.apply_gradients(zip(grad_Q1, self.q_net1.trainable_variables))
        self.q_net2.optimizer.apply_gradients(zip(grad_Q2, self.q_net2.trainable_variables))
        

        ## Update policy network and polyak update target Q networks less frequently (like in TD3 --> Delayed SAC)
        if self.update_step % self.policy_delay == 0:
            with tf.GradientTape() as tape_policy:
                new_actions, log_pi = self.policy_net(states)
                Q1 = self.q_net1(states, new_actions)
                Q2 = self.q_net2(states, new_actions)
                Q_min = tf.minimum(Q1, Q2)
                loss_policy = tf.reduce_mean(self.alpha * log_pi - Q_min)
            
            grad_policy = tape_policy.gradient(loss_policy, self.policy_net.trainable_variables)
            self.policy_net.optimizer.apply_gradients(zip(grad_policy, self.policy_net.trainable_variables))

            
            self.update_target_network_parameters()  # update target networks

        ## Update temperature
        with tf.GradientTape() as tape:
            _, log_pi_a = self.policy_net(states)
            alpha_loss = tf.reduce_mean(self.log_alpha*(-log_pi_a - self.minimum_entropy))
            
        grads = tape.gradient(alpha_loss, [self.log_alpha])
        self.alpha_optimizer.apply_gradients(zip(grads, [self.log_alpha]))
        self.alpha = tf.exp(self.log_alpha).numpy()

        self.update_step += 1  # Keep track of the number of network updates


In [5]:
def plot_learning_curve(scores, figure_file, Ylabel, color, avg_color=None , plot_folder='./plots', Xlabel='Episodes'):
    if not os.path.exists(plot_folder):
        os.makedirs(plot_folder)
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i - 100):i + 1])

    plt.figure()
    plt.plot(scores, color=color)
    if avg_color is not None:
        plt.plot(running_avg, color=avg_color)
    plt.xlabel(Xlabel)
    plt.ylabel(Ylabel)
    plt.savefig(figure_file)

In [6]:
learnID = '1'
n_games = 300 #1500

#RANDOM_SEED = 90

#env_id = 'InvertedPendulumBulletEnv-v0'
#env_id = 'Pendulum-v0'
env_id = 'LunarLanderContinuous-v2'
#env_id = 'Pendulum-v0'
#env_id = 'BipedalWalker-v3'

env = gym.make(env_id)

#tf.random.set_seed(RANDOM_SEED)
#env.seed(RANDOM_SEED)
#env.action_space.seed(RANDOM_SEED)
#np.random.seed(RANDOM_SEED)

agent = Agent(env=env)

filename_return = env_id + '_SAC_' + 'return_' + learnID
filename_alpha = env_id + '_SAC_' + 'alpha_' + learnID
figure_file_return = 'plots/' + filename_return
figure_file_alpha = 'plots/' + filename_alpha

best_score = env.reward_range[0]
score_history = []
alpha_history = []

steps = 0
for i in range(n_games):
    score = 0.0
    alpha = []
    done = False
    observation = env.reset()
    while not done:
        steps += 1
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        agent.remember(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
        score += reward
        alpha.append(agent.alpha)

    score_history.append(score)
    alpha_history.append(np.mean(alpha))
    avg_score = np.mean(score_history[-100:])
    if avg_score > best_score:
        best_score = avg_score
        agent.save_models()
    print('episode ', i, 'score %.1f' % score, 'avg score %1.f' % avg_score, 'steps ', steps, 'alpha ', np.mean(alpha))
    
plot_learning_curve(score_history, figure_file_return+'.png', color='lightgreen', avg_color='green', Ylabel='Return')
plot_learning_curve(alpha_history, figure_file_alpha+'.png', color='blue', Ylabel='Temperature alpha')

np.save(figure_file_return, score_history)


tf.Tensor(-2.0, shape=(), dtype=float32)
Minimum Entropy set to:  tf.Tensor(-2.0, shape=(), dtype=float32)
alpha:  [1.]
...save models...
episode  0 score -67.3 avg score -67 steps  127 alpha  1.0
Buffer Size equals batch Size! - Learning begins :)
episode  1 score -97.8 avg score -83 steps  266 alpha  0.99988145
episode  2 score -183.7 avg score -116 steps  379 alpha  0.9803062
episode  3 score -249.5 avg score -150 steps  524 alpha  0.9456728
episode  4 score -191.9 avg score -158 steps  655 alpha  0.9102256
episode  5 score -97.9 avg score -148 steps  797 alpha  0.8766789
episode  6 score -378.3 avg score -181 steps  899 alpha  0.8474386
episode  7 score -455.6 avg score -215 steps  989 alpha  0.8251228
No warmup anymore!
episode  8 score -153.8 avg score -208 steps  1096 alpha  0.80275446
episode  9 score -526.8 avg score -240 steps  1192 alpha  0.7806522
episode  10 score -211.3 avg score -238 steps  1268 alpha  0.76147735
episode  11 score -199.6 avg score -234 steps  1389 alpha 

...save models...
episode  114 score 131.6 avg score -45 steps  83916 alpha  0.030874776
...save models...
episode  115 score -26.1 avg score -44 steps  84916 alpha  0.02953326
...save models...
episode  116 score -47.0 avg score -44 steps  85916 alpha  0.029107803
...save models...
episode  117 score -12.2 avg score -42 steps  86916 alpha  0.02913047
...save models...
episode  118 score 40.1 avg score -40 steps  87916 alpha  0.029176759
...save models...
episode  119 score 16.6 avg score -40 steps  88916 alpha  0.031058097
...save models...
episode  120 score -43.4 avg score -40 steps  89916 alpha  0.031413935
...save models...
episode  121 score -41.7 avg score -39 steps  90916 alpha  0.032012265
...save models...
episode  122 score -3.5 avg score -39 steps  91916 alpha  0.032061722
...save models...
episode  123 score -3.6 avg score -37 steps  92916 alpha  0.030255893
...save models...
episode  124 score -4.2 avg score -36 steps  93916 alpha  0.029300047
...save models...
episode  1

episode  216 score 2.3 avg score -9 steps  184156 alpha  0.019265462
episode  217 score -12.7 avg score -9 steps  185156 alpha  0.019992711
episode  218 score -20.7 avg score -9 steps  186156 alpha  0.020088326
episode  219 score 3.0 avg score -9 steps  187156 alpha  0.019450013
episode  220 score -21.5 avg score -9 steps  188156 alpha  0.019926125
episode  221 score -21.0 avg score -9 steps  189156 alpha  0.01987983
episode  222 score -35.7 avg score -9 steps  190156 alpha  0.019561473
episode  223 score -35.0 avg score -10 steps  191156 alpha  0.019610684
episode  224 score -4.3 avg score -10 steps  192156 alpha  0.020049548
episode  225 score -19.0 avg score -10 steps  193156 alpha  0.01947803
episode  226 score 3.6 avg score -9 steps  194156 alpha  0.018820459
episode  227 score -7.4 avg score -9 steps  195156 alpha  0.018066177
episode  228 score 8.7 avg score -9 steps  196156 alpha  0.018324722


KeyboardInterrupt: 