#### Code Reference: https://github.com/pasus/Reinforcement-Learning-Book-Revision <br/>Code Reference: https://github.com/zhihanyang2022/pytorch-sac <br/> Modified the code so as to apply it to Unity Enviornment. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, Lambda, concatenate
from tensorflow.python.keras.optimizers import adam_v2
import tensorflow as tf
import tensorflow_probability as tfp
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

from replaybuffer import ReplayBuffers

In [None]:
class Actor(Model):
    
    def __init__(self, action_dim):
        super(Actor, self).__init__()

        self.action_dim = action_dim
        self.std_bound = [1e-2, 1.0]

        self.h1 = Dense(128, activation='relu')
        self.h2 = Dense(64, activation='relu')
        self.h3 = Dense(32, activation='relu')
        self.h4 = Dense(16, activation='relu')
        self.mu = Dense(action_dim, activation='tanh')
        self.std = Dense(action_dim, activation='softplus')

    def call(self, state):
        x = self.h1(state)
        x = self.h2(x)
        x = self.h3(x)
        x = self.h4(x)
        mu = self.mu(x)
        std = self.std(x)

        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])

        return mu, std
        
    def sample_normal(self, mu, std):
        normal_prob = tfp.distributions.Normal(mu, std)
        action = normal_prob.sample()
        # here, action could be squeezed, but in this project, agent's velocity is included in the observation space.
        # it is therefore, your choice whether or not to include tf.tanh. 
        # action = tf.tanh(action)
       
        
        #limiting the action value
        log_pdf = normal_prob.log_prob(action)
        log_pdf = tf.reduce_sum(log_pdf, 1, keepdims=True)
       
        return action, log_pdf

In [None]:
class Critic(Model):

    def __init__(self):
        super(Critic, self).__init__()

        self.x1 = Dense(64, activation='relu')
        self.a1 = Dense(64, activation='relu')
        self.h2 = Dense(64, activation='relu')
        self.h3 = Dense(32, activation='relu')
        self.h4 = Dense(16, activation='relu')
        self.q = Dense(1, activation='linear')


    def call(self, state_action):
        state = state_action[0]
        action = state_action[1]
        x = self.x1(state)
        a = self.a1(action)
        h = concatenate([x, a], axis=-1)
        x = self.h2(h)
        x = self.h3(x)
        x = self.h4(x)
        q = self.q(x)
        return q

In [None]:
class SACagent(object):

    def __init__(self, N_STATES, N_ACTIONS):

        # Hyperparameter
        self.GAMMA = 0.99
        self.BATCH_SIZE = 1000
        self.BUFFER_SIZE = 10000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001
        self.ALPHA = 0.5

        # Observation space and Action space
        self.state_dim = N_STATES
        self.action_dim = N_ACTIONS


        # Build Actor, Q1, Q2 and its Target NN
        self.actor = Actor(self.action_dim)
        self.actor.build(input_shape=(None, self.state_dim))

        self.critic_1 = Critic()
        self.target_critic_1 = Critic()

        self.critic_2 = Critic()
        self.target_critic_2 = Critic()

        state_in = Input((self.state_dim,))
        action_in = Input((self.action_dim,))
        self.critic_1([state_in, action_in])
        self.target_critic_1([state_in, action_in])
        self.critic_2([state_in, action_in])
        self.target_critic_2([state_in, action_in])

        self.actor.summary()
        self.critic_1.summary()
        self.critic_2.summary()

        # optimizer, any other form of optimizer should also work. 
        Adam = adam_v2.Adam(learning_rate=self.ACTOR_LEARNING_RATE)
        self.actor_opt = Adam
        self.critic_1_opt = Adam
        self.critic_2_opt = Adam

        # clear out the buffer
        self.buffer = ReplayBuffers(self.BUFFER_SIZE)

        # for plotting purposes, data is stored. 
        self.policy_lost = []
        self.reward_list = []
    
    
    ## get a sample action 
    def get_action(self, state):
        mu, std = self.actor(state)
        action, _ = self.actor.sample_normal(mu, std)
        return action.numpy()

    ## copy NN parameter values to the target NN.
    def update_target_network(self, TAU):
        phi_1 = self.critic_1.get_weights()
        phi_2 = self.critic_2.get_weights()
        target_phi_1 = self.target_critic_1.get_weights()
        target_phi_2 = self.target_critic_2.get_weights()
        for i in range(len(phi_1)):
            target_phi_1[i] = TAU * phi_1[i] + (1 - TAU) * target_phi_1[i]
            target_phi_2[i] = TAU * phi_2[i] + (1 - TAU) * target_phi_2[i]
        self.target_critic_1.set_weights(target_phi_1)
        self.target_critic_2.set_weights(target_phi_2)


    ## train Q1, Q2
    def critic_learn(self, states, actions, q_targets):
        with tf.GradientTape() as tape:
            q_1 = self.critic_1([states, actions], training=True)
            loss_1 = tf.reduce_mean(tf.square(q_1-q_targets))

        grads_1 = tape.gradient(loss_1, self.critic_1.trainable_variables)
        self.critic_1_opt.apply_gradients(zip(grads_1, self.critic_1.trainable_variables))

        with tf.GradientTape() as tape:
            q_2 = self.critic_2([states, actions], training=True)
            loss_2 = tf.reduce_mean(tf.square(q_2-q_targets))

        grads_2 = tape.gradient(loss_2, self.critic_2.trainable_variables)
        self.critic_2_opt.apply_gradients(zip(grads_2, self.critic_2.trainable_variables))


    ## Train the actor NN.
    def actor_learn(self, states):
        with tf.GradientTape() as tape:
            mu, std = self.actor(states, training=True)
            actions, log_pdfs = self.actor.sample_normal(mu, std)
            log_pdfs = tf.squeeze(log_pdfs, 1)
            soft_q_1 = self.critic_1([states, actions])
            soft_q_2 = self.critic_2([states, actions])
            soft_q = tf.math.minimum(soft_q_1, soft_q_2)

            loss = tf.reduce_mean(self.ALPHA * log_pdfs - soft_q)

        grads = tape.gradient(loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(zip(grads, self.actor.trainable_variables))
        return float(loss)


    ## calculating the target
    def q_target(self, rewards, q_values, dones):
        y_k = np.asarray(q_values)
        for i in range(q_values.shape[0]): # number of batch
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.GAMMA * q_values[i]
        return y_k



    def load_weights(self, path):
        self.actor.load_weights(path + 'ParkingEnv_actor_2q.h5')
        self.critic_1.load_weights(path + 'ParkingEnv_critic_12q.h5')
        self.critic_2.load_weights(path + 'ParkingEnv_critic_22q.h5')


    def train(self, max_episode_num, env, behavior_name):

        cnt = 0
        # reset target network param.
        self.update_target_network(1.0)


        for ep in range(int(max_episode_num)):
            frame, episode_reward = 0, 0
            # reset the enviornment
            env.reset()
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            episode_done = False
            # setting up the initial state as an array
            x = decision_steps.obs[0][0] # Ray Perception 3D
            y = decision_steps.obs[1][0] # Agent's velocity x,z
            state = np.concatenate((x, y), 0)

            while not episode_done:
        
                action = self.get_action(tf.convert_to_tensor([state], dtype=tf.float32))
                # wrap the action with ActionTuple before sending it to UE. 
                action = ActionTuple(np.array(action, dtype = np.float32))
                env.set_actions(behavior_name, action)
                # move the agent along with the action. 
                env.step()
                action = action._continuous # converting ActionTuple to array
                next_decision_steps, next_terminal_steps = env.get_steps(behavior_name)


                # if the agent is still on, collect data and add it to buffer.
                if next_decision_steps:
                    # get the reward. 
                    train_reward = next_decision_steps.reward[0]
                    x = next_decision_steps.obs[0][0]
                    y = next_decision_steps.obs[1][0]
                    next_state = np.concatenate((x, y), 0)
                    episode_reward += next_decision_steps.reward[0]
                    # store the data to the buffer
                    self.buffer.add_data(state, action, train_reward, next_state, False)
                    episode_done = False

                # if the agent is off, collect data and add True for done.
                if next_terminal_steps:
                    # get the reward. 
                    train_reward = next_terminal_steps.reward[0]
                    x = next_terminal_steps.obs[0][0]
                    y = next_terminal_steps.obs[1][0]
                    next_state = np.concatenate((x, y), 0)
                    episode_reward += next_terminal_steps.reward[0]
                    # store the data to the buffer
                    self.buffer.add_data(state, action, train_reward, next_state, True)
                    episode_done = True


                # if buffer has enough data start training. 
                if self.buffer.buffer_count() > self.BATCH_SIZE:

                    
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(self.BATCH_SIZE)

                    # Calculate the Q target value
                    next_mu, next_std = self.actor(tf.convert_to_tensor(next_states, dtype=tf.float32))
                    next_actions, next_log_pdf = self.actor.sample_normal(next_mu, next_std)

                    target_qs_1 = self.target_critic_1([next_states, next_actions])
                    target_qs_2 = self.target_critic_2([next_states, next_actions])
                    target_qs = tf.math.minimum(target_qs_1, target_qs_2)

                    target_qi = target_qs - self.ALPHA * next_log_pdf
                    y_i = self.q_target(rewards, target_qi.numpy(), dones)
                    self.critic_learn(tf.convert_to_tensor(states, dtype=tf.float32),
                                      tf.convert_to_tensor(actions, dtype=tf.float32),
                                      tf.convert_to_tensor(y_i, dtype=tf.float32))

                    # update Actor and return policy loss
                    policy_loss = self.actor_learn(tf.convert_to_tensor(states, dtype=tf.float32))
                    
                    # store the performance of the algorithm.
                    if cnt % 500 == 0:
                        self.reward_list.append(train_reward)
                        self.policy_lost.append(policy_loss)
                    self.update_target_network(self.TAU)
                state = next_state
                frame += 1
                cnt += 1
        

            # Episode output
            print('Episode: ', ep+1, 'Frame: ', frame, 'u Reward: ', episode_reward/frame)
        


            # save weights for each run
            self.actor.save_weights("./save_weights/ParkingEnv_actor_2q.h5")
            self.critic_1.save_weights("./save_weights/ParkingEnv_critic_12q.h5")
            self.critic_2.save_weights("./save_weights/ParkingEnv_critic_22q.h5")

    
    def plot_result(self):
        fig=plt.figure(figsize=(18, 6))
        fig.add_subplot(1, 3, 1)  # 1 row, 3 columns
        plt.plot(self.reward_list)

        fig.add_subplot(1, 3, 3) 
        plt.plot(self.policy_lost)

        plt.show()

#### Run Unity Enviornment

In [None]:
N_ACTIONS = 2

In [None]:
env = UnityEnvironment(file_name= "../Parking lot", base_port=5004)

In [None]:
env.reset()
behavior_names = list(env.behavior_specs.keys())
behavior_name = behavior_names[0]
decision_steps, terminal_steps = env.get_steps(behavior_name)

### Get the length of the space size

In [None]:
# initial states
N_STATES = len(decision_steps.obs[0][0]) + len(decision_steps.obs[1][0])

In [None]:
agent = SACagent(N_STATES, N_ACTIONS)
# usually 30K is enough. 
agent.train(30000, env, behavior_name) 

In [None]:
agent.plot_result()

In [None]:
env.close()


** *When the training is finished, the env will automatically re-open and load the saved weights. <br/> Note, due to oscillation during the experiment, poor parameters could be loaded. Be careful with your training. Normally, it is better to stop when the agent starts to behave well in the enviornment.* **

### Load saved weights

In [None]:
env = UnityEnvironment(file_name= "../Parking lot", base_port=5004)
training = False;

In [None]:
env.reset()
behavior_names = list(env.behavior_specs.keys())
behavior_name = behavior_names[0]
print(behavior_name)
decision_steps, terminal_steps = env.get_steps(behavior_name)

In [None]:
N_ACTIONS = 2
N_STATES = len(decision_steps.obs[0][0]) + len(decision_steps.obs[1][0])

In [None]:
agent = SACagent(N_STATES, N_ACTIONS)
print("loading weights...")
agent.load_weights('./save_weights/')
print("loaded weights!")
cnt = 0
while True:
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    x = decision_steps.obs[0][0] # Ray Perception 3D
    y = decision_steps.obs[1][0] # Agent's velocity x,z
    state = np.concatenate((x, y), 0)

    action = agent.actor(tf.convert_to_tensor([state], dtype=tf.float32))[0]
    print(action)
    action = ActionTuple(np.array(action, dtype = np.float32))
    env.set_actions(behavior_name, action)
    env.step()
    
    cnt += 1

    if cnt % 10000 ==0:
        break

In [None]:
env.close()