# **Introduction**

This notebook is for implementing a vanilla Actor-Critic method based on the advantage function (A2C), for learning an optimal policy for the `Cartpole` environment. The Actor-Critic method utilizes two networks, one of which is responsible for mapping states to a probability distribution over the actions (actor), and another which estimates the value of a state to guide the actor (critic). The general idea is that the actor updates its policy in the direction suggested by the critic.

# **Import Packages**

This section imports the necessary packages.

In [None]:
# import these packages:
import gymnasium as gym
import numpy as np
import random
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
tf.get_logger().setLevel('INFO')
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

from collections import deque

# **Environment Setup**

This section sets up the environment and defines the relevant functions needed for this implementation.

##### Function for making Keras models:

In [None]:
# function for making a keras model:
def make_model(layers, neurons, rate, norm, drop, input_shape, output_shape, loss_function, output_activation):
    # instantiate model:
    model = keras.Sequential()

    # add hidden layers:
    for i in range(layers):
        if i == 0:
            model.add(Input(shape = (input_shape, )))
            model.add(Dense(neurons, activation = 'relu', name = f'hidden_layer_{i+1}'))
        else:
            model.add(Dense(neurons, activation = 'relu', name = f'hidden_layer_{i+1}'))

        if norm == True:
            model.add(BatchNormalization(name = f'batch_norm_layer_{i+1}'))

        if drop == True:
            model.add(Dropout(0.2, name = f'dropout_layer_{i+1}'))
    
    # add output layer:
    model.add(Dense(output_shape, activation = output_activation, name = 'output_layer'))

    # compile the model:
    model.compile(optimizer = Adam(learning_rate = rate),
                  loss = loss_function)
    
    return model 

##### A2C class:

In [None]:
# A2C class:
class A2C_Agent:
    ####################### INITIALIZATION #######################
    # constructor:
    def __init__(self,
                env: gym.Env,
                gamma: float, 
                lr_a: float, 
                lr_c: float,
                layers = int,
                neurons = int,
                seed = int):
        """ 
        this is the constructor for the agent. this agent uses the advantage actor-critic (A2C) algorithm to learn an optimal policy,
        through the use of two approximator networks. the first network, called the actor, is responsible for providing the probabilty 
        distribution over all actions given a state. the second network, called the critic, is responsible for utilizing the advantage function
        to guide the learning of the actor.

        env:                a gymnasium environment
        gamma:              a float value indicating the discount factor, γ
        lr_a:               a float value indicating the learning rate of the actor, α_a
        lr_c:               a float value indicating the learning rate of the critic, α_c
        layers:             an int value indicating the number of layers in a network
        neurons:            an int value indicating the number of neurons per layer
        seed:               an int value indicating the desired seed, for use in randomization and reproducability

        nS:                 an int representing the number of states observed, each of which is continuous
        nA:                 an int representing the number of discrete actions that can be taken

        actor_network:      a Keras sequential neural network representing the actor
        critic_network:     a Keras sequential neural network representing the actor

        """
        # object parameters:
        self.env = env
        self.gamma = gamma
        self.lr_a = lr_a
        self.lr_c = lr_c

        # get the environment dimensions:
        self.nS = env.observation_space.shape[0]
        self.nA = env.action_space.n

        # initialize the networks:
        self.actor_network = make_model(layers = layers,
                                        neurons = neurons,
                                        rate = lr_a,
                                        norm = True,
                                        drop = True,
                                        input_shape = self.nS,
                                        output_shape = self.nA,
                                        loss_function = "categorical_crossentropy",
                                        output_activation = "softmax")
        
        self.critic_network = make_model(layers = layers,
                                        neurons = neurons,
                                        rate = lr_c,
                                        norm = True,
                                        drop = True,
                                        input_shape = self.nS,
                                        output_shape = 1,
                                        loss_function = "mse",
                                        output_activation = "linear")
        
        # set the seed:
        self.seed = seed
    
    ####################### TRAINING #######################
    # function for updating actor:
    @tf.function
    def update_actor(self, state, action, advantage):
        # enforce dimensionality:
        state = tf.convert_to_tensor(state, dtype = tf.float32)
        state = tf.reshape(state, [1, self.nS])                 

        action = tf.reshape(tf.cast(action, tf.int32), [-1])                

        advantage = tf.stop_gradient(tf.cast(tf.reshape(advantage, [1]), tf.float32))

        # track auto differentiation:
        with tf.GradientTape() as tape_a:
            # 1) compute the loss for the actor:
            action_probs = self.actor_network(state)
            index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
            chosen_prob = tf.gather_nd(action_probs, index)

            # 2) compute policy loss:
            logp = tf.math.log(chosen_prob + 1e-8)
            policy_loss = -tf.reduce_mean(logp * advantage)

        # 3) backpropagate and update weights:
        grads = tape_a.gradient(policy_loss, self.actor_network.trainable_variables)
        self.actor_network.optimizer.apply_gradients(zip(grads, self.actor_network.trainable_variables))

    # function for updating critic:
    @tf.function
    def update_critic(self, state, target):
        # enforce dimensionality:
        state = tf.convert_to_tensor(state, dtype = tf.float32)
        state = tf.reshape(state, [1, self.nS])             

        # freeze gradient on targets:
        target = tf.stop_gradient(tf.cast(tf.reshape(target, [1, 1])), tf.float32)

        # track auto differentiation:
        with tf.GradientTape() as tape_c:
            # 1) compute values so tape knows which weights to differentiate:
            value = self.critic_network(state)

            # 2) compute loss:
            critic_loss = tf.reduce_mean(tf.square(target - value))

        # 3) backpropagate and update the weights:
        grads = tape_c.gradient(critic_loss, self.critic_network.trainable_variables)
        self.critic_network.optimizer.apply_gradients(zip(grads, self.critic_network.trainable_variables))

    # training function:
    def training(self, training_length):
        # initialize reward history:
        reward_history = np.zeros(training_length)

        # for every episode:
        for episode in tqdm(range(training_length), colour = "#33FF00", ncols = 100, desc = "training progress"):
            # get initial state:
            obs, _ = self.env.reset()
            obs = tf.convert_to_tensor(obs[None, :], dtype = tf.float32) 

            # counter for reward earned this episode
            episode_reward = 0

            # flag for completion:
            done = False

            # while false:
            while not done:
                # 1) pick an action from the actor network output:
                action_probs = self.actor_network(obs, training = False)
                # print(f"action probs are: {action_probs}")
                action = tf.random.categorical(tf.math.log(action_probs), 1)
                # print(f"action is {action}")

                # 2) critic predicts value of state:
                value = self.critic_network(obs, training = False)
                print(f"value of current state is: {value}")

                # 3) get next state, reward:
                next_obs, reward, term, trunc, _ = self.env.step(action)
                next_obs = tf.convert_to_tensor(next_obs[None, :], dtype = tf.float32) 

                done = term or trunc
                episode_reward += reward

                # 4) compute TD target, advantage:
                next_value = self.critic_network(next_obs, training = False)
                target = reward + (1-done) * self.gamma * next_value
                advantage = target - value

                # 5) update critic:
                self.update_critic(obs, target)

                # 6) update actor:
                self.update_actor(obs, action, advantage)

                # 7) advance values:
                obs = next_obs
            
            # advance reward history:
            reward_history[episode] = episode_reward
        
        # return to user:
        return reward_history


# **Using the Environment**

This section utilizes the above A2C implementation to create an environment and train an agent.

##### Specify hyperparameters:

In [None]:
# agent hyperparameters:
lr_a = 1e-3             # learning rate for the actor, α_a
lr_c = 1e-3             # learning rate for the actor, α_c
gamma = 0.99            # discount factor γ
train_length = 50     # number of episodes to train for

layers = 2      # how many layers to have in each network
neurons = 32    # how many neurons to have in each layer of each network

seed = 18       # seed for reproducability

##### Initialize the environment and agent:

In [None]:
# create the environment:
env = gym.make("CartPole-v1")

# clear the backend:
tf.keras.backend.clear_session()

# instantiate the agent:
ac_agent = A2C_Agent(env = env,
                    lr_a = lr_a,
                    lr_c = lr_c,
                    gamma = gamma,
                    layers = layers,
                    neurons = neurons, 
                    seed = seed
                    )

_, _ = ac_agent.env.reset(seed = ac_agent.seed)

##### Train the agent:

In [None]:
reward_history = ac_agent.training(training_length = train_length)