# A2C

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Conv2D, GlobalAveragePooling2D

class ActionValueModel(Model):

    def __init__(self, num_actions):

        super().__init__('mlp_policy')

        self.num_actions = num_actions

        # TODO Defining learnable operators

        # Actor - Defining operators for policy network
        self.feat_extr_0 = Conv2D(filters=32, kernel_size=(3,3), strides=(2,2))
        self.feat_extr_1 = Conv2D(filters=32, kernel_size=(3,3), strides=(2,2))
        self.feat_extr_2 = Conv2D(filters=32, kernel_size=(3,3), strides=(2,2))
        self.feat_extr_3 = Conv2D(filters=32, kernel_size=(3,3), strides=(2,2))
        self.ap = GlobalAveragePooling2D()
        self.fcl_0 = Dense(128, activation='relu')
        self.fcl_1 = Dense(64, activation='relu')
        self.fcl_2 = Dense(128, activation='relu')
        self.action_picker = Dense(self.num_actions, activation='relu')
        #self.logits = Dense(num_actions, name='policy_logits')
        #self.dist = ProbabilityDistribution()

        # Critic - Defining operators for value network
        self.hidden2 = Dense(128, activation='relu')
        self.hidden3 = Dense(128, activation='relu')
        self.hidden4 = Dense(128, activation='relu')
        self.value = Dense(1, name='value')
        

    def call(self, inputs):
        # inputs = [target_array, current_array]

        # Converting inputs array into a tensor
        input_target, input_current = inputs #x = tf.convert_to_tensor(inputs)

        # Encoder -> target
        target = self.feat_extr_0(input_target)
        target = self.feat_extr_1(target)
        target = self.feat_extr_2(target)
        target = self.feat_extr_3(target)
        latent_target = self.ap(target)

        # Encoder -> current
        current = self.feat_extr_0(input_current)
        current = self.feat_extr_1(current)
        current = self.feat_extr_2(current)
        current = self.feat_extr_3(current)
        latent_current = self.ap(current)

        # Producing output
        aggr = Concatenate()([latent_target, latent_current])
        aggr = self.fcl_0(aggr)
        aggr = self.fcl_1(aggr)
        aggr = self.fcl_2(aggr)
        actions = self.action_picker(aggr)

        # Calculating value
        hidden_vals = self.hidden2(aggr)
        hidden_vals = self.hidden3(hidden_vals)
        hidden_vals = self.hidden4(hidden_vals)
        value = self.value(hidden_vals)

        # returning action actions and value
        return actions, value


    def action_value(self, obs, target):

        # Predicting action and value from observed state
        action, value = self.predict([obs,target])
        #action = self.dist.predict(logits)

        return np.squeeze(action, axis=-1), np.squeeze(value, axis=-1) # ?

In [0]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import categorical_crossentropy, SparseCategoricalCrossentropy, mean_squared_error

class A2CAgent(object) :

    def __init__(self, model):

        # Hyperparameters settings
        self.params = {"gamma"               :   0.99, # discount factor
                       "value_loss_factor"   :    0.5,
                       "entropy_loss_factor" : 0.0001, # exploration factor
                       }

        # Defining and compiling model
        self.model = model
        self.model.compile(optimizer = RMSprop(lr=0.0007),
                           loss = [self._logits_loss, 
                                   self._value_loss,
                                   ]
                           )
    

    def train(self, environment, batch_size=32, updates=1000):

        # storage helpers for a single batch of data
        actions = np.empty((batch_size,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_size))
        observations = np.empty((batch_size,) + environment.observation_space.shape)
        
        #memory leak tracker
        #tr = tracker.SummaryTracker()

        # Training loop
        episode_rewards = [0.0] # cumulative episode reward (episode_rewards[i] contains the sum of rewards of episode i)
        next_obs = environment.reset()
        for update in range(updates):
            logging.info("\n\n    UPDATE: %03d \n\n" % (update))
            # Collecting batch of training data
            for step in range(batch_size):

                # Percepting the environment
                observations[step] = next_obs.copy()

                # Getting action and value predicted for this perception
                actions[step], values[step] = self.model.action_value(next_obs[None, :])

                # Updating the environment with action
                next_obs, rewards[step], dones[step], _ = environment.step(actions[step])

                # Updating current episode total reward
                episode_rewards[-1] += rewards[step]

                # If the episode has come to an end, restarting the environment and initializing cumulative reward for new episode
                if dones[step]:
                    episode_rewards.append(0.0)
                    next_obs = environment.reset()

                    # Monitoring
                    logging.info("\n Episode: %03d, Reward: %03d" % (len(episode_rewards)-1, episode_rewards[-2]))
                    logging.info("   actions: %s; rewards: %s; dones: %s; values: %s; observations: %s" % (actions.shape, rewards.shape, dones.shape, values.shape, observations.shape))
                    logging.info("   episode_rewards: %s" % (len(episode_rewards)))
                    #tr.print_diff()
            # At that point, we have enough information to make a batch of data

            # Making batch
            _, next_value = self.model.action_value(next_obs[None, :])
            returns, advantages = self._returns_advantages(rewards, dones, values, next_value)

            # Training on collected batch
            acts_and_advs = np.concatenate([actions[:, None], advantages[:, None]], axis=-1) # trick to include multiple arguments in a loss function
            losses = self.model.train_on_batch(observations, [acts_and_advs, returns])

            # Monitoring
            logging.debug("[%d/%d] Losses: %s" % (update+1, updates, losses))
        
        #tr.print_diff()
        return episode_rewards


    def decision(self, perception) :
        action, value = self.model.action_value(perception[None, :])
        return action


    def test(self, environment, render=False):
        obs, done, episode_reward = environment.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = environment.step(action)
            episode_reward += reward
            if render:
                environment.render()
        return episode_reward


    def _returns_advantages(self, rewards, dones, values, next_value):
        # next_value is the bootstrap value estimate of a future state (the critic)
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)

        # Calculating returns as discounted sum of future rewards
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.params['gamma'] * returns[t+1] * (1-dones[t])
        returns = returns[:-1]

        # Calculating advantages
        advantages = returns - values

        return returns, advantages
    

    def _value_loss(self, returns, value):
        return self.params['value_loss_factor']*mean_squared_error(returns, value) # value loss is typically MSE between value estimates and returns


    def _logits_loss(self, acts_and_advs, logits):

        # Separating actions and advantages
        actions, advantages = tf.split(acts_and_advs, 2, axis=-1)

        # Calculating policy loss
        # note: we only calculate the loss on the actions we've actually taken
        actions = tf.cast(actions, tf.int32)
        weighted_sparse_ce = SparseCategoricalCrossentropy(from_logits=True) # from_logits argument ensures transformation into normalized probabilities
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages) # policy loss is defined by policy gradients, weighted by advantages

        # Calculating entropy loss
        entropy_loss = categorical_crossentropy(logits, logits, from_logits=True) # entropy loss can be calculated via CE over itself
        
        return policy_loss - self.params['entropy_loss_factor']*entropy_loss # here signs are flipped because optimizer minimizes

# Environment

In [0]:
from painter.environmentInterfaces.LibMyPaintInterface import *

environment = LibMyPaintInterface(episode_length=20)

# Agent

In [0]:
model = ActionValueModel(num_actions=11)
agent = A2CAgent(model)

# Baseline

In [0]:
rewards_sum = agent.test(environment)
print(agent.test(environment))

#Training

In [0]:
import logging

# set to logging.WARNING to disable logs or logging.DEBUG to see losses as well
logging.getLogger().setLevel(logging.INFO)

rewards_history = agent.train(environment, updates=1000)

print("Finished training.")

# Validation

In [0]:
def plotCumulativeRewards(rewards_history) :

    plt.style.use('seaborn')

    plt.plot(rewards_history)

    plt.xlabel('Episode')
    plt.ylabel('Total Reward')

    plt.show()

print(rewards_history)
plotCumulativeRewards(rewards_history)

In [0]:
from painter.animators.Animator import *
from PIL import Image

input_img = Image.open("TODO").asarray()

animator = Animator(agent=agent,
                    environment_interface=environment,
                    objectif=input_img)
animator.anime(target=input_img, fps=10)