In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from collections import deque
import matplotlib.pyplot as plt

In [2]:
class A2C(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(A2C, self).__init__()
        self.fc1 = Dense(64, activation='relu')
        # self.fc2 = Dense(64, activation='relu')
        self.actor  = Dense(action_size, activation='softmax',
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        self.critic = Dense(1,
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        
    def call(self, x):
        x      = self.fc1(x)
        # x      = self.fc2(x)
        policy = self.actor(x)
        value  = self.critic(x)
        return policy, value

In [3]:
class A2CAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        
        self.model = A2C(self.state_size,self.action_size)
        self.model.load_weights("./save_model/LunarLanderv2_a2c_TF")
        
    def get_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        policy, _ = self.model(state)
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
        
    def train_model(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_size])
        next_state = np.reshape(next_state, [1, self.state_size])
        model_params = self.model.trainable_variables
        policy, value      = self.model(state)
        _,      next_value = self.model(next_state)
        target = reward + (1 - done) * self.discount_factor * next_value[0]

        # For policy network
        one_hot_action = tf.one_hot([action], self.action_size)
        action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
        cross_entropy = - tf.math.log(action_prob + 1e-5)
        advantage = tf.stop_gradient(target - value[0])
        actor_loss = tf.reduce_mean(cross_entropy * advantage)

        # For value network
        critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
        critic_loss = tf.reduce_mean(critic_loss)

        # integrate losses
        loss = 0.2 * actor_loss + critic_loss
            
        return np.array(loss)

In [4]:
%matplotlib tk

ENV_NAME = 'LunarLander-v2'
EPISODES = 10
# END_SCORE = 200

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = A2CAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores, episodes, losses = [], [], []
    score_avg = 0
    
    end = False
    
    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0
        loss_list = []
        
        state = env.reset()
        while not done:
            env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            loss = agent.train_model(state, action, reward, next_state, done)
            state = next_state

            # 
            score += reward
            loss_list.append(loss)
            if done:
                print('epi: {:3d} | score {:6.2f} | loss {:7.4f}'
                      .format(e+1, score, tf.reduce_mean(loss_list)))
                scores = np.append(scores,score)
                losses.append(tf.reduce_mean(loss_list))
    print('Avg. score {:6.2f}, Avg. loss {:7.4f}'.
          format(tf.reduce_mean(scores), tf.reduce_mean(losses)))
    env.close()     

Env Name :  LunarLander-v2
States 8, Actions 4
epi:   1 | score 21.17 | loss 14.9523
epi:   2 | score 258.54 | loss 5.8380
epi:   3 | score 236.02 | loss 6.6211
epi:   4 | score 221.04 | loss 4.9660
epi:   5 | score 190.85 | loss 4.5637
epi:   6 | score 198.03 | loss 5.4835
epi:   7 | score 218.07 | loss 7.2222
epi:   8 | score 225.81 | loss 6.3970
epi:   9 | score 223.19 | loss 6.1927
epi:  10 | score 183.98 | loss 10.0756
Avg. score 197.67, Avg. loss 7.2312
