In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from collections import deque
import matplotlib.pyplot as plt

In [2]:
class A2C(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(A2C, self).__init__()
        self.fc1 = Dense(64, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        self.actor  = Dense(action_size, activation='softmax',
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        self.critic = Dense(1,
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        
    def call(self, x):
        x      = self.fc1(x)
        x      = self.fc2(x)
        policy = self.actor(x)
        value  = self.critic(x)
        return policy, value

In [3]:
class A2CAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        
        self.model = A2C(self.state_size,self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        
    def get_action(self, state):
        policy, _ = self.model(state)
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
        
    def train_model(self, state, action, reward, next_state, done):
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            policy, value      = self.model(state)
            _,      next_value = self.model(next_state)
            target = reward + (1 - done) * self.discount_factor * next_value[0]
            
            # For policy network
            one_hot_action = tf.one_hot([action], self.action_size)
            action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
            cross_entropy = - tf.math.log(action_prob + 1e-5)
            advantage = tf.stop_gradient(target - value[0])
            actor_loss = tf.reduce_mean(cross_entropy * advantage)
            
            # For value network
            critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
            critic_loss = tf.reduce_mean(critic_loss)
            
            # integrate losses
            loss = 0.2 * actor_loss + critic_loss
            
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        return np.array(loss)

In [4]:
%matplotlib tk

ENV_NAME = 'LunarLander-v2'
EPISODES = 4000
END_SCORE = 200

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = A2CAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores, episodes, losses = [], [], []
    score_avg = 0
    
    end = False
    
    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0
        loss_list = []
        
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        
        while not done:
            # env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            loss = agent.train_model(state, action, reward, next_state, done)
            state = next_state

            # 
            score += reward
            loss_list.append(loss)
            if done:
                # agent.update_target_model()

                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | loss: {:.4f}'.format(e, score_avg, np.mean(loss_list)))

                # Save data for plot
                scores.append(score_avg)
                episodes.append(e)
                losses.append(np.mean(loss_list))
                
                # View data
                plt.subplot(211)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('cartpole A2C')
                plt.grid()
                
                plt.subplot(212)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('loss')
                plt.grid()
                
                plt.savefig('./save_model/LunarLanderv2_a2c_TF.png')

                if score_avg > END_SCORE:
                    agent.model.save_weights('./save_model/LunarLanderv2_a2c_TF', save_format='tf')
                    end = True
                    break
        if end == True:
            np.save('./save_model/LunarLanderv2_a2c_epi',episodes)
            np.save('./save_model/LunarLanderv2_a2c_score',scores)
            np.save('./save_model/LunarLanderv2_a2c_loss',losses)
            env.close()
            print("End")
            break

Env Name :  LunarLander-v2
States 8, Actions 4
epi:   0 | score avg -350.88 | loss: 25.1242
epi:   1 | score avg -348.28 | loss: 188.8681
epi:   2 | score avg -333.36 | loss: 44.7586
epi:   3 | score avg -350.35 | loss: 19.0472
epi:   4 | score avg -370.28 | loss: 777.5601
epi:   5 | score avg -421.80 | loss: 2756.9612
epi:   6 | score avg -413.02 | loss: 675.2480
epi:   7 | score avg -415.90 | loss: 1031.9916
epi:   8 | score avg -427.39 | loss: 921.1139
epi:   9 | score avg -458.58 | loss: 1535.3828
epi:  10 | score avg -510.93 | loss: 1693.2179
epi:  11 | score avg -519.83 | loss: 643.3398
epi:  12 | score avg -568.33 | loss: 1594.1311
epi:  13 | score avg -583.42 | loss: 976.9350
epi:  14 | score avg -634.11 | loss: 948.9446
epi:  15 | score avg -624.16 | loss: 630.1415
epi:  16 | score avg -644.56 | loss: 603.0780
epi:  17 | score avg -632.12 | loss: 420.9306
epi:  18 | score avg -611.56 | loss: 210.6061
epi:  19 | score avg -626.89 | loss: 195.8028
epi:  20 | score avg -618.81 | 