In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from collections import deque
import matplotlib.pyplot as plt

2021-07-23 21:28:53.441770: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
class A2C(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(A2C, self).__init__()
        self.fc1 = Dense(64, activation='relu')
        # self.fc2 = Dense(64, activation='relu')
        self.actor  = Dense(action_size, activation='softmax',
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        self.critic = Dense(1,
                                        kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        
    def call(self, x):
        x      = self.fc1(x)
        # x      = self.fc2(x)
        policy = self.actor(x)
        value  = self.critic(x)
        return policy, value

In [3]:
class A2CAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        
        self.model = A2C(self.state_size,self.action_size)
        self.model.load_weights("./save_model/LunarLanderv2_a2c_TF")
        
    def get_action(self, state):
        policy, _ = self.model(state)
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
        
    def get_loss(self, state, action, reward, next_state, done):
        model_params = self.model.trainable_variables
        policy, value      = self.model(state)
        _,      next_value = self.model(next_state)
        target = reward + (1 - done) * self.discount_factor * next_value[0]

        # For policy network
        one_hot_action = tf.one_hot([action], self.action_size)
        action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
        cross_entropy = - tf.math.log(action_prob + 1e-5)
        advantage = tf.stop_gradient(target - value[0])
        actor_loss = tf.reduce_mean(cross_entropy * advantage)

        # For value network
        critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
        critic_loss = tf.reduce_mean(critic_loss)

        # integrate losses
        loss = 0.2 * actor_loss + critic_loss
            
        return np.array(loss)

In [4]:
%matplotlib tk

ENV_NAME = 'LunarLander-v2'
EPISODES = 10
# END_SCORE = 200

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = A2CAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores, episodes, losses = [], [], []
    score_avg = 0
    
    end = False
    
    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0
        loss_list = []
        
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        
        while not done:
            env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            loss = agent.get_loss(state, action, reward, next_state, done)
            state = next_state

            # 
            score += reward
            loss_list.append(loss)
            if done:
                print('epi: {:3d} | score {:3.2f} | loss {:.4f}'
                      .format(e+1, score, tf.reduce_mean(loss_list)))
                scores = np.append(scores,score)
                losses.append(tf.reduce_mean(loss_list))
    print('Avg. score {:4.2f}, Avg. loss {:.4f}'.
          format(tf.reduce_mean(scores), tf.reduce_mean(losses)))
    env.close()     

2021-07-23 21:28:54.674963: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-23 21:28:54.675560: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-07-23 21:28:54.719790: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:65:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.545GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-07-23 21:28:54.719823: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-07-23 21:28:54.721927: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-07-23 21:28:54.721988: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2

Env Name :  LunarLander-v2
States 8, Actions 4


2021-07-23 21:28:56.051322: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-07-23 21:28:56.405202: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11


epi:   1 | score 220.96 | loss 4.4395
epi:   2 | score 228.09 | loss 6.5300
epi:   3 | score 149.43 | loss 9.5975
epi:   4 | score 179.09 | loss 3.8855
epi:   5 | score 200.56 | loss 3.5614
epi:   6 | score 248.95 | loss 4.6770
epi:   7 | score 206.98 | loss 7.4999
epi:   8 | score 183.40 | loss 4.4573
epi:   9 | score 232.92 | loss 6.2462
epi:  10 | score 243.34 | loss 4.7110
epi:  11 | score 200.45 | loss 4.9288
epi:  12 | score 241.11 | loss 4.6664
epi:  13 | score 194.93 | loss 5.8562
epi:  14 | score 194.48 | loss 6.0831
epi:  15 | score 240.97 | loss 4.8028
epi:  16 | score 193.26 | loss 9.7190
epi:  17 | score 200.64 | loss 5.0567
epi:  18 | score 5.63 | loss 8.3881
epi:  19 | score 183.01 | loss 4.6562
epi:  20 | score 182.28 | loss 6.5038
