아직 공부중, On studying...

In [None]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from collections import deque
import matplotlib.pyplot as plt

In [None]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, action_min, action_max):
        super(Actor, self).__init__()
        self.action_min = action_min
        self.action_max = action_max

        self.fc1 = Dense(16, activation='relu')
        self.fc2 = Dense(16, activation='relu')
        # self.fc3 = Dense(16, activation='relu')
        self.out= Dense(action_size, activation='tanh',kernel_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)) # -1 ~ +1

    def call(self, x):
        x       = self.fc1(x)
        x       = self.fc2(x)
        # x       = self.fc3(x)
        action  = self.out(x)
        # return self.projected_to_action_space(action)
        action = Lambda(lambda x: x*self.action_max)(action)
        return action

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1 = Dense(16, activation='relu')
        # self.s2 = Dense(16, activation='relu')
        self.a1 = Dense(16, activation='relu')
        # self.a2 = Dense(16, activation='relu')
        self.fc1= Dense(32, activation='relu')
        # self.fc2= Dense(32, activation='relu')
        self.out= Dense(1,  activation='linear',kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

    def call(self,state_action):
        state  = state_action[0]
        action = state_action[1]
        s = self.s1(state)
        # s = self.s2(s)
        a = self.a1(action)
        # a = self.a2(a)
        c = concatenate([s,a],axis=-1)
        x = self.fc1(c)
        # x = self.fc2(x)
        q = self.out(x)
        return q

In [None]:
class ACERAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        self.actor_learning_rate  = 0.001
        self.critic_learning_rate = 0.002

        # Experience Replay
        self.batch_size = 16
        self.train_start = 500
        self.memory = deque(maxlen=2000)

        self.critic         = Critic(self.state_size, self.action_size)
        self.actor          = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.critic_optimizer   = tf.keras.optimizers.Adam(lr=self.critic_learning_rate)
        self.actor_optimizer    = tf.keras.optimizers.Adam(lr=self.actor_learning_rate)
        self.actor.build(input_shape=(None, self.state_size))
        state_in = Input((self.state_size,))
        action_in = Input((self.action_size,))
        self.actor(state_in)
        self.critic(state_in, action_in)
        self.actor.summary()
        self.critic.summary()

    def remember(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_size])
        next_state = np.reshape(next_state, [1, self.state_size])
        transition = (state, action, reward, next_state, done)
        self.memory.append(transition)
        
    def get_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        policy, _ = self.model(state)
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
        
    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return
        # Sampling from the memory
        mini_batch = random.sample(self.memory, self.batch_size)
        
        states      = tf.convert_to_tensor(np.array([sample[0] for sample in mini_batch]))
        actions     = tf.convert_to_tensor(np.array([sample[1] for sample in mini_batch]))
        rewards     = tf.convert_to_tensor(np.array([sample[2] for sample in mini_batch]),dtype=tf.float32)
        rewards     = tf.expand_dims(rewards, axis = 1)
        next_states = tf.convert_to_tensor(np.array([sample[3] for sample in mini_batch]))
        dones       = tf.convert_to_tensor(np.array([sample[4] for sample in mini_batch]),dtype=tf.float32)
        dones       = tf.expand_dims(dones, axis = 1)
        
        if self.show_media_info == False:
            self.show_media_info = True
            print('Start to train, check batch shapes')
            print('shape of states', np.shape(states),type(states))
            print('shape of actions', np.shape(actions),type(actions))
            print('shape of rewards', np.shape(rewards),type(rewards))
            print('shape of next_states', np.shape(next_states),type(next_states))
            print('shape of dones', np.shape(dones),type(dones))

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states,training=True)
            target_q = self.target_critic(next_states,target_actions,training=True)
            target_value = rewards + (1 - dones) * self.discount_factor * target_q
            q = self.critic(states, actions,training=True)
            td_error = target_value - q
            critic_loss = tf.math.reduce_mean(tf.math.square(target_value - q))
        critic_params = self.critic.trainable_variables
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.critic_optimizer.apply_gradients(zip(critic_grads, critic_params))

        with tf.GradientTape() as tape:
            new_actions = self.actor(states,training=True)
            new_q = self.critic(states, new_actions,training=True)
            actor_loss = -tf.reduce_mean(new_q)
        actor_params = self.actor.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.actor_optimizer.apply_gradients(zip(actor_grads, actor_params))

    def train_model(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_size])
        next_state = np.reshape(next_state, [1, self.state_size])
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            policy, value      = self.model(state)
            _,      next_value = self.model(next_state)
            target = reward + (1 - done) * self.discount_factor * next_value[0]
            
            # For policy network
            one_hot_action = tf.one_hot([action], self.action_size)
            action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
            cross_entropy = - tf.math.log(action_prob + 1e-5)
            advantage = tf.stop_gradient(target - value[0])
            actor_loss = tf.reduce_mean(cross_entropy * advantage)
            
            # For value network
            critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
            critic_loss = tf.reduce_mean(critic_loss)
            
            # integrate losses
            loss = 0.2 * actor_loss + critic_loss
            
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        return np.array(loss)

    def save_model(self):
        self.model.save_weights('./save_model/cartpole_acer_TF', save_format='tf')

In [None]:
%matplotlib tk

ENV_NAME = 'CartPole-v1'
EPISODES = 3000
END_SCORE = 300

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = ACERAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores_avg, scores_raw, episodes, losses = [], [], [], []
    score_avg = 0
    
    end = False
    
    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0
        loss_list = []
        
        state = env.reset()
        
        while not done:
            # env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            loss = agent.train_model()
            state = next_state

            # 
            score += reward
            loss_list.append(loss)
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | loss: {:.4f}'.format(e, score_avg, np.mean(loss_list)))

                # Save data for plot
                episodes.append(e)
                scores_avg.append(score_avg)
                scores_raw.append(score)
                losses.append(np.mean(loss_list))
                
                # View data
                plt.subplot(211)
                plt.plot(episodes, scores_avg, 'b')
                plt.xlabel('episode'); plt.ylabel('average score'); plt.grid()
                plt.title('cartpole A2C TF')
                plt.subplot(212)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode'); plt.ylabel('loss'); plt.grid()
                plt.savefig('./save_model/cartpole_acer_TF.png')

                if score_avg > END_SCORE:
                    agent.save_model()
                    end = True
                    break
        if end == True:
            np.save('./save_model/data/cartpole_acer_TF_epi',episodes)
            np.save('./save_model/data/cartpole_acer_TF_score',scores)
            np.save('./save_model/data/cartpole_acer_TF_loss',losses)
            env.close()
            print("End")
            break