In [1]:
import gym
import sys
import pylab
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow_probability import distributions as tfd
from tensorflow.keras.initializers import RandomUniform
from collections import deque
import matplotlib.pyplot as plt
dist = tfd.Normal(loc=0., scale=3.)

In [2]:
class CAC(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(CAC, self).__init__()
        self.actor_fc1 = Dense(24,activation='tanh')
        self.actor_mu  = Dense(action_size,kernel_initializer=RandomUniform(-1e-3,1e-3))
        self.actor_sig = Dense(action_size, activation='sigmoid',kernel_initializer=RandomUniform(-1e-3,1e-3))

        self.critic_fc1 = Dense(24,activation='tanh')
        self.critic_fc2 = Dense(24,activation='tanh')
        self.critic_out = Dense(1,kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

    def call(self,x):
        x1  = self.actor_fc1(x)
        mu  = self.actor_mu(x1)
        sig = self.actor_sig(x1)
        sig+= 1e-5

        x1  = self.critic_fc1(x)
        x2  = self.critic_fc2(x1)
        val = self.critic_out(x2)

        return mu, sig, val

In [3]:
class CACAgent:
    def __init__(self, state_size, action_size, act_min, act_max):
        self.state_size = state_size
        self.action_size= action_size
        self.act_min = act_min
        self.act_max = act_max

        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        self.model     = CAC(self.state_size, self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=1.0)

    def get_action(self, state):
        mu, sigma, _ = self.model(state)
        distrib = tfd.Normal(loc=mu[0], scale=sigma[0])
        action = distrib.sample([1])[0]
        return np.clip(action, self.act_min, self.act_max)

    def train_model(self, state, action, reward, next_state, done):
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            mu, sigma, value = self.model(state)
            _, _, next_value = self.model(next_state)
            target = reward + (1 - done) * self.discount_factor * next_value[0]

            # Policy Network
            advantage = tf.stop_gradient(target - value[0])
            distrib = tfd.Normal(loc=mu[0], scale=sigma[0])
            action_prob = distrib.prob([action])[0]
            cross_entropy = - tf.math.log(action_prob + 1e-5)
            actor_loss = tf.reduce_mean(cross_entropy * advantage)

            # Value Network
            critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
            critic_loss = tf.reduce_mean(critic_loss)

            loss = 0.1 * actor_loss + critic_loss
        
        # Update weights
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        return loss, sigma

In [4]:
%matplotlib tk

ENV_NAME = 'Pendulum-v0'
EPISODES = 1000

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    agent = CACAgent(state_size, action_size,
                        env.action_space.low[0],
                        env.action_space.high[0])
    scores, episodes, losses, sigmas = [], [], [], []
    score_avg = 0

    end = False
    
    for e in range(EPISODES):
        done = False
        score = 0
        loss_list, sigma_list = [], []

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            env.render()

            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            score += reward
            reward = 0.1 if not done or score == 500 else -1

            loss, sigma = agent.train_model(state, action, reward, next_state, done)
            loss_list.append(loss)
            sigma_list.append(sigma)
            
            state = next_state
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f} | sigma: {:.3f}".format(
                      e, score_avg, np.mean(loss_list), np.mean(sigma)))

                episodes.append(e)
                scores.append(score_avg)
                losses.append(np.mean(loss_list))
                sigmas.append(np.mean(sigma))

                plt.subplot(311)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('pendulum SAC')
                plt.grid()
                
                plt.subplot(312)
                plt.plot(episodes, sigmas, 'b')
                plt.xlabel('episode')
                plt.ylabel('sigma')
                plt.grid()
                
                plt.subplot(313)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('losses')
                plt.grid()
                plt.savefig("./save_model/pendulum_sac.png")

                # 이동 평균이 400 이상일 때 종료
                if score_avg > 400:
                    agent.model.save_weights("./save_model/pendulum_sac", save_format="tf")
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/pendulum_sac_epi',  episodes)
            np.save('./save_model/pendulum_sac_score',scores)
            np.save('./save_model/pendulum_sac_loss', losses)
            np.save('./save_model/pendulum_sac_sigmas', sigmas)
            print("End")
            break

95 | loss: 0.240 | sigma: 0.278
episode: 323 | score avg: -1452.30 | loss: 0.238 | sigma: 0.280
episode: 324 | score avg: -1447.60 | loss: 0.237 | sigma: 0.165
episode: 325 | score avg: -1446.77 | loss: 0.236 | sigma: 0.266
episode: 326 | score avg: -1452.96 | loss: 0.237 | sigma: 0.257
episode: 327 | score avg: -1457.61 | loss: 0.235 | sigma: 0.277
episode: 328 | score avg: -1461.85 | loss: 0.239 | sigma: 0.348
episode: 329 | score avg: -1454.87 | loss: 0.459 | sigma: 0.335
episode: 330 | score avg: -1459.80 | loss: 0.240 | sigma: 0.329
episode: 331 | score avg: -1463.70 | loss: 0.236 | sigma: 0.313
episode: 332 | score avg: -1460.31 | loss: 0.237 | sigma: 0.183
episode: 333 | score avg: -1458.43 | loss: 0.284 | sigma: 0.307
episode: 334 | score avg: -1455.77 | loss: 0.243 | sigma: 0.308
episode: 335 | score avg: -1459.51 | loss: 0.237 | sigma: 0.267
episode: 336 | score avg: -1465.21 | loss: 0.239 | sigma: 0.188
episode: 337 | score avg: -1464.35 | loss: 0.236 | sigma: 0.291
episode:

KeyboardInterrupt: 

In [4]:
np.save('./save_model/pendulum_sac_epi',  episodes)
np.save('./save_model/pendulum_sac_score',scores)
np.save('./save_model/pendulum_sac_loss', losses)
np.save('./save_model/pendulum_sac_sigmas', sigmas)
agent.model.save_weights("./save_model/pendulum_sac", save_format="tf")