In [1]:
import gym
import sys
import pylab
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow_probability import distributions as tfd
from tensorflow.keras.initializers import RandomUniform
from collections import deque
import matplotlib.pyplot as plt
dist = tfd.Normal(loc=0., scale=3.)

In [2]:
class SAC(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(SAC, self).__init__()
        self.actor_fc1 = Dense(24,activation='tanh')
        self.actor_mu  = Dense(action_size,kernel_initializer=RandomUniform(-1e-3,1e-3))
        self.actor_sig = Dense(action_size, activation='sigmoid',kernel_initializer=RandomUniform(-1e-3,1e-3))

        self.critic_fc1 = Dense(24,activation='tanh')
        self.critic_fc2 = Dense(24,activation='tanh')
        self.critic_out = Dense(1,kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

    def call(self,x):
        x1  = self.actor_fc1(x)
        mu  = self.actor_mu(x1)
        sig = self.actor_sig(x1)
        sig+= 1e-5

        x1  = self.critic_fc1(x)
        x2  = self.critic_fc2(x1)
        val = self.critic_out(x2)

        return mu, sig, val

In [3]:
class SACAgent:
    def __init__(self, state_size, action_size, act_min, act_max):
        self.state_size = state_size
        self.action_size= action_size
        self.act_min = act_min
        self.act_max = act_max

        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        self.model     = SAC(self.state_size, self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=1.0)

    def get_action(self, state):
        mu, sigma, _ = self.model(state)
        distrib = tfd.Normal(loc=mu[0], scale=sigma[0])
        action = distrib.sample([1])[0]
        return np.clip(action, self.act_min, self.act_max)

    def train_model(self, state, action, reward, next_state, done):
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            mu, sigma, value = self.model(state)
            _, _, next_value = self.model(next_state)
            target = reward + (1 - done) * self.discount_factor * next_value[0]

            # Policy Network
            advantage = tf.stop_gradient(target - value[0])
            distrib = tfd.Normal(loc=mu[0], scale=sigma[0])
            action_prob = distrib.prob([action])[0]
            cross_entropy = - tf.math.log(action_prob + 1e-5)
            actor_loss = tf.reduce_mean(cross_entropy * advantage)

            # Value Network
            critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
            critic_loss = tf.reduce_mean(critic_loss)

            loss = 0.1 * actor_loss + critic_loss
        
        # Update weights
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        return loss, sigma

In [4]:
%matplotlib tk

ENV_NAME = 'Pendulum-v0'
EPISODES = 1000

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    agent = SACAgent(state_size, action_size,
                        env.action_space.low[0],
                        env.action_space.high[0])
    scores, episodes, losses, sigmas = [], [], [], []
    score_avg = 0

    end = False
    
    for e in range(EPISODES):
        done = False
        score = 0
        loss_list, sigma_list = [], []

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            env.render()

            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            score += reward
            reward = 0.1 if not done or score == 500 else -1

            loss, sigma = agent.train_model(state, action, reward, next_state, done)
            loss_list.append(loss)
            sigma_list.append(sigma)
            
            state = next_state
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f} | sigma: {:.3f}".format(
                      e, score_avg, np.mean(loss_list), np.mean(sigma)))

                episodes.append(e)
                scores.append(score_avg)
                losses.append(np.mean(loss_list))
                sigmas.append(np.mean(sigma))

                plt.subplot(311)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('pendulum SAC')
                plt.grid()
                
                plt.subplot(312)
                plt.plot(episodes, sigmas, 'b')
                plt.xlabel('episode')
                plt.ylabel('sigma')
                plt.grid()
                
                plt.subplot(313)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('losses')
                plt.grid()
                plt.savefig("./save_model/pendulum_sac.png")

                # 이동 평균이 400 이상일 때 종료
                if score_avg > 400:
                    agent.model.save_weights("./save_model/pendulum_sac", save_format="tf")
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/pendulum_sac_epi',  episodes)
            np.save('./save_model/pendulum_sac_score',scores)
            np.save('./save_model/pendulum_sac_loss', losses)
            np.save('./save_model/pendulum_sac_sigmas', sigmas)
            print("End")
            break

episode:   0 | score avg: -1454.82 | loss: 0.088 | sigma: 0.495
episode:   1 | score avg: -1395.70 | loss: 0.143 | sigma: 0.378
episode:   2 | score avg: -1430.40 | loss: 0.157 | sigma: 0.513
episode:   3 | score avg: -1438.18 | loss: 0.200 | sigma: 0.410
episode:   4 | score avg: -1389.85 | loss: 0.156 | sigma: 0.371
episode:   5 | score avg: -1340.41 | loss: 0.076 | sigma: 0.457
episode:   6 | score avg: -1385.89 | loss: 0.151 | sigma: 0.452
episode:   7 | score avg: -1417.44 | loss: 0.212 | sigma: 0.447
episode:   8 | score avg: -1388.36 | loss: 0.265 | sigma: 0.478
episode:   9 | score avg: -1437.53 | loss: 0.277 | sigma: 0.480
episode:  10 | score avg: -1469.69 | loss: 0.284 | sigma: 0.488
episode:  11 | score avg: -1503.70 | loss: 0.276 | sigma: 0.495
episode:  12 | score avg: -1533.00 | loss: 0.290 | sigma: 0.477
episode:  13 | score avg: -1553.38 | loss: 0.301 | sigma: 0.503
episode:  14 | score avg: -1527.89 | loss: 0.339 | sigma: 0.498
episode:  15 | score avg: -1524.05 | los

KeyboardInterrupt: 

In [4]:
env.close()