In [None]:
# Refer from
#  https://pasus.tistory.com/138
#  https://horomary.hatenablog.com/entry/2020/06/26/003806
#  https://keras.io/examples/rl/ddpg_pendulum/
#  https://github.com/dongminlee94/Samsung-DRL-Code/blob/master/5_SAC/sac/model.py
# ! pip 
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, concatenate, Lambda
import tensorflow_probability as tfp
tfd = tfp.distributions
from collections import deque
import matplotlib.pyplot as plt

In [None]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, log_std_min, log_std_max):
        super(Actor, self).__init__()
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max

        self.fc1= Dense(64, activation='relu')
        self.fc2= Dense(64, activation='relu')
        # self.fc3 = Dense(16, activation='relu')
        self.mu = Dense(action_size)
        self.log_std= Dense(action_size)

    def call(self, x):
        x       = self.fc1(x)
        x       = self.fc2(x)
        # x       = self.fc3(x)
        mu = self.mu(x)
        log_std = self.log_std(x)
        log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
        std = tf.math.exp(log_std)
        return mu, std

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1 = Dense(16, activation='relu')
        self.s2 = Dense(32, activation='relu')
        self.a1 = Dense(32, activation='relu')
        self.a2 = Dense(32, activation='relu')
        self.fc1= Dense(64, activation='relu')
        self.fc2= Dense(64, activation='relu')
        self.out= Dense(1,  activation='linear')

    def call(self,state,action):
        # state  = state_action[0]
        # action = state_action[1]
        s = self.s1(state)
        s = self.s2(s)
        a = self.a1(action)
        a = self.a2(a)
        c = concatenate([s,a],axis=-1)
        x = self.fc1(c)
        x = self.fc2(x)
        q = self.out(x)
        return q

In [None]:
# https://github.com/dongminlee94/Samsung-DRL-Code/blob/master/5_SAC/sac/utils.py
# https://github.com/RickyMexx/SAC-tf2/blob/master/SAC/SAC_rla.py
# https://github.com/p-christ/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/blob/b338c87bebb672e39304e47e0eed55aeb462b243/agents/Base_Agent.py#L278
# 
class SACAgent:
    def __init__(self, state_size, action_size, action_min, action_max):
        self.state_size = state_size
        self.action_size= action_size
        self.action_min = action_min
        self.action_max = action_max

        self.actor          = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.actor.build(input_shape=(None, self.state_size))
        state_in = Input(shape=(self.state_size,),dtype=tf.float32)
        self.actor(state_in)
        self.actor.summary()
        self.load_model()

    def get_action(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        mu, std = self.actor(state)
        action = mu
        return action[0]

    def eval_action(self, mu, std, epsilon=1e-6):
        action_prob = tfd.Normal(loc=mu, scale=std)
        z = action_prob.sample()
        action = tf.math.tanh(z)
        # action = tf.stop_gradient(action)
        log_prob = action_prob.log_prob(z) - tf.math.log(1.0 - tf.pow(action,2) + epsilon)
        log_prob = tf.reduce_sum(log_prob, axis=-1, keepdims=True)
        return action, log_prob

    def load_model(self):
        self.actor.load_weights("./save_model/pendulum_sac_TF_actor")
        return


In [None]:
%matplotlib tk

ENV_NAME = 'Pendulum-v0'
EPISODES = 5
# END_SCORE = -200

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size      = env.observation_space.shape[0]
    action_size     = env.action_space.shape[0]
    log_std_min     = -20.0
    log_std_max     = 5.0

    agent = SACAgent(state_size, action_size, log_std_min, log_std_max)
    print('Env Name : ',ENV_NAME)
    print('States {0}, Actions {1}'.format(state_size, action_size))
    print('Action scale exp({0:.2f} ~ {1:.2f})'.format(log_std_min, log_std_max))
    
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        critic_losses = []
        actor_losses = []
        while not done:
            env.render()
            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            state = next_state
            # 
            score += reward
            if done:
                # score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {0:3d} | score: {1:3.2f} |".format(e, score))

In [None]:
env.close()