In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, concatenate, Lambda
from collections import deque
import matplotlib.pyplot as plt

In [2]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, action_min, action_max):
        super(Actor, self).__init__()
        self.action_min = action_min
        self.action_max = action_max

        self.fc1 = Dense(64, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        # self.fc3 = Dense(16, activation='relu')
        self.out= Dense(action_size, activation='tanh',kernel_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)) # -1 ~ +1

    def call(self, x):
        x       = self.fc1(x)
        x       = self.fc2(x)
        # x       = self.fc3(x)
        action  = self.out(x)
        # return self.projected_to_action_space(action)
        a = Lambda(lambda x: x*self.action_max)(action)
        return a

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1 = Dense(16, activation='relu')
        self.s2 = Dense(32, activation='relu')
        self.a1 = Dense(32, activation='relu')
        self.a2 = Dense(32, activation='relu')
        self.fc1= Dense(64, activation='relu')
        self.fc2= Dense(64, activation='relu')
        self.out= Dense(1,  activation='linear')

    def call(self,state,action):
        # state  = state_action[0]
        # action = state_action[1]
        s = self.s1(state)
        s = self.s2(s)
        a = self.a1(action)
        a = self.a2(a)
        c = concatenate([s,a],axis=-1)
        x = self.fc1(c)
        x = self.fc2(x)
        q = self.out(x)
        return q

In [3]:
class DDPGAgent:
    def __init__(self, state_size, action_size, action_min, action_max):
        self.state_size = state_size
        self.action_size= action_size
        self.action_min = action_min
        self.action_max = action_max
        
        self.actor          = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        state_in = Input((self.state_size,))
        self.actor(state_in)
        self.actor.summary()
        
    def get_action(self,state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        action = self.actor(state)
        action_from_net = action.numpy()[0]
        # Exploration and Exploitation
        return np.clip(action_from_net,self.action_min,self.action_max)

    def load_model(self):
        self.actor.load_weights( "./save_model/LunarLanderContinuous_ddpg_TF_actor" )
        return


In [4]:
%matplotlib tk

ENV_NAME = 'LunarLanderContinuous-v2'
EPISODES = 10
# END_SCORE = -150

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size  = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    action_min  = env.action_space.low[0]
    action_max  = env.action_space.high[0]

    agent = DDPGAgent(state_size, action_size, action_min, action_max)
    print('Env Name : ',ENV_NAME)
    print('States {0}, Actions {1}'.format(state_size, action_size))
    print('Action space {0:.2f} ~ {1:.2f}'.format(action_min, action_max))
    scores, episodes = [], []
    score_avg = 0

    end = False
    show_media_info = True
    
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        while not done:
            env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            state = next_state
            # 
            score += reward
            if show_media_info:
                print("State Shape : ", np.shape(state))
                print("Action Shape : ", np.shape(action))
                print("Reward Shape : ", np.shape(reward))
                print("done Shape : ", np.shape(done))
                show_media_info = False
            if done:
                print("episode : {0:3d} | score : {1:3.2f} |".format(e, score))

n :  [-0.00133296  0.00268901]
action from net :  [-0.00137478  0.00272951]
action :  [-0.00137478  0.00272951]
action from net :  [-0.00142426  0.00277518]
action :  [-0.00142426  0.00277518]
action from net :  [-0.00148198  0.00281386]
action :  [-0.00148198  0.00281386]
action from net :  [-0.00153974  0.00284194]
action :  [-0.00153974  0.00284194]
action from net :  [-0.00159714  0.00286501]
action :  [-0.00159714  0.00286501]
action from net :  [-0.00165503  0.00288777]
action :  [-0.00165503  0.00288777]
action from net :  [-0.00171413  0.00292065]
action :  [-0.00171413  0.00292065]
action from net :  [-0.00177866  0.00297474]
action :  [-0.00177866  0.00297474]
action from net :  [-0.00184375  0.003029  ]
action :  [-0.00184375  0.003029  ]
action from net :  [-0.00399051  0.00062954]
action :  [-0.00399051  0.00062954]
episode :   5 | score : -119.88 |
action from net :  [-0.00027443  0.00057894]
action :  [-0.00027443  0.00057894]
action from net :  [-0.00029023  0.00061824]

KeyboardInterrupt: 

In [None]:
env.close()