In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Lambda, concatenate
from collections import deque
import matplotlib.pyplot as plt

In [2]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, action_max):
        super(Actor, self).__init__()

        self.action_max = action_max

        self.a1 = Dense(64, activation='relu')
        self.a2 = Dense(32, activation='relu')
        self.a3 = Dense(12, activation='relu')
        self.out= Dense(action_size, activation='tanh')

    def call(self, x):
        x = self.a1(x)
        x = self.a2(x)
        x = self.a3(x)
        a = self.out(x)
        a = Lambda(lambda x: x*self.action_max)(a)
        return a

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1= Dense(32, activation='relu')
        self.a1= Dense(32, activation='relu')
        self.c1= Dense(32, activation='relu')
        self.c2= Dense(16, activation='relu')
        self.out= Dense(1, activation='linear')

    def call(self, state, action):
        # state = state_action[0]
        # action = state_action[1]

        state = self.s1(state)
        action = self.a1(action)
        c = concatenate([state,action],axis=-1)
        c = self.c1(c)
        c = self.c2(c)
        q = self.out(c)
        return q
        
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [3]:
class DDPGAgent(tf.keras.Model):
    def __init__(self, state_size, action_size, action_max):
        super(DDPGAgent, self).__init__()
        self.state_size = state_size
        self.action_size= action_size
        self.action_max = action_max

        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate_actor = 0.001
        self.learning_rate_critic = 0.01
        self.tau = 0.001
        
        # Experience Replay
        self.batch_size = 256
        self.train_start = 400
        self.memory = deque(maxlen=20000)
        
        # Neural Network Architecture
        self.actor        = Actor(self.state_size, self.action_size, self.action_max)
        self.target_actor = Actor(self.state_size, self.action_size, self.action_max)
        self.critic       = Critic(self.state_size, self.action_size)
        self.target_critic= Critic(self.state_size, self.action_size)
        self.optimizer_actor    = tf.keras.optimizers.Adam(lr=self.learning_rate_actor)
        self.optimizer_critic   = tf.keras.optimizers.Adam(lr=self.learning_rate_critic)
        self.ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=0.2 * np.ones(1))
        
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        self.update_target_model()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def update_target_model(self):
        weight          = self.actor.get_weights()
        target_weight   = self.target_actor.get_weights()
        print('Len of weight : ',len(weight))
        print(weight)
        for i in range(1,len(weight)):
            target_weight[i] = self.tau * weight[i] + (1. - self.tau) * target_weight[i]
        self.target_actor.set_weights(target_weight)

        weight          = self.critic.get_weights()
        target_weight   = self.target_critic.get_weights()
        for i in range(1,len(weight)):
            target_weight[i] = self.tau * weight[i] + (1. - self.tau) * target_weight[i]
        self.target_critic.set_weights(target_weight)
        return 
        
    def get_action(self,state):
        # Exploration and Exploitation
        action = self.actor(state)
        action = action.numpy()[0]
        noise = self.ou_noise()
        out = np.clip(action + noise, -self.action_max, self.action_max)
        return out
        # if (np.random.rand() <= self.epsilon):
        #     return random.randrange(self.action_size)
        # else:
        #     return np.argmax(self.model.predict(state))
        
    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return 0.0
        # Sampling from the memory
        mini_batch = random.sample(self.memory, self.batch_size)
        
        states      = np.array([sample[0][0] for sample in mini_batch])
        actions     = np.array([sample[1] for sample in mini_batch])
        rewards     = np.array([sample[2] for sample in mini_batch])
        next_states = np.array([sample[3][0] for sample in mini_batch])
        dones       = np.array([sample[4] for sample in mini_batch])

        critic_params = self.critic.trainable_variables
        with tf.GradientTape() as tape:
            q               = self.critic(states,actions)
            target_action   = self.target_actor(next_states)
            target_q        = self.target_critic(next_states, target_action)
            y = rewards + (1 - dones) * self.discount_factor * target_q
            td_error = y - q
            critic_loss = tf.reduce_mean(tf.square(y - q))
            # critic_loss = tf.keras.losses.MSE(y,q)
            
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.optimizer_critic.apply_gradients(zip(critic_grads, critic_params))
         
        actor_params = self.actor.trainable_variables
        with tf.GradientTape() as tape:
            now_action  = self.actor(states)
            now_q       = self.critic(states,now_action)
            actor_loss  = -tf.reduce_mean(now_q)
            
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.optimizer_actor.apply_gradients(zip(actor_grads, actor_params))

        # agent.update_target_model()
        # return critic_loss, actor_loss
        return

In [4]:
%matplotlib tk

ENV_NAME = 'Pendulum-v0'
EPISODES = 1000
END_SCORE = 200

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    agent = DDPGAgent(state_size, action_size,env.action_space.high[0])
    # agent.get_act_lim(  env.action_space.low[0],
    #                     env.action_space.high[0])
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {} in {} ~ {}'
            .format(state_size, action_size,env.
                    action_space.low[0],env.action_space.high[0]))
    
    scores, episodes = [], []
    score_avg = 0
    
    end = False

    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            # env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            agent.train_model()
            state = next_state

            score += reward
            if done:

                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | mem length: {:4d}'
                      .format(e, score_avg, len(agent.memory)))

                # Save data for plot
                episodes.append(e)
                scores.append(score_avg)

                # View data
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('Pendulum DDPG')
                plt.grid()

                plt.savefig('./save_model/pendulum_ddpg_TF.png')

                if score_avg > END_SCORE:
                    agent.model.save_weights('./save_model/pendulum_ddpg_TF', save_format='tf')
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/pendulum_ddpg_TF_epi',  episodes)
            np.save('./save_model/pendulum_ddpg_TF_score',scores)
            print("End")
            break

Len of weight :  0
[]
Env Name :  Pendulum-v0
States 3, Actions 1 in -2.0 ~ 2.0


ValueError: Sample larger than population or is negative

  self.func()


In [None]:
env.close()