In [1]:
import os
import sys
import argparse
import gym
import tensorflow as tf
import numpy as np
import time

In [2]:
# replay class that stores n size tuple of experiences
class ExpReplay(object):
    def __init__(self, size):
        self._storage = []
        self._maxsize = size       

    def __len__(self):
        return len(self._storage)

    def add(self, s, a, r, next_s, done):
        data = (s, a, r, next_s, done)
        
        self._storage.append(data)
        storage_size = len(self._storage)
        if (storage_size >= self._maxsize):
            self._storage = self._storage[storage_size-self._maxsize:]
        
    def sample(self, batch_size):
        idx = np.random.randint(0, len(self._storage), size=batch_size)
        batch = np.array(self._storage)[idx]
        states, actions, rewards, next_states, isdone = [], [], [], [], []
        
        for s, a, r, ns, done in batch:
            states.append(s)
            actions.append(a)
            rewards.append(r)
            next_states.append(ns)
            isdone.append(done)
        
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(isdone)

In [3]:
class Critic:
    def __init__(self, state, action, state_dims, action_dims, scope='critic'):
        # state - State input to pass through the network
        # action - Action input for which the Q value should be predicted
         
        self.state = state
        self.action = action
        self.state_dims = np.prod(state_dims)       #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
        self.action_dims = np.prod(action_dims)
        self.scope = scope    
        
        # Networks params              
        dense1_size = 400
        dense2_size = 300
        final_layer_init = 0.003
         
        with tf.variable_scope(self.scope):           
            self.dense1_mul = tf.layers.dense(self.state, dense1_size, kernel_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))),
                                bias_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))))  
                         
            self.dense1 = tf.nn.relu(self.dense1_mul)
             
            #Merge first dense layer with action input to get second dense layer            
            self.dense2a = tf.layers.dense(self.dense1, dense2_size, kernel_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),
                                bias_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))))        
             
            self.dense2b = tf.layers.dense(self.action, dense2_size, kernel_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),
                                bias_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims)))) 
                           
            self.dense2 = tf.nn.relu(self.dense2a + self.dense2b)
                          
            self.output = tf.layers.dense(self.dense2, 1, kernel_initializer=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init),
                                bias_initializer=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init))  
             
                          
            self.network_params = tf.trainable_variables(self.scope)
          
            self.action_grads = tf.gradients(self.output, self.action) # gradient of value output wrt action input - used to train actor network
            

    def train_step(self, target_Q):
        # target_Q - Target Q value (immediate reward plus expected Q from next state)
         
        with tf.variable_scope(self.scope):
            with tf.variable_scope('train'):
                learning_rate = 0.001
                l2_lambda = 0
                 
                self.optimizer = tf.train.AdamOptimizer(learning_rate)
                self.loss = tf.losses.mean_squared_error(target_Q, self.output)
                self.l2_reg_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.network_params if 'kernel' in v.name]) * l2_lambda
                self.total_loss = self.loss + self.l2_reg_loss
                 
                train_step = self.optimizer.minimize(self.total_loss, var_list=self.network_params)
                  
                return train_step
        

class Actor:
    def __init__(self, state, state_dims, action_dims, action_bound_low, action_bound_high, scope='actor'):
        # state - State input to pass through the network
        # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space
         
        self.state = state
        self.state_dims = np.prod(state_dims)       #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
        self.action_dims = np.prod(action_dims)
        self.action_bound_low = action_bound_low
        self.action_bound_high = action_bound_high
        self.scope = scope
        
        # Networks params 
        dense1_size = 400
        dense2_size = 300
        final_layer_init = 0.003
         
        with tf.variable_scope(self.scope):
                    
            self.dense1_mul = tf.layers.dense(self.state, dense1_size, kernel_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))),
                                bias_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))))  
                         
            self.dense1 = tf.nn.relu(self.dense1_mul)
             
            self.dense2_mul = tf.layers.dense(self.dense1, dense2_size, kernel_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))),
                                bias_initializer=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))))        
                         
            self.dense2 = tf.nn.relu(self.dense2_mul)
             
            self.output_mul = tf.layers.dense(self.dense2, self.action_dims, kernel_initializer=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init),
                                bias_initializer=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init)) 
             
            self.output_tanh = tf.nn.tanh(self.output_mul)
             
            # Scale tanh output to lower and upper action bounds
            self.output = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.action_bound_high-self.action_bound_low)) + (self.action_bound_high+self.action_bound_low))
             
            self.network_params = tf.trainable_variables(self.scope)
        
        
    def train_step(self, action_grads):
        # action_grads - gradient of value output wrt action from critic network
         
        with tf.variable_scope(self.scope):
            with tf.variable_scope('train'):
                learning_rate = 0.0001
                batch_size = 64
                 
                self.optimizer = tf.train.AdamOptimizer(learning_rate)
                self.grads = tf.gradients(self.output, self.network_params, -action_grads)  
                 
                train_step = self.optimizer.apply_gradients(zip(self.grads, self.network_params))
                 
                return train_step


In [4]:
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [5]:
# Create session
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)    

class DDPG:
    
    def __init__ (
        self,
        env,
        state_dim,
        action_dim,
        action_low,
        action_high,
        replaybuffer,
        warm_steps = 50000,
        tau = 0.001
    ):
        
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_low = action_low
        self.action_high = action_high
        self.warm_steps = warm_steps
        self.replay = replaybuffer
        self.tau = tau
        
        # Define input placeholders    
        self.state_ph = tf.placeholder(tf.float32, ((None,) + state_dim))
        self.action_ph = tf.placeholder(tf.float32, ((None,) + action_dim))
        self.target_ph = tf.placeholder(tf.float32, (None, 1))  # Target Q-value - for critic training
        self.action_grads_ph = tf.placeholder(tf.float32, ((None,) + action_dim)) # Gradient of critic's value output wrt action input - for actor training
        self.is_training_ph = tf.placeholder_with_default(True, shape=None)
            
        self.critic = Critic(self.state_ph, self.action_ph, state_dim, action_dim, scope='critic_main')
        self.critic_target = Critic(self.state_ph, self.action_ph, state_dim, action_dim, scope='critic_target')    
            
        self.actor = Actor(self.state_ph, state_dim, action_dim, low, high, scope='actor_main')
        self.actor_target = Actor(self.state_ph, state_dim, action_dim, low, high, scope='actor_target')
        
        # Create training step ops
        self.critic_train_step = self.critic.train_step(self.target_ph)
        self.actor_train_step = self.actor.train_step(self.action_grads_ph)

        # Create ops to update target networks
        self.update_critic_target = self.update_target_network(self.critic.network_params, self.critic_target.network_params, self.tau)
        self.update_actor_target = self.update_target_network(self.actor.network_params, self.actor_target.network_params, self.tau)
        
    def update_target_network(self, network_params, target_network_params, tau):     
        
        # When tau=1.0, we perform a hard copy of parameters, otherwise a soft copy
        # Create ops which update target network parameters with (fraction of) main network parameters
        op_holder = []
        for from_var,to_var in zip(network_params, target_network_params):
            op_holder.append(to_var.assign((tf.multiply(from_var, tau) + tf.multiply(to_var, 1. - tau))))        

        return op_holder
        
    def train(self, train_eps = 50000):
        
        start_ep = 0
        sess.run(tf.global_variables_initializer())   
        # Perform hard copy (tau=1.0) of initial params to target networks
        sess.run(self.update_target_network(self.critic.network_params, self.critic_target.network_params, self.tau))
        sess.run(self.update_target_network(self.actor.network_params, self.actor_target.network_params, self.tau))

        # Create summary writer to write summaries to disk
        if not os.path.exists('./logs/train'):
            os.makedirs('./logs/train')
        summary_writer = tf.summary.FileWriter('./logs/train', sess.graph)
    
        # Create summary op to save episode reward to Tensorboard log
        ep_reward_var = tf.Variable(0.0, trainable = False)
        tf.summary.scalar("Episode Reward", ep_reward_var)
        summary_op = tf.summary.merge_all()

        ## Training 
        # Initially populate replay memory by taking random actions 
        sys.stdout.write('\nPopulating replay memory with random actions...\n')   
        sys.stdout.flush()          
        
        state = self.env.reset()
     
        for random_step in range(1, self.warm_steps + 1):
            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)
            self.replay.add(state, action, reward, next_state, done)
            
            state = next_state

            if done:
                state = self.env.reset()

        sys.stdout.write('\n\nTraining...\n')   
        sys.stdout.flush()

        for train_ep in range(start_ep + 1, train_eps + 1):      
            # Reset environment and noise process
            state = self.env.reset()
            exploration_noise.reset()

            train_step = 0
            episode_reward = 0
            duration_values = []
            ep_done = False

            sys.stdout.write('\n')
            sys.stdout.flush()

            while not ep_done:
                train_step += 1
                start_time = time.time()            
                ## Take action and store experience

                action = sess.run(self.actor.output, {self.state_ph: state[None]})[0]     # Add batch dimension to single state input, and remove batch dimension from single action output
                
                action += exploration_noise() * noise_scaling
                next_state, reward, done, _ = self.env.step(action)
                self.replay.add(state, action, reward, next_state, done)

                episode_reward += reward

                ## Train networks
                # Get minibatch
                states_batch, actions_batch, rewards_batch, next_states_batch, done_batch = self.replay.sample(64) 

                # Critic training step    
                future_action = sess.run(self.actor_target.output, {self.state_ph: next_states_batch})  
                future_Q = sess.run(self.critic_target.output, {self.state_ph: next_states_batch, self.action_ph: future_action})[:,0]   # future_Q is of shape [batch_size, 1], need to remove second dimension for ops with terminals_batch and rewards_batch which are of shape [batch_size]
                future_Q[done_batch] = 0
                targets = rewards_batch + (future_Q * 0.99)
                sess.run(self.critic_train_step, {self.state_ph:states_batch, self.action_ph:actions_batch, self.target_ph:np.expand_dims(targets, 1)})   

                # Actor training step
                actor_actions = sess.run(self.actor.output, {self.state_ph:states_batch})
                action_grads = sess.run(self.critic.action_grads, {self.state_ph:states_batch, self.action_ph:actor_actions})
                sess.run(self.actor_train_step, {self.state_ph:states_batch, self.action_grads_ph:action_grads[0]})

                # Update target networks
                sess.run(self.update_critic_target)
                sess.run(self.update_actor_target)

                # Display progress            
                duration = time.time() - start_time
                duration_values.append(duration)
                ave_duration = sum(duration_values)/float(len(duration_values))

                sys.stdout.write('\x1b[2K\rEpisode {:d}/{:d} \t Steps = {:d} \t Reward = {:.3f} \t ({:.3f} s/step)'.format(train_ep, 50000, train_step, episode_reward, ave_duration))
                sys.stdout.flush()
                
                state = next_state

                if done or train_step == 1000:
                    # Log total episode reward and begin next episode
                    summary_str = sess.run(summary_op, {ep_reward_var: episode_reward})
                    summary_writer.add_summary(summary_str, train_ep)
                    ep_done = True

        env.close()     

In [6]:
import gym

env = gym.make('Pendulum-v0')

state_dim = env.observation_space.shape
action_dim = env.action_space.shape
high = env.action_space.high
low = env.action_space.low

# Initialise Ornstein-Uhlenbeck Noise generator
exploration_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
noise_scaling = 0.3 * (high - low)
replaybuffer = ExpReplay(1e6)

random_seed = 99999999
env.seed(random_seed)
np.random.seed(random_seed)
tf.set_random_seed(random_seed)        

ddpg = DDPG(env, state_dim, action_dim, high, low, replaybuffer)

  result = entry_point.load(False)


In [7]:
ddpg.train()

INFO:tensorflow:Summary name Episode Reward is illegal; using Episode_Reward instead.

Populating replay memory with random actions...


Training...

Episode 1/50000 	 Steps = 200 	 Reward = -1574.019 	 (0.021 s/step)
Episode 2/50000 	 Steps = 200 	 Reward = -1279.673 	 (0.021 s/step)
Episode 3/50000 	 Steps = 200 	 Reward = -1703.272 	 (0.021 s/step)
Episode 4/50000 	 Steps = 200 	 Reward = -1417.250 	 (0.021 s/step)
Episode 5/50000 	 Steps = 200 	 Reward = -1402.465 	 (0.021 s/step)
Episode 6/50000 	 Steps = 200 	 Reward = -1860.237 	 (0.021 s/step)
Episode 7/50000 	 Steps = 200 	 Reward = -1890.255 	 (0.021 s/step)
Episode 8/50000 	 Steps = 200 	 Reward = -1678.196 	 (0.022 s/step)
Episode 9/50000 	 Steps = 200 	 Reward = -1552.741 	 (0.021 s/step)
Episode 10/50000 	 Steps = 200 	 Reward = -1503.556 	 (0.021 s/step)
Episode 11/50000 	 Steps = 200 	 Reward = -1458.674 	 (0.021 s/step)
Episode 12/50000 	 Steps = 200 	 Reward = -1540.081 	 (0.022 s/step)
Episode 13/50000 	 Steps = 200 

KeyboardInterrupt: 