In [None]:
from collections import deque
import random
import gym
import numpy as np

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
import tensorflow.keras.backend as kb
import time

In [None]:
class DDPG():
    def __init__(self, session, num_states, num_actions, actor_optim_lr, critic_optim_lr, update_freq, 
                 replay_size, batch_size, tau, gamma, theta, mu, sigma, action_high, action_low):
        self.sess = session
        self.num_states = num_states
        self.num_actions = num_actions
        
        self.actor_optim_lr = actor_optim_lr
        self.critic_optim_lr = critic_optim_lr
        
        self.update_freq = update_freq
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.tau = tau
        
        self.gamma = gamma
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.action_high = action_high
        self.action_low = action_low
        
        self.train_steps = 0
        self.replay_buffer = deque(maxlen = self.replay_size)
        self.action_noise = np.ones(self.num_actions[0]) * self.mu
        
        self.actor_state_input, self.primary_actor_network = self.create_actor_network()
        _, self.target_actor_network = self.create_actor_network()
        self.critic_state_input, self.critic_action_input, self.primary_critic_network = self.create_critic_network()
        _, _, self.target_critic_network = self.create_critic_network()
        
        self.critic_grads = tf.gradients(self.primary_critic_network.output, self.critic_action_input)

        actor_network_weights = self.primary_actor_network.trainable_weights
        self.actor_critic_grads = tf.placeholder(tf.float32,[None, self.num_actions[0]])

        self.actor_grads = tf.gradients(self.primary_actor_network.output, 
                                        actor_network_weights, - self.actor_critic_grads)

        self.actor_optim = tf.train.AdamOptimizer(self.actor_optim_lr).apply_gradients(
            zip(self.actor_grads, actor_network_weights))
        
        self.sess.run(tf.global_variables_initializer())
        
    def create_actor_network(self):
        state_input = layers.Input(shape = self.num_states)
        
        hidden_state_1 = layers.Dense(400, 
                         activation = 'relu',
                        )(state_input)
        hidden_state_2 = layers.Dense(300, 
                         activation = 'relu',
                        )(hidden_state_1)
        output_layer = layers.Dense(self.num_actions[0], 
                         activation = 'tanh',
                        )(hidden_state_2)
        
        model = models.Model(inputs=state_input, outputs=output_layer)
        model.compile(loss='mean_squared_error', optimizer=optimizers.Adam(self.actor_optim_lr))
        
        return state_input, model

    def create_critic_network(self):
        state_input = layers.Input(shape = self.num_states)
        action_input = layers.Input(shape = self.num_actions)
        
        hidden_state_1 = layers.Dense(400,  
                         activation = 'relu',
                        )(state_input)
        hidden_state_2 = layers.Dense(300, 
                         activation = None,
                        )(hidden_state_1)
        hidden_action_1 = layers.Dense(300, 
                         activation = None,
                        )(action_input)

        merged_layer = layers.Concatenate()([hidden_state_2, hidden_action_1])
        
        hidden_merged_3 = layers.Dense(300,  
                         activation = 'relu',
                        )(merged_layer)
        
        output_layer = layers.Dense(1,  
                         activation = None,
                        )(hidden_merged_3)
        
        model = models.Model(inputs=[state_input,action_input], outputs=output_layer)
        model.compile(loss='mean_squared_error', optimizer=optimizers.Adam(self.critic_optim_lr))
        
        return state_input, action_input, model

    def update_target_nets(self):
        actor_primary_weights  = self.primary_actor_network.get_weights()
        actor_target_weights = self.target_actor_network.get_weights()
        critic_primary_weights  = self.primary_critic_network.get_weights()
        critic_target_weights = self.target_critic_network.get_weights()

        for i in range(len(actor_target_weights)):
            actor_target_weights[i] = actor_primary_weights[i]*self.tau + actor_target_weights[i]*(1-self.tau)
        for i in range(len(critic_target_weights)):
            critic_target_weights[i] = critic_primary_weights[i]*self.tau + critic_target_weights[i]*(1-self.tau)
        
        self.target_actor_network.set_weights(actor_target_weights)
        self.target_critic_network.set_weights(critic_target_weights)

    def ou_noise(self):
        if self.sigma < 0.01:
            self.sigma = 0.01
        deriv = self.theta * (self.mu - self.action_noise) + self.sigma * np.random.randn(len(self.action_noise))
        self.action_noise += deriv
        return self.action_noise
    
    def train_network(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        self.train_steps += 1

        if self.train_steps % self.update_freq == 0:
            self.update_target_nets()
        
        replay_batch = random.sample(self.replay_buffer,self.batch_size)
        
        obs_batch = np.array([replay[0] for replay in replay_batch])
        action_batch = np.array([replay[1] for replay in replay_batch])
        reward_batch = np.array([replay[2] for replay in replay_batch])
        obs_next_batch = np.array([replay[3] for replay in replay_batch])
        terminate_batch = np.array([replay[4] for replay in replay_batch])

        primary_actions = self.primary_actor_network.predict(obs_batch)
        
        grads = self.sess.run(self.critic_grads, feed_dict={
            self.critic_state_input:  obs_batch,
            self.critic_action_input: primary_actions
        })[0]

        self.sess.run(self.actor_optim, feed_dict={
            self.actor_state_input: obs_batch,
            self.actor_critic_grads: grads
        })
        
        target_actions = self.target_actor_network.predict(obs_next_batch)
        reward_next = self.target_critic_network.predict([obs_next_batch, target_actions])
        
        reward_batch += self.gamma * reward_next.reshape(-1) * (1 - terminate_batch)

        self.primary_critic_network.fit([obs_batch, action_batch], reward_batch, verbose=0) 

    def noised_action(self, obs):
        obs = obs.reshape(1, self.num_states[0])
        action = self.primary_actor_network.predict(obs)
        action += self.ou_noise()
        action = np.clip(action, self.action_low, self.action_high)
    
        return action
    
    def train_and_remember(self, obs, action, reward, obs_next, terminate):
        reward += abs(obs[0] + 0.5) + 5 * max(obs[0],0) + 1 * obs[1]
        self.replay_buffer.append((obs, action[0], reward, obs_next, terminate))
        if len(self.replay_buffer) > self.replay_size:
            self.replay_buffer.popleft()
        if len(self.replay_buffer) > self.batch_size:
            self.train_network()
        if terminate:
            self.action_noise = np.ones(self.num_actions[0]) * self.mu

In [None]:
session = tf.Session()
kb.set_session(session)

env = gym.make('MountainCarContinuous-v0')
env = env.unwrapped
env = gym.wrappers.Monitor(env,'MountainCarContinuous-v0-DDPG',force=True)

tf.set_random_seed(0)
np.random.seed(0)
random.seed(0)

num_episodes = 100

In [None]:
ddpg = DDPG(
    session = session,
    num_states = env.observation_space.shape,
    num_actions = env.action_space.shape,
    actor_optim_lr = 1e-4,
    critic_optim_lr = 1e-3,
    update_freq = 2,
    replay_size = 10000,
    batch_size = 64,
    tau = 1e-3,
    gamma = 0.99,
    theta = 0.15,
    mu = 0,
    sigma = 0.2,
    action_high = env.action_space.high,
    action_low = env.action_space.low)

for episode in range(num_episodes):
    obs = env.reset()
    steps = 0

    while True:
        env.render()
        action = ddpg.noised_action(obs)
        action = action.reshape((-1,))
        obs_next, reward, terminate, _ = env.step(action * env.action_space.high)
        ddpg.train_and_remember(obs, action, reward, obs_next, terminate)  
        steps += 1
        if terminate:
            break
        obs = obs_next

    print("Episode {} completed in {} steps".format(episode + 1, steps))

start = time.time()
while True:
    env.render()
    if (time.time()-start)>=5:
        break
env.close()