# DDPG

The interaction between these two models is key to the functioning of DDPG:

1. The Actor generates an action given the current state.

2. This action, along with the state, is evaluated by the Critic to compute the value.

3. The Critic’s output (the value) is used to update both the Critic and the Actor.
   The Critic updates its weights to better predict future rewards, while the Actor
   uses the value gradient (provided by the Critic) to update its policy to generate
   more rewarding actions in the future.

In [2]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import deque

In [3]:
class Environment:
    def __init__(self):
        self.state = self.reset()
        self.goal = np.zeros((256,))  # The goal is a zero vector of size 256

    def reset(self):
        # Initialize a state with 256 continuous values between -1 and 1
        self.state = np.random.uniform(-1, 1, size=(256,))
        return self.state

    def step(self, action):
        action = np.clip(action, -1, 1)  # Ensure each action component is between -1 and 1
        self.state += action  # Update the state by taking the action.
        self.state = np.clip(self.state, -1, 1)  # Ensure state remains within the desired range
        reward = -np.linalg.norm(self.state - self.goal)  # Use norm as the reward function
        done = np.linalg.norm(self.state - self.goal) < 0.1  # Termination condition based on distance
        return self.state, reward, done

    def sample_action(self):
        return np.random.uniform(-1, 1, size=(256,))

In [4]:
class OUNoise:
    def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(action_dimension) * mu

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [5]:
class Actor:
    """
    The Actor model, also known as the policy network, is responsible for directly determining the action
    to take given the current state of the environment.
     
    Its main objectives are:

    1. Action Selection:

       The Actor takes the current state of the environment as input and outputs the best
       perceived action to take.

    2. Policy Approximation:
    
       The Actor model represents the policy function of the agent. It approximates
       the optimal policy mapping states to actions that maximize the long-term reward.

    3. Exploration and Exploitation:
    
       During training, the Actor's policy is often augmented with noise (e.g., Ornstein-Uhlenbeck process)
       to encourage exploration of the state space. This helps in discovering more about the environment
       and avoids the pitfall of getting trapped in local optima.
    """
    def __init__(self):
        self.model = self.create_model()

    def create_model(self):
        inputs = layers.Input(shape=(256,))
        out = layers.Dense(512, activation="relu")(inputs)
        out = layers.Dense(512, activation="relu")(out)
        outputs = layers.Dense(256, activation="tanh")(out)  # Output a 256-dimensional action vector
        model = tf.keras.Model(inputs, outputs)
        return model

In [6]:
class Critic:
    """
    The Critic model, also known as the value network, evaluates the action taken by the Actor by computing
    the value function. The main functions of the Critic in DDPG are:

    1. Value Estimation:
    
       The Critic estimates the value of the current state-action pair. This value is a
       measure of the expected future rewards that can be obtained from that state-action pair. It helps
       in evaluating how good the action taken by the Actor is.

    2. Training the Actor:
    
       The Critic assists in training the Actor by providing gradients of the value
       function with respect to the actions taken. This feedback helps the Actor adjust its parameters
       to produce better actions that can lead to higher rewards.

    3. Temporal Difference Learning:
    
       The Critic uses a technique called Temporal Difference (TD) learning
       to update its own weights. It compares the predicted value of the current state-action pair with
       the reward received from the environment plus the predicted value of the next state-action pair,
       allowing it to adjust its value predictions to be more accurate.
    """
    def __init__(self):
        self.model = self.create_model()

    def create_model(self):
        state_input = layers.Input(shape=(256,))
        state_out = layers.Dense(512, activation="relu")(state_input)
        action_input = layers.Input(shape=(256,))
        action_out = layers.Dense(512, activation="relu")(action_input)
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(512, activation="relu")(concat)
        outputs = layers.Dense(1)(out)  # Output a single value representing the Q-value
        model = tf.keras.Model([state_input, action_input], outputs)
        return model

In [7]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        """
        Ensure that each experience added is long enough
        """
        self.buffer.append(experience)

    def sample(self, batch_size):
        """
        Sample only those entries that have a complete sequence
        """
        return random.sample(self.buffer, batch_size)

In [8]:
actor = Actor()
critic = Critic()

actor_optimizer = tf.keras.optimizers.Adam(0.001)
critic_optimizer = tf.keras.optimizers.Adam(0.002)

env = Environment()
noise_process = OUNoise(action_dimension=256)
buffer = ReplayBuffer(10000)

TOTAL_EPISODES = 100
DISCOUNT_FACTOR = 0.97
STEPS_FORWARD = 10
TRAIN_BUFFER_SIZE = 10
MAX_STEPS_PER_EPISODE = 100

In [42]:
def train_actor_and_critic(batch):
    print(f"Training with {len(batch)} episode sequences.")
    for sequence in batch:
        states, actions, rewards, next_states, dones = zip(*sequence)
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)
        update_networks(states, actions, rewards, next_states, dones)

In [43]:
def update_networks(states, actions, rewards, next_states, dones):
    """
    Updates the weights of the Actor and Critic networks using a batch of experience tuples.
    
    This function performs two main tasks:

    1. Updates the Critic network by minimizing the Mean Squared Error between the Critic's predicted value 
       of the current state-action pairs and the target values. The target values are calculated using the 
       Bellman equation, which incorporates rewards obtained and the discounted future values estimated by 
       the target Critic network for the next state-action pairs.
       
    2. Updates the Actor network by maximizing the expected return as estimated by the Critic network. 
       This is achieved by using the gradient of the Critic's output with respect to the actions, which are 
       generated by the Actor network. The Actor's parameters are updated in the direction that improves 
       its policy, leading to higher predicted values from the Critic.
    """
    max_step = min(len(rewards), STEPS_FORWARD)
    with tf.GradientTape() as tape:

        # Calculate the Critic model's value of the given state-action pairs.
        critic_value = critic.model([states, actions], training=True)
        assert critic_value.shape[0] == max_step

        # Estimate the best action for the given next states
        target_actions = actor.model(next_states.reshape(-1, 256), training=True)
        assert target_actions.shape[0] == max_step

        # Estimate the value of the optimal future rewards that can be obtained from a given state-action pair
        future_rewards = critic.model([next_states.reshape(-1, 256), target_actions], training=True).numpy().flatten()
        assert future_rewards.shape[0] == max_step

        # Estimate the value of the optimal future rewards that can be obtained from a given state-action pair,
        # using n-step TD target calculation
        n_step_rewards = np.array([
            DISCOUNT_FACTOR**i * rewards[i] * (1 - dones[i])
            for i in range(max_step - 1)
        ])
        assert n_step_rewards.shape[0] == max_step - 1

        # Calculate the target values using the n-step rewards and future rewards
        y = n_step_rewards + (DISCOUNT_FACTOR**max_step) * future_rewards[-1] * (1 - dones[-1])

        # Calculate the loss between the predicted value and the target value.
        # The goal is to minimize the difference between the estimated value of the current state
        # (as predicted by the Critic) and the expected optimal future value (also estimated by
        # the Critic but based on the next state and the action proposed by the Actor).
        critic_loss = tf.reduce_mean(tf.square(y - critic_value))
        print(f"Critic loss: {critic_loss}")

    # Calculate the gradients of the Critic model with respect to the loss.
    critic_grad = tape.gradient(critic_loss, critic.model.trainable_variables)

    # Update the Critic model's weights using the gradients. 
    critic_optimizer.apply_gradients(zip(critic_grad, critic.model.trainable_variables))

    with tf.GradientTape() as tape:
        # Calculate the loss for the Actor model. The goal is to maximize the expected return.
        actions = actor.model(states)

        # The Critic's value of the state-action pairs is used as the loss for the Actor.
        critic_value = critic.model([states, actions])

        # The Actor's loss is the negative of the Critic's value of the state-action pairs.
        actor_loss = -tf.math.reduce_mean(critic_value)
        print(f"Actor loss: {actor_loss}")

    # Calculate the gradients of the Actor model with respect to the loss.
    actor_grad = tape.gradient(actor_loss, actor.model.trainable_variables)

    # Update the Actor model's weights using the gradients.
    actor_optimizer.apply_gradients(zip(actor_grad, actor.model.trainable_variables))

In [None]:
for episode_number in range(TOTAL_EPISODES):
    print(f"Episode: {episode_number}")
    state = env.reset()  # Initial state.
    episode_reward = 0
    episode = []  # This will store sequences of experiences
    for step_number in range(MAX_STEPS_PER_EPISODE):
        print(f"Episode: {episode_number}, Step: {step_number}")

        # Select an action using the Actor model. Add noise for exploration.
        action = actor.model(state.reshape(1, -1))[0].numpy() + noise_process.noise()
        print(f"Next Action: {action}"[:20])
        
        # Take the action and observe the next state and reward.
        next_state, reward, done = env.step(action)
        print(f"Step reward: {reward}, Next State: {next_state}"[:50])

        # Append the transition to the episode
        episode.append((state, action, reward, next_state, done))

        # Check if the episode has accumulated enough steps
        if len(episode) >= STEPS_FORWARD:
            print(f"Adding episode sequence, size: {len(episode)}")
            buffer.add(episode[:STEPS_FORWARD])  # Add only the first STEPS_FORWARD transitions
            episode.pop(0)  # Remove the oldest transition to maintain a moving window

        state = next_state
        episode_reward += reward
        print(f"Total episode reward: {episode_reward}")

        # If the episode is done, break.
        if done:
            print("End of Episode!")
            print(f"Episode: {episode_number + 1}, Step: {step_number + 1}, Reward: {episode_reward}")
            break

    # Periodically train the model.
    if len(buffer.buffer) >= TRAIN_BUFFER_SIZE:
        batch = buffer.sample(TRAIN_BUFFER_SIZE)
        train_actor_and_critic(batch)

Episode: 0
Episode: 0, Step: 0
Next Action: [-1.819
Step reward: -11.913473802709847, Next State: [-0.
Total episode reward: -11.913473802709847
Episode: 0, Step: 1
Next Action: [-1.729
Step reward: -14.38648371206444, Next State: [-1. 
Total episode reward: -26.299957514774285
Episode: 0, Step: 2
Next Action: [-1.397
Step reward: -15.09034817951579, Next State: [-1. 
Total episode reward: -41.39030569429008
Episode: 0, Step: 3
Next Action: [-1.485
Step reward: -15.406369484842314, Next State: [-1.
Total episode reward: -56.79667517913239
Episode: 0, Step: 4
Next Action: [-1.246
Step reward: -15.618709195725415, Next State: [-1.
Total episode reward: -72.41538437485781
Episode: 0, Step: 5
Next Action: [-1.131
Step reward: -15.730933105177495, Next State: [-1.
Total episode reward: -88.14631748003531
Episode: 0, Step: 6
Next Action: [-0.960
Step reward: -15.76522431843969, Next State: [-1. 
Total episode reward: -103.911541798475
Episode: 0, Step: 7
Next Action: [-1.121
Step reward: -15