In [None]:
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
from datetime import datetime
from sys import exit


# **Model Generator Function**

In the original DDPG paper by David Silver, for the critic network, the action enters the network in middle layers instead of entering the network from the beginning. This is only done to increase performance/stability.

However, for us to learn from scratch, the action and state input will enter the critic network from the beginning. We write a function that generates both the actor and critic.


In [None]:
# simple NN Generator

def ANN2(input_shape,layer_sizes, hidden_activation='relu', output_activation=None):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=input_shape))    
    for h in layer_sizes[:-1]:
        x = model.add(tf.keras.layers.Dense(units=h, activation='relu'))
    model.add(tf.keras.layers.Dense(units=layer_sizes[-1], activation=output_activation))
    return model


The function ANN2 generates both critic and actor networks using input_shape and layer_size parameters. The hidden layers for both networks have ‘relu’ activations. The output layer for the actor will be a ‘tanh’, ( to map continuous action -1 to 1) and the output layer for critic will be ‘None’ as its the Q-value. The output for the actor-network can be scaled by a factor to make the action correspond to the environment action range.

# **Model Initialization**

We initialize 4-networks: The main Actor and Critic and the target Actor and Critic

In [None]:
# Network parameters
X_shape = (num_states)
QA_shape = (num_states + num_actions)
hidden_sizes_1=(1000,500,200)
hidden_sizes_2=(400,200)

# Main network outputs
mu = ANN2(X_shape,list(hidden_sizes_1)+[num_actions], hidden_activation='relu', output_activation='tanh')
q_mu = ANN2(QA_shape, list(hidden_sizes_2)+[1], hidden_activation='relu')

# Target networks
mu_target = ANN2(X_shape,list(hidden_sizes_1)+[num_actions], hidden_activation='relu', output_activation='tanh')
q_mu_target = ANN2(QA_shape, list(hidden_sizes_2)+[1], hidden_activation='relu')

# **Replay Buffer**

As with other deep reinforcement learning techniques, DDPG relies on the use of Replay Buffer for stability. The replay buffer needs to maintain a balance of old and new experiences.

***Definition:***
The replay buffer contains a collection of experience tuples (S, A, R, S′). The tuples are gradually added to the buffer as we are interacting with the Environment. The simplest implementation is a buffer of fixed size, with new data added to the end of the buffer so that it pushes the oldest experience out of it.

***Purpose:***
A buffer of past experiences is used to stabilize training by decorrelating the training examples in each batch used to update the neural network. This buffer records past states, the actions taken at those states, the reward received and the next state that was observed.

In [None]:
class BasicBuffer:
    
    def __init__(self, size, obs_dim, act_dim):
        self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.rews_buf = np.zeros([size], dtype=np.float32)
        self.done_buf = np.zeros([size], dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def push(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = np.asarray([rew])
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        temp_dict= dict(s=self.obs1_buf[idxs],
                    s2=self.obs2_buf[idxs],
                    a=self.acts_buf[idxs],
                    r=self.rews_buf[idxs],
                    d=self.done_buf[idxs])
        return (temp_dict['s'],temp_dict['a'],temp_dict['r'].reshape(-1,1),temp_dict['s2'],temp_dict['d'])



In [None]:
# Buffer Import

import sys
sys.path.insert(0,'/content/drive/MyDrive/Colab_Data/DDPG')

import buffer
from buffer import BasicBuffer_a,BasicBuffer_b