In [87]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [88]:
problem = "Pendulum-v0"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [89]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [90]:
class Buffer:
    def __init__(self, buffer_capacity=5000, batch_size=64):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size

        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))


    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    def update(self, state_batch, action_batch, reward_batch, next_state_batch,):
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    def learn(self):

        record_range = min(self.buffer_counter, self.buffer_capacity)

        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)

In [91]:
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [92]:
def get_actor():
    last_init=tf.random_uniform_initializer(minval=-0.003,maxval=0.003)
    i=layers.Input(shape=(num_states))
    x=layers.Dense(256,activation='relu')(i)
    x=layers.Dense(256,activation='relu')(x)
    x=layers.Dense(1,activation='tanh')(x)
    x=x*upper_bound
    model=tf.keras.Model(i,x)
    return model

In [93]:
actor=get_actor()

In [94]:
actor.summary()

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        [(None, 3)]               0         
_________________________________________________________________
dense_62 (Dense)             (None, 256)               1024      
_________________________________________________________________
dense_63 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_64 (Dense)             (None, 1)                 257       
_________________________________________________________________
tf_op_layer_mul_3 (TensorFlo [(None, 1)]               0         
Total params: 67,073
Trainable params: 67,073
Non-trainable params: 0
_________________________________________________________________


In [95]:
def get_critic():
    state_input=layers.Input(shape=(num_states))
    state_output=layers.Dense(16,activation='relu')(state_input)
    state_output=layers.Dense(32,activation='relu')(state_output)
    
    action_input=layers.Input(shape=(num_actions))
    action_output=layers.Dense(32,activation='relu')(action_input)

    concat=layers.Concatenate()([state_output,action_output])
    x=layers.Dense(256,activation='relu')(concat)
    
    x=layers.Dense(256,activation='relu')(x)
    x=layers.Dense(1,activation='linear')(x)
    model=tf.keras.Model([state_input,action_input],x)
    return model

In [96]:
critic=get_critic()

In [97]:
critic.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense_65 (Dense)                (None, 16)           64          input_23[0][0]                   
__________________________________________________________________________________________________
input_24 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
dense_66 (Dense)                (None, 32)           544         dense_65[0][0]                   
___________________________________________________________________________________________

In [139]:
def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    sampled_actions = sampled_actions.numpy() + noise

    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

In [140]:
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()


target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())


critic_lr = 0.002
actor_lr = 0.001

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 100

gamma = 0.99

tau = 0.005

buffer = Buffer(50000, 64)

In [142]:
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

# Takes about 4 min to train
for ep in range(total_episodes):

    prev_state = env.reset()
    episodic_reward = 0

    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.
        # env.render()

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        action = policy(tf_prev_state, ou_noise)
        # Recieve state and reward from environment.
        state, reward, done, info = env.step(action)

        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward

        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)

        # End this episode when `done` is True
        if done:
            break

        prev_state = state

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

Episode * 0 * Avg Reward is ==> -1565.7035505426832
Episode * 1 * Avg Reward is ==> -1485.6692649394622
Episode * 2 * Avg Reward is ==> -1542.3306734398273
Episode * 3 * Avg Reward is ==> -1553.8672642345462
Episode * 4 * Avg Reward is ==> -1566.0977092588896
Episode * 5 * Avg Reward is ==> -1580.4891356156204
Episode * 6 * Avg Reward is ==> -1546.159994980426
Episode * 7 * Avg Reward is ==> -1507.142735205743


KeyboardInterrupt: 

array([0.0144628])

array(-0.16432893, dtype=float32)

In [185]:
class ActorNetwork(Model):
    def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=n_actions, name='actor'):
        super(ActorNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.model_name = name

        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.mu = Dense(self.n_actions, activation='tanh')

    def call(self, state):
        prob = self.fc1(state)
   
        prob = self.fc2(prob)
    
 
        mu = self.mu(prob)
        return mu


In [186]:
actor = ActorNetwork(n_actions, name='actor')


In [241]:
states=[]

In [242]:
for i in range(0,100):
    observation=env.reset()
    states.append(observation)

In [232]:
observation=env.reset()

In [233]:
states.append(observation)

In [243]:
states_ = tf.convert_to_tensor(states, dtype=tf.float32)

In [244]:
states_.shape

TensorShape([100, 3])

In [245]:
actions = actor(states_)

In [246]:
actions

<tf.Tensor: shape=(100, 1), dtype=float32, numpy=
array([[-0.04399632],
       [-0.02760663],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [-0.0041277 ],
       [-0.06393441],
       [ 0.        ],
       [-0.00192662],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [-0.0054175 ],
       [ 0.        ],
       [ 0.        ],
       [-0.06271657],
       [-0.00980605],
       [-0.0130644 ],
       [-0.04276937],
       [ 0.        ],
       [-0.00200453],
       [ 0.        ],
       [ 0.        ],
       [-0.04787141],
       [ 0.        ],
       [-0.00209415],
       [ 0.        ],
       [-0.00884993],
       [-0.00219852],
       [-0.01731527],
       [-0.04653164],
       [-0.07462489],
       [ 0.        ],
       [-0.04444505],
       [-0.07893737],
       [ 0.        ],
       [-0.05314264],
       [-0.06636559],
       [-0.05527316],
       [-0.0155631 ],
       [-0.02323784],
       [ 0.        ],
    

In [247]:
actions = tf.clip_by_value(actions, min_action, max_action)

In [248]:
actions

<tf.Tensor: shape=(100, 1), dtype=float32, numpy=
array([[-0.04399632],
       [-0.02760663],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [-0.0041277 ],
       [-0.06393441],
       [ 0.        ],
       [-0.00192662],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [-0.0054175 ],
       [ 0.        ],
       [ 0.        ],
       [-0.06271657],
       [-0.00980605],
       [-0.0130644 ],
       [-0.04276937],
       [ 0.        ],
       [-0.00200453],
       [ 0.        ],
       [ 0.        ],
       [-0.04787141],
       [ 0.        ],
       [-0.00209415],
       [ 0.        ],
       [-0.00884993],
       [-0.00219852],
       [-0.01731527],
       [-0.04653164],
       [-0.07462489],
       [ 0.        ],
       [-0.04444505],
       [-0.07893737],
       [ 0.        ],
       [-0.05314264],
       [-0.06636559],
       [-0.05527316],
       [-0.0155631 ],
       [-0.02323784],
       [ 0.        ],
    

In [114]:
state=env.reset()

In [63]:
state=tf.convert_to_tensor(state, dtype=tf.float32)

In [115]:
state.shape

(3,)

In [34]:
actor.compile(optimizer=Adam(learning_rate=0.01))

In [116]:
class ActionValueNetwork:
    def __init__(self, step_size=0.01):
        self.step_size=step_size
    def create_model(self):
        i = Input(shape=state.shape)
        x = Dense(256, activation='relu')(i)
        x = Dense(128, activation='relu')(x)
        x = Dense(2, activation='tanh')(x)
        model = Model(i, x)
        return model

In [117]:
action=ActionValueNetwork()

In [118]:
model=action.create_model()

In [119]:
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
dense_46 (Dense)             (None, 256)               1024      
_________________________________________________________________
dense_47 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_48 (Dense)             (None, 2)                 258       
Total params: 34,178
Trainable params: 34,178
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.predict(states_).shape

(2, 1)