In [66]:
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Input, LayerNormalization
from tensorflow.keras.optimizers import Adam
import math as m
import numpy as np
import matplotlib.pyplot as plt
import gym

"""
Reportedly vanilla policy gradient is useful in reactive scenarios but not so great where planning and 
searching to proactively find a good strategy! Will move on to TRPO and DDPG
"""

'\nReportedly vanilla policy gradient is useful in reactive scenarios but not so great where planning and \nsearching to proactively find a good strategy! Will move on to TRPO and DDPG\n'

In [67]:
"""
Initialize environment
"""

env = gym.make("Pendulum-v0")
action_shape = env.action_space.shape
state_shape = env.observation_space.shape
action_shape_high = env.action_space.low[0]
action_space_low = env.action_space.low[0]

In [125]:
class Continuous_PG:
    def __init__(self, model, env, gamma=0.99):
        self.model = model
        self.env = env
        self.gamma = gamma
        self.pi = tf.constant(m.pi)


        
    def discount_and_normalize_rewards(self, episode_rewards):
        # Array for storing the episode rewards
        discounted_rewards = np.zeros_like(episode_rewards, dtype=np.float32)
        # compute the discounted reward
        reward_to_go = 0.0
        for i in reversed(range(len(episode_rewards))):
            reward_to_go = reward_to_go*self.gamma + episode_rewards[i]
            discounted_rewards[i] = reward_to_go
        #Normalize and return
        discounted_rewards -= np.mean(discounted_rewards) # Baseline
        discounted_rewards /= np.std(discounted_rewards) # Varience reduction

        return discounted_rewards
    
    def get_dist_params(self, net_outs):
        mu = net_outs * 2
        return mu
    
    def gaussian_policy(self, mu, actions, sig):
        # A function that takes the actions taken and returns their probability based on mu
        # as returned by the neural network
        x = (1 / (sig * tf.math.sqrt(2 * self.pi))) * tf.math.exp(-1 * (1 / 2) * ((actions - mu)/sig) ** 2)
        return x
    
    def loss(self, states, actions, discounted_rewards, sig):
        states_tf = tf.convert_to_tensor(states, dtype=tf.float32)
        discounted_rewards_tf = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)
        # Returns the mu for the distribution that we will sample from, want it to be between 
        # env.action_space.high and env.action_space.low
        net_outputs = self.model(states_tf)
        #print("net_output: ",net_outputs)
        mu=2 * net_outputs
        # get the policy pi(a,s)
        # mu, sig = self.get_dist_params(net_outputs)
        pi_a_s = self.gaussian_policy(mu=mu, actions=actions, sig=sig)
        #print("pi(a|s): ", pi_a_s)
        neg_log_probability = -1*tf.math.log(pi_a_s)
        loss = tf.reduce_sum(tf.squeeze(neg_log_probability)*discounted_rewards_tf)
        return loss
    
    def train_step(self, states, actions, discounted_rewards, sig, optimizer):
        with tf.GradientTape() as tape:
            loss = self.loss(states, actions, discounted_rewards, sig)
        grads = tape.gradient(loss, self.model.trainable_variables)
        optimizer.apply_gradients(zip(grads, self.model.trainable_variables))


In [130]:
"""
Initiate model
"""

input_layer = Input(shape=state_shape)
norm = LayerNormalization()(input_layer)
layer_1 = Dense(64, activation='relu')(input_layer)
layer_2 = Dense(64, activation='relu')(layer_1)
output_layer = Dense(1, activation='tanh')(layer_2)

model = Model(inputs=input_layer, outputs=output_layer)


In [131]:
CPG = Continuous_PG(model, env)

In [132]:
def single_episode(model, env, sig):
    episode_states, episode_actions, episode_rewards = [],[],[]
    done=False
    Return=0
    state=env.reset()
    while not done:
        episode_states.append(list(state))
        state = state.reshape((1, env.observation_space.shape[0]))
        net_out = CPG.model(state)
        mu = CPG.get_dist_params(net_outs=net_out)
        a = tf.clip_by_value(np.random.normal(loc=mu, scale=sig),-1.999,1.999)
        episode_actions.append(a.numpy()[0])
        next_state, reward, done, info = env.step(a.numpy()[0])
        Return+=reward
        episode_rewards.append(reward)
        state=next_state
    rewards_to_go = CPG.discount_and_normalize_rewards(episode_rewards)
    return episode_states, episode_actions, rewards_to_go, Return

In [None]:
num_iterations = 10000
return_array = np.zeros(num_iterations)
returns_benchmark=-5000
sig=0.3
decay=0.9995
for i in range(num_iterations):
    sigma = max(sig*decay, 0.05)
    episode_states, episode_actions, rewards_to_go, Return= single_episode(CPG.model, env, sig)
    return_array[i]=Return
    CPG.train_step(episode_states, episode_actions, rewards_to_go, sigma, optimizer=tf.keras.optimizers.Adam(lr=0.001))
    if Return > returns_benchmark:
        best_model=CPG.model
    if i%25==0:
        print("Iteration:{}, Return:{}".format(i, Return))
#         CPG.model.save("CPGAgent-v0-episode{}".format(i))
best_model.save("CPGAgent-v0-best")

Iteration:0, Return:-1520.4954748546463
Iteration:25, Return:-1571.6567987125802
Iteration:50, Return:-1140.9215176142407
Iteration:75, Return:-1637.7448655121968
Iteration:100, Return:-1036.1686256357902
Iteration:125, Return:-1543.2439757826637
Iteration:150, Return:-1636.0142931813575
Iteration:175, Return:-1626.8055822362767
Iteration:200, Return:-946.3694036327017
Iteration:225, Return:-1166.0383442437014
Iteration:250, Return:-1417.8113866984213
Iteration:275, Return:-1559.0014484046433
Iteration:300, Return:-1521.6987207673537
Iteration:325, Return:-907.8618395751629
Iteration:350, Return:-1526.784841263133
Iteration:375, Return:-1511.372840144932
Iteration:400, Return:-1523.9947338487007
Iteration:425, Return:-1530.9345001294753
Iteration:450, Return:-1538.5248437997325
Iteration:475, Return:-1444.1549811481511
Iteration:500, Return:-1637.767766178241
Iteration:525, Return:-1524.4148786575456
Iteration:550, Return:-949.9786150061947
Iteration:575, Return:-1641.973563421586
Iter

In [112]:
env.observation_space.shape

(3,)