In [1]:
import gymnasium as gym
import tensorflow as tf
import numpy as np
import random

2023-08-19 21:06:13.981105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define the models
def create_policy_model():
    inputs = tf.keras.Input(shape=(4,)) #changed
    x = tf.keras.layers.Dense(32, activation="relu")(inputs)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    outputs = tf.keras.layers.Dense(2, activation="softmax")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="pi")
    return model

def create_value_function_model():
    inputs = tf.keras.Input(shape=(4,)) #changed
    x = tf.keras.layers.Dense(32, activation="relu")(inputs)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    #outputs = tf.keras.layers.Dense(2, activation="relu")(x)
    outputs = tf.keras.layers.Dense(1, activation="linear")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="V")
    return model

value_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
pi_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)


2023-08-19 21:06:15.297910: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [3]:
class Environment:
    def __init__(self, NUM_ENVS):
        self.num_envs = NUM_ENVS
        self.envs = envs = gym.vector.make('CartPole-v1', num_envs=NUM_ENVS)
        #self.envs = envs = gym.vector.make('LunarLander-v2', num_envs=NUM_ENVS)
        self.current_state, _ = self.envs.reset()

    def sample(self, model, epsilon=0.2):
        old_observation = self.current_state
        q_values = model(self.current_state) #get q values for current state
        #TODO: view q_value like probability, sample from this probability
        action = np.argmax(q_values, axis=1) #get action that maximizes q-value
        action = [self.envs.single_action_space.sample() if random.random() < epsilon else a for a in action] #choose epsilon greedy
        new_observation, reward, terminated, _, _ = self.envs.step(action)

        self.current_state = new_observation #update current state after environment did step
        return (old_observation, action, reward, new_observation, terminated)
    
    def collect_trajectories(self, model, length, epsilon = 0.2):
        old_obs, act, rew, new_obs, term = self.sample(model, epsilon=epsilon)
        data = {"observations": np.expand_dims(old_obs, axis=1), 
                "actions": act, 
                "rewards": rew, 
                "terminateds": term}
        for i in range(length-1):
            old_obs, act, rew, new_obs, term = self.sample(model, epsilon=epsilon)
            data["observations"] = np.column_stack((data["observations"], np.expand_dims(old_obs, axis=1)))
            data["actions"] = np.column_stack((data["actions"], act))
            data["rewards"] = np.column_stack((data["rewards"], rew))
            data["terminateds"] = np.column_stack((data["terminateds"], term))
        return data



In [4]:
NUM_ENVS = 5
STEPS_PER_TRAJECTORY = 50
CLIP_RATIO = 0.2
NUM_UPDATE_EPOCHS = 1
MINIBATCH_SIZE = 50

TRAIN_EPOCHS = 2000

In [5]:
env = Environment(NUM_ENVS)

# 1: Input: initial policy parameters θ_0, initial value function parameters Φ_o
pi = create_policy_model() # initial policy
V = create_value_function_model() # initial value function

#pi_optimizer.build(pi.trainable_variables)
#value_optimizer.build(V.trainable_variables)

# 2: for k = 0, 1, 2, ... do
for k in range(TRAIN_EPOCHS):
    print("epoch: ", k)
    #print("Gathering trajectories")

    # 3: Collect set of trajectories D_k = {τ_i} by running policy π_k = pi(θ_k) in the environment.
    print("Collection")
    D = env.collect_trajectories(pi, STEPS_PER_TRAJECTORY)
    #D = {"observations": [environment][timestep], 
    #     "actions": [environment][timestep], 
    #     "rewards": [environment][timestep], 
    #     "terminateds": [environment][timestep]] }

    # 4: Compute rewards-to-go R̂_t
    rewards_to_go = np.zeros_like(D["rewards"])
    rewards_to_go[:,-1] = D["rewards"][:,-1] * (1-D["terminateds"][:,-1])
    for ind in reversed(range(1,STEPS_PER_TRAJECTORY)):
        rewards_to_go[:,ind-1] += D["rewards"][:,ind-1] + rewards_to_go[:,ind] * (1-D["terminateds"][:,ind-1])

    # flatten data
    rewards_to_go = np.reshape(rewards_to_go, (-1, *rewards_to_go.shape[2:]))
    for key, val in D.items():
        D[key] = np.reshape(val, (-1, *val.shape[2:]))

    # 5: Compute advantage estimates, Â_t (using any method of advantage estimation) based on the current value function V_Φ_k
    #TODO use better advantage formula
    values = tf.reshape(V(D["observations"]), -1)
    advantages = rewards_to_go - values
    # zero center advantages
    advantages = advantages - tf.reduce_mean(advantages)

    # Collect old_logits without gradientTaping for taking the ratio later
    old_logits = tf.gather(pi(D["observations"]), D["actions"], batch_dims=1)

    print("Tapework")

    # We minibatch for increasing the efficiency of the gradient ascent (PPO-implementation details nr. 6)
    batch_size = D["observations"].shape[0]
    batch_inds = np.arange(batch_size)
    for update_epoch in range(NUM_UPDATE_EPOCHS):
        np.random.shuffle(batch_inds)
        for minibatch_start in range(0, batch_size, MINIBATCH_SIZE):
            minibatch_end = min(minibatch_start + MINIBATCH_SIZE, batch_size-1)
            minibatch_inds = batch_inds[minibatch_start:minibatch_end]

            mb_obs = D["observations"][minibatch_inds]
            mb_acts = D["actions"][minibatch_inds]
            mb_old_logits = tf.gather(old_logits, minibatch_inds)
            mb_advantages = tf.gather(advantages, minibatch_inds)

            # 6: Update the policy by maximizing the PPO-Clip objective
            with tf.GradientTape() as pi_tape:
                new_logits = tf.gather(pi(mb_obs), mb_acts, batch_dims=1)
                ratios = new_logits / mb_old_logits
                surrogate_objective1 = ratios * mb_advantages
                surrogate_objective2 = tf.clip_by_value(ratios, 1 - CLIP_RATIO, 1 + CLIP_RATIO) * mb_advantages
                pi_loss = -tf.reduce_mean(tf.minimum(surrogate_objective1, surrogate_objective2))
            pi_gradients = pi_tape.gradient(pi_loss, pi.trainable_variables) #get
            pi_optimizer.apply_gradients(zip(pi_gradients, pi.trainable_variables)) #and apply gradients

            # 7: Fit value function by regression on mean-squared error:
            with tf.GradientTape() as val_tape:
                values = V(mb_obs)
                value_loss = tf.reduce_mean(tf.square(values - rewards_to_go[minibatch_inds]))
            value_gradients = val_tape.gradient(value_loss, V.trainable_variables) #get
            value_optimizer.apply_gradients(zip(value_gradients, V.trainable_variables)) #and apply gradients


    



epoch:  0
Collection
Tapework
epoch:  1
Collection
Tapework
epoch:  2
Collection
Tapework
epoch:  3
Collection
Tapework
epoch:  4
Collection
Tapework
epoch:  5
Collection
Tapework
epoch:  6
Collection
Tapework
epoch:  7
Collection
Tapework
epoch:  8
Collection
Tapework
epoch:  9
Collection
Tapework
epoch:  10
Collection
Tapework
epoch:  11
Collection
Tapework
epoch:  12
Collection
Tapework
epoch:  13
Collection
Tapework
epoch:  14
Collection
Tapework
epoch:  15
Collection
Tapework
epoch:  16
Collection
Tapework
epoch:  17
Collection
Tapework
epoch:  18
Collection
Tapework
epoch:  19
Collection
Tapework
epoch:  20
Collection
Tapework
epoch:  21
Collection
Tapework
epoch:  22
Collection
Tapework
epoch:  23
Collection
Tapework
epoch:  24
Collection
Tapework
epoch:  25
Collection
Tapework
epoch:  26
Collection
Tapework
epoch:  27
Collection
Tapework
epoch:  28
Collection
Tapework
epoch:  29
Collection
Tapework
epoch:  30
Collection
Tapework
epoch:  31
Collection
Tapework
epoch:  32
Collect

In [6]:
#test_env = gym.make('LunarLander-v2', render_mode='human')
test_env = gym.make('CartPole-v1', render_mode='human')
obs, inf = test_env.reset()

In [8]:
for i in range(1000):
    qs = pi(tf.expand_dims(obs, 0))
    act = np.argmax(qs)
    obs, _, terminated, _, _ = test_env.step(act)
    if(terminated):
        obs, _ = test_env.reset()