In [1]:
import gym
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np



In [2]:
env_name = 'CartPole-v0'
render = True
lr = 0.001

In [32]:
def mlp(sizes, activation=tf.keras.activations.tanh, output_activation=tf.identity):
    # Build a feedforward neural network.
    layers = []
    layers.append(tf.keras.layers.Input(sizes[0]))
    for size in sizes[1:-1]:
        layers.append(tf.keras.layers.Dense(units=size, activation=activation))
    layers.append(tf.keras.layers.Dense(units=sizes[-1], activation=output_activation))
    return tf.keras.Sequential(layers)

In [31]:
# make function to compute action distribution
def get_policy(obs, logits_net):
    logits = logits_net(obs)
    return tfp.distributions.Categorical(logits=logits)

In [30]:
# make action selection function (outputs int actions, sampled from policy)
def get_action(obs, logits_net):
    return get_policy(obs, logits_net).sample().item()

In [29]:
# make loss function whose gradient, for the right data, is policy gradient
def compute_loss(obs, act, weights, logits_net):
    logp = get_policy(obs, logits_net).log_prob(act)
    return -(logp * weights).mean()

In [28]:
def train_epoch(env, batch_size, optimizer, logits_net):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset() # first obs comes from starting distribution
    donde = False     # signal from environment that episode is over  
    ep_rews = []      # list for rewards accrued throughout ep

    # render first episode of each epoch
    finished_rendering_this_epoch = False

    # collect experience by acting in the environment with current policy
    while True:
        if (not finished_rendering_this_epoch):
            env.render()

        # Save the observation
        batch_obs.append(batch_obs.copy())

        # act in the environment
        act = get_action(tf.constant(obs[0], dtype=tf.dtypes.float32), logits_net)
        obs, rew, done, _ = env.step(act)

        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)

            # the weight for each logprob(a|s) is R(tau)
            batch_weights += [ep_ret] * ep_len

            # Reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []

            # won't render again this epoch
            finished_rendering_this_epoch = True

            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break

    # Reset the optimizer
    for var in optimizer.variables():
        var.assign(tf.zeros_like(var))

    batch_loss = compute_loss(
        batch_obs = tf.constant(batch_obs, dtype=tf.dtypes.float32),
        batch_acts = tf.constant(batch_acts, dtype=tf.dtypes.float32),
        batch_weights = tf.constant(batch_weights, dtype=tf.dtypes.float32),
        logits_net=logits_net
    )

    batch_loss.backward()
    optimizer.step()
    return batch_loss, batch_rets, batch_lens

In [34]:
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=5000, render=False):
    env = gym.make(env_name)

    assert isinstance(env.observation_space, gym.spaces.Box), \
        "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, gym.spaces.Discrete), \
        "This example only works for envs with discrete action spaces."

    obs_dim = env.observation_space.shape[0]
    print(f'Observation dimention: {obs_dim}')
    n_acts = env.action_space.n
    print(f'Number of possible actions: {n_acts}')

    # make core of policy network
    logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])

    # make optimizer
    optimizer = tf.keras.optimizers.Adam(lr)

    # Train loop
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_epoch(env, batch_size, optimizer, logits_net)
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
            (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))

In [35]:
train(env_name=env_name, render=render, lr=lr)



Observation dimention: 4
Number of possible actions: 2


  gym.logger.warn(


ValueError: Exception encountered when calling layer 'sequential_10' (type Sequential).

Input 0 of layer "dense_29" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (4,)

Call arguments received by layer 'sequential_10' (type Sequential):
  • inputs=tf.Tensor(shape=(4,), dtype=float32)
  • training=None
  • mask=None