In [1]:
import gym
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import tensorflow as tf
from tensorflow import keras
import pyvirtualdisplay
mpl.rc('animation', html='jshtml')

print(tf.__version__)

2.5.0


In [2]:
env = gym.make('CartPole-v1')

In [3]:
env.seed(42)
obs = env.reset()

In [4]:
obs

array([-0.01258566, -0.00156614,  0.04207708, -0.00180545])

In [5]:
try:
    import pyvirtualdisplay
    display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
except ImportError:
    pass

In [6]:
#env.render()

In [7]:
#img = env.render(mode="rgb_array")
#img.shape

In [8]:
'''
def plot_environment(env, figsize=(5,4)):
    plt.figure(figsize=figsize)
    img = env.render(mode="rgb_array")
    plt.imshow(img)
    plt.axis("off")
    return img
'''

'\ndef plot_environment(env, figsize=(5,4)):\n    plt.figure(figsize=figsize)\n    img = env.render(mode="rgb_array")\n    plt.imshow(img)\n    plt.axis("off")\n    return img\n'

In [9]:
'''
plot_environment(env)
plt.show()
'''

'\nplot_environment(env)\nplt.show()\n'

In [10]:
env.seed(42)

def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(50):
    episode_rewards = 0
    obs = env.reset()
    for step in range(50):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [11]:
env.seed(4)

#frames = []

obs = env.reset()
for step in range(200):
    #img = env.render(mode="rgb_array")
    #frames.append(img)
    action = basic_policy(obs)

    obs, reward, done, info = env.step(action)
    if done:
        break

In [12]:
'''
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim
'''

"\ndef update_scene(num, frames, patch):\n    patch.set_data(frames[num])\n    return patch,\n\ndef plot_animation(frames, repeat=False, interval=40):\n    fig = plt.figure()\n    patch = plt.imshow(frames[0])\n    plt.axis('off')\n    anim = animation.FuncAnimation(\n        fig, update_scene, fargs=(frames, patch),\n        frames=len(frames), repeat=repeat, interval=interval)\n    plt.close()\n    return anim\n"

In [13]:
#plot_animation(frames)

In [14]:
'''Now we are going to create the nerual network that will take observations as inputs, then output the probability, p,of going left (0), and the probability, 1-p, of going right (1).'''

'Now we are going to create the nerual network that will take observations as inputs, then output the probability, p,of going left (0), and the probability, 1-p, of going right (1).'

In [15]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

n_inputs = 4 #  == env.observatoin_space.shape[0]

model = keras.models.Sequential([
    keras.layers.Dense(5, activation='elu', input_shape=[n_inputs]),
    keras.layers.Dense(1, activation='sigmoid'),
])


In [16]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1,1])) > left_proba
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, rewards, done, info = env.step(int(action[0,0].numpy()))
    return obs, reward, done, grads

In [17]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, rewards, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(rewards)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [18]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) -2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted

In [19]:
def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor)
                             for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
           for discounted_rewards in all_discounted_rewards]

In [20]:
discount_rewards([10, 0, -50], discount_factor=0.8)

array([-22, -40, -50])

In [21]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]],
                              discount_factor=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [22]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95

optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.binary_crossentropy

In [23]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
    env, n_episodes_per_update, n_max_steps, model, loss_fn)
    
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                      discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
        [final_reward * all_grads[episode_index][step][var_index]
         for episode_index, final_rewards in enumerate(all_final_rewards)
         for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

Iteration: 149, mean rewards: 190.8