8. Reinf learning. Exercise: Use policy gradients to solve OpenAI Gym's LunarLander-v2 environment. You will need to install the Box2D dependencies (%pip install -U gym[box2d]).

In [4]:
import numpy as np
import gym
env = gym.make("LunarLander-v2")
obs = env.reset()
obs

array([ 0.00422506,  1.4077823 ,  0.4279313 , -0.13946356, -0.00488894,
       -0.09693269,  0.        ,  0.        ], dtype=float32)

In [2]:
import matplotlib.pyplot as plt
# img = env.render(mode="rgb_array")
# env.close()
# plt.imshow(img)
print(env.action_space)
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


In [8]:
# random policy

total_rewards = []
for i_episode in range(200):
    rewards = 0
    obs = env.reset()
    for t in range(200):
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        rewards += reward
        if done:
            break
    total_rewards.append(rewards)
print(np.mean(total_rewards), np.std(total_rewards), np.min(total_rewards), np.max(total_rewards))

-179.8682271598708 113.57826569586982 -536.6091716341286 11.068357820341546


In [28]:
# model

import tensorflow as tf
from tensorflow import keras

print(env.reset()[np.newaxis, :])
print(env.action_space.n)
model = keras.models.Sequential()
model.add(keras.layers.InputLayer(input_shape=(env.observation_space.shape[0],)))
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(env.action_space.n, activation='softmax'))

model(env.reset()[np.newaxis, :])

[[ 0.00584564  1.4179595   0.592075    0.3128464  -0.00676677 -0.13411388
   0.          0.        ]]
4


<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.47304472, 0.15830469, 0.09631486, 0.27233568]], dtype=float32)>

In [107]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        action_proba = model(obs[np.newaxis, :])[0]
        rand = tf.random.uniform([env.action_space.n])
        actions = action_proba - rand
        max_action_value = tf.reduce_max(actions)
        action = int(tf.argmax(actions))
        y_target = tf.cast(actions == max_action_value, tf.float32)
        # print(action_proba, y_target)
        loss = tf.reduce_mean(loss_fn(y_target, action_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, _ = env.step(action)
    return obs, reward, done, grads
play_one_step(env, env.reset(), model, None)

2


TypeError: 'NoneType' object is not callable

In [121]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_factor=0.97):
    discounted = np.array(rewards)
    for n_reward in range(len(discounted) - 2, -1, -1):
        discounted[n_reward] += discount_factor * discounted[n_reward + 1]
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor=0.97):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_all_discounted_rewards = np.concatenate(all_discounted_rewards)
    flat_mean = np.mean(flat_all_discounted_rewards)
    flat_std = np.std(flat_all_discounted_rewards)
    return [(discounted_rewards - flat_mean) / flat_std for discounted_rewards in all_discounted_rewards]

print(discount_rewards([10, 0, -50], 0.8))
print(discount_and_normalize_rewards([[10, 0, -50], [10, 20]], 0.8))

[-22 -40 -50]
[array([-0.28435071, -0.86597718, -1.18910299]), array([1.26665318, 1.0727777 ])]


In [113]:

for n in range(len(np.array([0, 1, 2])), 0, -1):
    print()

3
2
1
