8. Reinf learning. Exercise: Use policy gradients to solve OpenAI Gym's LunarLander-v2 environment. You will need to install the Box2D dependencies (%pip install -U gym[box2d]). hands-on-ml book exerices page 623

In [4]:
import numpy as np
import gym
env = gym.make("LunarLander-v2")
obs = env.reset()
obs

array([ 0.00422506,  1.4077823 ,  0.4279313 , -0.13946356, -0.00488894,
       -0.09693269,  0.        ,  0.        ], dtype=float32)

In [2]:
import matplotlib.pyplot as plt
# img = env.render(mode="rgb_array")
# env.close()
# plt.imshow(img)
print(env.action_space)
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


In [151]:
# random policy
# print(np.mean(total_rewards), np.std(total_rewards), np.min(total_rewards), np.max(total_rewards))
# -179.8682271598708 113.57826569586982 -536.6091716341286 11.068357820341546

total_rewards = []
for i_episode in range(200):
    rewards = 0
    obs = env.reset()
    for t in range(200):
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        rewards += reward
        if done:
            break
    total_rewards.append(rewards)
print(np.mean(total_rewards), np.std(total_rewards), np.min(total_rewards), np.max(total_rewards))

-160.12920791414706 99.56875747347738 -478.625099914632 113.81167088199918


In [152]:
# model

import tensorflow as tf
from tensorflow import keras

print(env.reset()[np.newaxis, :])
print(env.action_space.n)
model = keras.models.Sequential()
model.add(keras.layers.InputLayer(input_shape=(env.observation_space.shape[0],)))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(env.action_space.n, activation='softmax'))

model(env.reset()[np.newaxis, :])
print(model.trainable_variables)

[[-0.00437126  1.4179332  -0.4427755   0.3116899   0.00507198  0.1002952
   0.          0.        ]]
4
[<tf.Variable 'dense_36/kernel:0' shape=(8, 16) dtype=float32, numpy=
array([[ 0.301795  , -0.38039255, -0.24801195,  0.04388976, -0.34565997,
         0.1735723 , -0.47314346,  0.41481686,  0.05990446,  0.10238111,
        -0.48438263,  0.2602874 , -0.09898937, -0.31536007, -0.1656872 ,
         0.27430248],
       [ 0.41590798,  0.32985508, -0.39172673,  0.02755785, -0.49637842,
         0.4681188 ,  0.08902979,  0.34461904, -0.21435642, -0.07684267,
        -0.4240328 , -0.07903743,  0.430732  ,  0.45916307,  0.38133073,
         0.41708183],
       [-0.18825614, -0.12405837, -0.45531416,  0.17599726,  0.06348991,
        -0.03635478, -0.43160188, -0.15807903,  0.12305021,  0.3191123 ,
        -0.40355122,  0.25191665, -0.07294977,  0.37522745, -0.16398835,
        -0.28805494],
       [-0.15455902, -0.3789966 , -0.35286403, -0.22206593, -0.36340916,
        -0.09669185, -0.3449095

In [153]:
# training
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        action_proba = model(obs[np.newaxis, :])[0]
        rand = tf.random.uniform([env.action_space.n])
        actions = action_proba - rand
        max_action_value = tf.reduce_max(actions)
        action = int(tf.argmax(actions))
        y_target = tf.cast(actions == max_action_value, tf.float32)
        # print(action_proba, y_target)
        loss = tf.reduce_mean(loss_fn(y_target, action_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, _ = env.step(action)
    return obs, reward, done, grads
play_one_step(env, env.reset(), model, keras.losses.categorical_crossentropy)

(array([-0.01519747,  1.4324677 , -0.77315265,  0.46594155,  0.0188807 ,
         0.20210364,  0.        ,  0.        ], dtype=float32),
 -1.2968667428616516,
 False,
 [<tf.Tensor: shape=(8, 16), dtype=float32, numpy=
  array([[ 6.58586796e-04,  3.87097098e-04,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00, -1.05614099e-03,
          -1.10561890e-03,  2.35164911e-03,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          -5.45751362e-04,  5.77779138e-04, -1.60321000e-03,
           1.66092440e-03],
         [-1.23741761e-01, -7.27316067e-02,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  1.98438153e-01,
           2.07734540e-01, -4.41850990e-01,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           1.02541134e-01, -1.08558834e-01,  3.01226884e-01,
          -3.12070817e-01],
         [ 6.67092800e-02,  3.92096676e-02,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00, -1.06978171

In [154]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_factor=0.97):
    discounted = np.array(rewards)
    for n_reward in range(len(discounted) - 2, -1, -1):
        discounted[n_reward] += discount_factor * discounted[n_reward + 1]
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor=0.97):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_all_discounted_rewards = np.concatenate(all_discounted_rewards)
    flat_mean = np.mean(flat_all_discounted_rewards)
    flat_std = np.std(flat_all_discounted_rewards)
    return [(discounted_rewards - flat_mean) / flat_std for discounted_rewards in all_discounted_rewards]

print(discount_rewards([10, 0, -50], 0.8)) # -22 -40 -50
print(discount_and_normalize_rewards([[10, 0, -50], [10, 20]], 0.8)) # -0.28 .... 1.26, 1.07

[-22 -40 -50]
[array([-0.28435071, -0.86597718, -1.18910299]), array([1.26665318, 1.0727777 ])]


In [147]:
# training algorithm: play several episodes without applying grads, then apply mean grad according to a reward
from tqdm import tqdm

n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.97

optimizer = keras.optimizers.Adam(0.01)
loss_fn = keras.losses.categorical_crossentropy

def train():
    for n_iteration in tqdm(range(n_iterations)):
        all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
        all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
        
        mean_grads = []
        for trainable_var_index in range(len(model.trainable_variables)):
            weighted_grads = []
            for episode, final_rewards in enumerate(all_final_rewards):
                for step, final_reward in enumerate(final_rewards):
                    grad = all_grads[episode][step][trainable_var_index]
                    weighted_grads.append(grad * final_reward)
            mean_grad = tf.reduce_mean(weighted_grads, axis=0)
            mean_grads.append(mean_grad)
        optimizer.apply_gradients(zip(mean_grads, model.trainable_variables))    
train()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [12:22<00:00,  4.95s/it]


In [150]:
# play with the trained model

total_rewards = []
for i_episode in tqdm(range(200)):
    rewards = 0
    obs = env.reset()
    for t in range(200):
        action_proba = model(obs[np.newaxis, :])[0]
        action = int(tf.argmax(action_proba))
        obs, reward, done, info = env.step(action)
        rewards += reward
        if done:
            break
    total_rewards.append(rewards)
# -39.04493488942172 19.198360777169057 -81.51906253760649 3.046460930017937
print(np.mean(total_rewards), np.std(total_rewards), np.min(total_rewards), np.max(total_rewards))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:41<00:00,  4.78it/s]

-39.04493488942172 19.198360777169057 -81.51906253760649 3.046460930017937



