In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import matplotlib.animation as animation

In [3]:
env = gym.make("CartPole-v1")
# observed value
obs = env.reset()
print(obs) # position, velocity, angular position, angular velocity

[ 0.04540147 -0.01949354  0.01132249 -0.0464231 ]


  logger.warn(


In [4]:
# pygame render
env.render()

True

In [12]:
# the number of possible actions
env.action_space

Discrete(2)

In [13]:
action = 1
obs, reward, done, info = env.step(action)

In [14]:
print(obs)
print(reward)
print(done)
print(info)

[-0.03714275  0.21301685  0.03923215 -0.29308167]
1.0
False
{}


In [5]:
def simple_policy(obs):
    angle = obs[2]
    if(angle < 0):
        ret = 0
    else:
        ret = 1
    return ret

In [6]:
totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(200):
        action = simple_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        env.render()
        if done:
            break
    totals.append(episode_rewards)

KeyboardInterrupt: 

In [10]:
print('mean : ', np.mean(totals), 'std : ', np.std(totals), 'min : ', np.min(totals), 'max : ', np.max(totals))

mean :  41.716 std :  9.034342477457892 min :  24.0 max :  66.0


In [46]:
n_inputs = 4
model = keras.models.Sequential([
    keras.layers.Dense(5, activation="elu", input_shape=[n_inputs]),
    keras.layers.Dense(1, activation="sigmoid"),
])

In [37]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1, 1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads

In [38]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [39]:
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

In [40]:
def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [42]:
discount_rewards([10, 0, -50], discount_rate=0.8)

array([-22, -40, -50])

In [43]:
discount_and_normalize_rewards([[10,0,-50],[10, 20]], discount_rate=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [74]:
n_iterations = 100
n_episodes_per_update = 10
n_max_steps = 400
discount_rate = 0.95

In [75]:
optimizer = keras.optimizers.Adam(learning_rate=0.007)
loss_fn = keras.losses.binary_crossentropy

In [76]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

Iteration: 99, mean rewards: 381.0

In [81]:
model.save('models/1st_learned_model')

INFO:tensorflow:Assets written to: models/1st_learned_model/assets


2022-07-08 16:57:03.729589: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [77]:
def test_trained_model(n_max_steps=400):
    env = gym.make("CartPole-v1")
    obs = env.reset()
    for step in range(n_max_steps):
        left_proba = model.predict(obs.reshape(1, -1))
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            break
    env.close()

In [80]:
test_trained_model()

In [35]:
all_mean_grads

[<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
 array([[ 1.9074937e-04, -3.2367632e-05, -4.1303618e-04, -4.2168500e-05,
         -5.3802072e-05],
        [ 1.3819337e-03, -2.3294112e-04, -2.0992579e-03, -2.3773212e-03,
         -2.1802902e-03],
        [-1.0039198e-04,  1.9278592e-05,  1.1088847e-05,  7.3342392e-04,
          6.4214121e-04],
        [-1.7743259e-03,  3.0406620e-04,  1.9464429e-03,  5.2335402e-03,
          4.6678316e-03]], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([ 0.00121662, -0.00022681, -0.00372062,  0.00425246,  0.00362416],
       dtype=float32)>,
 <tf.Tensor: shape=(5, 1), dtype=float32, numpy=
 array([[ 0.00529886],
        [ 0.00145386],
        [-0.00105939],
        [-0.00275677],
        [-0.00192604]], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.01368127], dtype=float32)>]

In [36]:
all_final_rewards

[array([ 2.1791304 ,  2.02380075,  1.86847111,  1.71314147,  1.55781182,
         1.40248218,  1.24715254,  1.0918229 ,  0.93649325,  0.78116361,
         0.62583397,  0.47050432,  0.31517468,  0.15984504,  0.0045154 ,
        -0.15081425, -0.30614389, -0.46147353, -0.61680317, -0.77213282,
        -0.92746246, -1.0827921 , -1.23812175, -1.39345139]),
 array([ 0.31517468,  0.15984504,  0.0045154 , -0.15081425, -0.30614389,
        -0.46147353, -0.61680317, -0.77213282, -0.92746246, -1.0827921 ,
        -1.23812175, -1.39345139]),
 array([ 0.78116361,  0.62583397,  0.47050432,  0.31517468,  0.15984504,
         0.0045154 , -0.15081425, -0.30614389, -0.46147353, -0.61680317,
        -0.77213282, -0.92746246, -1.0827921 , -1.23812175, -1.39345139]),
 array([ 0.0045154 , -0.15081425, -0.30614389, -0.46147353, -0.61680317,
        -0.77213282, -0.92746246, -1.0827921 , -1.23812175, -1.39345139]),
 array([ 0.78116361,  0.62583397,  0.47050432,  0.31517468,  0.15984504,
         0.0045154 , -