In [None]:
#| default_exp cartpole_cross_entropy

In [None]:
# |export
import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

Our model's core is a one-hidden-layer NN, with rectified linear unit (ReLU) and 128 hidden neurons (which is absolutely
arbitrary).

the count of neurons in the hidden layer is 128, the count of episodes we play on every iteration (16), and the
percentile of episodes' total rewards that we use for "elite" episode filtering. We will
take the 70th percentile, which means that we will leave the top 30% of episodes
sorted by reward

In [None]:
# |export
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

Neural Network class that takes a single observation from the
environment as an input vector and outputs a number for every action we can
perform. The output from the NN is a probability distribution over actions, so
a straightforward way to proceed would be to include softmax nonlinearity after
the last layer.

In addition we define two helper classes : 

 - EpisodeStep: This will be used to represent one single step that our agent made in the episode, and it stores the observation from the environment and what action the agent completed. We will use episode steps from "elite" episodes as training data.

- Episode: This is a single episode stored as total undiscounted reward and a collection of EpisodeStep

In [None]:
# |export

class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# |export
Episode = namedtuple("Episode", field_names=["reward", "steps"])
EpisodeStep = namedtuple("EpisodeStep", field_names=["observation", "action"])

In [None]:
# |export
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        step = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)
        if is_done:
            e = Episode(reward=episode_reward, steps=episode_steps)
            batch.append(e)
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs

The training accepts the environment (the Env class instance from the Gym library), our NN, and the count of episodes it should generate on every iteration.

We also declare a reward counter for the current episode and its
list of steps (the EpisodeStep objects). Then we reset our environment to obtain the
first observation and create a softmax layer, which will be used to convert the NN's
output to a probability distribution of actions.



This function is at the core of the cross-entropy method—from the given batch
of episodes and percentile value, it calculates a boundary reward, which is used
to filter "elite" episodes to train on. To obtain the boundary reward, we will use
NumPy's percentile function, which, from the list of values and the desired
percentile, calculates the percentile's value. Then, we will calculate the mean
reward, which is used only for monitoring

In [None]:
# |export

def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for reward, steps in batch:
        if reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, steps))
        train_act.extend(map(lambda step: step.action, steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

Main function

In [None]:
# |export

if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)

    for iter_no, batch in enumerate(iterate_batches(
            env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = \
            filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        if reward_m > 199:
            print("Solved!")
            break

0: loss=0.687, reward_mean=22.7, rw_bound=26.5
1: loss=0.660, reward_mean=24.8, rw_bound=29.0
2: loss=0.663, reward_mean=32.2, rw_bound=36.5
3: loss=0.641, reward_mean=44.8, rw_bound=45.0
4: loss=0.648, reward_mean=37.6, rw_bound=42.0
5: loss=0.619, reward_mean=42.1, rw_bound=49.5
6: loss=0.634, reward_mean=50.6, rw_bound=60.5
7: loss=0.614, reward_mean=48.5, rw_bound=56.5
8: loss=0.616, reward_mean=52.9, rw_bound=71.0
9: loss=0.606, reward_mean=57.2, rw_bound=62.0
10: loss=0.601, reward_mean=57.7, rw_bound=64.0
11: loss=0.594, reward_mean=66.1, rw_bound=62.0
12: loss=0.595, reward_mean=68.1, rw_bound=90.0
13: loss=0.576, reward_mean=55.4, rw_bound=71.0
14: loss=0.595, reward_mean=69.5, rw_bound=72.0
15: loss=0.591, reward_mean=66.2, rw_bound=76.0
16: loss=0.598, reward_mean=74.2, rw_bound=70.0
17: loss=0.560, reward_mean=94.4, rw_bound=103.0
18: loss=0.568, reward_mean=87.2, rw_bound=100.5
19: loss=0.566, reward_mean=98.0, rw_bound=131.5
20: loss=0.567, reward_mean=123.2, rw_bound=154