1. Use our current module and environment to play N episodes
2. Calculate the reward for each episode and set the reward boundary, we use the percentile of total reward as boundary, such as 50-70%
3. Drop all episodes that has total reward less than reward boundary.
4. Use observation as input and train the remaining episodes, and the decided action as output.
5. Return to step 1 until we are satisfied with the output.

In [1]:
import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
#other parameters are randomize and won't do customization
#hidden layers neurons number, number of episodes for every loop
#filter percentage for the best episodes(we will take the best 30%)
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [4]:
#We define 2 helper class to create 2 namedtuple.
#EpisodeStep store 1 step in a episode, it also stored observation from environment and the action performed
#Episode is the set of EpisodeStep, it store the non-discounted reward in one whole episode
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [5]:
def iterate_batches(env, net, batch_size):
    #batch to save result of process, set a reward counter for the current episode
    #reset environment and construct softmax layer, it transfer the output to action probabilites
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    
    #every loop will convert observation to pytorch tensor and send to network to get the action probability
    #nn.Module will get observation value from the CartPole 1 * 4 Tensor
    #Because we didn't use softmax function at final layer, it will output original value
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        #we use tensor.data to uncompress tensor and convert to Numpy array, this has same 2 dimension as input data
        #we want to get the first batch from the batch array, therefore we use [0], which is action probabilities
        act_probs = act_probs_v.data.numpy()[0]
        
        #we use random.choice() to sample from action probabilities, and put the action to environment to get next 
        #observation, reward and episode is done or not
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        
        #we add the reward to total reward, we store the observation and action pair into the episode_steps, the 
        #observation is before action, not after
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        
        #when gameover, episode is done, we will append the reward to total reward and reset environment and episode rewards
        #if batch accumulates enough episodes,we yield the result to caller for further process
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        
        #get observation value from environment to current observation variable
        #repeat everything after: pass observation result to network, sample action and take action, let environment
        #handle action, save result
        obs = next_obs

In [6]:
#training loop
def filter_batch(batch, percentile):
    #we use numpy percentile to caluclate the reward_bound with the batch reward we got
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    #reward_mean for monitoring
    reward_mean = float(np.mean(rewards))
    
    #if reward is larger than reward boundary, we keep it and store the observation and action
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
        
    #transform observation and action to vector and put in array, last 2 will just put in TensorBoard for monitoring but 
    # no real use
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [7]:
if __name__ == "__main__":
    #we create all necessary objects: environment, network, target function, optimizer and TensorBoard writer
    env = gym.make("CartPole-v0")
    #The line below will create a monitor to save the agent action as video
    env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n
    
    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)
    writer = SummaryWriter()
    
    #we get the batch in loop, filter the batch, get observation and action vector, reward boundary and mean
    #we make the gradient zero and give the observation to network and get the action score
    #action score will put to target function to calculate the difference between network output and agent chosen action
    #so the agent will choose the action will higher network output value
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        
        #For monitoring, show iterate number, loss, batch reward mean, reward boundary, all these values will be 
        #written in TensorBoard
        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" %(iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        
        #if mean reward > 199, we stop training, it is because in Gym, when 100 episodes > 195,
        #the cartpole problem is said to be successfully solved, it can balance infinitely long,
        #but in CartPole environment, it used TimeLimit to limit the episodes within 200, so it is forced to stop after
        #200 steps. Therefore we use > 199 steps as to indicate the problem solved
        if reward_m > 199:
            print("Solved!")
            break
    writer.close()

0: loss=0.698, reward_mean=15.7, reward_bound=16.0
1: loss=0.684, reward_mean=20.5, reward_bound=23.0
2: loss=0.667, reward_mean=30.4, reward_bound=33.5
3: loss=0.652, reward_mean=36.4, reward_bound=44.0
4: loss=0.637, reward_mean=34.4, reward_bound=41.0
5: loss=0.627, reward_mean=36.9, reward_bound=44.5
6: loss=0.629, reward_mean=42.1, reward_bound=52.0
7: loss=0.593, reward_mean=44.5, reward_bound=49.0
8: loss=0.608, reward_mean=53.8, reward_bound=60.5
9: loss=0.589, reward_mean=44.8, reward_bound=48.5
10: loss=0.581, reward_mean=57.0, reward_bound=64.5
11: loss=0.571, reward_mean=56.6, reward_bound=52.0
12: loss=0.552, reward_mean=59.7, reward_bound=71.0
13: loss=0.546, reward_mean=64.9, reward_bound=77.0
14: loss=0.552, reward_mean=63.1, reward_bound=69.5
15: loss=0.551, reward_mean=70.9, reward_bound=86.0
16: loss=0.521, reward_mean=70.8, reward_bound=76.0
17: loss=0.524, reward_mean=62.1, reward_bound=68.0
18: loss=0.533, reward_mean=54.1, reward_bound=58.5
19: loss=0.530, reward