1. Batch must be large enough(In CartPole, we just need 16 episodes is enough, but in FrozenLake, we need at least 100 episode to get some success cases.)
2. Use discount factor on reward, we use 0.9 or 0.95, therefore a longer episode will have a smaller reward than a shorter episode.
3. Extend the time of keeping elite episode: In CartPole, we sample from episodes and get the elite episode, using those for training and drop them. But in FrozenLake, it is difficult to see a success episode, therefore we must keep them longer for training.
4. Decrease learning rate to get more samples
5. Longer training time, we need at least 5000 times training for 50% success rate.

In [1]:
import gym, gym.spaces
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
#other parameters are randomize and won't do customization
#hidden layers neurons number, number of episodes for every loop
#filter percentage for the best episodes(we will take the best 30%)
HIDDEN_SIZE = 128
#batch size set to 100 instead of 16
BATCH_SIZE = 100
PERCENTILE = 70
#Use for calculate discounted reward
GAMMA = 0.9

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [4]:
#We define 2 helper class to create 2 namedtuple.
#EpisodeStep store 1 step in a episode, it also stored observation from environment and the action performed
#Episode is the set of EpisodeStep, it store the non-discounted reward in one whole episode
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [5]:
def iterate_batches(env, net, batch_size):
    #batch to save result of process, set a reward counter for the current episode
    #reset environment and construct softmax layer, it transfer the output to action probabilites
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    
    #every loop will convert observation to pytorch tensor and send to network to get the action probability
    #nn.Module will get observation value from the CartPole 1 * 4 Tensor
    #Because we didn't use softmax function at final layer, it will output original value
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        #we use tensor.data to uncompress tensor and convert to Numpy array, this has same 2 dimension as input data
        #we want to get the first batch from the batch array, therefore we use [0], which is action probabilities
        act_probs = act_probs_v.data.numpy()[0]
        
        #we use random.choice() to sample from action probabilities, and put the action to environment to get next 
        #observation, reward and episode is done or not
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        
        #we add the reward to total reward, we store the observation and action pair into the episode_steps, the 
        #observation is before action, not after
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        
        #when gameover, episode is done, we will append the reward to total reward and reset environment and episode rewards
        #if batch accumulates enough episodes,we yield the result to caller for further process
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        
        #get observation value from environment to current observation variable
        #repeat everything after: pass observation result to network, sample action and take action, let environment
        #handle action, save result
        obs = next_obs

In [6]:
#training loop
#we return elite batch here
def filter_batch(batch, percentile):
    #we use numpy percentile to caluclate the reward_bound with the batch reward we got
    disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
    reward_bound = np.percentile(disc_rewards, percentile)
    
    #if reward is larger than reward boundary, we keep it and store the observation and action
    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)
            
    return elite_batch, train_obs, train_act, reward_bound

In [7]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)
        
    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

# The reward cannot converge because the environment and reward mechanism is very different, the reward will mark as success(1) or fail(0), and we only have very little chance by random walking will go to the goal. So if we use >50-70% as elite episode for training, probably there are many failed episodes


# To conclude, if using cross entropy, the episode is shorter, the result is better. The total reward should be able to divide the good and bad episode. Also before success, there won't be any indication of whether the target is going to achieve as good episode or bad episode.

In [8]:
if __name__ == "__main__":
    #we create all necessary objects: environment, network, target function, optimizer and TensorBoard writer
    env = DiscreteOneHotWrapper(gym.make("FrozenLake-v1"))
    #The line below will create a monitor to save the agent action as video
    env = gym.wrappers.Monitor(env, directory="mon3", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n
    
    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    #learning set to 0.001 instead of 0.01, 1/10 of original
    optimizer = optim.Adam(params=net.parameters(), lr=0.001)
    writer = SummaryWriter(comment="-frozenlake-tweaked")
    
    full_batch = []
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        reward_m = float(np.mean(list(map(lambda s: s.reward, batch))))
        full_batch, obs, acts, reward_b = filter_batch(full_batch + batch, PERCENTILE)
        if not full_batch:
            continue
        obs_v = torch.FloatTensor(obs)
        acts_v = torch.LongTensor(acts)
        full_batch = full_batch[-500:]
                
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        
        #For monitoring, show iterate number, loss, batch reward mean, reward boundary, all these values will be 
        #written in TensorBoard
        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" %(iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        
        #if mean reward > 199, we stop training, it is because in Gym, when 100 episodes > 195,
        #the cartpole problem is said to be successfully solved, it can balance infinitely long,
        #but in CartPole environment, it used TimeLimit to limit the episodes within 200, so it is forced to stop after
        #200 steps. Therefore we use > 199 steps as to indicate the problem solved
        if reward_m > 199:
            print("Solved!")
            break
    writer.close()

NameError: name 'reward_b' is not defined