- Reinforce method need short episode like in CartPole, in Pong, it will converge very slow. And we sample a lots will just have a few successful samples.
- We need the Q(s,a) and V(s) for each step q-value estimation, but in policy gradient, we just have the distribution but not the above value, so we need to estimate the above value with Actor-Critic method. We can also extend n steps for estimation, further steps has few contributions because discounted value is big.
- In cartpole, if we have a successful game with 100 steps holding the pole, the reward will be 100. When compared to reward 5 with 5 steps, the total reward have very large difference, which makes one lucky episode will occupied the most at the final gradient, and this affect our training, because variance is large, so we need q to minus baseline value, which can be the mean discounted value or moving average or discounted value or the state value V(s).
- We have entropy bonus introduced here, which means the uncertainty of an action, the math equation stated as:                H(π) = -Σπ(a|s)logπ(a|s), the entropy will have a maximum value, when all policy has same probabilities, the entropy will be larger than 0 and will be maximum, which means our agent is unsure which action to choose, otherwise, if 1 action has value 1 and others has 0, it means the agent is clearly what it is doing, therefore the entropy will change to minimum. To avoid local minimum problem, we use loss - entropy to punish agent over sure what it should be done.
- In DQN, we use replay buffer with different sample to make the i.i.d requirement holds, but here we can't use replay buffer, because we are using on-policy method, we can't save the old policy data to buffer and sample it, we need the newest data to current action. Therefore, we use parallel environment, which means we will interact with multiple environment instead of 1 in the same action and sample the data.

In [1]:
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.001
#entropy_beta is the resize ratio value of entropy bonus
ENTROPY_BETA = 0.01
BATCH_SIZE = 8
#Reward steps is the number of steps that Bellman equation extend steps, it is used to calculate discounted total reward
REWARD_STEPS = 10

In [3]:
class PGN(nn.Module):
    #network structure same as before, 128 neurons layer
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [4]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    writer = SummaryWriter(comment="-cartpole-pg")

    net = PGN(env.observation_space.shape[0], env.action_space.n)
    print(net)

    agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                                   apply_softmax=True)
    #extend Bellman equation for 10 steps from experience source.
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    step_rewards = []
    step_idx = 0
    done_episodes = 0
    reward_sum = 0.0

    batch_states, batch_actions, batch_scales = [], [], []

    #record the total discounted reward in training, and use it to calculate the baseline of policy gradient
    for step_idx, exp in enumerate(exp_source):
        reward_sum += exp.reward
        baseline = reward_sum / (step_idx + 1)
        writer.add_scalar("baseline", baseline, step_idx)
        batch_states.append(exp.state)
        batch_actions.append(int(exp.action))
        batch_scales.append(exp.reward - baseline)

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d" % (
                step_idx, reward, mean_rewards, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
                break

        if len(batch_states) < BATCH_SIZE:
            continue

        states_v = torch.FloatTensor(batch_states)
        batch_actions_t = torch.LongTensor(batch_actions)
        batch_scale_v = torch.FloatTensor(batch_scales)

        #same code as before to calculate the negative policy gradient, which is the policy loss
        optimizer.zero_grad()
        logits_v = net(states_v)
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
        loss_policy_v = -log_prob_actions_v.mean()

        #we add entropy bonus to loss and calculate batch entropy
        prob_v = F.softmax(logits_v, dim=1)
        entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
        entropy_loss_v = -ENTROPY_BETA * entropy_v
        loss_v = loss_policy_v + entropy_loss_v

        loss_v.backward()
        optimizer.step()

        #Kullback Leibler to test the difference between new and old policy, it state how much one distribution is variance
        #to another distribution. If the value is high, it means the policy is much difference from the previous, which is
        #bad because weightings are changing seriously and hardly converge.
        # calc KL-div
        new_logits_v = net(states_v)
        new_prob_v = F.softmax(new_logits_v, dim=1)
        kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
        writer.add_scalar("kl", kl_div_v.item(), step_idx)

        #we show the gradient statistics in training step with maximum value and L2-norm(L2 範數) 
        grad_max = 0.0
        grad_means = 0.0
        grad_count = 0
        for p in net.parameters():
            grad_max = max(grad_max, p.grad.abs().max().item())
            grad_means += (p.grad ** 2).mean().sqrt().item()
            grad_count += 1

        #save everything to TensorBoard
        writer.add_scalar("baseline", baseline, step_idx)
        writer.add_scalar("entropy", entropy_v.item(), step_idx)
        writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
        writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
        writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
        writer.add_scalar("loss_total", loss_v.item(), step_idx)
        writer.add_scalar("grad_l2", grad_means / grad_count, step_idx)
        writer.add_scalar("grad_max", grad_max, step_idx)

        batch_states.clear()
        batch_actions.clear()
        batch_scales.clear()

    writer.close()

PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
16: reward:  15.00, mean_100:  15.00, episodes: 1
28: reward:  11.00, mean_100:  13.00, episodes: 2
48: reward:  19.00, mean_100:  15.00, episodes: 3
57: reward:   9.00, mean_100:  13.50, episodes: 4
72: reward:  14.00, mean_100:  13.60, episodes: 5
95: reward:  22.00, mean_100:  15.00, episodes: 6
147: reward:  51.00, mean_100:  20.14, episodes: 7
159: reward:  11.00, mean_100:  19.00, episodes: 8
235: reward:  75.00, mean_100:  25.22, episodes: 9
282: reward:  46.00, mean_100:  27.30, episodes: 10
311: reward:  28.00, mean_100:  27.36, episodes: 11
342: reward:  30.00, mean_100:  27.58, episodes: 12
361: reward:  18.00, mean_100:  26.85, episodes: 13
394: reward:  32.00, mean_100:  27.21, episodes: 14
419: reward:  24.00, mean_100:  27.00, episodes: 15
433: reward:  13.00, mean_100:  26.12, episodes: 16
468: reward:  3

13084: reward: 200.00, mean_100: 111.13, episodes: 155
13285: reward: 200.00, mean_100: 112.73, episodes: 156
13486: reward: 200.00, mean_100: 114.08, episodes: 157
13571: reward:  84.00, mean_100: 114.56, episodes: 158
13760: reward: 188.00, mean_100: 115.94, episodes: 159
13922: reward: 161.00, mean_100: 117.44, episodes: 160
13966: reward:  43.00, mean_100: 117.68, episodes: 161
14167: reward: 200.00, mean_100: 119.30, episodes: 162
14368: reward: 200.00, mean_100: 121.12, episodes: 163
14536: reward: 167.00, mean_100: 122.45, episodes: 164
14737: reward: 200.00, mean_100: 124.27, episodes: 165
14883: reward: 145.00, mean_100: 125.16, episodes: 166
15059: reward: 175.00, mean_100: 125.82, episodes: 167
15174: reward: 114.00, mean_100: 126.21, episodes: 168
15327: reward: 152.00, mean_100: 127.35, episodes: 169
15528: reward: 200.00, mean_100: 128.54, episodes: 170
15652: reward: 123.00, mean_100: 129.39, episodes: 171
15853: reward: 200.00, mean_100: 130.73, episodes: 172
16054: rew

40539: reward: 134.00, mean_100: 186.85, episodes: 306
40740: reward: 200.00, mean_100: 187.05, episodes: 307
40941: reward: 200.00, mean_100: 187.05, episodes: 308
41142: reward: 200.00, mean_100: 187.82, episodes: 309
41313: reward: 170.00, mean_100: 188.27, episodes: 310
41514: reward: 200.00, mean_100: 188.27, episodes: 311
41571: reward:  56.00, mean_100: 187.55, episodes: 312
41772: reward: 200.00, mean_100: 187.55, episodes: 313
41973: reward: 200.00, mean_100: 187.94, episodes: 314
42174: reward: 200.00, mean_100: 187.94, episodes: 315
42339: reward: 164.00, mean_100: 187.58, episodes: 316
42498: reward: 158.00, mean_100: 187.16, episodes: 317
42699: reward: 200.00, mean_100: 187.16, episodes: 318
42804: reward: 104.00, mean_100: 186.92, episodes: 319
42907: reward: 102.00, mean_100: 185.94, episodes: 320
43043: reward: 135.00, mean_100: 185.29, episodes: 321
43244: reward: 200.00, mean_100: 185.29, episodes: 322
43445: reward: 200.00, mean_100: 185.29, episodes: 323
43646: rew