In [1]:
import gym
import ptan
import argparse
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.001
#entropy_beta is the resize ratio value of entropy bonus
ENTROPY_BETA = 0.01
BATCH_SIZE = 8
#Reward steps is the number of steps that Bellman equation extend steps, it is used to calculate discounted total reward
REWARD_STEPS = 10

In [3]:
class PGN(nn.Module):
    #network structure same as before, 128 neurons layer
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [4]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--baseline", default=False, action='store_true', help="Enable mean baseline")
    args, unknown = parser.parse_known_args()
    
    env = gym.make("CartPole-v0")
    writer = SummaryWriter(comment="-cartpole-pg" + "-baseline=%s" % args.baseline)

    net = PGN(env.observation_space.shape[0], env.action_space.n)
    print(net)

    agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                                   apply_softmax=True)
    #extend Bellman equation for 10 steps from experience source.
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    step_rewards = []
    step_idx = 0
    done_episodes = 0
    reward_sum = 0.0

    batch_states, batch_actions, batch_scales = [], [], []

    #record the total discounted reward in training, and use it to calculate the baseline of policy gradient
    for step_idx, exp in enumerate(exp_source):
        reward_sum += exp.reward
        baseline = reward_sum / (step_idx + 1)
        writer.add_scalar("baseline", baseline, step_idx)
        batch_states.append(exp.state)
        batch_actions.append(int(exp.action))
        if args.baseline:
            batch_scales.append(exp.reward - baseline)
        else:
            batch_scales.append(exp.reward)

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d" % (
                step_idx, reward, mean_rewards, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
                break

        if len(batch_states) < BATCH_SIZE:
            continue

        states_v = torch.FloatTensor(batch_states)
        batch_actions_t = torch.LongTensor(batch_actions)
        batch_scale_v = torch.FloatTensor(batch_scales)

        #same code as before to calculate the negative policy gradient, which is the policy loss, and use chosen action
        #probabilities to calculate log value and multiply policy resize value(which is the discounted reward)
        optimizer.zero_grad()
        logits_v = net(states_v)
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
        loss_policy_v = -log_prob_actions_v.mean()
        
        #backpropagate policy loss to calculate gradient and store in model buffer.
        #retain_graph=True will store the graph structure, usually it will destroyed by backward(), which means the
        #variables between the process won't be stored and destoryed, and we can't get it back, so calling this option
        #will keep the variables saved there and we can get it.
        loss_policy_v.backward(retain_graph=True)
        
        #we loop every parameters in our model, which has gradient tensor, then we flatten it and get the grad column,
        #this will give us a long array with all grad columns, we need to consider both policy gradient and entropy gradient
        #therefore, we call backward() with retain graph = True
        grads = np.concatenate([p.grad.data.numpy().flatten()
                                for p in net.parameters()
                                if p.grad is not None])

        #we add entropy bonus to loss and calculate batch entropy
        prob_v = F.softmax(logits_v, dim=1)
        entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
        entropy_loss_v = -ENTROPY_BETA * entropy_v
        entropy_loss_v.backward()
        optimizer.step()
        
        
        loss_v = loss_policy_v + entropy_loss_v

        #Kullback Leibler to test the difference between new and old policy, it state how much one distribution is variance
        #to another distribution. If the value is high, it means the policy is much difference from the previous, which is
        #bad because weightings are changing seriously and hardly converge.
        # calc KL-div
        new_logits_v = net(states_v)
        new_prob_v = F.softmax(new_logits_v, dim=1)
        kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
        writer.add_scalar("kl", kl_div_v.item(), step_idx)

        #save everything to TensorBoard
        writer.add_scalar("baseline", baseline, step_idx)
        writer.add_scalar("entropy", entropy_v.item(), step_idx)
        writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
        writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
        writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
        writer.add_scalar("loss_total", loss_v.item(), step_idx)

        writer.add_scalar("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx)
        writer.add_scalar("grad_max", np.max(np.abs(grads)), step_idx)
        writer.add_scalar("grad_var", np.var(grads), step_idx)

        batch_states.clear()
        batch_actions.clear()
        batch_scales.clear()

    writer.close()

PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
27: reward:  26.00, mean_100:  26.00, episodes: 1
46: reward:  18.00, mean_100:  22.00, episodes: 2
64: reward:  17.00, mean_100:  20.33, episodes: 3
77: reward:  12.00, mean_100:  18.25, episodes: 4
89: reward:  11.00, mean_100:  16.80, episodes: 5
125: reward:  35.00, mean_100:  19.83, episodes: 6
139: reward:  13.00, mean_100:  18.86, episodes: 7
185: reward:  45.00, mean_100:  22.12, episodes: 8
203: reward:  17.00, mean_100:  21.56, episodes: 9
220: reward:  16.00, mean_100:  21.00, episodes: 10
263: reward:  42.00, mean_100:  22.91, episodes: 11
295: reward:  31.00, mean_100:  23.58, episodes: 12
311: reward:  15.00, mean_100:  22.92, episodes: 13
349: reward:  37.00, mean_100:  23.93, episodes: 14
395: reward:  45.00, mean_100:  25.33, episodes: 15
417: reward:  21.00, mean_100:  25.06, episodes: 16
431: reward:  

5578: reward:  33.00, mean_100:  38.46, episodes: 153
5634: reward:  55.00, mean_100:  38.52, episodes: 154
5691: reward:  56.00, mean_100:  38.91, episodes: 155
5771: reward:  79.00, mean_100:  39.58, episodes: 156
5816: reward:  44.00, mean_100:  39.60, episodes: 157
5877: reward:  60.00, mean_100:  40.06, episodes: 158
5932: reward:  54.00, mean_100:  40.44, episodes: 159
5982: reward:  49.00, mean_100:  40.67, episodes: 160
6019: reward:  36.00, mean_100:  40.59, episodes: 161
6043: reward:  23.00, mean_100:  40.56, episodes: 162
6127: reward:  83.00, mean_100:  41.22, episodes: 163
6187: reward:  59.00, mean_100:  41.58, episodes: 164
6266: reward:  78.00, mean_100:  42.06, episodes: 165
6309: reward:  42.00, mean_100:  42.27, episodes: 166
6354: reward:  44.00, mean_100:  42.59, episodes: 167
6504: reward: 149.00, mean_100:  43.50, episodes: 168
6540: reward:  35.00, mean_100:  43.63, episodes: 169
6640: reward:  99.00, mean_100:  43.81, episodes: 170
6688: reward:  47.00, mean_1

16740: reward: 158.00, mean_100:  72.89, episodes: 304
16806: reward:  65.00, mean_100:  73.25, episodes: 305
16942: reward: 135.00, mean_100:  73.74, episodes: 306
17094: reward: 151.00, mean_100:  74.32, episodes: 307
17215: reward: 120.00, mean_100:  74.92, episodes: 308
17245: reward:  29.00, mean_100:  74.40, episodes: 309
17350: reward: 104.00, mean_100:  73.44, episodes: 310
17492: reward: 141.00, mean_100:  73.33, episodes: 311
17681: reward: 188.00, mean_100:  73.65, episodes: 312
17882: reward: 200.00, mean_100:  74.84, episodes: 313
17993: reward: 110.00, mean_100:  74.91, episodes: 314
18137: reward: 143.00, mean_100:  76.06, episodes: 315
18284: reward: 146.00, mean_100:  76.77, episodes: 316
18409: reward: 124.00, mean_100:  77.10, episodes: 317
18541: reward: 131.00, mean_100:  77.49, episodes: 318
18742: reward: 200.00, mean_100:  78.79, episodes: 319
18839: reward:  96.00, mean_100:  78.54, episodes: 320
18953: reward: 113.00, mean_100:  78.51, episodes: 321
19030: rew

32680: reward: 100.00, mean_100: 101.67, episodes: 458
32716: reward:  35.00, mean_100: 101.13, episodes: 459
32741: reward:  24.00, mean_100: 100.27, episodes: 460
32803: reward:  61.00, mean_100:  99.84, episodes: 461
32885: reward:  81.00, mean_100:  99.48, episodes: 462
32910: reward:  24.00, mean_100:  98.50, episodes: 463
32944: reward:  33.00, mean_100:  97.58, episodes: 464
32980: reward:  35.00, mean_100:  95.97, episodes: 465
33025: reward:  44.00, mean_100:  94.56, episodes: 466
33112: reward:  86.00, mean_100:  93.42, episodes: 467
33198: reward:  85.00, mean_100:  92.27, episodes: 468
33270: reward:  71.00, mean_100:  91.33, episodes: 469
33326: reward:  55.00, mean_100:  89.88, episodes: 470
33366: reward:  39.00, mean_100:  88.27, episodes: 471
33393: reward:  26.00, mean_100:  86.92, episodes: 472
33465: reward:  71.00, mean_100:  86.36, episodes: 473
33525: reward:  59.00, mean_100:  84.95, episodes: 474
33580: reward:  54.00, mean_100:  83.78, episodes: 475
33644: rew

45567: reward: 102.00, mean_100:  88.61, episodes: 608
45668: reward: 100.00, mean_100:  89.35, episodes: 609
45776: reward: 107.00, mean_100:  89.30, episodes: 610
45868: reward:  91.00, mean_100:  89.56, episodes: 611
45960: reward:  91.00, mean_100:  89.72, episodes: 612
46042: reward:  81.00, mean_100:  89.54, episodes: 613
46103: reward:  60.00, mean_100:  89.63, episodes: 614
46159: reward:  55.00, mean_100:  89.72, episodes: 615
46234: reward:  74.00, mean_100:  90.24, episodes: 616
46276: reward:  41.00, mean_100:  89.89, episodes: 617
46297: reward:  20.00, mean_100:  89.21, episodes: 618
46381: reward:  83.00, mean_100:  88.88, episodes: 619
46442: reward:  60.00, mean_100:  88.36, episodes: 620
46469: reward:  26.00, mean_100:  88.23, episodes: 621
46561: reward:  91.00, mean_100:  87.99, episodes: 622
46656: reward:  94.00, mean_100:  87.96, episodes: 623
46735: reward:  78.00, mean_100:  87.62, episodes: 624
46763: reward:  27.00, mean_100:  87.36, episodes: 625
46851: rew

60008: reward: 100.00, mean_100:  88.17, episodes: 761
60112: reward: 103.00, mean_100:  88.08, episodes: 762
60139: reward:  26.00, mean_100:  88.06, episodes: 763
60251: reward: 111.00, mean_100:  88.08, episodes: 764
60356: reward: 104.00, mean_100:  87.94, episodes: 765
60373: reward:  16.00, mean_100:  86.82, episodes: 766
60469: reward:  95.00, mean_100:  86.40, episodes: 767
60495: reward:  25.00, mean_100:  85.24, episodes: 768
60602: reward: 106.00, mean_100:  85.03, episodes: 769
60702: reward:  99.00, mean_100:  84.50, episodes: 770
60810: reward: 107.00, mean_100:  83.77, episodes: 771
60913: reward: 102.00, mean_100:  83.07, episodes: 772
61016: reward: 102.00, mean_100:  82.49, episodes: 773
61129: reward: 112.00, mean_100:  82.34, episodes: 774
61238: reward: 108.00, mean_100:  82.12, episodes: 775
61266: reward:  27.00, mean_100:  80.82, episodes: 776
61290: reward:  23.00, mean_100:  79.77, episodes: 777
61403: reward: 112.00, mean_100:  79.59, episodes: 778
61441: rew

74178: reward: 105.00, mean_100:  97.79, episodes: 910
74225: reward:  46.00, mean_100:  97.15, episodes: 911
74337: reward: 111.00, mean_100:  97.25, episodes: 912
74443: reward: 105.00, mean_100:  98.02, episodes: 913
74554: reward: 110.00, mean_100:  98.11, episodes: 914
74656: reward: 101.00, mean_100:  98.06, episodes: 915
74752: reward:  95.00, mean_100:  98.68, episodes: 916
74865: reward: 112.00, mean_100:  98.74, episodes: 917
74895: reward:  29.00, mean_100:  98.02, episodes: 918
74926: reward:  30.00, mean_100:  97.29, episodes: 919
75031: reward: 104.00, mean_100:  97.41, episodes: 920
75131: reward:  99.00, mean_100:  97.99, episodes: 921
75230: reward:  98.00, mean_100:  97.93, episodes: 922
75336: reward: 105.00, mean_100:  97.97, episodes: 923
75374: reward:  37.00, mean_100:  97.97, episodes: 924
75479: reward: 104.00, mean_100:  97.97, episodes: 925
75576: reward:  96.00, mean_100:  98.19, episodes: 926
75593: reward:  16.00, mean_100:  98.10, episodes: 927
75632: rew

93874: reward: 113.00, mean_100: 145.21, episodes: 1061
93997: reward: 122.00, mean_100: 144.43, episodes: 1062
94107: reward: 109.00, mean_100: 143.96, episodes: 1063
94204: reward:  96.00, mean_100: 143.29, episodes: 1064
94302: reward:  97.00, mean_100: 142.97, episodes: 1065
94417: reward: 114.00, mean_100: 142.60, episodes: 1066
94494: reward:  76.00, mean_100: 141.61, episodes: 1067
94603: reward: 108.00, mean_100: 141.10, episodes: 1068
94707: reward: 103.00, mean_100: 140.25, episodes: 1069
94820: reward: 112.00, mean_100: 139.69, episodes: 1070
94945: reward: 124.00, mean_100: 139.50, episodes: 1071
95067: reward: 121.00, mean_100: 139.16, episodes: 1072
95190: reward: 122.00, mean_100: 138.75, episodes: 1073
95290: reward:  99.00, mean_100: 137.74, episodes: 1074
95428: reward: 137.00, mean_100: 137.11, episodes: 1075
95550: reward: 121.00, mean_100: 136.46, episodes: 1076
95666: reward: 115.00, mean_100: 135.81, episodes: 1077
95795: reward: 128.00, mean_100: 135.13, episode

113920: reward: 116.00, mean_100: 131.70, episodes: 1209
114064: reward: 143.00, mean_100: 131.68, episodes: 1210
114186: reward: 121.00, mean_100: 131.43, episodes: 1211
114309: reward: 122.00, mean_100: 131.12, episodes: 1212
114451: reward: 141.00, mean_100: 131.17, episodes: 1213
114568: reward: 116.00, mean_100: 130.76, episodes: 1214
114694: reward: 125.00, mean_100: 130.76, episodes: 1215
114810: reward: 115.00, mean_100: 130.60, episodes: 1216
114949: reward: 138.00, mean_100: 130.49, episodes: 1217
115086: reward: 136.00, mean_100: 130.48, episodes: 1218
115195: reward: 108.00, mean_100: 130.22, episodes: 1219
115310: reward: 114.00, mean_100: 129.95, episodes: 1220
115447: reward: 136.00, mean_100: 129.98, episodes: 1221
115587: reward: 139.00, mean_100: 129.85, episodes: 1222
115722: reward: 134.00, mean_100: 129.90, episodes: 1223
115868: reward: 145.00, mean_100: 130.07, episodes: 1224
116002: reward: 133.00, mean_100: 130.13, episodes: 1225
116121: reward: 118.00, mean_10

137591: reward: 125.00, mean_100: 177.13, episodes: 1353
137723: reward: 131.00, mean_100: 176.86, episodes: 1354
137905: reward: 181.00, mean_100: 176.67, episodes: 1355
138106: reward: 200.00, mean_100: 176.67, episodes: 1356
138307: reward: 200.00, mean_100: 177.21, episodes: 1357
138508: reward: 200.00, mean_100: 177.36, episodes: 1358
138709: reward: 200.00, mean_100: 177.96, episodes: 1359
138910: reward: 200.00, mean_100: 178.08, episodes: 1360
139111: reward: 200.00, mean_100: 178.65, episodes: 1361
139312: reward: 200.00, mean_100: 178.95, episodes: 1362
139513: reward: 200.00, mean_100: 178.95, episodes: 1363
139714: reward: 200.00, mean_100: 179.33, episodes: 1364
139915: reward: 200.00, mean_100: 179.99, episodes: 1365
140116: reward: 200.00, mean_100: 180.46, episodes: 1366
140317: reward: 200.00, mean_100: 181.23, episodes: 1367
140518: reward: 200.00, mean_100: 181.65, episodes: 1368
140719: reward: 200.00, mean_100: 182.33, episodes: 1369
140920: reward: 200.00, mean_10