# This algorithm can converge on simple environment with fewer steps like in Cartpole, but not Pong. Here the book introduced this method is just for a sample for looking, the book also can't converge, after about 20 million training, the reward nearly have no change, and the entropy also the same, they have the lowest reward of -21.

In [1]:
#!/usr/bin/env python3
import gym
import ptan
import numpy as np
import argparse
import collections
from tensorboardX import SummaryWriter

import torch
import torch.nn.functional as F
import torch.nn.utils as nn_utils
import torch.optim as optim

from lib import common

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.0001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128

REWARD_STEPS = 10
BASELINE_STEPS = 1000000
GRAD_L2_CLIP = 0.1

ENV_COUNT = 32

DEFAULT_RUN = "target"

In [3]:
def make_env():
    return ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4"))

#To calculate mean average faster, here constructed a deque-backed buffer
class MeanBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.deque = collections.deque(maxlen=capacity)
        self.sum = 0.0

    def add(self, val):
        if len(self.deque) == self.capacity:
            self.sum -= self.deque[0]
        self.deque.append(val)
        self.sum += val

    def mean(self):
        if not self.deque:
            return 0.0
        return self.sum / len(self.deque)

In [4]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
    parser.add_argument("-n", '--name', default=DEFAULT_RUN, required=False, help="Name of the run")
    args, unknown = parser.parse_known_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    #We create multiple environment here, and send these environments to ExperienceSource, ExperienceSource will sample in
    #turn, this can provide the sample with lower relationship and more independence.
    envs = [make_env() for _ in range(ENV_COUNT)]
    writer = SummaryWriter(comment="-pong-pg-" + args.name)

    net = common.AtariPGN(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
    print(net)

    agent = ptan.agent.PolicyAgent(net, apply_softmax=True, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

    total_rewards = []
    step_idx = 0
    done_episodes = 0
    train_step_idx = 0
    baseline_buf = MeanBuffer(BASELINE_STEPS)

    batch_states, batch_actions, batch_scales = [], [], []
    m_baseline, m_batch_scales, m_loss_entropy, m_loss_policy, m_loss_total = [], [], [], [], []
    m_grad_max, m_grad_mean = [], []
    sum_reward = 0.0

    with common.RewardTracker(writer, stop_reward=18) as tracker:
        for step_idx, exp in enumerate(exp_source):
            baseline_buf.add(exp.reward)
            baseline = baseline_buf.mean()
            batch_states.append(np.array(exp.state, copy=False))
            batch_actions.append(int(exp.action))
            batch_scales.append(exp.reward - baseline)

            # handle new rewards
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if tracker.reward(new_rewards[0], step_idx):
                    break

            if len(batch_states) < BATCH_SIZE:
                continue

            train_step_idx += 1
            states_v = torch.FloatTensor(batch_states).to(device)
            batch_actions_t = torch.LongTensor(batch_actions).to(device)

            scale_std = np.std(batch_scales)
            batch_scale_v = torch.FloatTensor(batch_scales).to(device)

            optimizer.zero_grad()
            logits_v = net(states_v)
            log_prob_v = F.log_softmax(logits_v, dim=1)
            log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = -ENTROPY_BETA * entropy_v
            loss_v = loss_policy_v + entropy_loss_v
            loss_v.backward()
            #gradient clipping use torch.nn.utils clip_grad_norm
            nn_utils.clip_grad_norm_(net.parameters(), GRAD_L2_CLIP)
            optimizer.step()

            # calc KL-div
            new_logits_v = net(states_v)
            new_prob_v = F.softmax(new_logits_v, dim=1)
            kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
            writer.add_scalar("kl", kl_div_v.item(), step_idx)

            grad_max = 0.0
            grad_means = 0.0
            grad_count = 0
            for p in net.parameters():
                grad_max = max(grad_max, p.grad.abs().max().item())
                grad_means += (p.grad ** 2).mean().sqrt().item()
                grad_count += 1

            writer.add_scalar("baseline", baseline, step_idx)
            writer.add_scalar("entropy", entropy_v.item(), step_idx)
            writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
            writer.add_scalar("batch_scales_std", scale_std, step_idx)
            writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
            writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
            writer.add_scalar("loss_total", loss_v.item(), step_idx)
            writer.add_scalar("grad_l2", grad_means / grad_count, step_idx)
            writer.add_scalar("grad_max", grad_max, step_idx)

            batch_states.clear()
            batch_actions.clear()
            batch_scales.clear()

    writer.close()

AtariPGN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
24428: done 1 games, mean reward -21.000, speed 119.33 f/s
25834: done 2 games, mean reward -21.000, speed 126.93 f/s
25895: done 3 games, mean reward -21.000, speed 85.02 f/s
25980: done 4 games, mean reward -21.000, speed 194.14 f/s
26364: done 5 games, mean reward -21.000, speed 120.91 f/s
26381: done 6 games, mean reward -20.833, speed 28.13 f/s
26755: done 7 games, mean reward -20.857, speed 122.59 f/s
26805: done 8 games, mean reward -20.875, speed 275.46 f/s
26890: done 9 games, mean reward -20.778, speed 89.90 f/s
27120: done 10 games, mean reward -20.700, spee

134570: done 130 games, mean reward -20.160, speed 122.13 f/s
134674: done 131 games, mean reward -20.180, speed 111.65 f/s
138829: done 132 games, mean reward -20.190, speed 121.16 f/s
139108: done 133 games, mean reward -20.190, speed 119.35 f/s
139145: done 134 games, mean reward -20.190, speed 53.83 f/s
140069: done 135 games, mean reward -20.200, speed 116.27 f/s
140509: done 136 games, mean reward -20.200, speed 123.59 f/s
141263: done 137 games, mean reward -20.210, speed 118.81 f/s
141310: done 138 games, mean reward -20.220, speed 184.08 f/s
141944: done 139 games, mean reward -20.230, speed 93.50 f/s
142365: done 140 games, mean reward -20.240, speed 94.12 f/s
143001: done 141 games, mean reward -20.240, speed 94.73 f/s
143498: done 142 games, mean reward -20.240, speed 108.45 f/s
146228: done 143 games, mean reward -20.250, speed 116.26 f/s
146261: done 144 games, mean reward -20.260, speed 192.37 f/s
148435: done 145 games, mean reward -20.250, speed 112.04 f/s
148537: done

256202: done 263 games, mean reward -20.270, speed 256.61 f/s
258859: done 264 games, mean reward -20.280, speed 120.95 f/s
260165: done 265 games, mean reward -20.300, speed 120.44 f/s
260331: done 266 games, mean reward -20.310, speed 132.07 f/s
261973: done 267 games, mean reward -20.320, speed 122.33 f/s
262227: done 268 games, mean reward -20.330, speed 123.37 f/s
262774: done 269 games, mean reward -20.320, speed 129.16 f/s
264112: done 270 games, mean reward -20.320, speed 122.38 f/s
264535: done 271 games, mean reward -20.300, speed 127.48 f/s
264911: done 272 games, mean reward -20.300, speed 120.30 f/s
265414: done 273 games, mean reward -20.290, speed 104.83 f/s
266462: done 274 games, mean reward -20.290, speed 114.56 f/s
267128: done 275 games, mean reward -20.300, speed 114.31 f/s
267426: done 276 games, mean reward -20.320, speed 102.54 f/s
267576: done 277 games, mean reward -20.310, speed 112.66 f/s
268086: done 278 games, mean reward -20.310, speed 113.18 f/s
268433: 

381056: done 396 games, mean reward -20.220, speed 113.07 f/s
382613: done 397 games, mean reward -20.210, speed 125.25 f/s
384077: done 398 games, mean reward -20.200, speed 119.22 f/s
384184: done 399 games, mean reward -20.200, speed 105.18 f/s
385338: done 400 games, mean reward -20.190, speed 120.09 f/s
386039: done 401 games, mean reward -20.170, speed 128.28 f/s
386150: done 402 games, mean reward -20.150, speed 105.00 f/s
386433: done 403 games, mean reward -20.150, speed 100.91 f/s
386773: done 404 games, mean reward -20.150, speed 137.47 f/s
387336: done 405 games, mean reward -20.130, speed 113.67 f/s
387805: done 406 games, mean reward -20.140, speed 128.00 f/s
389452: done 407 games, mean reward -20.150, speed 122.62 f/s
389572: done 408 games, mean reward -20.150, speed 117.27 f/s
390500: done 409 games, mean reward -20.160, speed 122.88 f/s
390563: done 410 games, mean reward -20.170, speed 87.01 f/s
393673: done 411 games, mean reward -20.180, speed 125.45 f/s
395386: d

504825: done 529 games, mean reward -20.250, speed 123.60 f/s
505759: done 530 games, mean reward -20.250, speed 115.29 f/s
507869: done 531 games, mean reward -20.250, speed 120.18 f/s
508534: done 532 games, mean reward -20.230, speed 125.66 f/s
508610: done 533 games, mean reward -20.210, speed 96.09 f/s
509391: done 534 games, mean reward -20.220, speed 121.50 f/s
510741: done 535 games, mean reward -20.210, speed 119.13 f/s
512230: done 536 games, mean reward -20.200, speed 127.58 f/s
512829: done 537 games, mean reward -20.210, speed 119.26 f/s
513341: done 538 games, mean reward -20.220, speed 123.79 f/s
514010: done 539 games, mean reward -20.190, speed 128.14 f/s
514033: done 540 games, mean reward -20.190, speed 217.56 f/s
514741: done 541 games, mean reward -20.200, speed 112.46 f/s
514905: done 542 games, mean reward -20.190, speed 144.24 f/s
518108: done 543 games, mean reward -20.190, speed 120.49 f/s
518458: done 544 games, mean reward -20.220, speed 117.29 f/s
519507: d

632083: done 662 games, mean reward -20.230, speed 113.97 f/s
632901: done 663 games, mean reward -20.230, speed 91.45 f/s
633341: done 664 games, mean reward -20.220, speed 128.72 f/s
634136: done 665 games, mean reward -20.210, speed 106.11 f/s
634668: done 666 games, mean reward -20.210, speed 106.24 f/s
635248: done 667 games, mean reward -20.220, speed 122.45 f/s
635303: done 668 games, mean reward -20.210, speed 77.99 f/s
635574: done 669 games, mean reward -20.220, speed 108.34 f/s
636335: done 670 games, mean reward -20.220, speed 115.90 f/s
636634: done 671 games, mean reward -20.230, speed 129.90 f/s
637220: done 672 games, mean reward -20.220, speed 115.85 f/s
638338: done 673 games, mean reward -20.220, speed 121.21 f/s
638448: done 674 games, mean reward -20.210, speed 232.70 f/s
640134: done 675 games, mean reward -20.190, speed 119.56 f/s
642976: done 676 games, mean reward -20.200, speed 122.48 f/s
643536: done 677 games, mean reward -20.220, speed 123.32 f/s
644422: do

753016: done 795 games, mean reward -20.300, speed 130.85 f/s
753742: done 796 games, mean reward -20.290, speed 120.50 f/s
755045: done 797 games, mean reward -20.290, speed 123.92 f/s
755587: done 798 games, mean reward -20.280, speed 112.21 f/s
756481: done 799 games, mean reward -20.270, speed 122.44 f/s
759025: done 800 games, mean reward -20.280, speed 125.97 f/s
760221: done 801 games, mean reward -20.280, speed 116.24 f/s
761778: done 802 games, mean reward -20.300, speed 120.76 f/s
762066: done 803 games, mean reward -20.320, speed 130.61 f/s
762106: done 804 games, mean reward -20.320, speed 193.75 f/s
764497: done 805 games, mean reward -20.330, speed 124.54 f/s
764999: done 806 games, mean reward -20.340, speed 113.70 f/s
765505: done 807 games, mean reward -20.340, speed 124.63 f/s
766646: done 808 games, mean reward -20.330, speed 103.73 f/s


KeyboardInterrupt: 