In [1]:
#!/usr/bin/env python3
import gym
import ptan
import numpy as np
import argparse
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim

from lib import common

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.001
#Entropy beta is the resize ratio of entropy bonus
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
NUM_ENVS = 50

#Reward steps define how many steps for each action to approximate the discounted total reward, because we can use value
#approximation for the state value in further steps, we can use less steps.
REWARD_STEPS = 4
#Clip_grad used for gradient clipping, it can prevent the gradient becomes too big and push the policy too further in
#optimization
CLIP_GRAD = 0.1

In [3]:
class AtariA2C(nn.Module):
    #we have a common convolution and 2 sequential network, one return policy with action probability distribution,
    #2nd will return a value, which is approximated as state value.
    def __init__(self, input_shape, n_actions):
        super(AtariA2C, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        self.policy = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
        
        self.value = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    
    def forward(self, x):
        fx = x.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)
        return self.policy(conv_out), self.value(conv_out)

In [4]:
#we get batch environment, return 3 tensor: batch state, batch action and batch q-value, the q-value will be used to
#calculate MSE loss and optimize value approximation, which is similar to DQN, another is to calculate action advantage.
"""	
Convert batch into training tensors	
:param batch:	
:param net:	
:return: states variable, actions tensor, reference values variable	
"""
def unpack_batch(batch, net, device='cpu'):
    #we get the batch transfer and copy to array, reward value included discounted reward from reward_steps, if episode not
    #done, we remember the batch index.
    states = []
    actions = []
    rewards = []
    
    not_done_idx = []
    last_states = []
    for idx, exp in enumerate(batch):
        states.append(np.array(exp.state, copy=False))
        actions.append(int(exp.action))
        rewards.append(exp.reward)
        if exp.last_state is not None:
            not_done_idx.append(idx)
            last_states.append(np.array(exp.last_state, copy=False))
    
    #convert states, actions to PyTorch Tensor and copy to GPU.
    states_v = torch.FloatTensor(np.array(states, copy=False)).to(device)
    actions_t = torch.LongTensor(actions).to(device)
    
    #get the last state value and multiply with discount factor to add to discounted reward
    rewards_np = np.array(rewards, dtype=np.float32)
    if not_done_idx:
        last_states_v = torch.FloatTensor(last_states).to(device)
        last_vals_v = net(last_states_v)[1]
        last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
        rewards_np[not_done_idx] += GAMMA ** REWARD_STEPS * last_vals_np
    
    #pack up q-value and return
    ref_vals_v = torch.FloatTensor(rewards_np).to(device)
    return states_v, actions_t, ref_vals_v

In [5]:
if __name__ == "__main__":
    #training code nearly same as before, but now we use multiple environment to collect experience,instead of 1 environment
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
    parser.add_argument("-n", "--name", default="t2", required=False, help="Name of the run")
    args, unknown = parser.parse_known_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    
    make_env = lambda: ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4"))
    envs = [make_env() for _ in range(NUM_ENVS)]
    writer = SummaryWriter(comment="-pong-a2c_" + args.name)
    
    net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
    print(net)
    
    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], apply_softmax=True, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    
    #mostly epsilon will set to a very small number to prevent zero division, such as 1e-8, 1e-10, but here if using 
    #such small value, it won't converge, using 1e-8 will make the gradient becomes too big
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)
    
    batch = []
    
    #we use 2 wrappers: common.RewardTracker, which will calculate the mean reward of 100 episodes, and tell use when will
    #it exceed the boundary. Another is from ptan TBMeanTracker, it output the mean value of last 10 steps to TensorBoard,
    #we will output a smoothed value for every 10 steps instead of writing all steps into the TensorBoard
    with common.RewardTracker(writer, stop_reward=18) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            for step_idx, exp in enumerate(exp_source):
                batch.append(exp)
                
                #handle new rewards
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards:
                    if tracker.reward(new_rewards[0], step_idx):
                        break
                        
                if len(batch) < BATCH_SIZE:
                    continue
                
                #we use the unpack batch process to unpack data, then put into the network to return the policy and value.
                states_v, actions_t, vals_ref_v = unpack_batch(batch, net, device=device)
                batch.clear()
                
                optimizer.zero_grad()
                logits_v, value_v = net(states_v)
                
                #we calculated value loss with the MSE between network return value and 4 step Bellman equation
                #approximation value
                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)
                
                #we calculate policy loss to get policy gradient, we will get the log probability of policy and calculate
                #action advantage: A(s,a) = Q(s,a) - V(s), we don't want the policy gradient go to value loss network, so we
                #use detach(), then we get the probability of our chosen action. The policy gradient loss will be 
                #negative mean of resized log policy, because policy should be maximize and loss should be minimize.
                #we use log_softmax to return probability distribution
                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.squeeze(-1).detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()
                
                prob_v = F.softmax(logits_v, dim=1)
                #we calculate entropy loss here, it equals to our resized policy with negative value.
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
                
                #we get the policy gradient to track the max gradient, gradient variance and l2 norm value.
                loss_policy_v.backward(retain_graph=True)
                grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
                                        for p in net.parameters()
                                        if p.grad is not None])
                
                #we back propagate entropy loss, loss value and clip the gradient, then renew the network
                loss_v = entropy_loss_v + loss_value_v
                loss_v.backward()
                nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
                optimizer.step()
                loss_v += loss_policy_v
                
                tb_tracker.track("advantage", adv_v.cpu().numpy(), step_idx)
                tb_tracker.track("values", value_v.cpu().detach().numpy(), step_idx)
                tb_tracker.track("batch_rewards", vals_ref_v.cpu().numpy(), step_idx)
                tb_tracker.track("loss_entropy",entropy_loss_v.cpu().detach().numpy(), step_idx)
                tb_tracker.track("loss_policy",loss_policy_v.cpu().detach().numpy(), step_idx)
                tb_tracker.track("loss_value", loss_value_v.cpu().detach().numpy(), step_idx)
                tb_tracker.track("loss_total", loss_v.cpu().detach().numpy(), step_idx)
                tb_tracker.track("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx)
                tb_tracker.track("grad_max", np.max(np.abs(grads)), step_idx)
                tb_tracker.track("grad_var", np.var(grads), step_idx)

AtariA2C(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (policy): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
  (value): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
)
37660: done 1 games, mean reward -21.000, speed 107.00 f/s
37868: done 2 games, mean reward -21.000, speed 123.54 f/s
39037: done 3 games, mean reward -21.000, speed 103.49 f/s
39469: done 4 games, mean reward -21.000, speed 93.07 f/s
39903: done 5 games, mean reward -21.000, speed 112.26 f/s
40051: done 6 games, mean reward -21.000, speed 114.75 f/s
40607: done 7 games, mean reward -21.000, speed 98.6

136791: done 128 games, mean reward -20.320, speed 111.55 f/s
137401: done 129 games, mean reward -20.310, speed 109.32 f/s
137679: done 130 games, mean reward -20.320, speed 116.17 f/s
138328: done 131 games, mean reward -20.310, speed 110.71 f/s
139169: done 132 games, mean reward -20.300, speed 105.07 f/s
139401: done 133 games, mean reward -20.300, speed 101.79 f/s
140436: done 134 games, mean reward -20.310, speed 110.15 f/s
140534: done 135 games, mean reward -20.310, speed 224.44 f/s
140657: done 136 games, mean reward -20.290, speed 110.75 f/s
142376: done 137 games, mean reward -20.290, speed 106.70 f/s
143544: done 138 games, mean reward -20.300, speed 108.71 f/s
144256: done 139 games, mean reward -20.320, speed 106.20 f/s
144318: done 140 games, mean reward -20.320, speed 215.28 f/s
144339: done 141 games, mean reward -20.320, speed 192.60 f/s
145256: done 142 games, mean reward -20.340, speed 112.02 f/s
145753: done 143 games, mean reward -20.350, speed 94.76 f/s
147029: d

266744: done 261 games, mean reward -20.150, speed 116.30 f/s
266761: done 262 games, mean reward -20.170, speed 24.53 f/s
266876: done 263 games, mean reward -20.180, speed 218.72 f/s
267116: done 264 games, mean reward -20.180, speed 105.48 f/s
267351: done 265 games, mean reward -20.180, speed 103.66 f/s
267803: done 266 games, mean reward -20.180, speed 96.50 f/s
268533: done 267 games, mean reward -20.180, speed 107.52 f/s
271369: done 268 games, mean reward -20.150, speed 106.48 f/s
271569: done 269 games, mean reward -20.150, speed 138.54 f/s
271744: done 270 games, mean reward -20.140, speed 85.26 f/s
272317: done 271 games, mean reward -20.140, speed 113.72 f/s
272353: done 272 games, mean reward -20.130, speed 177.33 f/s
272795: done 273 games, mean reward -20.150, speed 100.82 f/s
273221: done 274 games, mean reward -20.160, speed 115.94 f/s
275046: done 275 games, mean reward -20.140, speed 110.37 f/s
276067: done 276 games, mean reward -20.130, speed 112.93 f/s
276390: don

386883: done 394 games, mean reward -20.180, speed 111.54 f/s
387457: done 395 games, mean reward -20.180, speed 105.34 f/s
390253: done 396 games, mean reward -20.170, speed 112.36 f/s
391334: done 397 games, mean reward -20.180, speed 107.68 f/s
392032: done 398 games, mean reward -20.180, speed 115.27 f/s
392229: done 399 games, mean reward -20.180, speed 84.77 f/s
395761: done 400 games, mean reward -20.180, speed 110.46 f/s
396100: done 401 games, mean reward -20.210, speed 103.39 f/s
398383: done 402 games, mean reward -20.200, speed 108.68 f/s
399062: done 403 games, mean reward -20.180, speed 112.78 f/s
399237: done 404 games, mean reward -20.190, speed 85.70 f/s
400295: done 405 games, mean reward -20.180, speed 108.04 f/s
401289: done 406 games, mean reward -20.170, speed 109.03 f/s
402211: done 407 games, mean reward -20.170, speed 111.24 f/s
404266: done 408 games, mean reward -20.180, speed 109.02 f/s
404777: done 409 games, mean reward -20.180, speed 105.62 f/s
405280: do

509096: done 527 games, mean reward -20.380, speed 96.11 f/s
510313: done 528 games, mean reward -20.380, speed 103.13 f/s
510719: done 529 games, mean reward -20.370, speed 112.72 f/s
511235: done 530 games, mean reward -20.390, speed 96.30 f/s
511606: done 531 games, mean reward -20.380, speed 132.67 f/s
512988: done 532 games, mean reward -20.380, speed 105.34 f/s
513371: done 533 games, mean reward -20.380, speed 103.04 f/s
515027: done 534 games, mean reward -20.350, speed 109.49 f/s
517935: done 535 games, mean reward -20.360, speed 104.59 f/s
518178: done 536 games, mean reward -20.350, speed 105.37 f/s
518715: done 537 games, mean reward -20.340, speed 108.07 f/s
519202: done 538 games, mean reward -20.340, speed 102.40 f/s
522923: done 539 games, mean reward -20.340, speed 110.53 f/s
523331: done 540 games, mean reward -20.320, speed 108.31 f/s
524212: done 541 games, mean reward -20.320, speed 107.91 f/s
525202: done 542 games, mean reward -20.320, speed 105.39 f/s
525982: do

635664: done 660 games, mean reward -20.190, speed 109.80 f/s
635677: done 661 games, mean reward -20.200, speed 125.33 f/s
635892: done 662 games, mean reward -20.210, speed 141.99 f/s
637054: done 663 games, mean reward -20.230, speed 110.38 f/s
637069: done 664 games, mean reward -20.230, speed 20.57 f/s
637274: done 665 games, mean reward -20.230, speed 133.11 f/s
638763: done 666 games, mean reward -20.210, speed 106.76 f/s
639505: done 667 games, mean reward -20.210, speed 104.17 f/s
639954: done 668 games, mean reward -20.210, speed 119.47 f/s
640265: done 669 games, mean reward -20.220, speed 96.22 f/s
640466: done 670 games, mean reward -20.210, speed 136.73 f/s
640727: done 671 games, mean reward -20.210, speed 102.74 f/s
642651: done 672 games, mean reward -20.210, speed 109.29 f/s
643443: done 673 games, mean reward -20.210, speed 111.62 f/s
644022: done 674 games, mean reward -20.210, speed 102.83 f/s
647769: done 675 games, mean reward -20.230, speed 108.34 f/s
648151: do

759283: done 793 games, mean reward -20.230, speed 112.47 f/s
760706: done 794 games, mean reward -20.220, speed 104.22 f/s
761358: done 795 games, mean reward -20.240, speed 104.54 f/s
761548: done 796 games, mean reward -20.230, speed 120.07 f/s
762670: done 797 games, mean reward -20.230, speed 102.73 f/s
762885: done 798 games, mean reward -20.220, speed 98.39 f/s
764463: done 799 games, mean reward -20.240, speed 107.13 f/s
765202: done 800 games, mean reward -20.240, speed 104.36 f/s
765234: done 801 games, mean reward -20.240, speed 203.82 f/s
766362: done 802 games, mean reward -20.230, speed 110.28 f/s
766922: done 803 games, mean reward -20.250, speed 113.89 f/s
769080: done 804 games, mean reward -20.250, speed 109.88 f/s
772003: done 805 games, mean reward -20.250, speed 108.24 f/s
772403: done 806 games, mean reward -20.260, speed 106.04 f/s
773381: done 807 games, mean reward -20.260, speed 106.46 f/s
773688: done 808 games, mean reward -20.260, speed 120.96 f/s
773707: d

885888: done 926 games, mean reward -20.180, speed 107.32 f/s
885916: done 927 games, mean reward -20.170, speed 231.20 f/s
887149: done 928 games, mean reward -20.170, speed 110.68 f/s
887796: done 929 games, mean reward -20.170, speed 110.02 f/s
888341: done 930 games, mean reward -20.160, speed 98.45 f/s
889046: done 931 games, mean reward -20.150, speed 115.21 f/s
889615: done 932 games, mean reward -20.140, speed 102.48 f/s
890683: done 933 games, mean reward -20.150, speed 104.74 f/s
890944: done 934 games, mean reward -20.150, speed 105.69 f/s
891111: done 935 games, mean reward -20.140, speed 110.94 f/s
892931: done 936 games, mean reward -20.150, speed 105.48 f/s
895769: done 937 games, mean reward -20.150, speed 109.13 f/s
897028: done 938 games, mean reward -20.150, speed 109.70 f/s
897096: done 939 games, mean reward -20.160, speed 194.00 f/s
897224: done 940 games, mean reward -20.170, speed 102.43 f/s
899407: done 941 games, mean reward -20.170, speed 107.56 f/s
899566: d

1008298: done 1058 games, mean reward -20.200, speed 89.50 f/s
1009086: done 1059 games, mean reward -20.200, speed 110.53 f/s
1009536: done 1060 games, mean reward -20.210, speed 101.58 f/s
1010855: done 1061 games, mean reward -20.230, speed 111.54 f/s
1011490: done 1062 games, mean reward -20.240, speed 107.92 f/s
1013197: done 1063 games, mean reward -20.230, speed 112.48 f/s
1015600: done 1064 games, mean reward -20.230, speed 108.41 f/s
1018470: done 1065 games, mean reward -20.240, speed 111.26 f/s
1019524: done 1066 games, mean reward -20.280, speed 101.63 f/s
1020008: done 1067 games, mean reward -20.310, speed 123.05 f/s
1020716: done 1068 games, mean reward -20.310, speed 105.03 f/s
1021511: done 1069 games, mean reward -20.320, speed 107.85 f/s
1021557: done 1070 games, mean reward -20.320, speed 171.44 f/s
1022181: done 1071 games, mean reward -20.320, speed 103.12 f/s
1023825: done 1072 games, mean reward -20.320, speed 110.24 f/s
1023847: done 1073 games, mean reward -20

1127703: done 1187 games, mean reward -20.190, speed 119.10 f/s
1129167: done 1188 games, mean reward -20.170, speed 121.13 f/s
1129235: done 1189 games, mean reward -20.190, speed 75.34 f/s
1129435: done 1190 games, mean reward -20.200, speed 136.90 f/s
1130431: done 1191 games, mean reward -20.230, speed 111.09 f/s
1131601: done 1192 games, mean reward -20.220, speed 116.14 f/s
1132828: done 1193 games, mean reward -20.230, speed 109.57 f/s
1133893: done 1194 games, mean reward -20.240, speed 114.36 f/s
1135351: done 1195 games, mean reward -20.240, speed 114.75 f/s
1135455: done 1196 games, mean reward -20.240, speed 104.34 f/s
1139443: done 1197 games, mean reward -20.270, speed 112.72 f/s
1139452: done 1198 games, mean reward -20.270, speed 130.32 f/s
1140361: done 1199 games, mean reward -20.270, speed 102.31 f/s
1140717: done 1200 games, mean reward -20.250, speed 130.56 f/s
1142682: done 1201 games, mean reward -20.250, speed 110.65 f/s
1144670: done 1202 games, mean reward -20

1251299: done 1316 games, mean reward -20.120, speed 117.11 f/s
1255960: done 1317 games, mean reward -20.120, speed 110.94 f/s
1257939: done 1318 games, mean reward -20.120, speed 107.28 f/s
1258270: done 1319 games, mean reward -20.140, speed 94.89 f/s
1258410: done 1320 games, mean reward -20.140, speed 112.72 f/s
1258799: done 1321 games, mean reward -20.130, speed 115.40 f/s
1259454: done 1322 games, mean reward -20.140, speed 111.06 f/s
1259562: done 1323 games, mean reward -20.150, speed 98.80 f/s
1261668: done 1324 games, mean reward -20.150, speed 111.80 f/s
1262791: done 1325 games, mean reward -20.150, speed 106.34 f/s
1263012: done 1326 games, mean reward -20.160, speed 99.64 f/s
1263722: done 1327 games, mean reward -20.190, speed 115.65 f/s
1263833: done 1328 games, mean reward -20.170, speed 104.22 f/s
1264492: done 1329 games, mean reward -20.170, speed 110.85 f/s
1265205: done 1330 games, mean reward -20.170, speed 104.84 f/s
1265766: done 1331 games, mean reward -20.1

1372301: done 1445 games, mean reward -20.200, speed 105.90 f/s
1373148: done 1446 games, mean reward -20.210, speed 107.66 f/s
1373638: done 1447 games, mean reward -20.190, speed 106.46 f/s
1373815: done 1448 games, mean reward -20.180, speed 125.35 f/s
1374303: done 1449 games, mean reward -20.180, speed 100.02 f/s
1375099: done 1450 games, mean reward -20.180, speed 102.43 f/s
1376090: done 1451 games, mean reward -20.170, speed 99.66 f/s
1377983: done 1452 games, mean reward -20.170, speed 107.35 f/s
1382421: done 1453 games, mean reward -20.170, speed 109.19 f/s
1384668: done 1454 games, mean reward -20.170, speed 105.87 f/s
1384727: done 1455 games, mean reward -20.160, speed 66.03 f/s
1386428: done 1456 games, mean reward -20.160, speed 107.59 f/s
1388577: done 1457 games, mean reward -20.170, speed 106.59 f/s
1388986: done 1458 games, mean reward -20.180, speed 113.64 f/s
1391355: done 1459 games, mean reward -20.200, speed 111.02 f/s
1391644: done 1460 games, mean reward -20.

1499607: done 1574 games, mean reward -20.030, speed 106.22 f/s
1500282: done 1575 games, mean reward -20.040, speed 110.34 f/s
1503658: done 1576 games, mean reward -20.040, speed 104.67 f/s
1504236: done 1577 games, mean reward -20.040, speed 113.73 f/s
1505945: done 1578 games, mean reward -20.040, speed 103.51 f/s
1507119: done 1579 games, mean reward -20.020, speed 107.68 f/s
1508716: done 1580 games, mean reward -20.010, speed 111.25 f/s
1509139: done 1581 games, mean reward -19.990, speed 96.69 f/s
1509224: done 1582 games, mean reward -20.000, speed 179.80 f/s
1509743: done 1583 games, mean reward -20.010, speed 106.17 f/s
1510138: done 1584 games, mean reward -20.010, speed 105.23 f/s
1510806: done 1585 games, mean reward -20.020, speed 96.56 f/s
1513015: done 1586 games, mean reward -20.030, speed 98.00 f/s
1513284: done 1587 games, mean reward -20.030, speed 104.57 f/s
1514760: done 1588 games, mean reward -20.040, speed 93.43 f/s
1514987: done 1589 games, mean reward -20.04

1625631: done 1703 games, mean reward -20.080, speed 106.60 f/s
1626000: done 1704 games, mean reward -20.090, speed 98.35 f/s
1626310: done 1705 games, mean reward -20.100, speed 120.19 f/s
1627679: done 1706 games, mean reward -20.090, speed 100.42 f/s
1627804: done 1707 games, mean reward -20.110, speed 88.91 f/s
1629383: done 1708 games, mean reward -20.110, speed 104.37 f/s
1629496: done 1709 games, mean reward -20.110, speed 73.98 f/s
1630825: done 1710 games, mean reward -20.110, speed 97.53 f/s
1631235: done 1711 games, mean reward -20.110, speed 95.50 f/s
1632142: done 1712 games, mean reward -20.100, speed 105.82 f/s
1632727: done 1713 games, mean reward -20.110, speed 113.16 f/s
1632761: done 1714 games, mean reward -20.110, speed 236.72 f/s
1633616: done 1715 games, mean reward -20.160, speed 90.01 f/s
1634493: done 1716 games, mean reward -20.150, speed 101.17 f/s
1639461: done 1717 games, mean reward -20.150, speed 98.28 f/s
1640008: done 1718 games, mean reward -20.170, 

1751134: done 1832 games, mean reward -20.130, speed 212.44 f/s
1751150: done 1833 games, mean reward -20.140, speed 139.55 f/s
1751203: done 1834 games, mean reward -20.120, speed 64.44 f/s
1751861: done 1835 games, mean reward -20.150, speed 111.71 f/s
1752353: done 1836 games, mean reward -20.140, speed 110.52 f/s
1755003: done 1837 games, mean reward -20.140, speed 116.76 f/s
1755616: done 1838 games, mean reward -20.130, speed 114.48 f/s
1757386: done 1839 games, mean reward -20.140, speed 114.59 f/s
1758277: done 1840 games, mean reward -20.150, speed 113.34 f/s
1759624: done 1841 games, mean reward -20.140, speed 113.83 f/s
1759813: done 1842 games, mean reward -20.130, speed 134.69 f/s
1760650: done 1843 games, mean reward -20.120, speed 107.43 f/s
1762333: done 1844 games, mean reward -20.110, speed 114.99 f/s
1762831: done 1845 games, mean reward -20.110, speed 111.99 f/s
1764168: done 1846 games, mean reward -20.110, speed 115.34 f/s
1764746: done 1847 games, mean reward -20

1882071: done 1961 games, mean reward -19.800, speed 154.70 f/s
1882126: done 1962 games, mean reward -19.810, speed 63.52 f/s
1882190: done 1963 games, mean reward -19.780, speed 209.80 f/s
1883919: done 1964 games, mean reward -19.790, speed 108.60 f/s
1885900: done 1965 games, mean reward -19.790, speed 110.10 f/s
1886493: done 1966 games, mean reward -19.790, speed 104.84 f/s
1890093: done 1967 games, mean reward -19.780, speed 108.28 f/s
1892850: done 1968 games, mean reward -19.780, speed 108.54 f/s
1894329: done 1969 games, mean reward -19.780, speed 103.52 f/s
1895554: done 1970 games, mean reward -19.760, speed 102.26 f/s
1895965: done 1971 games, mean reward -19.790, speed 108.74 f/s
1899733: done 1972 games, mean reward -19.770, speed 107.54 f/s
1900502: done 1973 games, mean reward -19.780, speed 107.15 f/s
1901389: done 1974 games, mean reward -19.790, speed 105.39 f/s
1902096: done 1975 games, mean reward -19.770, speed 97.70 f/s
1902232: done 1976 games, mean reward -19.

2023026: done 2090 games, mean reward -19.610, speed 112.01 f/s
2024415: done 2091 games, mean reward -19.600, speed 106.84 f/s
2024857: done 2092 games, mean reward -19.560, speed 101.03 f/s
2025865: done 2093 games, mean reward -19.520, speed 108.41 f/s
2026725: done 2094 games, mean reward -19.520, speed 116.32 f/s
2027292: done 2095 games, mean reward -19.490, speed 102.01 f/s
2027776: done 2096 games, mean reward -19.520, speed 105.05 f/s
2028820: done 2097 games, mean reward -19.490, speed 110.65 f/s
2028920: done 2098 games, mean reward -19.490, speed 241.63 f/s
2031141: done 2099 games, mean reward -19.480, speed 109.01 f/s
2031352: done 2100 games, mean reward -19.490, speed 135.40 f/s
2031891: done 2101 games, mean reward -19.480, speed 99.47 f/s
2032635: done 2102 games, mean reward -19.470, speed 118.48 f/s
2035465: done 2103 games, mean reward -19.450, speed 107.38 f/s
2036400: done 2104 games, mean reward -19.420, speed 112.06 f/s
2037664: done 2105 games, mean reward -19

2166635: done 2219 games, mean reward -19.070, speed 106.00 f/s
2166710: done 2220 games, mean reward -19.060, speed 68.23 f/s
2167046: done 2221 games, mean reward -19.060, speed 89.45 f/s
2167175: done 2222 games, mean reward -19.040, speed 108.18 f/s
2168786: done 2223 games, mean reward -19.040, speed 109.51 f/s
2168844: done 2224 games, mean reward -19.040, speed 66.25 f/s
2172077: done 2225 games, mean reward -19.050, speed 110.05 f/s
2172321: done 2226 games, mean reward -19.050, speed 105.76 f/s
2175880: done 2227 games, mean reward -19.050, speed 107.67 f/s
2176055: done 2228 games, mean reward -19.030, speed 127.99 f/s
2177444: done 2229 games, mean reward -19.020, speed 109.83 f/s
2180639: done 2230 games, mean reward -19.030, speed 111.63 f/s
2180818: done 2231 games, mean reward -19.020, speed 117.12 f/s
2182816: done 2232 games, mean reward -19.010, speed 107.04 f/s
2183132: done 2233 games, mean reward -19.020, speed 122.79 f/s
2183716: done 2234 games, mean reward -19.0

2333942: done 2348 games, mean reward -18.580, speed 197.63 f/s
2335457: done 2349 games, mean reward -18.570, speed 103.81 f/s
2336607: done 2350 games, mean reward -18.570, speed 105.44 f/s
2338673: done 2351 games, mean reward -18.550, speed 105.39 f/s
2338716: done 2352 games, mean reward -18.570, speed 50.65 f/s
2340165: done 2353 games, mean reward -18.570, speed 111.07 f/s
2342277: done 2354 games, mean reward -18.590, speed 104.75 f/s
2342544: done 2355 games, mean reward -18.550, speed 98.54 f/s
2345637: done 2356 games, mean reward -18.550, speed 104.64 f/s
2347353: done 2357 games, mean reward -18.520, speed 104.40 f/s
2348356: done 2358 games, mean reward -18.500, speed 108.42 f/s
2350196: done 2359 games, mean reward -18.500, speed 108.42 f/s
2353225: done 2360 games, mean reward -18.470, speed 107.62 f/s
2355094: done 2361 games, mean reward -18.470, speed 104.95 f/s
2355413: done 2362 games, mean reward -18.470, speed 122.39 f/s
2356282: done 2363 games, mean reward -18.

2535703: done 2477 games, mean reward -17.970, speed 70.54 f/s
2536502: done 2478 games, mean reward -17.970, speed 75.10 f/s
2539588: done 2479 games, mean reward -17.940, speed 72.05 f/s
2539707: done 2480 games, mean reward -17.910, speed 69.63 f/s
2541013: done 2481 games, mean reward -17.910, speed 72.96 f/s
2541204: done 2482 games, mean reward -17.940, speed 59.60 f/s
2541425: done 2483 games, mean reward -17.920, speed 89.94 f/s
2544970: done 2484 games, mean reward -17.880, speed 72.16 f/s
2546602: done 2485 games, mean reward -17.900, speed 73.03 f/s
2547281: done 2486 games, mean reward -17.910, speed 74.54 f/s
2547866: done 2487 games, mean reward -17.880, speed 70.19 f/s
2549906: done 2488 games, mean reward -17.880, speed 74.56 f/s
2551266: done 2489 games, mean reward -17.880, speed 77.06 f/s
2554060: done 2490 games, mean reward -17.860, speed 71.70 f/s
2557191: done 2491 games, mean reward -17.880, speed 73.01 f/s
2562244: done 2492 games, mean reward -17.850, speed 74

2775212: done 2607 games, mean reward -17.870, speed 97.18 f/s
2775571: done 2608 games, mean reward -17.850, speed 101.81 f/s
2776140: done 2609 games, mean reward -17.840, speed 112.70 f/s
2776170: done 2610 games, mean reward -17.870, speed 158.93 f/s
2776576: done 2611 games, mean reward -17.870, speed 89.01 f/s
2780910: done 2612 games, mean reward -17.810, speed 102.81 f/s
2781501: done 2613 games, mean reward -17.800, speed 95.50 f/s
2782467: done 2614 games, mean reward -17.750, speed 99.60 f/s
2784129: done 2615 games, mean reward -17.770, speed 103.45 f/s
2787100: done 2616 games, mean reward -17.750, speed 101.22 f/s
2788258: done 2617 games, mean reward -17.710, speed 101.45 f/s
2792584: done 2618 games, mean reward -17.700, speed 99.63 f/s
2793456: done 2619 games, mean reward -17.710, speed 109.25 f/s
2793694: done 2620 games, mean reward -17.720, speed 105.54 f/s
2794522: done 2621 games, mean reward -17.680, speed 98.83 f/s
2796626: done 2622 games, mean reward -17.710,

3023856: done 2736 games, mean reward -17.400, speed 95.21 f/s
3024441: done 2737 games, mean reward -17.430, speed 98.74 f/s
3025450: done 2738 games, mean reward -17.380, speed 93.51 f/s
3025613: done 2739 games, mean reward -17.320, speed 96.72 f/s
3026086: done 2740 games, mean reward -17.370, speed 91.44 f/s
3029010: done 2741 games, mean reward -17.370, speed 95.01 f/s
3029351: done 2742 games, mean reward -17.280, speed 96.73 f/s
3034614: done 2743 games, mean reward -17.310, speed 99.81 f/s
3037794: done 2744 games, mean reward -17.240, speed 94.93 f/s
3039830: done 2745 games, mean reward -17.270, speed 94.98 f/s
3045501: done 2746 games, mean reward -17.260, speed 94.65 f/s
3046453: done 2747 games, mean reward -17.290, speed 87.56 f/s
3049458: done 2748 games, mean reward -17.370, speed 100.34 f/s
3051751: done 2749 games, mean reward -17.390, speed 92.11 f/s
3053531: done 2750 games, mean reward -17.330, speed 87.97 f/s
3054031: done 2751 games, mean reward -17.260, speed 9

3291945: done 2866 games, mean reward -14.590, speed 94.48 f/s
3292014: done 2867 games, mean reward -14.640, speed 195.97 f/s
3292781: done 2868 games, mean reward -14.580, speed 100.94 f/s
3296410: done 2869 games, mean reward -14.570, speed 96.90 f/s
3296934: done 2870 games, mean reward -14.640, speed 106.17 f/s
3296995: done 2871 games, mean reward -14.580, speed 197.28 f/s
3299896: done 2872 games, mean reward -14.700, speed 94.87 f/s
3302072: done 2873 games, mean reward -14.590, speed 98.70 f/s
3306022: done 2874 games, mean reward -14.610, speed 97.43 f/s
3307308: done 2875 games, mean reward -14.460, speed 101.90 f/s
3309064: done 2876 games, mean reward -14.320, speed 92.02 f/s
3315389: done 2877 games, mean reward -14.370, speed 105.90 f/s
3315625: done 2878 games, mean reward -14.200, speed 96.91 f/s
3316336: done 2879 games, mean reward -14.180, speed 111.84 f/s
3319184: done 2880 games, mean reward -14.260, speed 105.87 f/s
3323647: done 2881 games, mean reward -14.270, 

3582065: done 2995 games, mean reward -13.590, speed 109.68 f/s
3582804: done 2996 games, mean reward -13.570, speed 97.42 f/s
3585393: done 2997 games, mean reward -13.540, speed 102.44 f/s
3585790: done 2998 games, mean reward -13.510, speed 99.68 f/s
3590628: done 2999 games, mean reward -13.550, speed 105.23 f/s
3591004: done 3000 games, mean reward -13.480, speed 100.45 f/s
3591676: done 3001 games, mean reward -13.300, speed 107.64 f/s
3591984: done 3002 games, mean reward -13.260, speed 93.91 f/s
3600021: done 3003 games, mean reward -13.370, speed 107.83 f/s
3604214: done 3004 games, mean reward -13.360, speed 108.84 f/s
3604441: done 3005 games, mean reward -13.340, speed 96.20 f/s
3605905: done 3006 games, mean reward -13.420, speed 103.59 f/s
3608481: done 3007 games, mean reward -13.470, speed 108.12 f/s
3610593: done 3008 games, mean reward -13.260, speed 109.06 f/s
3610665: done 3009 games, mean reward -13.130, speed 80.81 f/s
3614171: done 3010 games, mean reward -13.040

3883266: done 3124 games, mean reward -10.740, speed 103.23 f/s
3885473: done 3125 games, mean reward -10.710, speed 108.62 f/s
3890320: done 3126 games, mean reward -10.580, speed 112.79 f/s
3894528: done 3127 games, mean reward -10.680, speed 112.77 f/s
3896215: done 3128 games, mean reward -10.820, speed 124.21 f/s
3899440: done 3129 games, mean reward -10.710, speed 122.07 f/s
3899748: done 3130 games, mean reward -10.560, speed 128.41 f/s
3899775: done 3131 games, mean reward -10.520, speed 235.34 f/s
3903314: done 3132 games, mean reward -10.510, speed 120.44 f/s
3905145: done 3133 games, mean reward -10.510, speed 124.32 f/s
3910369: done 3134 games, mean reward -10.380, speed 120.42 f/s
3910795: done 3135 games, mean reward -10.350, speed 104.13 f/s
3914591: done 3136 games, mean reward -10.380, speed 124.14 f/s
3916534: done 3137 games, mean reward -10.210, speed 118.13 f/s
3916712: done 3138 games, mean reward -10.360, speed 100.76 f/s
3918313: done 3139 games, mean reward -1

4244301: done 3254 games, mean reward -7.420, speed 182.53 f/s
4246071: done 3255 games, mean reward -7.390, speed 112.22 f/s
4249581: done 3256 games, mean reward -7.370, speed 113.72 f/s
4251758: done 3257 games, mean reward -7.310, speed 118.57 f/s
4253673: done 3258 games, mean reward -7.380, speed 118.97 f/s
4254011: done 3259 games, mean reward -7.360, speed 106.57 f/s
4255023: done 3260 games, mean reward -7.100, speed 118.49 f/s
4258197: done 3261 games, mean reward -7.040, speed 117.73 f/s
4261758: done 3262 games, mean reward -7.010, speed 122.60 f/s
4261770: done 3263 games, mean reward -6.860, speed 17.23 f/s
4266259: done 3264 games, mean reward -6.670, speed 121.31 f/s
4268555: done 3265 games, mean reward -6.600, speed 119.35 f/s
4275289: done 3266 games, mean reward -6.560, speed 122.35 f/s
4281176: done 3267 games, mean reward -6.590, speed 122.62 f/s
4281548: done 3268 games, mean reward -6.580, speed 119.61 f/s
4284001: done 3269 games, mean reward -6.780, speed 123.

4640807: done 3385 games, mean reward -2.530, speed 121.26 f/s
4641724: done 3386 games, mean reward -2.650, speed 120.79 f/s
4645607: done 3387 games, mean reward -2.760, speed 125.29 f/s
4648612: done 3388 games, mean reward -2.800, speed 119.37 f/s
4649026: done 3389 games, mean reward -2.810, speed 121.01 f/s
4650075: done 3390 games, mean reward -2.710, speed 123.44 f/s
4674036: done 3391 games, mean reward -2.580, speed 18.09 f/s
4677144: done 3392 games, mean reward -2.510, speed 113.39 f/s
4679114: done 3393 games, mean reward -2.310, speed 117.30 f/s
4684638: done 3394 games, mean reward -2.150, speed 116.11 f/s
4685971: done 3395 games, mean reward -2.190, speed 108.37 f/s
4688052: done 3396 games, mean reward -2.190, speed 115.49 f/s
4694245: done 3397 games, mean reward -2.060, speed 118.18 f/s
4695860: done 3398 games, mean reward -2.010, speed 116.43 f/s
4699726: done 3399 games, mean reward -1.970, speed 119.81 f/s
4699898: done 3400 games, mean reward -1.780, speed 136.

5029348: done 3517 games, mean reward 3.890, speed 123.33 f/s
5029836: done 3518 games, mean reward 3.970, speed 120.12 f/s
5040148: done 3519 games, mean reward 4.060, speed 123.77 f/s
5043903: done 3520 games, mean reward 4.100, speed 123.22 f/s
5046187: done 3521 games, mean reward 4.110, speed 120.40 f/s
5048213: done 3522 games, mean reward 4.340, speed 122.81 f/s
5048495: done 3523 games, mean reward 4.420, speed 132.46 f/s
5049713: done 3524 games, mean reward 4.440, speed 126.56 f/s
5052305: done 3525 games, mean reward 4.210, speed 118.42 f/s
5056872: done 3526 games, mean reward 4.120, speed 124.69 f/s
5061131: done 3527 games, mean reward 4.110, speed 121.03 f/s
5065493: done 3528 games, mean reward 4.160, speed 122.73 f/s
5092871: done 3529 games, mean reward 4.020, speed 123.64 f/s
5093491: done 3530 games, mean reward 4.110, speed 134.42 f/s
5096121: done 3531 games, mean reward 4.090, speed 117.48 f/s
5098367: done 3532 games, mean reward 4.240, speed 122.98 f/s
5098633:

5433540: done 3650 games, mean reward 6.110, speed 120.83 f/s
5442072: done 3651 games, mean reward 6.110, speed 116.06 f/s
5444454: done 3652 games, mean reward 6.120, speed 116.20 f/s
5446089: done 3653 games, mean reward 5.840, speed 114.13 f/s
5450948: done 3654 games, mean reward 6.080, speed 117.19 f/s
5456483: done 3655 games, mean reward 6.080, speed 119.07 f/s
5460502: done 3656 games, mean reward 6.290, speed 116.30 f/s
5462391: done 3657 games, mean reward 6.100, speed 121.49 f/s
5463482: done 3658 games, mean reward 6.120, speed 113.03 f/s
5464497: done 3659 games, mean reward 5.910, speed 118.73 f/s
5465509: done 3660 games, mean reward 5.740, speed 113.21 f/s
5466515: done 3661 games, mean reward 5.540, speed 111.58 f/s
5468431: done 3662 games, mean reward 5.350, speed 115.83 f/s
5479194: done 3663 games, mean reward 5.330, speed 117.55 f/s
5480981: done 3664 games, mean reward 5.200, speed 117.08 f/s
5482808: done 3665 games, mean reward 5.460, speed 119.96 f/s
5484521:

5823048: done 3783 games, mean reward 10.350, speed 118.18 f/s
5823717: done 3784 games, mean reward 10.360, speed 121.78 f/s
5828820: done 3785 games, mean reward 10.360, speed 117.12 f/s
5828974: done 3786 games, mean reward 10.390, speed 136.05 f/s
5830113: done 3787 games, mean reward 10.430, speed 116.27 f/s
5832265: done 3788 games, mean reward 10.450, speed 118.53 f/s
5834521: done 3789 games, mean reward 10.440, speed 117.31 f/s
5837612: done 3790 games, mean reward 10.420, speed 118.72 f/s
5838589: done 3791 games, mean reward 10.540, speed 120.11 f/s
5845498: done 3792 games, mean reward 10.720, speed 118.16 f/s
5850814: done 3793 games, mean reward 10.920, speed 117.83 f/s
5851651: done 3794 games, mean reward 10.990, speed 109.01 f/s
5853496: done 3795 games, mean reward 11.030, speed 122.28 f/s
5856796: done 3796 games, mean reward 11.150, speed 118.05 f/s
5859020: done 3797 games, mean reward 10.950, speed 117.72 f/s
5859602: done 3798 games, mean reward 10.940, speed 111

6140072: done 3914 games, mean reward 13.260, speed 118.60 f/s
6142986: done 3915 games, mean reward 13.160, speed 115.11 f/s
6143100: done 3916 games, mean reward 13.000, speed 260.53 f/s
6150233: done 3917 games, mean reward 13.110, speed 114.46 f/s
6150309: done 3918 games, mean reward 13.110, speed 80.40 f/s
6151404: done 3919 games, mean reward 13.210, speed 112.90 f/s
6152745: done 3920 games, mean reward 13.260, speed 106.64 f/s
6153054: done 3921 games, mean reward 13.370, speed 125.73 f/s
6156070: done 3922 games, mean reward 13.530, speed 113.21 f/s
6157180: done 3923 games, mean reward 13.690, speed 122.89 f/s
6159913: done 3924 games, mean reward 13.700, speed 113.68 f/s
6163385: done 3925 games, mean reward 13.920, speed 119.20 f/s
6167706: done 3926 games, mean reward 13.930, speed 113.63 f/s
6170621: done 3927 games, mean reward 14.010, speed 118.23 f/s
6172780: done 3928 games, mean reward 13.970, speed 113.49 f/s
6175103: done 3929 games, mean reward 14.110, speed 116.

6397259: done 4045 games, mean reward 16.840, speed 120.18 f/s
6399162: done 4046 games, mean reward 16.880, speed 122.30 f/s
6407868: done 4047 games, mean reward 16.960, speed 122.09 f/s
6411731: done 4048 games, mean reward 16.980, speed 124.52 f/s
6412691: done 4049 games, mean reward 17.050, speed 119.85 f/s
6414841: done 4050 games, mean reward 17.060, speed 125.84 f/s
6415473: done 4051 games, mean reward 17.040, speed 122.45 f/s
6415898: done 4052 games, mean reward 16.990, speed 114.74 f/s
6416072: done 4053 games, mean reward 17.100, speed 142.35 f/s
6416797: done 4054 games, mean reward 17.170, speed 120.98 f/s
6416930: done 4055 games, mean reward 17.160, speed 110.81 f/s
6419532: done 4056 games, mean reward 17.180, speed 124.47 f/s
6422737: done 4057 games, mean reward 17.300, speed 124.04 f/s
6424647: done 4058 games, mean reward 17.310, speed 124.65 f/s
6424660: done 4059 games, mean reward 17.330, speed 155.67 f/s
6426741: done 4060 games, mean reward 17.330, speed 126