In [1]:
import gym
from scipy.special import expit as sigmoid
import numpy as np
import numpy.random as npr
import time
from itertools import count
from collections import deque, namedtuple
import matplotlib.pyplot as plt
from a2c_ppo_acktr import utils
import random
from tqdm import tqdm, trange
from a2c_ppo_acktr.model import DebugAttentionBase

from heap import Heap

In [2]:
class args(object):
    bandwidth = 3
    postwidth = 3
    threshold = 0.5
    signal_split = 2
    att_num_heads = 4
    fps = 3
    eval_interval = 10
    eval_num = 250
    num_steps = 32
    ppo_epoch = 4
    value_loss_coef = 0.5
    entropy_coef = 0.01
    direction_loss_coef = 0
    lr = 1e-4
    eps = 1e-5
    max_grad_norm = 0.2
    clip_param = 0.2
    gamma = 0.99
    discount_mode = 'parallel'
    parallel_envs = 32
    quantile_truncate = 0.25

In [3]:
heap = Heap(4, 8, 1, args=args, post_process=None)



In [4]:
pc = np.array(heap.postcodes)

In [5]:
np.mean(np.sqrt(np.sum((pc[:, np.newaxis] - pc[np.newaxis, :])**2, axis=-1)))

0.6134421960202786

In [6]:
EnvTuple = namedtuple('EnvTuple', ['id', 'env', 'state', 'prev_state', 'rewards'])
envs = [EnvTuple(id, gym.make('CartPole-v0'), None, None, deque(maxlen=args.eval_interval)) for id in range(args.parallel_envs)]
eval_env = EnvTuple('eval', gym.make('CartPole-v0'), None, None, deque(maxlen=args.eval_num))
fpses = deque(maxlen=500)
def iterate_env(env):
    if env.state is None:
        env = env._replace(state=env.env.reset(), prev_state=env.state)
        env.rewards.append(0)
        heap.reset(env=env.id)
    out_action, fps = heap(env.state, n=args.fps, env=env.id)
    fpses.append(fps)
    if out_action:
        action = int(out_action[0] > 0.5)
    else:
        heap.done(env=env.id)
        env = env._replace(state=None, prev_state=env.state)
        return env
    state, reward, done, _ = env.env.step(action)
    heap.reward(reward, env=env.id)
    env.rewards[-1] += reward
    if done:
        if env.rewards[-1] < 200:
            heap.done(env=env.id)
        env = env._replace(state=None, prev_state=state)
        return env
    env = env._replace(state=state, prev_state=env.state)
    return env

def eval(env, j=0):
    env.rewards.clear()
    with tqdm(total=args.eval_num) as pbar:
        while True:
            env = iterate_env(env)
            if env.state is None:
                pbar.update(1)
                pbar.set_description('Eval | Avg. reward: %0.2f' % (sum(env.rewards)/len(env.rewards)))
                if len(env.rewards) == args.eval_num:
                    break
    avg_eval_reward = sum(env.rewards)/len(env.rewards)
    max_eval_reward = max(env.rewards)
    min_eval_reward = min(env.rewards)
    try:
        avg_reward = sum([sum(env.rewards) for env in envs])/sum([len(env.rewards) for env in envs])
        max_reward = max([max(env.rewards) for env in envs])
        min_reward = min([min(env.rewards) for env in envs])
    except:
        avg_reward = float('nan')
        max_reward = float('nan')
        min_reward = float('nan')
    print('Iter: %d, Avg/Max/Min. reward: %0.1f/%0.1f/%0.1f, Avg FPS: %0.2f\nEval Avg/Max/Min. reward: %0.1f/%0.1f/%0.1f' % (j, avg_reward, max_reward, min_reward, sum(fpses)/len(fpses), avg_eval_reward, max_eval_reward, min_eval_reward))

In [None]:
eval(eval_env, 0)
for j in count(559):
    heap.clear_memory()
    final_states = []
    with tqdm(total=args.num_steps) as pbar:
        for _ in range(args.num_steps):
            for i, env in enumerate(envs):
                envs[i] = iterate_env(env)
            pbar.update(1)
            avg_reward = sum([sum(env.rewards) for env in envs])/sum([len(env.rewards) for env in envs])
            pbar.set_description('Collect | Avg. reward: %0.2f' % avg_reward)
    next_states = {env.id : env.prev_state if env.state is None else env.state for env in envs}
    heap.update(next_states)
    if j % args.eval_interval == 0:
        eval(eval_env, j)

Eval | Avg. reward: 1.08: 100%|██████████| 250/250 [00:21<00:00, 11.83it/s]
  0%|          | 0/32 [00:00<?, ?it/s]

Iter: 0, Avg/Max/Min. reward: nan/nan/nan, Avg FPS: 1.88
Eval Avg/Max/Min. reward: 1.1/13.0/0.0


Collect | Avg. reward: 1.19: 100%|██████████| 32/32 [00:39<00:00,  1.23s/it]
Updating: 100%|██████████| 13/13 [05:34<00:00, 25.72s/it]
Collect | Avg. reward: 3.73: 100%|██████████| 32/32 [00:43<00:00,  1.35s/it]
Updating: 100%|██████████| 13/13 [04:39<00:00, 21.48s/it]
Eval | Avg. reward: 18.32: 100%|██████████| 250/250 [03:05<00:00,  1.35it/s]


Iter: 560, Avg/Max/Min. reward: 3.7/34.0/0.0, Avg FPS: 0.18
Eval Avg/Max/Min. reward: 18.3/82.0/0.0


Collect | Avg. reward: 6.76: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Updating: 100%|██████████| 13/13 [04:25<00:00, 20.42s/it]
Collect | Avg. reward: 9.69: 100%|██████████| 32/32 [00:40<00:00,  1.28s/it]
Updating: 100%|██████████| 13/13 [04:29<00:00, 20.71s/it]
Collect | Avg. reward: 12.52: 100%|██████████| 32/32 [00:39<00:00,  1.23s/it]
Updating: 100%|██████████| 13/13 [04:20<00:00, 20.01s/it]
Collect | Avg. reward: 15.09: 100%|██████████| 32/32 [00:39<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:21<00:00, 20.11s/it]
Collect | Avg. reward: 16.84: 100%|██████████| 32/32 [00:40<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:22<00:00, 20.23s/it]
Collect | Avg. reward: 18.18: 100%|██████████| 32/32 [00:40<00:00,  1.28s/it]
Updating: 100%|██████████| 13/13 [04:28<00:00, 20.62s/it]
Collect | Avg. reward: 18.14: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Updating: 100%|██████████| 13/13 [04:25<00:00, 20.45s/it]
Collect | Avg. reward: 18.00: 100%|██████████| 32/

Iter: 570, Avg/Max/Min. reward: 18.3/76.0/0.0, Avg FPS: 0.19
Eval Avg/Max/Min. reward: 20.7/70.0/0.0


Collect | Avg. reward: 18.58: 100%|██████████| 32/32 [00:39<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:21<00:00, 20.15s/it]
Collect | Avg. reward: 18.11: 100%|██████████| 32/32 [00:40<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:21<00:00, 20.08s/it]
Collect | Avg. reward: 18.47: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Updating: 100%|██████████| 13/13 [04:27<00:00, 20.61s/it]
Collect | Avg. reward: 18.76: 100%|██████████| 32/32 [00:39<00:00,  1.23s/it]
Updating: 100%|██████████| 13/13 [04:18<00:00, 19.90s/it]
Collect | Avg. reward: 19.02: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Updating: 100%|██████████| 13/13 [04:25<00:00, 20.41s/it]
Collect | Avg. reward: 19.35: 100%|██████████| 32/32 [00:40<00:00,  1.28s/it]
Updating: 100%|██████████| 13/13 [04:27<00:00, 20.58s/it]
Collect | Avg. reward: 19.13: 100%|██████████| 32/32 [00:41<00:00,  1.29s/it]
Updating: 100%|██████████| 13/13 [04:30<00:00, 20.84s/it]
Collect | Avg. reward: 18.78: 100%|██████████| 3

Iter: 580, Avg/Max/Min. reward: 18.5/81.0/0.0, Avg FPS: 0.21
Eval Avg/Max/Min. reward: 17.4/79.0/0.0


Collect | Avg. reward: 18.75: 100%|██████████| 32/32 [00:41<00:00,  1.29s/it]
Updating: 100%|██████████| 13/13 [04:30<00:00, 20.81s/it]
Collect | Avg. reward: 18.74: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Updating: 100%|██████████| 13/13 [04:27<00:00, 20.55s/it]
Collect | Avg. reward: 19.33: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Updating: 100%|██████████| 13/13 [04:23<00:00, 20.24s/it]
Collect | Avg. reward: 19.21: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Updating: 100%|██████████| 13/13 [04:25<00:00, 20.39s/it]
Collect | Avg. reward: 19.94: 100%|██████████| 32/32 [00:39<00:00,  1.24s/it]
Updating: 100%|██████████| 13/13 [04:21<00:00, 20.12s/it]
Collect | Avg. reward: 20.31: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Updating: 100%|██████████| 13/13 [04:23<00:00, 20.28s/it]
Collect | Avg. reward: 20.06: 100%|██████████| 32/32 [00:39<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:21<00:00, 20.11s/it]
Collect | Avg. reward: 19.05: 100%|██████████| 3

Iter: 590, Avg/Max/Min. reward: 18.1/70.0/0.0, Avg FPS: 0.15
Eval Avg/Max/Min. reward: 19.3/57.0/0.0


Collect | Avg. reward: 17.92: 100%|██████████| 32/32 [00:40<00:00,  1.25s/it]
Updating: 100%|██████████| 13/13 [04:22<00:00, 20.19s/it]
Collect | Avg. reward: 18.13: 100%|██████████| 32/32 [00:39<00:00,  1.23s/it]
Updating: 100%|██████████| 13/13 [04:17<00:00, 19.81s/it]
Collect | Avg. reward: 18.32: 100%|██████████| 32/32 [00:39<00:00,  1.24s/it]
Updating: 100%|██████████| 13/13 [04:19<00:00, 19.97s/it]
Collect | Avg. reward: 18.48: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Updating: 100%|██████████| 13/13 [04:23<00:00, 20.24s/it]
Collect | Avg. reward: 18.73: 100%|██████████| 32/32 [00:39<00:00,  1.24s/it]
Updating: 100%|██████████| 13/13 [04:20<00:00, 20.02s/it]
Collect | Avg. reward: 19.20: 100%|██████████| 32/32 [00:39<00:00,  1.24s/it]
Updating: 100%|██████████| 13/13 [04:19<00:00, 19.97s/it]
Collect | Avg. reward: 18.06: 100%|██████████| 32/32 [00:39<00:00,  1.24s/it]
Updating: 100%|██████████| 13/13 [04:20<00:00, 20.06s/it]
Collect | Avg. reward: 17.96: 100%|██████████| 3

Iter: 600, Avg/Max/Min. reward: 19.1/80.0/0.0, Avg FPS: 0.16
Eval Avg/Max/Min. reward: 19.9/93.0/0.0


Collect | Avg. reward: 19.09: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]
Updating: 100%|██████████| 13/13 [04:14<00:00, 19.54s/it]
Collect | Avg. reward: 18.63: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]
Updating: 100%|██████████| 13/13 [04:13<00:00, 19.49s/it]
Collect | Avg. reward: 18.36: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]
Updating: 100%|██████████| 13/13 [04:12<00:00, 19.45s/it]
Collect | Avg. reward: 18.64: 100%|██████████| 32/32 [00:38<00:00,  1.22s/it]
Updating: 100%|██████████| 13/13 [04:14<00:00, 19.61s/it]
Collect | Avg. reward: 18.47: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]
Updating: 100%|██████████| 13/13 [04:13<00:00, 19.50s/it]
Collect | Avg. reward: 18.61: 100%|██████████| 32/32 [00:38<00:00,  1.19s/it]
Updating: 100%|██████████| 13/13 [04:10<00:00, 19.26s/it]
Collect | Avg. reward: 18.63: 100%|██████████| 32/32 [00:38<00:00,  1.20s/it]
Updating: 100%|██████████| 13/13 [04:11<00:00, 19.38s/it]
Collect | Avg. reward: 18.86: 100%|██████████| 3

Iter: 610, Avg/Max/Min. reward: 19.0/78.0/0.0, Avg FPS: 0.12
Eval Avg/Max/Min. reward: 19.0/103.0/0.0


Collect | Avg. reward: 19.41: 100%|██████████| 32/32 [00:38<00:00,  1.19s/it]
Updating: 100%|██████████| 13/13 [04:08<00:00, 19.10s/it]
Collect | Avg. reward: 19.87: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:08<00:00, 19.10s/it]
Collect | Avg. reward: 19.88: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]
Updating: 100%|██████████| 13/13 [04:12<00:00, 19.44s/it]
Collect | Avg. reward: 20.11: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.92s/it]
Collect | Avg. reward: 19.55: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:06<00:00, 19.00s/it]
Collect | Avg. reward: 19.25: 100%|██████████| 32/32 [00:37<00:00,  1.19s/it]
Updating: 100%|██████████| 13/13 [04:08<00:00, 19.09s/it]
Collect | Avg. reward: 19.78: 100%|██████████| 32/32 [00:38<00:00,  1.20s/it]
Updating: 100%|██████████| 13/13 [04:11<00:00, 19.36s/it]
Collect | Avg. reward: 19.73: 100%|██████████| 3

Iter: 630, Avg/Max/Min. reward: 19.9/78.0/0.0, Avg FPS: 0.13
Eval Avg/Max/Min. reward: 20.4/67.0/0.0


Collect | Avg. reward: 19.96: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.87s/it]
Collect | Avg. reward: 19.68: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.76s/it]
Collect | Avg. reward: 20.12: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:07<00:00, 19.04s/it]
Collect | Avg. reward: 20.06: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:04<00:00, 18.77s/it]
Collect | Avg. reward: 20.14: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:06<00:00, 18.95s/it]
Collect | Avg. reward: 20.42: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.89s/it]
Collect | Avg. reward: 20.27: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.88s/it]
Collect | Avg. reward: 20.16: 100%|██████████| 3

Iter: 640, Avg/Max/Min. reward: 19.5/108.0/0.0, Avg FPS: 0.08
Eval Avg/Max/Min. reward: 21.1/71.0/0.0


Collect | Avg. reward: 19.61: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.86s/it]
Collect | Avg. reward: 19.52: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.75s/it]
Collect | Avg. reward: 19.72: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.70s/it]
Collect | Avg. reward: 19.90: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:04<00:00, 18.83s/it]
Collect | Avg. reward: 19.71: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:05<00:00, 18.88s/it]
Collect | Avg. reward: 19.78:  41%|████      | 13/32 [00:15<00:22,  1.18s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp

Iter: 670, Avg/Max/Min. reward: 21.6/136.0/0.0, Avg FPS: 0.09
Eval Avg/Max/Min. reward: 21.3/68.0/0.0


Collect | Avg. reward: 20.86: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:06<00:00, 18.99s/it]
Collect | Avg. reward: 20.96: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:04<00:00, 18.81s/it]
Collect | Avg. reward: 21.38: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:06<00:00, 18.96s/it]
Collect | Avg. reward: 21.09: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.64s/it]
Collect | Avg. reward: 20.49: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
Updating: 100%|██████████| 13/13 [04:06<00:00, 18.96s/it]
Collect | Avg. reward: 20.31: 100%|██████████| 32/32 [00:36<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.56s/it]
Collect | Avg. reward: 20.45: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:04<00:00, 18.84s/it]
Collect | Avg. reward: 19.75: 100%|██████████| 3

Iter: 680, Avg/Max/Min. reward: 20.3/58.0/0.0, Avg FPS: 0.08
Eval Avg/Max/Min. reward: 21.9/91.0/0.0


Collect | Avg. reward: 20.43: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:59<00:00, 18.43s/it]
Collect | Avg. reward: 20.00: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.62s/it]
Collect | Avg. reward: 19.77: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.75s/it]
Collect | Avg. reward: 19.82: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.72s/it]
Collect | Avg. reward: 19.69: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.72s/it]
Collect | Avg. reward: 20.23: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.63s/it]
Collect | Avg. reward: 19.56: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.73s/it]
Collect | Avg. reward: 19.70: 100%|██████████| 3

Iter: 710, Avg/Max/Min. reward: 20.1/57.0/0.0, Avg FPS: 0.06
Eval Avg/Max/Min. reward: 20.3/63.0/0.0


Collect | Avg. reward: 19.90: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.38s/it]
Collect | Avg. reward: 20.38: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:04<00:00, 18.81s/it]
Collect | Avg. reward: 20.65: 100%|██████████| 32/32 [00:36<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.66s/it]
Collect | Avg. reward: 20.51: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.56s/it]
Collect | Avg. reward: 20.28: 100%|██████████| 32/32 [00:37<00:00,  1.17s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.77s/it]
Collect | Avg. reward: 19.99: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:03<00:00, 18.71s/it]
Collect | Avg. reward: 20.12: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.68s/it]
Collect | Avg. reward: 20.06: 100%|██████████| 3

Iter: 720, Avg/Max/Min. reward: 20.5/97.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 22.7/94.0/0.0


Collect | Avg. reward: 20.68: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:00<00:00, 18.53s/it]
Collect | Avg. reward: 20.96: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.59s/it]
Collect | Avg. reward: 20.90: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.58s/it]
Collect | Avg. reward: 21.29: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.60s/it]
Collect | Avg. reward: 21.24: 100%|██████████| 32/32 [00:37<00:00,  1.16s/it]
Updating: 100%|██████████| 13/13 [04:02<00:00, 18.66s/it]
Collect | Avg. reward: 22.12: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [04:00<00:00, 18.49s/it]
Collect | Avg. reward: 22.30: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:00<00:00, 18.54s/it]
Collect | Avg. reward: 21.92: 100%|██████████| 3

Iter: 740, Avg/Max/Min. reward: 21.2/73.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 23.1/88.0/0.0


Collect | Avg. reward: 21.33: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 21.29: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.05s/it]
Collect | Avg. reward: 21.33: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 22.18: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.03s/it]
Collect | Avg. reward: 22.05: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.98s/it]
Collect | Avg. reward: 22.34: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 22.09: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.23s/it]
Collect | Avg. reward: 21.87: 100%|██████████| 3

Iter: 750, Avg/Max/Min. reward: 22.1/77.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 23.2/85.0/0.0


Collect | Avg. reward: 21.85: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.09s/it]
Collect | Avg. reward: 22.20: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 18.00s/it]
Collect | Avg. reward: 21.93: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.99s/it]
Collect | Avg. reward: 21.86: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.91s/it]
Collect | Avg. reward: 22.11: 100%|██████████| 32/32 [00:31<00:00,  1.01it/s]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.74s/it]
Collect | Avg. reward: 22.12: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.96s/it]
Collect | Avg. reward: 22.11: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.99s/it]
Collect | Avg. reward: 22.56: 100%|██████████| 3

Iter: 760, Avg/Max/Min. reward: 22.2/106.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 22.0/133.0/8.0


Collect | Avg. reward: 21.77: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.00s/it]
Collect | Avg. reward: 22.05: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.92s/it]
Collect | Avg. reward: 22.13: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.05s/it]
Collect | Avg. reward: 21.75: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 21.85: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.04s/it]
Collect | Avg. reward: 22.11: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.01s/it]
Collect | Avg. reward: 21.98: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 22.10: 100%|██████████| 3

Iter: 770, Avg/Max/Min. reward: 23.2/85.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 23.2/111.0/8.0


Collect | Avg. reward: 23.16: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.08s/it]
Collect | Avg. reward: 22.84: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.89s/it]
Collect | Avg. reward: 22.48: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.84s/it]
Collect | Avg. reward: 22.28: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.86s/it]
Collect | Avg. reward: 22.84: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.90s/it]
Collect | Avg. reward: 22.59: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.01s/it]
Collect | Avg. reward: 22.16: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.80s/it]
Collect | Avg. reward: 22.26: 100%|██████████| 3

Iter: 780, Avg/Max/Min. reward: 22.0/90.0/3.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 23.1/77.0/0.0


Collect | Avg. reward: 21.90: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.99s/it]
Collect | Avg. reward: 21.63: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.06s/it]
Collect | Avg. reward: 22.00: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.87s/it]
Collect | Avg. reward: 21.92: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.01s/it]
Collect | Avg. reward: 22.20: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.85s/it]
Collect | Avg. reward: 22.52: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.93s/it]
Collect | Avg. reward: 22.22: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.97s/it]
Collect | Avg. reward: 22.85: 100%|██████████| 3

Iter: 790, Avg/Max/Min. reward: 21.9/106.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 21.9/80.0/0.0


Collect | Avg. reward: 21.64: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.09s/it]
Collect | Avg. reward: 22.10: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.91s/it]
Collect | Avg. reward: 20.96: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.83s/it]
Collect | Avg. reward: 20.64: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.73s/it]
Collect | Avg. reward: 20.84: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.75s/it]
Collect | Avg. reward: 20.74: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.78s/it]
Collect | Avg. reward: 19.95: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.80s/it]
Collect | Avg. reward: 19.63: 100%|██████████| 3

Iter: 800, Avg/Max/Min. reward: 20.3/72.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 21.2/70.0/8.0


Collect | Avg. reward: 20.72: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.88s/it]
Collect | Avg. reward: 20.77: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.71s/it]
Collect | Avg. reward: 21.27: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.94s/it]
Collect | Avg. reward: 21.94: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.90s/it]
Collect | Avg. reward: 22.73: 100%|██████████| 32/32 [00:34<00:00,  1.09s/it]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.71s/it]
Collect | Avg. reward: 22.35: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.79s/it]
Collect | Avg. reward: 21.73: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:49<00:00, 17.68s/it]
Collect | Avg. reward: 21.65: 100%|██████████| 3

Iter: 810, Avg/Max/Min. reward: 21.1/87.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 23.8/138.0/0.0


Collect | Avg. reward: 22.12: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.94s/it]
Collect | Avg. reward: 21.97: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.16s/it]
Collect | Avg. reward: 22.24: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.93s/it]
Collect | Avg. reward: 22.12: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.92s/it]
Collect | Avg. reward: 22.75: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.81s/it]
Collect | Avg. reward: 22.54: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.05s/it]
Collect | Avg. reward: 22.55: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.00s/it]
Collect | Avg. reward: 22.57: 100%|██████████| 3

Iter: 820, Avg/Max/Min. reward: 21.7/73.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 22.2/66.0/0.0


Collect | Avg. reward: 21.97: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.92s/it]
Collect | Avg. reward: 22.05: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.98s/it]
Collect | Avg. reward: 21.85: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 22.36: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.06s/it]
Collect | Avg. reward: 22.62: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.97s/it]
Collect | Avg. reward: 22.87: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.86s/it]
Collect | Avg. reward: 22.43: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.97s/it]
Collect | Avg. reward: 22.98: 100%|██████████| 3

Iter: 830, Avg/Max/Min. reward: 22.9/131.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 23.0/86.0/0.0


Collect | Avg. reward: 22.24: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 21.98: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.99s/it]
Collect | Avg. reward: 22.48: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.92s/it]
Collect | Avg. reward: 22.13: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.06s/it]
Collect | Avg. reward: 22.42: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 22.90: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.98s/it]
Collect | Avg. reward: 23.15: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.82s/it]
Collect | Avg. reward: 23.84: 100%|██████████| 3

Iter: 840, Avg/Max/Min. reward: 24.8/93.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 24.7/98.0/9.0


Collect | Avg. reward: 24.81: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.86s/it]
Collect | Avg. reward: 24.20: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.97s/it]
Collect | Avg. reward: 24.50: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.95s/it]
Collect | Avg. reward: 24.72: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.00s/it]
Collect | Avg. reward: 24.97: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.99s/it]
Collect | Avg. reward: 24.82: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.89s/it]
Collect | Avg. reward: 24.11: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.95s/it]
Collect | Avg. reward: 24.36: 100%|██████████| 3

Iter: 850, Avg/Max/Min. reward: 25.5/116.0/0.0, Avg FPS: 0.02
Eval Avg/Max/Min. reward: 23.0/76.0/0.0


Collect | Avg. reward: 25.20: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.86s/it]
Collect | Avg. reward: 24.94: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.96s/it]
Collect | Avg. reward: 25.01: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.82s/it]
Collect | Avg. reward: 24.90: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.85s/it]
Collect | Avg. reward: 24.44: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.91s/it]
Collect | Avg. reward: 24.68: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.95s/it]
Collect | Avg. reward: 25.00: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.77s/it]
Collect | Avg. reward: 25.11: 100%|██████████| 3

Iter: 880, Avg/Max/Min. reward: 28.2/94.0/2.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 24.0/87.0/0.0


Collect | Avg. reward: 28.15: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.87s/it]
Collect | Avg. reward: 27.28: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.87s/it]
Collect | Avg. reward: 26.35: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.77s/it]
Collect | Avg. reward: 25.68: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.85s/it]
Collect | Avg. reward: 26.17: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.84s/it]
Collect | Avg. reward: 26.78: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:51<00:00, 17.84s/it]
Collect | Avg. reward: 27.19: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.93s/it]
Collect | Avg. reward: 28.19: 100%|██████████| 3

Iter: 890, Avg/Max/Min. reward: 27.7/126.0/4.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 31.1/110.0/0.0


Collect | Avg. reward: 28.62: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.97s/it]
Collect | Avg. reward: 30.09: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.86s/it]
Collect | Avg. reward: 30.71: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.98s/it]
Collect | Avg. reward: 31.18: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.92s/it]
Collect | Avg. reward: 31.64: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:50<00:00, 17.72s/it]
Collect | Avg. reward: 31.85: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.93s/it]
Collect | Avg. reward: 32.02: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.94s/it]
Collect | Avg. reward: 32.03: 100%|██████████| 3

Iter: 910, Avg/Max/Min. reward: 27.9/98.0/0.0, Avg FPS: 0.02
Eval Avg/Max/Min. reward: 31.0/132.0/0.0


Collect | Avg. reward: 28.10: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:52<00:00, 17.88s/it]
Collect | Avg. reward: 28.16: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.04s/it]
Collect | Avg. reward: 28.20: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.25s/it]
Collect | Avg. reward: 28.62: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.02s/it]
Collect | Avg. reward: 28.85: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.04s/it]
Collect | Avg. reward: 29.72: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.19s/it]
Collect | Avg. reward: 30.07: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.01s/it]
Collect | Avg. reward: 31.08: 100%|██████████| 3

Iter: 920, Avg/Max/Min. reward: 32.1/112.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 35.2/108.0/0.0


Collect | Avg. reward: 32.77: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.19s/it]
Collect | Avg. reward: 32.42: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.10s/it]
Collect | Avg. reward: 32.28: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 32.83: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.02s/it]
Collect | Avg. reward: 33.11: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 33.13: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.21s/it]
Collect | Avg. reward: 32.55: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.08s/it]
Collect | Avg. reward: 32.16: 100%|██████████| 3

Iter: 950, Avg/Max/Min. reward: 37.3/140.0/0.0, Avg FPS: 0.08
Eval Avg/Max/Min. reward: 39.3/135.0/0.0


Collect | Avg. reward: 37.22: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:59<00:00, 18.45s/it]
Collect | Avg. reward: 37.26: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.26s/it]
Collect | Avg. reward: 37.74: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.37s/it]
Collect | Avg. reward: 38.90: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.38s/it]
Collect | Avg. reward: 39.78: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.34s/it]
Collect | Avg. reward: 38.38: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.25s/it]
Collect | Avg. reward: 38.19: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.16s/it]
Collect | Avg. reward: 38.40: 100%|██████████| 3

Iter: 960, Avg/Max/Min. reward: 38.1/166.0/0.0, Avg FPS: 0.06
Eval Avg/Max/Min. reward: 35.9/126.0/0.0


Collect | Avg. reward: 38.10: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.16s/it]
Collect | Avg. reward: 38.50: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.35s/it]
Collect | Avg. reward: 37.58: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:47<00:00, 17.51s/it]
Collect | Avg. reward: 37.18: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 36.60: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.14s/it]
Collect | Avg. reward: 37.01: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.20s/it]
Collect | Avg. reward: 35.93: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 17.96s/it]
Collect | Avg. reward: 35.08: 100%|██████████| 3

Iter: 990, Avg/Max/Min. reward: 40.1/194.0/0.0, Avg FPS: 0.04
Eval Avg/Max/Min. reward: 36.3/123.0/0.0


Collect | Avg. reward: 40.23: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.32s/it]
Collect | Avg. reward: 39.49: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 38.73: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.02s/it]
Collect | Avg. reward: 38.48: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.22s/it]
Collect | Avg. reward: 38.22: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.32s/it]
Collect | Avg. reward: 39.07: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.20s/it]
Collect | Avg. reward: 39.11: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 37.90: 100%|██████████| 3

Iter: 1000, Avg/Max/Min. reward: 38.9/167.0/0.0, Avg FPS: 0.05
Eval Avg/Max/Min. reward: 33.6/124.0/0.0


Collect | Avg. reward: 36.03: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.37s/it]
Collect | Avg. reward: 35.62: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.24s/it]
Collect | Avg. reward: 35.52: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.05s/it]
Collect | Avg. reward: 34.80: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.21s/it]
Collect | Avg. reward: 33.63: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.18s/it]
Collect | Avg. reward: 33.68: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.30s/it]
Collect | Avg. reward: 33.57: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.18s/it]
Collect | Avg. reward: 33.51: 100%|██████████| 3

Iter: 1020, Avg/Max/Min. reward: 39.4/140.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 40.9/123.0/0.0


Collect | Avg. reward: 39.33: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.28s/it]
Collect | Avg. reward: 39.91: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:54<00:00, 18.07s/it]
Collect | Avg. reward: 40.16: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:59<00:00, 18.39s/it]
Collect | Avg. reward: 41.39: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.14s/it]
Collect | Avg. reward: 42.00: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.24s/it]
Collect | Avg. reward: 41.41: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.18s/it]
Collect | Avg. reward: 40.05: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.24s/it]
Collect | Avg. reward: 41.18: 100%|██████████| 3

Iter: 1040, Avg/Max/Min. reward: 43.5/169.0/0.0, Avg FPS: 0.02
Eval Avg/Max/Min. reward: 43.2/190.0/0.0


Collect | Avg. reward: 43.83: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.26s/it]
Collect | Avg. reward: 43.16: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.14s/it]
Collect | Avg. reward: 43.85: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.13s/it]
Collect | Avg. reward: 44.27: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.23s/it]
Collect | Avg. reward: 43.59: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.27s/it]
Collect | Avg. reward: 44.51: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.33s/it]
Collect | Avg. reward: 45.04: 100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
Updating: 100%|██████████| 13/13 [04:01<00:00, 18.59s/it]
Collect | Avg. reward: 44.99: 100%|██████████| 3

Iter: 1050, Avg/Max/Min. reward: 45.2/172.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 47.0/149.0/1.0


Collect | Avg. reward: 44.56: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.25s/it]
Collect | Avg. reward: 43.79: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.38s/it]
Collect | Avg. reward: 42.92: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.31s/it]
Collect | Avg. reward: 41.87: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.18s/it]
Collect | Avg. reward: 42.51: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.27s/it]
Collect | Avg. reward: 41.27: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.16s/it]
Collect | Avg. reward: 41.33: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 41.64: 100%|██████████| 3

Iter: 1070, Avg/Max/Min. reward: 43.6/200.0/0.0, Avg FPS: 0.03
Eval Avg/Max/Min. reward: 51.2/170.0/0.0


Collect | Avg. reward: 43.79: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]
Updating: 100%|██████████| 13/13 [03:53<00:00, 18.00s/it]
Collect | Avg. reward: 44.26: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:58<00:00, 18.36s/it]
Collect | Avg. reward: 45.19: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]
Updating: 100%|██████████| 13/13 [03:57<00:00, 18.25s/it]
Collect | Avg. reward: 45.15: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.11s/it]
Collect | Avg. reward: 45.01: 100%|██████████| 32/32 [00:36<00:00,  1.13s/it]
Updating: 100%|██████████| 13/13 [03:55<00:00, 18.13s/it]
Collect | Avg. reward: 44.94: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]
Updating: 100%|██████████| 13/13 [03:56<00:00, 18.19s/it]
Collect | Avg. reward: 45.54: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]


In [None]:
heap.clear_memory(eval_env.id)

In [None]:
eval(eval_env, -1)

In [None]:
n = 100
heap.plot_stats(n=n)

In [None]:
heap.clear_memory(eval_env.id)
while True:
    eval_env = iterate_env(eval_env)
    if eval_env.state is None:
        break
import torch

with torch.no_grad():
    next_value = heap(eval_env.prev_state, trace_value=True, env=eval_env.id)
_, memory = heap.get_inner_state(eval_env.id)
memory.compute_returns(torch.tensor([next_value]), 0.99)

In [None]:
eval_env.rewards

In [None]:
for (i, id, r, pr, rew) in zip(range(10000), memory.unit_ids, memory.returns, memory.value_preds, memory.rewards):
    print(id, r, pr, rew, i in memory.discount_marks, i in memory.done_marks)
    if i in memory.done_marks:
        break

In [None]:
last_obs = None
req_pos = 0
cur_pos = 0

In [None]:
for i, (id, act, obs) in enumerate(zip(memory.unit_ids, memory.actions, memory.obs)):
    if id == 12 and cur_pos <= req_pos:
        last_obs = obs
        cur_pos += 1
    from_ids = []
    if len(obs.shape) == 1:
        from_ids = ['env']
    else:
        for _obs in obs:
            from_vec = _obs[:3].numpy()
            if np.sum((from_vec - heap.postcodes[0])**2) < 1e-5:
                from_id = 0
            elif np.sum((from_vec - heap.postcodes[1])**2) < 1e-5:
                from_id = 1
            elif np.sum((from_vec - heap.postcodes[2])**2) < 1e-5:
                from_id = 2
            elif np.sum((from_vec - heap.postcodes[3])**2) < 1e-5:
                from_id = 3
            else:
                _from_ids = heap.index.get_nns_by_vector(from_vec, n=2)
                from_id = _from_ids[0] if not _from_ids[0] == id else _from_ids[1]
            from_ids.append(from_id)
    to_ids = []
    did_act = []
    if act.shape[0] == 1:
        to_ids = ['env']
        did_act = ['+']
        if sigmoid(act[0, 0]) <= args.threshold:
            did_act = ['-']
        print(act)
    else:
        for _act in act:
            _did_act = '+'
            to_vec = _act[1:4].numpy()
            _to_ids = heap.index.get_nns_by_vector(to_vec, n=2)
            to_id = _to_ids[0] if not _to_ids[0] == id else _to_ids[1]
            if sigmoid(_act[0]) <= args.threshold:
                _did_act = '-'
            to_ids.append(to_id)
            did_act.append(_did_act)
    if i in memory.discount_marks:
        print('-'*80)
    if i in memory.done_marks:
        print('x'*80)
    print(from_ids, '->', id, '->', ['%s%s' % (str(to_id), did_act[j]) for j, to_id in enumerate(to_ids)])

In [None]:
tst = DebugAttentionBase(7, 1, args.att_num_heads)
tst.load_state_dict(heap.units[-1].actor_critic.base.state_dict())
tst(last_obs)