
# TIANSHOU

In [3]:
import datetime
import os
import pprint

import numpy as np
import torch
import glob

from torch.utils.tensorboard import SummaryWriter
from tianshou.data import Collector, CollectStats, ReplayBuffer, VectorReplayBuffer
from tianshou.exploration import GaussianNoise
from tianshou.highlevel.logger import LoggerFactoryDefault
from tianshou.policy import TD3Policy
from tianshou.policy.base import BasePolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import Actor, Critic
from tianshou.env import DummyVectorEnv
from tianshou.utils import TensorboardLogger

import gymnasium as gym

from gymnasium.wrappers import RecordVideo


In [4]:
env = gym.make('LunarLanderContinuous-v2', render_mode="human")

train_envs = DummyVectorEnv([lambda: gym.make('LunarLanderContinuous-v2') for _ in range(10)])
test_envs = DummyVectorEnv([lambda: gym.make('LunarLanderContinuous-v2') for _ in range(10)])

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
max_action = env.action_space.high[0]

print("Observations shape:", state_shape)
print("Actions shape:", action_shape)
print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high))

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

Observations shape: (8,)
Actions shape: (2,)
Action range: -1.0 1.0


<torch._C.Generator at 0x17904a9c310>

In [5]:
#Model
hidden_sizes = ()
device = "cuda" if torch.cuda.is_available() else "cpu"
net_a = Net(state_shape=state_shape, hidden_sizes=hidden_sizes, device=device)
actor = Actor(net_a, action_shape, max_action=max_action, device=device).to(
    device,
)
actor_optim = torch.optim.Adam(actor.parameters(), lr=1e-3)
net_c1 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=hidden_sizes,
    concat=True,
    device=device,
)
net_c2 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=hidden_sizes,
    concat=True,
    device=device,
)
critic1 = Critic(net_c1, device=device).to(device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=1e-3)
critic2 = Critic(net_c2, device=device).to(device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=1e-3)

policy: TD3Policy = TD3Policy(
    actor=actor,
    actor_optim=actor_optim,
    critic=critic1,
    critic_optim=critic1_optim,
    critic2=critic2,
    critic2_optim=critic2_optim,
    tau=0.005,
    gamma=0.99,
    exploration_noise=GaussianNoise(sigma=0.1),
    policy_noise=0.2,
    update_actor_freq=2,
    noise_clip=0.5,
    estimation_step=1,
    action_space=env.action_space,
    action_scaling=True
)

# Collector
buffer = VectorReplayBuffer(20000, 10)
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)
train_collector.reset()
train_collector.collect(n_step=1000, random=True)

# logger
writer = SummaryWriter(log_dir='log/td3_lunar_lander')
logger = TensorboardLogger(writer)

def save_best_fn(policy: BasePolicy) -> None:
    torch.save(policy.state_dict(), 'best_policy.pth')

result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=100,
    step_per_epoch=10000,
    step_per_collect=1000,
    episode_per_test=1000,
    batch_size=64,
    save_best_fn=save_best_fn,
    logger=logger,
    update_per_step=0.1,
    test_in_train=False,
    ).run()
pprint.pprint(result)


Epoch #1: 10001it [00:09, 1013.82it/s, env_step=10000, gradient_step=1000, len=68, n/ep=13, n/st=1000, rew=-191.16]                           


Epoch #1: test_reward: -238.972669 ± 86.793901, best_reward: -238.972669 ± 86.793901 in #1


Epoch #2: 10001it [00:08, 1140.13it/s, env_step=20000, gradient_step=2000, len=64, n/ep=13, n/st=1000, rew=-548.36]                           


Epoch #2: test_reward: -543.503579 ± 144.869109, best_reward: -238.972669 ± 86.793901 in #1


Epoch #3: 10001it [00:08, 1130.33it/s, env_step=30000, gradient_step=3000, len=93, n/ep=12, n/st=1000, rew=-313.95]                           


Epoch #3: test_reward: -195.112965 ± 125.359493, best_reward: -195.112965 ± 125.359493 in #3


Epoch #4: 10001it [00:08, 1120.27it/s, env_step=40000, gradient_step=4000, len=112, n/ep=7, n/st=1000, rew=-356.87]                            


Epoch #4: test_reward: -411.617043 ± 59.331250, best_reward: -195.112965 ± 125.359493 in #3


Epoch #5: 10001it [00:09, 1094.85it/s, env_step=50000, gradient_step=5000, len=87, n/ep=11, n/st=1000, rew=-1012.96]                           


Epoch #5: test_reward: -1080.483022 ± 321.548864, best_reward: -195.112965 ± 125.359493 in #3


Epoch #6: 10001it [00:09, 1037.11it/s, env_step=60000, gradient_step=6000, len=86, n/ep=10, n/st=1000, rew=-1024.60]                           


Epoch #6: test_reward: -1074.764309 ± 329.892324, best_reward: -195.112965 ± 125.359493 in #3


Epoch #7: 10001it [00:10, 936.61it/s, env_step=70000, gradient_step=7000, len=82, n/ep=14, n/st=1000, rew=-304.53]                           


Epoch #7: test_reward: -209.050242 ± 70.570707, best_reward: -195.112965 ± 125.359493 in #3


Epoch #8: 10001it [00:09, 1057.17it/s, env_step=80000, gradient_step=8000, len=77, n/ep=10, n/st=1000, rew=-113.32]                           


Epoch #8: test_reward: -223.782759 ± 124.708645, best_reward: -195.112965 ± 125.359493 in #3


Epoch #9: 10001it [00:08, 1136.61it/s, env_step=90000, gradient_step=9000, len=66, n/ep=14, n/st=1000, rew=-494.36]                           


Epoch #9: test_reward: -488.491227 ± 90.590726, best_reward: -195.112965 ± 125.359493 in #3


Epoch #10: 10001it [00:08, 1117.16it/s, env_step=100000, gradient_step=10000, len=64, n/ep=14, n/st=1000, rew=-564.33]                           


Epoch #10: test_reward: -560.691727 ± 148.579699, best_reward: -195.112965 ± 125.359493 in #3


Epoch #11: 10001it [00:09, 1072.97it/s, env_step=110000, gradient_step=11000, len=69, n/ep=14, n/st=1000, rew=-588.07]                          


Epoch #11: test_reward: -562.692704 ± 158.797692, best_reward: -195.112965 ± 125.359493 in #3


Epoch #12: 10001it [00:10, 987.64it/s, env_step=120000, gradient_step=12000, len=71, n/ep=12, n/st=1000, rew=-540.21]                           


Epoch #12: test_reward: -482.919571 ± 119.890445, best_reward: -195.112965 ± 125.359493 in #3


Epoch #13: 10001it [00:09, 1097.42it/s, env_step=130000, gradient_step=13000, len=259, n/ep=6, n/st=1000, rew=-656.83]                           


Epoch #13: test_reward: -495.966697 ± 254.018104, best_reward: -195.112965 ± 125.359493 in #3


Epoch #14: 10001it [00:14, 702.30it/s, env_step=140000, gradient_step=14000, len=96, n/ep=10, n/st=1000, rew=-570.00]                           


Epoch #14: test_reward: -631.459338 ± 91.993224, best_reward: -195.112965 ± 125.359493 in #3


Epoch #15: 10001it [00:14, 691.59it/s, env_step=150000, gradient_step=15000, len=95, n/ep=11, n/st=1000, rew=-828.13]                           


Epoch #15: test_reward: -892.530072 ± 168.408481, best_reward: -195.112965 ± 125.359493 in #3


Epoch #16: 10001it [00:13, 714.70it/s, env_step=160000, gradient_step=16000, len=102, n/ep=10, n/st=1000, rew=-824.83]                           


Epoch #16: test_reward: -679.390362 ± 178.825267, best_reward: -195.112965 ± 125.359493 in #3


Epoch #17: 10001it [00:14, 688.73it/s, env_step=170000, gradient_step=17000, len=88, n/ep=9, n/st=1000, rew=-185.42]                            


Epoch #17: test_reward: -272.090846 ± 119.287227, best_reward: -195.112965 ± 125.359493 in #3


Epoch #18: 10001it [00:14, 698.99it/s, env_step=180000, gradient_step=18000, len=68, n/ep=14, n/st=1000, rew=-142.68]                           


Epoch #18: test_reward: -167.740256 ± 57.419840, best_reward: -167.740256 ± 57.419840 in #18


Epoch #19: 10001it [00:13, 726.71it/s, env_step=190000, gradient_step=19000, len=71, n/ep=17, n/st=1000, rew=-394.09]                           


Epoch #19: test_reward: -395.437565 ± 109.624681, best_reward: -167.740256 ± 57.419840 in #18


Epoch #20: 10001it [00:14, 708.44it/s, env_step=200000, gradient_step=20000, len=69, n/ep=14, n/st=1000, rew=-480.35]                           


Epoch #20: test_reward: -484.627786 ± 89.965778, best_reward: -167.740256 ± 57.419840 in #18


Epoch #21: 10001it [00:14, 712.01it/s, env_step=210000, gradient_step=21000, len=65, n/ep=15, n/st=1000, rew=-478.63]                           


Epoch #21: test_reward: -503.297869 ± 105.827702, best_reward: -167.740256 ± 57.419840 in #18


Epoch #22: 10001it [00:13, 746.86it/s, env_step=220000, gradient_step=22000, len=72, n/ep=13, n/st=1000, rew=-254.70]                           


Epoch #22: test_reward: -176.777678 ± 93.154860, best_reward: -167.740256 ± 57.419840 in #18


Epoch #23: 10001it [00:13, 751.48it/s, env_step=230000, gradient_step=23000, len=142, n/ep=8, n/st=1000, rew=-610.75]                           


Epoch #23: test_reward: -648.421403 ± 176.029540, best_reward: -167.740256 ± 57.419840 in #18


Epoch #24: 10001it [00:14, 686.79it/s, env_step=240000, gradient_step=24000, len=95, n/ep=10, n/st=1000, rew=-697.71]                            


Epoch #24: test_reward: -682.920535 ± 93.570046, best_reward: -167.740256 ± 57.419840 in #18


Epoch #25: 10001it [00:14, 683.86it/s, env_step=250000, gradient_step=25000, len=122, n/ep=9, n/st=1000, rew=-279.03]                           


Epoch #25: test_reward: -494.350100 ± 207.670662, best_reward: -167.740256 ± 57.419840 in #18


Epoch #26: 10001it [00:13, 741.19it/s, env_step=260000, gradient_step=26000, len=69, n/ep=12, n/st=1000, rew=-137.09]                           


Epoch #26: test_reward: -138.816623 ± 36.995180, best_reward: -138.816623 ± 36.995180 in #26


Epoch #27: 10001it [00:13, 744.77it/s, env_step=270000, gradient_step=27000, len=72, n/ep=15, n/st=1000, rew=-269.95]                           


Epoch #27: test_reward: -265.100696 ± 55.143662, best_reward: -138.816623 ± 36.995180 in #26


Epoch #28: 10001it [00:14, 692.59it/s, env_step=280000, gradient_step=28000, len=71, n/ep=14, n/st=1000, rew=-433.98]                           


Epoch #28: test_reward: -455.659932 ± 77.090961, best_reward: -138.816623 ± 36.995180 in #26


Epoch #29: 10001it [00:14, 666.76it/s, env_step=290000, gradient_step=29000, len=64, n/ep=16, n/st=1000, rew=-553.57]                           


Epoch #29: test_reward: -540.103831 ± 130.863707, best_reward: -138.816623 ± 36.995180 in #26


Epoch #30: 10001it [00:09, 1096.67it/s, env_step=300000, gradient_step=30000, len=65, n/ep=14, n/st=1000, rew=-474.91]                           


Epoch #30: test_reward: -461.049105 ± 77.028881, best_reward: -138.816623 ± 36.995180 in #26


Epoch #31: 10001it [00:08, 1140.98it/s, env_step=310000, gradient_step=31000, len=74, n/ep=14, n/st=1000, rew=-211.75]                           


Epoch #31: test_reward: -175.180399 ± 56.033529, best_reward: -138.816623 ± 36.995180 in #26


Epoch #32: 10001it [00:12, 810.23it/s, env_step=320000, gradient_step=32000, len=106, n/ep=13, n/st=1000, rew=-155.56]                           


Epoch #32: test_reward: -222.133613 ± 115.524728, best_reward: -138.816623 ± 36.995180 in #26


Epoch #33: 10001it [00:14, 688.85it/s, env_step=330000, gradient_step=33000, len=104, n/ep=12, n/st=1000, rew=-450.21]                           


Epoch #33: test_reward: -550.909633 ± 157.440090, best_reward: -138.816623 ± 36.995180 in #26


Epoch #34: 10001it [00:14, 686.82it/s, env_step=340000, gradient_step=34000, len=87, n/ep=11, n/st=1000, rew=-695.83]                           


Epoch #34: test_reward: -745.566875 ± 75.020807, best_reward: -138.816623 ± 36.995180 in #26


Epoch #35: 10001it [00:14, 667.64it/s, env_step=350000, gradient_step=35000, len=103, n/ep=10, n/st=1000, rew=-486.76]                           


Epoch #35: test_reward: -477.081909 ± 175.319745, best_reward: -138.816623 ± 36.995180 in #26


Epoch #36: 10001it [00:09, 1030.94it/s, env_step=360000, gradient_step=36000, len=106, n/ep=8, n/st=1000, rew=-147.04]                           


Epoch #36: test_reward: -326.695117 ± 287.559728, best_reward: -138.816623 ± 36.995180 in #26


Epoch #37: 10001it [00:10, 980.82it/s, env_step=370000, gradient_step=37000, len=79, n/ep=12, n/st=1000, rew=-123.92]                            


Epoch #37: test_reward: -142.752193 ± 40.668871, best_reward: -138.816623 ± 36.995180 in #26


Epoch #38: 10001it [00:14, 712.67it/s, env_step=380000, gradient_step=38000, len=75, n/ep=13, n/st=1000, rew=-256.16]                           


Epoch #38: test_reward: -257.776484 ± 46.248529, best_reward: -138.816623 ± 36.995180 in #26


Epoch #39: 10001it [00:14, 698.94it/s, env_step=390000, gradient_step=39000, len=66, n/ep=17, n/st=1000, rew=-405.97]                           


Epoch #39: test_reward: -418.106800 ± 67.863741, best_reward: -138.816623 ± 36.995180 in #26


Epoch #40: 10001it [00:13, 736.36it/s, env_step=400000, gradient_step=40000, len=67, n/ep=14, n/st=1000, rew=-525.84]                           


Epoch #40: test_reward: -522.486864 ± 114.225719, best_reward: -138.816623 ± 36.995180 in #26


Epoch #41: 10001it [00:13, 740.67it/s, env_step=410000, gradient_step=41000, len=65, n/ep=16, n/st=1000, rew=-447.51]                           


Epoch #41: test_reward: -409.276553 ± 63.073332, best_reward: -138.816623 ± 36.995180 in #26


Epoch #42: 10001it [00:09, 1041.24it/s, env_step=420000, gradient_step=42000, len=77, n/ep=13, n/st=1000, rew=-200.52]                           


Epoch #42: test_reward: -163.774200 ± 48.417277, best_reward: -138.816623 ± 36.995180 in #26


Epoch #43: 10001it [00:15, 654.40it/s, env_step=430000, gradient_step=43000, len=87, n/ep=11, n/st=1000, rew=-140.48]                           


Epoch #43: test_reward: -152.587913 ± 50.248042, best_reward: -138.816623 ± 36.995180 in #26


Epoch #44: 10001it [00:09, 1054.37it/s, env_step=440000, gradient_step=44000, len=116, n/ep=11, n/st=1000, rew=-231.10]                           


Epoch #44: test_reward: -271.524240 ± 74.672617, best_reward: -138.816623 ± 36.995180 in #26


Epoch #45: 10001it [00:09, 1045.34it/s, env_step=450000, gradient_step=45000, len=106, n/ep=10, n/st=1000, rew=-420.88]                           


Epoch #45: test_reward: -473.182796 ± 115.814620, best_reward: -138.816623 ± 36.995180 in #26


Epoch #46: 10001it [00:09, 1033.35it/s, env_step=460000, gradient_step=46000, len=84, n/ep=12, n/st=1000, rew=-787.52]                           


Epoch #46: test_reward: -810.311843 ± 138.807987, best_reward: -138.816623 ± 36.995180 in #26


Epoch #47: 10001it [00:13, 735.09it/s, env_step=470000, gradient_step=47000, len=77, n/ep=14, n/st=1000, rew=-721.38]                           


Epoch #47: test_reward: -688.160345 ± 86.479045, best_reward: -138.816623 ± 36.995180 in #26


Epoch #48: 10001it [00:13, 750.93it/s, env_step=480000, gradient_step=48000, len=135, n/ep=8, n/st=1000, rew=-367.23]                           


Epoch #48: test_reward: -347.831753 ± 276.118595, best_reward: -138.816623 ± 36.995180 in #26


Epoch #49: 10001it [00:13, 764.66it/s, env_step=490000, gradient_step=49000, len=77, n/ep=11, n/st=1000, rew=-79.48]                            


Epoch #49: test_reward: -135.714776 ± 84.277236, best_reward: -135.714776 ± 84.277236 in #49


Epoch #50: 10001it [00:12, 770.95it/s, env_step=500000, gradient_step=50000, len=70, n/ep=16, n/st=1000, rew=-165.16]                           


Epoch #50: test_reward: -176.920797 ± 89.242121, best_reward: -135.714776 ± 84.277236 in #49


Epoch #51: 10001it [00:14, 694.18it/s, env_step=510000, gradient_step=51000, len=72, n/ep=15, n/st=1000, rew=-271.05]                           


Epoch #51: test_reward: -253.597031 ± 51.796246, best_reward: -135.714776 ± 84.277236 in #49


Epoch #52: 10001it [00:09, 1004.86it/s, env_step=520000, gradient_step=52000, len=65, n/ep=16, n/st=1000, rew=-328.47]                          


Epoch #52: test_reward: -356.499317 ± 51.070587, best_reward: -135.714776 ± 84.277236 in #49


Epoch #53: 10001it [00:09, 1042.52it/s, env_step=530000, gradient_step=53000, len=62, n/ep=16, n/st=1000, rew=-435.06]                           


Epoch #53: test_reward: -470.442834 ± 81.194534, best_reward: -135.714776 ± 84.277236 in #49


Epoch #54: 10001it [00:15, 664.28it/s, env_step=540000, gradient_step=54000, len=69, n/ep=14, n/st=1000, rew=-324.35]                           


Epoch #54: test_reward: -268.555613 ± 44.034758, best_reward: -135.714776 ± 84.277236 in #49


Epoch #55: 10001it [00:09, 1078.27it/s, env_step=550000, gradient_step=55000, len=68, n/ep=14, n/st=1000, rew=-148.37]                           


Epoch #55: test_reward: -129.448861 ± 17.043291, best_reward: -129.448861 ± 17.043291 in #55


Epoch #56: 10001it [00:13, 729.17it/s, env_step=560000, gradient_step=56000, len=72, n/ep=16, n/st=1000, rew=-145.50]                           


Epoch #56: test_reward: -149.244559 ± 57.480064, best_reward: -129.448861 ± 17.043291 in #55


Epoch #57: 10001it [00:14, 711.29it/s, env_step=570000, gradient_step=57000, len=72, n/ep=15, n/st=1000, rew=-221.81]                           


Epoch #57: test_reward: -234.217783 ± 49.496588, best_reward: -129.448861 ± 17.043291 in #55


Epoch #58: 10001it [00:13, 720.97it/s, env_step=580000, gradient_step=58000, len=66, n/ep=16, n/st=1000, rew=-350.79]                           


Epoch #58: test_reward: -374.715888 ± 64.865145, best_reward: -129.448861 ± 17.043291 in #55


Epoch #59: 10001it [00:13, 738.48it/s, env_step=590000, gradient_step=59000, len=78, n/ep=14, n/st=1000, rew=-582.41]                           


Epoch #59: test_reward: -584.147435 ± 154.944903, best_reward: -129.448861 ± 17.043291 in #55


Epoch #60: 10001it [00:14, 673.53it/s, env_step=600000, gradient_step=60000, len=88, n/ep=9, n/st=1000, rew=-785.00]                            


Epoch #60: test_reward: -1207.009420 ± 452.702000, best_reward: -129.448861 ± 17.043291 in #55


Epoch #61: 10001it [00:13, 727.82it/s, env_step=610000, gradient_step=61000, len=78, n/ep=14, n/st=1000, rew=-702.39]                            


Epoch #61: test_reward: -694.622988 ± 91.294058, best_reward: -129.448861 ± 17.043291 in #55


Epoch #62: 10001it [00:13, 724.39it/s, env_step=620000, gradient_step=62000, len=242, n/ep=5, n/st=1000, rew=-1654.97]                           


Epoch #62: test_reward: -852.192337 ± 71.500713, best_reward: -129.448861 ± 17.043291 in #55


Epoch #63: 10001it [00:13, 724.90it/s, env_step=630000, gradient_step=63000, len=96, n/ep=10, n/st=1000, rew=-728.83]                           


Epoch #63: test_reward: -713.501005 ± 69.911819, best_reward: -129.448861 ± 17.043291 in #55


Epoch #64: 10001it [00:09, 1093.41it/s, env_step=640000, gradient_step=64000, len=85, n/ep=11, n/st=1000, rew=-748.91]                           


Epoch #64: test_reward: -743.382070 ± 53.712588, best_reward: -129.448861 ± 17.043291 in #55


Epoch #65: 10001it [00:13, 752.31it/s, env_step=650000, gradient_step=65000, len=92, n/ep=11, n/st=1000, rew=-852.02]                           


Epoch #65: test_reward: -821.113516 ± 166.560702, best_reward: -129.448861 ± 17.043291 in #55


Epoch #66: 10001it [00:13, 752.87it/s, env_step=660000, gradient_step=66000, len=117, n/ep=8, n/st=1000, rew=-501.05]                           


Epoch #66: test_reward: -271.179893 ± 55.637803, best_reward: -129.448861 ± 17.043291 in #55


Epoch #67: 10001it [00:13, 767.46it/s, env_step=670000, gradient_step=67000, len=124, n/ep=11, n/st=1000, rew=-581.77]                           


Epoch #67: test_reward: -516.607250 ± 96.174743, best_reward: -129.448861 ± 17.043291 in #55


Epoch #68: 10001it [00:13, 738.46it/s, env_step=680000, gradient_step=68000, len=66, n/ep=12, n/st=1000, rew=-582.67]                           


Epoch #68: test_reward: -559.529976 ± 110.791750, best_reward: -129.448861 ± 17.043291 in #55


Epoch #69: 10001it [00:13, 723.03it/s, env_step=690000, gradient_step=69000, len=68, n/ep=16, n/st=1000, rew=-511.81]                           


Epoch #69: test_reward: -542.904497 ± 149.517515, best_reward: -129.448861 ± 17.043291 in #55


Epoch #70: 10001it [00:14, 712.43it/s, env_step=700000, gradient_step=70000, len=65, n/ep=15, n/st=1000, rew=-509.40]                           


Epoch #70: test_reward: -534.139301 ± 139.913405, best_reward: -129.448861 ± 17.043291 in #55


Epoch #71: 10001it [00:10, 950.53it/s, env_step=710000, gradient_step=71000, len=70, n/ep=15, n/st=1000, rew=-288.37]                           


Epoch #71: test_reward: -231.831904 ± 51.518694, best_reward: -129.448861 ± 17.043291 in #55


Epoch #72: 10001it [00:14, 689.96it/s, env_step=720000, gradient_step=72000, len=68, n/ep=16, n/st=1000, rew=-119.95]                           


Epoch #72: test_reward: -120.072964 ± 17.170160, best_reward: -120.072964 ± 17.170160 in #72


Epoch #73: 10001it [00:14, 706.68it/s, env_step=730000, gradient_step=73000, len=65, n/ep=15, n/st=1000, rew=-172.33]                           


Epoch #73: test_reward: -187.022679 ± 75.133525, best_reward: -120.072964 ± 17.170160 in #72


Epoch #74: 10001it [00:14, 699.90it/s, env_step=740000, gradient_step=74000, len=74, n/ep=15, n/st=1000, rew=-267.94]                           


Epoch #74: test_reward: -268.980084 ± 59.804128, best_reward: -120.072964 ± 17.170160 in #72


Epoch #75: 10001it [00:14, 699.00it/s, env_step=750000, gradient_step=75000, len=68, n/ep=16, n/st=1000, rew=-380.40]                           


Epoch #75: test_reward: -396.871724 ± 91.787827, best_reward: -120.072964 ± 17.170160 in #72


Epoch #76: 10001it [00:14, 702.61it/s, env_step=760000, gradient_step=76000, len=64, n/ep=17, n/st=1000, rew=-456.90]                           


Epoch #76: test_reward: -507.508006 ± 127.288518, best_reward: -120.072964 ± 17.170160 in #72


Epoch #77: 10001it [00:12, 816.45it/s, env_step=770000, gradient_step=77000, len=70, n/ep=15, n/st=1000, rew=-488.33]                           


Epoch #77: test_reward: -434.221259 ± 95.929808, best_reward: -120.072964 ± 17.170160 in #72


Epoch #78: 10001it [00:12, 832.82it/s, env_step=780000, gradient_step=78000, len=63, n/ep=15, n/st=1000, rew=-182.29]                           


Epoch #78: test_reward: -168.123902 ± 64.774830, best_reward: -120.072964 ± 17.170160 in #72


Epoch #79: 10001it [00:11, 849.59it/s, env_step=790000, gradient_step=79000, len=72, n/ep=12, n/st=1000, rew=-119.35]                           


Epoch #79: test_reward: -118.798268 ± 16.884793, best_reward: -118.798268 ± 16.884793 in #79


Epoch #80: 10001it [00:12, 815.59it/s, env_step=800000, gradient_step=80000, len=70, n/ep=14, n/st=1000, rew=-161.24]                           


Epoch #80: test_reward: -165.387607 ± 75.529851, best_reward: -118.798268 ± 16.884793 in #79


Epoch #81: 10001it [00:11, 840.67it/s, env_step=810000, gradient_step=81000, len=71, n/ep=13, n/st=1000, rew=-309.89]                           


Epoch #81: test_reward: -309.080728 ± 55.016837, best_reward: -118.798268 ± 16.884793 in #79


Epoch #82: 10001it [00:12, 831.99it/s, env_step=820000, gradient_step=82000, len=67, n/ep=16, n/st=1000, rew=-455.85]                           


Epoch #82: test_reward: -489.670585 ± 134.135186, best_reward: -118.798268 ± 16.884793 in #79


Epoch #83: 10001it [00:13, 743.08it/s, env_step=830000, gradient_step=83000, len=64, n/ep=17, n/st=1000, rew=-525.45]                           


Epoch #83: test_reward: -571.436022 ± 178.610288, best_reward: -118.798268 ± 16.884793 in #79


Epoch #84: 10001it [00:12, 799.26it/s, env_step=840000, gradient_step=84000, len=68, n/ep=16, n/st=1000, rew=-641.13]                           


Epoch #84: test_reward: -684.864819 ± 159.127123, best_reward: -118.798268 ± 16.884793 in #79


Epoch #85: 10001it [00:09, 1041.75it/s, env_step=850000, gradient_step=85000, len=75, n/ep=13, n/st=1000, rew=-621.42]                           


Epoch #85: test_reward: -631.584896 ± 72.800589, best_reward: -118.798268 ± 16.884793 in #79


Epoch #86: 10001it [00:08, 1112.34it/s, env_step=860000, gradient_step=86000, len=195, n/ep=3, n/st=1000, rew=-464.90]                            


Epoch #86: test_reward: -191.441314 ± 124.778999, best_reward: -118.798268 ± 16.884793 in #79


Epoch #87: 10001it [00:08, 1230.72it/s, env_step=870000, gradient_step=87000, len=76, n/ep=12, n/st=1000, rew=-131.08]                           


Epoch #87: test_reward: -143.107666 ± 50.554627, best_reward: -118.798268 ± 16.884793 in #79


Epoch #88: 10001it [00:07, 1260.60it/s, env_step=880000, gradient_step=88000, len=72, n/ep=11, n/st=1000, rew=-231.11]                           


Epoch #88: test_reward: -258.622812 ± 68.737914, best_reward: -118.798268 ± 16.884793 in #79


Epoch #89: 10001it [00:08, 1229.73it/s, env_step=890000, gradient_step=89000, len=64, n/ep=16, n/st=1000, rew=-359.77]                           


Epoch #89: test_reward: -400.866759 ± 95.212781, best_reward: -118.798268 ± 16.884793 in #79


Epoch #90: 10001it [00:08, 1247.34it/s, env_step=900000, gradient_step=90000, len=68, n/ep=16, n/st=1000, rew=-529.86]                           


Epoch #90: test_reward: -508.619571 ± 129.746106, best_reward: -118.798268 ± 16.884793 in #79


Epoch #91: 10001it [00:08, 1231.03it/s, env_step=910000, gradient_step=91000, len=65, n/ep=14, n/st=1000, rew=-498.24]                           


Epoch #91: test_reward: -476.448044 ± 117.407120, best_reward: -118.798268 ± 16.884793 in #79


Epoch #92: 10001it [00:08, 1203.25it/s, env_step=920000, gradient_step=92000, len=74, n/ep=15, n/st=1000, rew=-249.34]                           


Epoch #92: test_reward: -186.478127 ± 78.428558, best_reward: -118.798268 ± 16.884793 in #79


Epoch #93: 10001it [00:08, 1124.35it/s, env_step=930000, gradient_step=93000, len=84, n/ep=13, n/st=1000, rew=-62.82]                            


Epoch #93: test_reward: -166.685981 ± 127.797938, best_reward: -118.798268 ± 16.884793 in #79


Epoch #94: 10001it [00:07, 1260.10it/s, env_step=940000, gradient_step=94000, len=111, n/ep=7, n/st=1000, rew=-386.76]                           


Epoch #94: test_reward: -552.843305 ± 167.196216, best_reward: -118.798268 ± 16.884793 in #79


Epoch #95: 10001it [00:08, 1220.33it/s, env_step=950000, gradient_step=95000, len=85, n/ep=14, n/st=1000, rew=-672.29]                           


Epoch #95: test_reward: -706.392522 ± 93.015375, best_reward: -118.798268 ± 16.884793 in #79


Epoch #96: 10001it [00:07, 1313.14it/s, env_step=960000, gradient_step=96000, len=84, n/ep=10, n/st=1000, rew=-424.36]                           


Epoch #96: test_reward: -358.262808 ± 66.732007, best_reward: -118.798268 ± 16.884793 in #79


Epoch #97: 10001it [00:07, 1333.77it/s, env_step=970000, gradient_step=97000, len=81, n/ep=14, n/st=1000, rew=-108.47]                            


Epoch #97: test_reward: -112.318632 ± 53.741940, best_reward: -112.318632 ± 53.741940 in #97


Epoch #98: 10001it [00:07, 1310.59it/s, env_step=980000, gradient_step=98000, len=70, n/ep=14, n/st=1000, rew=-105.72]                           


Epoch #98: test_reward: -111.815993 ± 24.104302, best_reward: -111.815993 ± 24.104302 in #98


Epoch #99: 10001it [00:12, 805.96it/s, env_step=990000, gradient_step=99000, len=64, n/ep=13, n/st=1000, rew=-137.65]                           


Epoch #99: test_reward: -136.127628 ± 31.930091, best_reward: -111.815993 ± 24.104302 in #98


Epoch #100: 10001it [00:12, 772.17it/s, env_step=1000000, gradient_step=100000, len=67, n/ep=15, n/st=1000, rew=-136.41]                           


Epoch #100: test_reward: -157.729117 ± 50.077350, best_reward: -111.815993 ± 24.104302 in #98


Epoch #1: 10001it [00:12, 794.85it/s, env_step=10000, gradient_step=101000, len=70, n/ep=13, n/st=1000, rew=-260.72]                           


Epoch #1: test_reward: -259.808064 ± 46.883007, best_reward: -156.070041 ± 56.429260 in #0


Epoch #2: 10001it [00:13, 756.27it/s, env_step=20000, gradient_step=102000, len=71, n/ep=16, n/st=1000, rew=-435.90]                           


Epoch #2: test_reward: -433.971428 ± 72.852414, best_reward: -156.070041 ± 56.429260 in #0


Epoch #3: 10001it [00:10, 980.63it/s, env_step=30000, gradient_step=103000, len=67, n/ep=16, n/st=1000, rew=-552.90]                            


Epoch #3: test_reward: -545.433531 ± 142.759140, best_reward: -156.070041 ± 56.429260 in #0


Epoch #4: 10001it [00:08, 1148.34it/s, env_step=40000, gradient_step=104000, len=65, n/ep=14, n/st=1000, rew=-556.94]                           


Epoch #4: test_reward: -560.534572 ± 156.974158, best_reward: -156.070041 ± 56.429260 in #0


Epoch #5: 10001it [00:08, 1223.35it/s, env_step=50000, gradient_step=105000, len=70, n/ep=12, n/st=1000, rew=-337.88]                           


Epoch #5: test_reward: -272.767571 ± 45.407868, best_reward: -156.070041 ± 56.429260 in #0


Epoch #6: 10001it [00:08, 1197.35it/s, env_step=60000, gradient_step=106000, len=64, n/ep=15, n/st=1000, rew=-118.55]                           


Epoch #6: test_reward: -113.776907 ± 15.634680, best_reward: -113.776907 ± 15.634680 in #6


Epoch #7: 10001it [00:07, 1315.15it/s, env_step=70000, gradient_step=107000, len=76, n/ep=15, n/st=1000, rew=-196.48]                           


Epoch #7: test_reward: -181.835220 ± 71.150773, best_reward: -113.776907 ± 15.634680 in #6


Epoch #8: 10001it [00:07, 1330.94it/s, env_step=80000, gradient_step=108000, len=65, n/ep=13, n/st=1000, rew=-277.53]                           


Epoch #8: test_reward: -295.127159 ± 67.390173, best_reward: -113.776907 ± 15.634680 in #6


Epoch #9: 10001it [00:07, 1338.49it/s, env_step=90000, gradient_step=109000, len=61, n/ep=17, n/st=1000, rew=-343.78]                           


Epoch #9: test_reward: -426.580805 ± 98.523187, best_reward: -113.776907 ± 15.634680 in #6


Epoch #10: 10001it [00:07, 1275.89it/s, env_step=100000, gradient_step=110000, len=67, n/ep=15, n/st=1000, rew=-510.50]                           


Epoch #10: test_reward: -514.314158 ± 135.274559, best_reward: -113.776907 ± 15.634680 in #6


Epoch #11: 10001it [00:08, 1200.97it/s, env_step=110000, gradient_step=111000, len=66, n/ep=14, n/st=1000, rew=-443.41]                           


Epoch #11: test_reward: -386.830452 ± 89.119872, best_reward: -113.776907 ± 15.634680 in #6


Epoch #12: 10001it [00:07, 1297.39it/s, env_step=120000, gradient_step=112000, len=77, n/ep=15, n/st=1000, rew=-233.15]                           


Epoch #12: test_reward: -152.466268 ± 39.343743, best_reward: -113.776907 ± 15.634680 in #6


Epoch #13: 10001it [00:08, 1249.11it/s, env_step=130000, gradient_step=113000, len=72, n/ep=15, n/st=1000, rew=-134.49]                           


Epoch #13: test_reward: -130.034623 ± 27.026389, best_reward: -113.776907 ± 15.634680 in #6


Epoch #14: 10001it [00:07, 1308.42it/s, env_step=140000, gradient_step=114000, len=78, n/ep=14, n/st=1000, rew=-182.67]                           


Epoch #14: test_reward: -190.908355 ± 90.289407, best_reward: -113.776907 ± 15.634680 in #6


Epoch #15: 10001it [00:07, 1290.69it/s, env_step=150000, gradient_step=115000, len=64, n/ep=15, n/st=1000, rew=-294.71]                           


Epoch #15: test_reward: -333.335245 ± 57.194072, best_reward: -113.776907 ± 15.634680 in #6


Epoch #16: 10001it [00:07, 1266.98it/s, env_step=160000, gradient_step=116000, len=70, n/ep=17, n/st=1000, rew=-491.15]                           


Epoch #16: test_reward: -498.376508 ± 113.959923, best_reward: -113.776907 ± 15.634680 in #6


Epoch #17: 10001it [00:07, 1253.11it/s, env_step=170000, gradient_step=117000, len=66, n/ep=14, n/st=1000, rew=-525.85]                           


Epoch #17: test_reward: -556.777544 ± 158.594507, best_reward: -113.776907 ± 15.634680 in #6


Epoch #18: 10001it [00:07, 1287.41it/s, env_step=180000, gradient_step=118000, len=106, n/ep=9, n/st=1000, rew=-1203.34]                           


Epoch #18: test_reward: -1097.191998 ± 318.650817, best_reward: -113.776907 ± 15.634680 in #6


Epoch #19: 10001it [00:08, 1244.60it/s, env_step=190000, gradient_step=119000, len=91, n/ep=12, n/st=1000, rew=-651.13]                            


Epoch #19: test_reward: -614.289217 ± 139.133850, best_reward: -113.776907 ± 15.634680 in #6


Epoch #20: 10001it [00:08, 1189.37it/s, env_step=200000, gradient_step=120000, len=154, n/ep=9, n/st=1000, rew=-895.20]                           


Epoch #20: test_reward: -784.687161 ± 93.468308, best_reward: -113.776907 ± 15.634680 in #6


Epoch #21: 10001it [00:08, 1210.05it/s, env_step=210000, gradient_step=121000, len=86, n/ep=11, n/st=1000, rew=-733.40]                           


Epoch #21: test_reward: -708.545637 ± 84.463095, best_reward: -113.776907 ± 15.634680 in #6


Epoch #22: 10001it [00:08, 1244.23it/s, env_step=220000, gradient_step=122000, len=96, n/ep=12, n/st=1000, rew=-267.00]                            


Epoch #22: test_reward: -211.658436 ± 99.918507, best_reward: -113.776907 ± 15.634680 in #6


Epoch #23: 10001it [00:08, 1208.33it/s, env_step=230000, gradient_step=123000, len=77, n/ep=11, n/st=1000, rew=-67.68]                            


Epoch #23: test_reward: -115.244757 ± 71.621033, best_reward: -113.776907 ± 15.634680 in #6


Epoch #24: 10001it [00:08, 1190.27it/s, env_step=240000, gradient_step=124000, len=70, n/ep=13, n/st=1000, rew=-309.85]                           


Epoch #24: test_reward: -313.787125 ± 83.263729, best_reward: -113.776907 ± 15.634680 in #6


Epoch #25: 10001it [00:08, 1220.02it/s, env_step=250000, gradient_step=125000, len=67, n/ep=14, n/st=1000, rew=-423.63]                           


Epoch #25: test_reward: -434.061576 ± 50.618736, best_reward: -113.776907 ± 15.634680 in #6


Epoch #26: 10001it [00:07, 1254.99it/s, env_step=260000, gradient_step=126000, len=65, n/ep=13, n/st=1000, rew=-477.36]                           


Epoch #26: test_reward: -499.060204 ± 79.904624, best_reward: -113.776907 ± 15.634680 in #6


Epoch #27: 10001it [00:08, 1231.63it/s, env_step=270000, gradient_step=127000, len=65, n/ep=14, n/st=1000, rew=-485.85]                           


Epoch #27: test_reward: -496.924663 ± 73.372524, best_reward: -113.776907 ± 15.634680 in #6


Epoch #28: 10001it [00:08, 1204.01it/s, env_step=280000, gradient_step=128000, len=70, n/ep=17, n/st=1000, rew=-239.60]                           


Epoch #28: test_reward: -210.467365 ± 75.306828, best_reward: -113.776907 ± 15.634680 in #6


Epoch #29: 10001it [00:08, 1211.18it/s, env_step=290000, gradient_step=129000, len=69, n/ep=15, n/st=1000, rew=-133.48]                           


Epoch #29: test_reward: -132.773460 ± 40.998431, best_reward: -113.776907 ± 15.634680 in #6


Epoch #30: 10001it [00:08, 1240.19it/s, env_step=300000, gradient_step=130000, len=72, n/ep=14, n/st=1000, rew=-215.88]                           


Epoch #30: test_reward: -218.674834 ± 85.282047, best_reward: -113.776907 ± 15.634680 in #6


Epoch #31: 10001it [00:08, 1225.80it/s, env_step=310000, gradient_step=131000, len=70, n/ep=14, n/st=1000, rew=-295.52]                           


Epoch #31: test_reward: -288.308064 ± 88.390451, best_reward: -113.776907 ± 15.634680 in #6


Epoch #32: 10001it [00:07, 1271.94it/s, env_step=320000, gradient_step=132000, len=75, n/ep=14, n/st=1000, rew=-330.62]                           


Epoch #32: test_reward: -278.124196 ± 90.011083, best_reward: -113.776907 ± 15.634680 in #6


Epoch #33: 10001it [00:08, 1145.86it/s, env_step=330000, gradient_step=133000, len=70, n/ep=12, n/st=1000, rew=-259.35]                           


Epoch #33: test_reward: -253.120660 ± 93.148524, best_reward: -113.776907 ± 15.634680 in #6


Epoch #34: 10001it [00:08, 1190.96it/s, env_step=340000, gradient_step=134000, len=67, n/ep=17, n/st=1000, rew=-209.88]                           


Epoch #34: test_reward: -234.078444 ± 84.012077, best_reward: -113.776907 ± 15.634680 in #6


Epoch #35: 10001it [00:07, 1254.31it/s, env_step=350000, gradient_step=135000, len=68, n/ep=14, n/st=1000, rew=-179.31]                           


Epoch #35: test_reward: -168.571875 ± 62.302570, best_reward: -113.776907 ± 15.634680 in #6


Epoch #36: 10001it [00:07, 1252.42it/s, env_step=360000, gradient_step=136000, len=69, n/ep=17, n/st=1000, rew=-127.98]                           


Epoch #36: test_reward: -129.124879 ± 16.528867, best_reward: -113.776907 ± 15.634680 in #6


Epoch #37: 10001it [00:07, 1262.92it/s, env_step=370000, gradient_step=137000, len=75, n/ep=14, n/st=1000, rew=-264.80]                           


Epoch #37: test_reward: -226.318045 ± 58.231938, best_reward: -113.776907 ± 15.634680 in #6


Epoch #38: 10001it [00:07, 1305.30it/s, env_step=380000, gradient_step=138000, len=68, n/ep=16, n/st=1000, rew=-380.91]                           


Epoch #38: test_reward: -411.291490 ± 49.557879, best_reward: -113.776907 ± 15.634680 in #6


Epoch #39: 10001it [00:07, 1297.44it/s, env_step=390000, gradient_step=139000, len=68, n/ep=16, n/st=1000, rew=-522.98]                           


Epoch #39: test_reward: -519.931797 ± 107.120682, best_reward: -113.776907 ± 15.634680 in #6


Epoch #40: 10001it [00:07, 1272.92it/s, env_step=400000, gradient_step=140000, len=71, n/ep=13, n/st=1000, rew=-594.46]                           


Epoch #40: test_reward: -557.086403 ± 137.242171, best_reward: -113.776907 ± 15.634680 in #6


Epoch #41: 10001it [00:07, 1281.46it/s, env_step=410000, gradient_step=141000, len=75, n/ep=13, n/st=1000, rew=-603.42]                           


Epoch #41: test_reward: -501.901095 ± 93.840925, best_reward: -113.776907 ± 15.634680 in #6


Epoch #42: 10001it [00:07, 1265.96it/s, env_step=420000, gradient_step=142000, len=70, n/ep=12, n/st=1000, rew=-233.27]                           


Epoch #42: test_reward: -225.308894 ± 68.047567, best_reward: -113.776907 ± 15.634680 in #6


Epoch #43: 10001it [00:07, 1288.45it/s, env_step=430000, gradient_step=143000, len=73, n/ep=13, n/st=1000, rew=-131.96]                           


Epoch #43: test_reward: -126.432882 ± 23.867824, best_reward: -113.776907 ± 15.634680 in #6


Epoch #44: 10001it [00:07, 1270.33it/s, env_step=440000, gradient_step=144000, len=75, n/ep=16, n/st=1000, rew=-212.27]                           


Epoch #44: test_reward: -190.938409 ± 74.070406, best_reward: -113.776907 ± 15.634680 in #6


Epoch #45: 10001it [00:07, 1299.12it/s, env_step=450000, gradient_step=145000, len=77, n/ep=10, n/st=1000, rew=-290.31]                           


Epoch #45: test_reward: -262.921129 ± 75.872211, best_reward: -113.776907 ± 15.634680 in #6


Epoch #46: 10001it [00:07, 1290.37it/s, env_step=460000, gradient_step=146000, len=64, n/ep=15, n/st=1000, rew=-297.37]                           


Epoch #46: test_reward: -332.164773 ± 86.931721, best_reward: -113.776907 ± 15.634680 in #6


Epoch #47: 10001it [00:07, 1292.28it/s, env_step=470000, gradient_step=147000, len=69, n/ep=15, n/st=1000, rew=-446.57]                           


Epoch #47: test_reward: -435.597599 ± 118.150974, best_reward: -113.776907 ± 15.634680 in #6


Epoch #48: 10001it [00:08, 1175.79it/s, env_step=480000, gradient_step=148000, len=67, n/ep=16, n/st=1000, rew=-527.53]                           


Epoch #48: test_reward: -523.564042 ± 158.113286, best_reward: -113.776907 ± 15.634680 in #6


Epoch #49: 10001it [00:07, 1283.04it/s, env_step=490000, gradient_step=149000, len=65, n/ep=14, n/st=1000, rew=-398.84]                           


Epoch #49: test_reward: -371.354020 ± 103.815436, best_reward: -113.776907 ± 15.634680 in #6


Epoch #50: 10001it [00:08, 1115.40it/s, env_step=500000, gradient_step=150000, len=70, n/ep=15, n/st=1000, rew=-152.25]                           


Epoch #50: test_reward: -164.233875 ± 62.364989, best_reward: -113.776907 ± 15.634680 in #6


Epoch #51: 10001it [00:13, 768.63it/s, env_step=510000, gradient_step=151000, len=68, n/ep=17, n/st=1000, rew=-134.83]                           


Epoch #51: test_reward: -141.576620 ± 23.422579, best_reward: -113.776907 ± 15.634680 in #6


Epoch #52: 10001it [00:12, 824.25it/s, env_step=520000, gradient_step=152000, len=72, n/ep=15, n/st=1000, rew=-246.38]                           


Epoch #52: test_reward: -252.339241 ± 49.218224, best_reward: -113.776907 ± 15.634680 in #6


Epoch #53: 10001it [00:12, 783.45it/s, env_step=530000, gradient_step=153000, len=70, n/ep=15, n/st=1000, rew=-424.68]                           


Epoch #53: test_reward: -437.895734 ± 66.749610, best_reward: -113.776907 ± 15.634680 in #6


Epoch #54: 10001it [00:12, 786.36it/s, env_step=540000, gradient_step=154000, len=72, n/ep=17, n/st=1000, rew=-589.50]                           


Epoch #54: test_reward: -536.450622 ± 116.779636, best_reward: -113.776907 ± 15.634680 in #6


Epoch #55: 10001it [00:12, 833.07it/s, env_step=550000, gradient_step=155000, len=66, n/ep=16, n/st=1000, rew=-502.92]                           


Epoch #55: test_reward: -567.902206 ± 144.394822, best_reward: -113.776907 ± 15.634680 in #6


Epoch #56: 10001it [00:12, 817.73it/s, env_step=560000, gradient_step=156000, len=68, n/ep=16, n/st=1000, rew=-556.72]                           


Epoch #56: test_reward: -565.338540 ± 155.356795, best_reward: -113.776907 ± 15.634680 in #6


Epoch #57: 10001it [00:12, 782.09it/s, env_step=570000, gradient_step=157000, len=65, n/ep=13, n/st=1000, rew=-544.48]                           


Epoch #57: test_reward: -546.822373 ± 113.462712, best_reward: -113.776907 ± 15.634680 in #6


Epoch #58: 10001it [00:12, 817.73it/s, env_step=580000, gradient_step=158000, len=141, n/ep=6, n/st=1000, rew=-473.48]                           


Epoch #58: test_reward: -485.438289 ± 218.280182, best_reward: -113.776907 ± 15.634680 in #6


Epoch #59: 10001it [00:12, 792.66it/s, env_step=590000, gradient_step=159000, len=153, n/ep=7, n/st=1000, rew=-420.25]                           


Epoch #59: test_reward: -362.544215 ± 173.028898, best_reward: -113.776907 ± 15.634680 in #6


Epoch #60: 10001it [00:12, 786.01it/s, env_step=600000, gradient_step=160000, len=122, n/ep=11, n/st=1000, rew=-707.25]                           


Epoch #60: test_reward: -570.704740 ± 184.193696, best_reward: -113.776907 ± 15.634680 in #6


Epoch #61: 10001it [00:12, 817.11it/s, env_step=610000, gradient_step=161000, len=106, n/ep=8, n/st=1000, rew=-531.96]                            


Epoch #61: test_reward: -669.873763 ± 156.632517, best_reward: -113.776907 ± 15.634680 in #6


Epoch #62: 10001it [00:12, 804.10it/s, env_step=620000, gradient_step=162000, len=266, n/ep=2, n/st=1000, rew=-387.06]                           


Epoch #62: test_reward: -635.950639 ± 309.767063, best_reward: -113.776907 ± 15.634680 in #6


Epoch #63: 10001it [00:11, 839.42it/s, env_step=630000, gradient_step=163000, len=82, n/ep=11, n/st=1000, rew=-499.17]                           


Epoch #63: test_reward: -547.100826 ± 82.200571, best_reward: -113.776907 ± 15.634680 in #6


Epoch #64: 10001it [00:12, 826.54it/s, env_step=640000, gradient_step=164000, len=67, n/ep=18, n/st=1000, rew=-560.80]                           


Epoch #64: test_reward: -561.179060 ± 113.715526, best_reward: -113.776907 ± 15.634680 in #6


Epoch #65: 10001it [00:12, 811.98it/s, env_step=650000, gradient_step=165000, len=63, n/ep=16, n/st=1000, rew=-492.43]                           


Epoch #65: test_reward: -550.797232 ± 141.060610, best_reward: -113.776907 ± 15.634680 in #6


Epoch #66: 10001it [00:10, 997.57it/s, env_step=660000, gradient_step=166000, len=68, n/ep=15, n/st=1000, rew=-604.96]                            


Epoch #66: test_reward: -563.428862 ± 156.008943, best_reward: -113.776907 ± 15.634680 in #6


Epoch #67: 10001it [00:07, 1263.03it/s, env_step=670000, gradient_step=167000, len=76, n/ep=13, n/st=1000, rew=-644.42]                           


Epoch #67: test_reward: -554.298400 ± 146.658848, best_reward: -113.776907 ± 15.634680 in #6


Epoch #68: 10001it [00:07, 1259.88it/s, env_step=680000, gradient_step=168000, len=74, n/ep=16, n/st=1000, rew=-328.86]                           


Epoch #68: test_reward: -265.998109 ± 62.205699, best_reward: -113.776907 ± 15.634680 in #6


Epoch #69: 10001it [00:07, 1294.95it/s, env_step=690000, gradient_step=169000, len=76, n/ep=14, n/st=1000, rew=-80.00]                            


Epoch #69: test_reward: -87.610979 ± 28.699940, best_reward: -87.610979 ± 28.699940 in #69


Epoch #70: 10001it [00:07, 1296.51it/s, env_step=700000, gradient_step=170000, len=105, n/ep=11, n/st=1000, rew=-147.92]                           


Epoch #70: test_reward: -157.769583 ± 79.457772, best_reward: -87.610979 ± 28.699940 in #69


Epoch #71: 10001it [00:09, 1027.11it/s, env_step=710000, gradient_step=171000, len=93, n/ep=11, n/st=1000, rew=-304.47]                           


Epoch #71: test_reward: -320.387884 ± 43.556985, best_reward: -87.610979 ± 28.699940 in #69


Epoch #72: 10001it [00:11, 871.82it/s, env_step=720000, gradient_step=172000, len=88, n/ep=10, n/st=1000, rew=-763.16]                            


Epoch #72: test_reward: -781.297520 ± 133.856859, best_reward: -87.610979 ± 28.699940 in #69


Epoch #73: 10001it [00:13, 755.57it/s, env_step=730000, gradient_step=173000, len=84, n/ep=14, n/st=1000, rew=-901.90]                           


Epoch #73: test_reward: -926.800981 ± 208.997997, best_reward: -87.610979 ± 28.699940 in #69


Epoch #74: 10001it [00:13, 762.19it/s, env_step=740000, gradient_step=174000, len=76, n/ep=13, n/st=1000, rew=-535.72]                           


Epoch #74: test_reward: -463.319760 ± 70.026364, best_reward: -87.610979 ± 28.699940 in #69


Epoch #75: 10001it [00:12, 769.72it/s, env_step=750000, gradient_step=175000, len=78, n/ep=12, n/st=1000, rew=-240.80]                           


Epoch #75: test_reward: -161.221186 ± 86.104275, best_reward: -87.610979 ± 28.699940 in #69


Epoch #76: 10001it [00:11, 846.62it/s, env_step=760000, gradient_step=176000, len=153, n/ep=9, n/st=1000, rew=-306.33]                           


Epoch #76: test_reward: -477.513777 ± 286.607052, best_reward: -87.610979 ± 28.699940 in #69


Epoch #77: 10001it [00:12, 823.49it/s, env_step=770000, gradient_step=177000, len=101, n/ep=9, n/st=1000, rew=-693.69]                            


Epoch #77: test_reward: -742.765187 ± 182.396314, best_reward: -87.610979 ± 28.699940 in #69


Epoch #78: 10001it [00:12, 832.49it/s, env_step=780000, gradient_step=178000, len=105, n/ep=8, n/st=1000, rew=-650.74]                            


Epoch #78: test_reward: -798.491592 ± 257.612514, best_reward: -87.610979 ± 28.699940 in #69


Epoch #79: 10001it [00:11, 852.08it/s, env_step=790000, gradient_step=179000, len=138, n/ep=5, n/st=1000, rew=-445.50]                            


Epoch #79: test_reward: -333.574487 ± 153.646339, best_reward: -87.610979 ± 28.699940 in #69


Epoch #80: 10001it [00:11, 841.16it/s, env_step=800000, gradient_step=180000, len=140, n/ep=10, n/st=1000, rew=-81.33]                           


Epoch #80: test_reward: -174.520551 ± 132.398622, best_reward: -87.610979 ± 28.699940 in #69


Epoch #81: 10001it [00:11, 835.26it/s, env_step=810000, gradient_step=181000, len=74, n/ep=14, n/st=1000, rew=-190.17]                           


Epoch #81: test_reward: -212.617907 ± 92.877344, best_reward: -87.610979 ± 28.699940 in #69


Epoch #82: 10001it [00:12, 774.75it/s, env_step=820000, gradient_step=182000, len=64, n/ep=13, n/st=1000, rew=-333.06]                           


Epoch #82: test_reward: -383.895156 ± 103.853364, best_reward: -87.610979 ± 28.699940 in #69


Epoch #83: 10001it [00:12, 801.78it/s, env_step=830000, gradient_step=183000, len=69, n/ep=14, n/st=1000, rew=-482.71]                           


Epoch #83: test_reward: -525.174905 ± 153.807775, best_reward: -87.610979 ± 28.699940 in #69


Epoch #84: 10001it [00:12, 819.12it/s, env_step=840000, gradient_step=184000, len=69, n/ep=15, n/st=1000, rew=-580.88]                           


Epoch #84: test_reward: -560.671578 ± 163.807657, best_reward: -87.610979 ± 28.699940 in #69


Epoch #85: 10001it [00:12, 832.22it/s, env_step=850000, gradient_step=185000, len=75, n/ep=12, n/st=1000, rew=-441.62]                           


Epoch #85: test_reward: -328.584619 ± 86.520897, best_reward: -87.610979 ± 28.699940 in #69


Epoch #86: 10001it [00:09, 1023.88it/s, env_step=860000, gradient_step=186000, len=114, n/ep=8, n/st=1000, rew=-512.36]                           


Epoch #86: test_reward: -543.567908 ± 159.488897, best_reward: -87.610979 ± 28.699940 in #69


Epoch #87: 10001it [00:09, 1015.07it/s, env_step=870000, gradient_step=187000, len=197, n/ep=3, n/st=1000, rew=-921.67]                            


Epoch #87: test_reward: -637.968115 ± 424.614283, best_reward: -87.610979 ± 28.699940 in #69


Epoch #88: 10001it [00:09, 1021.70it/s, env_step=880000, gradient_step=188000, len=101, n/ep=10, n/st=1000, rew=-679.17]                           


Epoch #88: test_reward: -662.207947 ± 127.046618, best_reward: -87.610979 ± 28.699940 in #69


Epoch #89: 10001it [00:14, 713.02it/s, env_step=890000, gradient_step=189000, len=97, n/ep=10, n/st=1000, rew=-754.17]                           


Epoch #89: test_reward: -698.236077 ± 109.944226, best_reward: -87.610979 ± 28.699940 in #69


Epoch #90: 10001it [00:13, 750.16it/s, env_step=900000, gradient_step=190000, len=161, n/ep=4, n/st=1000, rew=-991.36]                           


Epoch #90: test_reward: -1609.083304 ± 581.518183, best_reward: -87.610979 ± 28.699940 in #69


Epoch #91: 10001it [00:07, 1365.31it/s, env_step=910000, gradient_step=191000, len=106, n/ep=10, n/st=1000, rew=-612.18]                           


Epoch #91: test_reward: -570.866366 ± 84.936819, best_reward: -87.610979 ± 28.699940 in #69


Epoch #92: 10001it [00:09, 1051.26it/s, env_step=920000, gradient_step=192000, len=93, n/ep=9, n/st=1000, rew=-595.38]                            


Epoch #92: test_reward: -559.148220 ± 71.097582, best_reward: -87.610979 ± 28.699940 in #69


Epoch #93: 10001it [00:09, 1076.87it/s, env_step=930000, gradient_step=193000, len=70, n/ep=13, n/st=1000, rew=-295.37]                           


Epoch #93: test_reward: -359.657917 ± 111.161002, best_reward: -87.610979 ± 28.699940 in #69


Epoch #94: 10001it [00:08, 1204.11it/s, env_step=940000, gradient_step=194000, len=64, n/ep=14, n/st=1000, rew=-434.57]                           


Epoch #94: test_reward: -486.206618 ± 147.949566, best_reward: -87.610979 ± 28.699940 in #69


Epoch #95: 10001it [00:09, 1077.49it/s, env_step=950000, gradient_step=195000, len=64, n/ep=15, n/st=1000, rew=-474.78]                           


Epoch #95: test_reward: -459.481640 ± 141.383301, best_reward: -87.610979 ± 28.699940 in #69


Epoch #96: 10001it [00:08, 1129.49it/s, env_step=960000, gradient_step=196000, len=74, n/ep=16, n/st=1000, rew=-131.25]                           


Epoch #96: test_reward: -109.243336 ± 62.525915, best_reward: -87.610979 ± 28.699940 in #69


Epoch #97: 10001it [00:09, 1108.73it/s, env_step=970000, gradient_step=197000, len=107, n/ep=9, n/st=1000, rew=-319.26]                           


Epoch #97: test_reward: -392.051681 ± 185.782982, best_reward: -87.610979 ± 28.699940 in #69


Epoch #98: 10001it [00:08, 1242.80it/s, env_step=980000, gradient_step=198000, len=76, n/ep=15, n/st=1000, rew=-591.53]                           


Epoch #98: test_reward: -571.217536 ± 139.103707, best_reward: -87.610979 ± 28.699940 in #69


Epoch #99: 10001it [00:09, 1110.62it/s, env_step=990000, gradient_step=199000, len=66, n/ep=14, n/st=1000, rew=-447.39]                           


Epoch #99: test_reward: -447.906069 ± 96.146511, best_reward: -87.610979 ± 28.699940 in #69


Epoch #100: 10001it [00:08, 1115.18it/s, env_step=1000000, gradient_step=200000, len=74, n/ep=14, n/st=1000, rew=-323.56]                           


Epoch #100: test_reward: -278.433285 ± 76.535378, best_reward: -87.610979 ± 28.699940 in #69


KeyboardInterrupt: 

In [1]:
# Let's watch its performance!
test_envs.seed(seed)
test_collector.reset()
collector_stats = test_collector.collect(n_episode = 10, render = True)
print(collector_stats)

NameError: name 'test_envs' is not defined

In [7]:
# lunvh tensorboard
from bbrl_utils.notebook import setup_tensorboard

setup_tensorboard("./log/")

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 28284), started 4:40:06 ago. (Use '!kill 28284' to kill it.)