
# TIANSHOU

In [3]:
import datetime
import os
import pprint

import numpy as np
import torch
import glob

from torch.utils.tensorboard import SummaryWriter
from tianshou.data import Collector, CollectStats, ReplayBuffer, VectorReplayBuffer
from tianshou.exploration import GaussianNoise
from tianshou.highlevel.logger import LoggerFactoryDefault
from tianshou.policy import TD3Policy
from tianshou.policy.base import BasePolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import Actor, Critic
from tianshou.env import DummyVectorEnv
from tianshou.utils import TensorboardLogger

import gymnasium as gym

from gymnasium.wrappers import RecordVideo


In [4]:
env = gym.make('LunarLanderContinuous-v2', render_mode="human")

train_envs = DummyVectorEnv([lambda: gym.make('LunarLanderContinuous-v2') for _ in range(10)])
test_envs = DummyVectorEnv([lambda: gym.make('LunarLanderContinuous-v2') for _ in range(10)])

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
max_action = env.action_space.high[0]

print("Observations shape:", state_shape)
print("Actions shape:", action_shape)
print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high))

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

Observations shape: (8,)
Actions shape: (2,)
Action range: -1.0 1.0


<torch._C.Generator at 0x17904a9c310>

In [8]:
#Model
hidden_sizes = ()
device = "cuda" if torch.cuda.is_available() else "cpu"
net_a = Net(state_shape=state_shape, hidden_sizes=hidden_sizes, device=device)
actor = Actor(net_a, action_shape, max_action=max_action, device=device).to(
    device,
)
actor_optim = torch.optim.Adam(actor.parameters(), lr=1e-3)
net_c1 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=hidden_sizes,
    concat=True,
    device=device,
)
net_c2 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=hidden_sizes,
    concat=True,
    device=device,
)
critic1 = Critic(net_c1, device=device).to(device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=1e-3)
critic2 = Critic(net_c2, device=device).to(device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=1e-3)

policy: TD3Policy = TD3Policy(
    actor=actor,
    actor_optim=actor_optim,
    critic=critic1,
    critic_optim=critic1_optim,
    critic2=critic2,
    critic2_optim=critic2_optim,
    tau=0.005,
    gamma=0.99,
    exploration_noise=GaussianNoise(sigma=0.1),
    policy_noise=0.2,
    update_actor_freq=2,
    noise_clip=0.5,
    estimation_step=1,
    action_space=env.action_space,
    action_scaling=True
)

# Collector
buffer = VectorReplayBuffer(20000, 10)
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)
train_collector.reset()
train_collector.collect(n_step=1000, random=True)

# logger
writer = SummaryWriter(log_dir='log/td3_lunar_lander')
logger = TensorboardLogger(writer)

def save_best_fn(policy: BasePolicy) -> None:
    torch.save(policy.state_dict(), 'best_policy.pth')

result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=100,
    step_per_epoch=10000,
    step_per_collect=1000,
    episode_per_test=1000,
    batch_size=64,
    save_best_fn=save_best_fn,
    logger=logger,
    update_per_step=0.1,
    test_in_train=False,
    ).run()
pprint.pprint(result)


Epoch #1: 10001it [00:08, 1232.29it/s, env_step=10000, gradient_step=1000, len=80, n/ep=12, n/st=1000, rew=-415.69]                           


Epoch #1: test_reward: -619.778763 ± 145.154765, best_reward: -591.427869 ± 298.438357 in #0


Epoch #2: 10001it [00:08, 1218.10it/s, env_step=20000, gradient_step=2000, len=68, n/ep=17, n/st=1000, rew=-265.62]                           


Epoch #2: test_reward: -143.263332 ± 35.298072, best_reward: -143.263332 ± 35.298072 in #2


Epoch #3: 10001it [00:07, 1254.25it/s, env_step=30000, gradient_step=3000, len=74, n/ep=17, n/st=1000, rew=-340.74]                           


Epoch #3: test_reward: -392.451196 ± 72.359643, best_reward: -143.263332 ± 35.298072 in #2


Epoch #4: 10001it [00:08, 1172.78it/s, env_step=40000, gradient_step=4000, len=66, n/ep=14, n/st=1000, rew=-549.49]                           


Epoch #4: test_reward: -574.409478 ± 162.094828, best_reward: -143.263332 ± 35.298072 in #2


Epoch #5: 10001it [00:08, 1153.46it/s, env_step=50000, gradient_step=5000, len=70, n/ep=12, n/st=1000, rew=-796.44]                           


Epoch #5: test_reward: -894.517647 ± 223.614917, best_reward: -143.263332 ± 35.298072 in #2


Epoch #6: 10001it [00:08, 1163.13it/s, env_step=60000, gradient_step=6000, len=72, n/ep=15, n/st=1000, rew=-677.75]                           


Epoch #6: test_reward: -573.953891 ± 78.930477, best_reward: -143.263332 ± 35.298072 in #2


Epoch #7: 10001it [00:08, 1211.64it/s, env_step=70000, gradient_step=7000, len=131, n/ep=11, n/st=1000, rew=-139.91]                           


Epoch #7: test_reward: -128.559823 ± 68.859143, best_reward: -128.559823 ± 68.859143 in #7


Epoch #8: 10001it [00:08, 1218.45it/s, env_step=80000, gradient_step=8000, len=69, n/ep=15, n/st=1000, rew=-484.01]                           


Epoch #8: test_reward: -501.537718 ± 102.301941, best_reward: -128.559823 ± 68.859143 in #7


Epoch #9: 10001it [00:08, 1209.65it/s, env_step=90000, gradient_step=9000, len=69, n/ep=13, n/st=1000, rew=-531.11]                           


Epoch #9: test_reward: -579.073286 ± 168.578308, best_reward: -128.559823 ± 68.859143 in #7


Epoch #10: 10001it [00:08, 1195.09it/s, env_step=100000, gradient_step=10000, len=76, n/ep=13, n/st=1000, rew=-643.33]                           


Epoch #10: test_reward: -573.348999 ± 162.403915, best_reward: -128.559823 ± 68.859143 in #7


Epoch #11: 10001it [00:08, 1194.58it/s, env_step=110000, gradient_step=11000, len=67, n/ep=12, n/st=1000, rew=-538.64]                           


Epoch #11: test_reward: -661.320354 ± 129.161166, best_reward: -128.559823 ± 68.859143 in #7


Epoch #12: 10001it [00:08, 1188.33it/s, env_step=120000, gradient_step=12000, len=87, n/ep=11, n/st=1000, rew=-518.47]                           


Epoch #12: test_reward: -538.634267 ± 115.877225, best_reward: -128.559823 ± 68.859143 in #7


Epoch #13: 10001it [00:08, 1133.38it/s, env_step=130000, gradient_step=13000, len=107, n/ep=9, n/st=1000, rew=-525.20]                            


Epoch #13: test_reward: -523.495652 ± 92.883426, best_reward: -128.559823 ± 68.859143 in #7


Epoch #14: 10001it [00:08, 1219.31it/s, env_step=140000, gradient_step=14000, len=72, n/ep=10, n/st=1000, rew=-729.88]                           


Epoch #14: test_reward: -736.727196 ± 60.239849, best_reward: -128.559823 ± 68.859143 in #7


Epoch #15: 10001it [00:08, 1242.91it/s, env_step=150000, gradient_step=15000, len=94, n/ep=11, n/st=1000, rew=-1159.14]                           


Epoch #15: test_reward: -1114.918844 ± 326.609706, best_reward: -128.559823 ± 68.859143 in #7


Epoch #16: 10001it [00:08, 1206.40it/s, env_step=160000, gradient_step=16000, len=89, n/ep=11, n/st=1000, rew=-876.10]                            


Epoch #16: test_reward: -627.867646 ± 160.359961, best_reward: -128.559823 ± 68.859143 in #7


Epoch #17: 10001it [00:08, 1224.94it/s, env_step=170000, gradient_step=17000, len=87, n/ep=9, n/st=1000, rew=-113.27]                            


Epoch #17: test_reward: -65.217313 ± 36.256643, best_reward: -65.217313 ± 36.256643 in #17


Epoch #18: 10001it [00:08, 1224.58it/s, env_step=180000, gradient_step=18000, len=122, n/ep=7, n/st=1000, rew=-580.00]                           


Epoch #18: test_reward: -676.862059 ± 268.580270, best_reward: -65.217313 ± 36.256643 in #17


Epoch #19: 10001it [00:08, 1216.17it/s, env_step=190000, gradient_step=19000, len=76, n/ep=13, n/st=1000, rew=-797.36]                           


Epoch #19: test_reward: -827.016998 ± 95.919370, best_reward: -65.217313 ± 36.256643 in #17


Epoch #20: 10001it [00:08, 1207.00it/s, env_step=200000, gradient_step=20000, len=75, n/ep=15, n/st=1000, rew=-800.16]                           


Epoch #20: test_reward: -688.272353 ± 103.631551, best_reward: -65.217313 ± 36.256643 in #17


Epoch #21: 10001it [00:09, 1100.58it/s, env_step=210000, gradient_step=21000, len=66, n/ep=13, n/st=1000, rew=-412.16]                           


Epoch #21: test_reward: -372.222592 ± 91.496137, best_reward: -65.217313 ± 36.256643 in #17


Epoch #22: 10001it [00:09, 1040.07it/s, env_step=220000, gradient_step=22000, len=66, n/ep=13, n/st=1000, rew=-149.64]                           


Epoch #22: test_reward: -121.239365 ± 32.339303, best_reward: -65.217313 ± 36.256643 in #17


Epoch #23: 10001it [00:08, 1160.01it/s, env_step=230000, gradient_step=23000, len=69, n/ep=14, n/st=1000, rew=-166.33]                           


Epoch #23: test_reward: -192.425573 ± 97.259622, best_reward: -65.217313 ± 36.256643 in #17


Epoch #24: 10001it [00:07, 1271.67it/s, env_step=240000, gradient_step=24000, len=72, n/ep=15, n/st=1000, rew=-401.85]                           


Epoch #24: test_reward: -388.985946 ± 71.568434, best_reward: -65.217313 ± 36.256643 in #17


Epoch #25: 10001it [00:07, 1298.30it/s, env_step=250000, gradient_step=25000, len=67, n/ep=14, n/st=1000, rew=-505.28]                           


Epoch #25: test_reward: -527.255053 ± 127.866054, best_reward: -65.217313 ± 36.256643 in #17


Epoch #26: 10001it [00:07, 1264.36it/s, env_step=260000, gradient_step=26000, len=68, n/ep=15, n/st=1000, rew=-531.67]                           


Epoch #26: test_reward: -549.564901 ± 138.015762, best_reward: -65.217313 ± 36.256643 in #17


Epoch #27: 10001it [00:07, 1284.54it/s, env_step=270000, gradient_step=27000, len=70, n/ep=13, n/st=1000, rew=-276.51]                           


Epoch #27: test_reward: -196.420073 ± 131.233306, best_reward: -65.217313 ± 36.256643 in #17


Epoch #28: 10001it [00:07, 1277.98it/s, env_step=280000, gradient_step=28000, len=90, n/ep=9, n/st=1000, rew=-106.03]                            


Epoch #28: test_reward: -123.651789 ± 74.938336, best_reward: -65.217313 ± 36.256643 in #17


Epoch #29: 10001it [00:07, 1268.46it/s, env_step=290000, gradient_step=29000, len=101, n/ep=11, n/st=1000, rew=-478.83]                           


Epoch #29: test_reward: -623.660501 ± 67.819136, best_reward: -65.217313 ± 36.256643 in #17


Epoch #30: 10001it [00:07, 1253.16it/s, env_step=300000, gradient_step=30000, len=80, n/ep=14, n/st=1000, rew=-795.65]                           


Epoch #30: test_reward: -836.861298 ± 137.021908, best_reward: -65.217313 ± 36.256643 in #17


Epoch #31: 10001it [00:07, 1271.67it/s, env_step=310000, gradient_step=31000, len=88, n/ep=12, n/st=1000, rew=-787.22]                           


Epoch #31: test_reward: -742.035573 ± 67.965564, best_reward: -65.217313 ± 36.256643 in #17


Epoch #32: 10001it [00:07, 1277.95it/s, env_step=320000, gradient_step=32000, len=101, n/ep=10, n/st=1000, rew=-265.82]                           


Epoch #32: test_reward: -196.438481 ± 81.309133, best_reward: -65.217313 ± 36.256643 in #17


Epoch #33: 10001it [00:07, 1264.83it/s, env_step=330000, gradient_step=33000, len=90, n/ep=14, n/st=1000, rew=-84.38]                            


Epoch #33: test_reward: -119.906680 ± 90.949199, best_reward: -65.217313 ± 36.256643 in #17


Epoch #34: 10001it [00:07, 1252.66it/s, env_step=340000, gradient_step=34000, len=69, n/ep=17, n/st=1000, rew=-337.88]                           


Epoch #34: test_reward: -314.714443 ± 100.952045, best_reward: -65.217313 ± 36.256643 in #17


Epoch #35: 10001it [00:07, 1284.94it/s, env_step=350000, gradient_step=35000, len=65, n/ep=16, n/st=1000, rew=-444.72]                           


Epoch #35: test_reward: -439.705058 ± 86.639994, best_reward: -65.217313 ± 36.256643 in #17


Epoch #36: 10001it [00:07, 1294.97it/s, env_step=360000, gradient_step=36000, len=77, n/ep=12, n/st=1000, rew=-521.18]                           


Epoch #36: test_reward: -527.863015 ± 145.082144, best_reward: -65.217313 ± 36.256643 in #17


Epoch #37: 10001it [00:07, 1266.49it/s, env_step=370000, gradient_step=37000, len=70, n/ep=16, n/st=1000, rew=-490.16]                           


Epoch #37: test_reward: -394.617318 ± 77.079962, best_reward: -65.217313 ± 36.256643 in #17


Epoch #38: 10001it [00:07, 1290.01it/s, env_step=380000, gradient_step=38000, len=76, n/ep=14, n/st=1000, rew=-162.85]                           


Epoch #38: test_reward: -123.746566 ± 73.031049, best_reward: -65.217313 ± 36.256643 in #17


Epoch #39: 10001it [00:07, 1310.93it/s, env_step=390000, gradient_step=39000, len=119, n/ep=9, n/st=1000, rew=-313.85]                           


Epoch #39: test_reward: -334.019825 ± 147.992137, best_reward: -65.217313 ± 36.256643 in #17


Epoch #40: 10001it [00:08, 1141.75it/s, env_step=400000, gradient_step=40000, len=88, n/ep=10, n/st=1000, rew=-571.60]                           


Epoch #40: test_reward: -671.120265 ± 74.000879, best_reward: -65.217313 ± 36.256643 in #17


Epoch #41: 10001it [00:07, 1279.92it/s, env_step=410000, gradient_step=41000, len=89, n/ep=12, n/st=1000, rew=-681.17]                           


Epoch #41: test_reward: -644.968095 ± 89.695405, best_reward: -65.217313 ± 36.256643 in #17


Epoch #42: 10001it [00:07, 1332.89it/s, env_step=420000, gradient_step=42000, len=144, n/ep=8, n/st=1000, rew=-45.65]                            


Epoch #42: test_reward: -247.801307 ± 355.065712, best_reward: -65.217313 ± 36.256643 in #17


Epoch #43: 10001it [00:07, 1314.86it/s, env_step=430000, gradient_step=43000, len=78, n/ep=12, n/st=1000, rew=-130.53]                           


Epoch #43: test_reward: -145.469422 ± 42.926991, best_reward: -65.217313 ± 36.256643 in #17


Epoch #44: 10001it [00:07, 1267.77it/s, env_step=440000, gradient_step=44000, len=68, n/ep=15, n/st=1000, rew=-253.01]                           


Epoch #44: test_reward: -258.097963 ± 55.389094, best_reward: -65.217313 ± 36.256643 in #17


Epoch #45: 10001it [00:07, 1278.56it/s, env_step=450000, gradient_step=45000, len=62, n/ep=15, n/st=1000, rew=-377.86]                           


Epoch #45: test_reward: -410.414625 ± 72.288137, best_reward: -65.217313 ± 36.256643 in #17


Epoch #46: 10001it [00:07, 1284.87it/s, env_step=460000, gradient_step=46000, len=66, n/ep=14, n/st=1000, rew=-505.44]                           


Epoch #46: test_reward: -508.779765 ± 115.484050, best_reward: -65.217313 ± 36.256643 in #17


Epoch #47: 10001it [00:08, 1248.76it/s, env_step=470000, gradient_step=47000, len=63, n/ep=16, n/st=1000, rew=-393.36]                           


Epoch #47: test_reward: -368.133574 ± 60.782394, best_reward: -65.217313 ± 36.256643 in #17


Epoch #48: 10001it [00:07, 1261.93it/s, env_step=480000, gradient_step=48000, len=70, n/ep=13, n/st=1000, rew=-156.14]                           


Epoch #48: test_reward: -204.439132 ± 148.527091, best_reward: -65.217313 ± 36.256643 in #17


Epoch #49: 10001it [00:07, 1278.61it/s, env_step=490000, gradient_step=49000, len=109, n/ep=4, n/st=1000, rew=-161.21]                            


Epoch #49: test_reward: -218.384863 ± 118.486777, best_reward: -65.217313 ± 36.256643 in #17


Epoch #50: 10001it [00:08, 1122.30it/s, env_step=500000, gradient_step=50000, len=102, n/ep=10, n/st=1000, rew=-509.36]                           


Epoch #50: test_reward: -537.478842 ± 154.145671, best_reward: -65.217313 ± 36.256643 in #17


Epoch #51: 10001it [00:08, 1158.50it/s, env_step=510000, gradient_step=51000, len=79, n/ep=14, n/st=1000, rew=-658.50]                           


Epoch #51: test_reward: -667.653118 ± 83.858325, best_reward: -65.217313 ± 36.256643 in #17


Epoch #52: 10001it [00:08, 1205.85it/s, env_step=520000, gradient_step=52000, len=100, n/ep=8, n/st=1000, rew=-159.11]                           


Epoch #52: test_reward: -168.410634 ± 84.671669, best_reward: -65.217313 ± 36.256643 in #17


Epoch #53: 10001it [00:08, 1191.03it/s, env_step=530000, gradient_step=53000, len=78, n/ep=12, n/st=1000, rew=-132.54]                           


Epoch #53: test_reward: -151.071998 ± 41.577995, best_reward: -65.217313 ± 36.256643 in #17


Epoch #54: 10001it [00:08, 1224.14it/s, env_step=540000, gradient_step=54000, len=72, n/ep=14, n/st=1000, rew=-152.90]                           


Epoch #54: test_reward: -194.444961 ± 104.302865, best_reward: -65.217313 ± 36.256643 in #17


Epoch #55: 10001it [00:07, 1253.19it/s, env_step=550000, gradient_step=55000, len=71, n/ep=14, n/st=1000, rew=-306.30]                           


Epoch #55: test_reward: -304.460558 ± 38.170610, best_reward: -65.217313 ± 36.256643 in #17


Epoch #56: 10001it [00:08, 1124.04it/s, env_step=560000, gradient_step=56000, len=69, n/ep=15, n/st=1000, rew=-452.13]                           


Epoch #56: test_reward: -457.114598 ± 71.315174, best_reward: -65.217313 ± 36.256643 in #17


Epoch #57: 10001it [00:08, 1213.65it/s, env_step=570000, gradient_step=57000, len=70, n/ep=14, n/st=1000, rew=-510.87]                           


Epoch #57: test_reward: -494.049862 ± 89.853866, best_reward: -65.217313 ± 36.256643 in #17


Epoch #58: 10001it [00:08, 1187.92it/s, env_step=580000, gradient_step=58000, len=71, n/ep=15, n/st=1000, rew=-266.67]                           


Epoch #58: test_reward: -233.116166 ± 59.421912, best_reward: -65.217313 ± 36.256643 in #17


Epoch #59: 10001it [00:08, 1198.18it/s, env_step=590000, gradient_step=59000, len=70, n/ep=13, n/st=1000, rew=-133.76]                           


Epoch #59: test_reward: -135.511828 ± 37.505865, best_reward: -65.217313 ± 36.256643 in #17


Epoch #60: 10001it [00:08, 1196.39it/s, env_step=600000, gradient_step=60000, len=104, n/ep=11, n/st=1000, rew=-184.19]                           


Epoch #60: test_reward: -183.751990 ± 88.434536, best_reward: -65.217313 ± 36.256643 in #17


Epoch #61: 10001it [00:08, 1159.78it/s, env_step=610000, gradient_step=61000, len=103, n/ep=8, n/st=1000, rew=-266.72]                           


Epoch #61: test_reward: -306.181012 ± 73.752043, best_reward: -65.217313 ± 36.256643 in #17


Epoch #62: 10001it [00:08, 1200.15it/s, env_step=620000, gradient_step=62000, len=97, n/ep=11, n/st=1000, rew=-426.67]                           


Epoch #62: test_reward: -474.293960 ± 103.718725, best_reward: -65.217313 ± 36.256643 in #17


Epoch #63: 10001it [00:08, 1239.65it/s, env_step=630000, gradient_step=63000, len=90, n/ep=11, n/st=1000, rew=-916.57]                           


Epoch #63: test_reward: -850.159845 ± 150.801519, best_reward: -65.217313 ± 36.256643 in #17


Epoch #64: 10001it [00:08, 1195.13it/s, env_step=640000, gradient_step=64000, len=80, n/ep=14, n/st=1000, rew=-720.81]                           


Epoch #64: test_reward: -696.590309 ± 90.409364, best_reward: -65.217313 ± 36.256643 in #17


Epoch #65: 10001it [00:08, 1190.90it/s, env_step=650000, gradient_step=65000, len=211, n/ep=2, n/st=1000, rew=-1463.49]                           


Epoch #65: test_reward: -1265.160690 ± 316.202581, best_reward: -65.217313 ± 36.256643 in #17


Epoch #66: 10001it [00:08, 1127.10it/s, env_step=660000, gradient_step=66000, len=145, n/ep=6, n/st=1000, rew=-384.11]                           


Epoch #66: test_reward: -301.785425 ± 170.205287, best_reward: -65.217313 ± 36.256643 in #17


Epoch #67: 10001it [00:08, 1152.44it/s, env_step=670000, gradient_step=67000, len=92, n/ep=12, n/st=1000, rew=-380.36]                           


Epoch #67: test_reward: -408.150153 ± 156.751139, best_reward: -65.217313 ± 36.256643 in #17


Epoch #68: 10001it [00:09, 1084.77it/s, env_step=680000, gradient_step=68000, len=68, n/ep=16, n/st=1000, rew=-348.53]                           


Epoch #68: test_reward: -369.051667 ± 114.660821, best_reward: -65.217313 ± 36.256643 in #17


Epoch #69: 10001it [00:08, 1182.42it/s, env_step=690000, gradient_step=69000, len=65, n/ep=17, n/st=1000, rew=-516.64]                           


Epoch #69: test_reward: -535.580100 ± 165.383695, best_reward: -65.217313 ± 36.256643 in #17


Epoch #70: 10001it [00:09, 1098.79it/s, env_step=700000, gradient_step=70000, len=70, n/ep=15, n/st=1000, rew=-567.23]                           


Epoch #70: test_reward: -501.419211 ± 152.263085, best_reward: -65.217313 ± 36.256643 in #17


Epoch #71: 10001it [00:09, 1083.34it/s, env_step=710000, gradient_step=71000, len=70, n/ep=15, n/st=1000, rew=-212.91]                           


Epoch #71: test_reward: -203.651494 ± 81.385847, best_reward: -65.217313 ± 36.256643 in #17


Epoch #72: 10001it [00:09, 1056.30it/s, env_step=720000, gradient_step=72000, len=76, n/ep=14, n/st=1000, rew=-119.55]                           


Epoch #72: test_reward: -141.532884 ± 86.548769, best_reward: -65.217313 ± 36.256643 in #17


Epoch #73: 10001it [00:09, 1031.78it/s, env_step=730000, gradient_step=73000, len=71, n/ep=15, n/st=1000, rew=-386.45]                           


Epoch #73: test_reward: -370.357239 ± 122.017216, best_reward: -65.217313 ± 36.256643 in #17


Epoch #74: 10001it [00:09, 1046.19it/s, env_step=740000, gradient_step=74000, len=76, n/ep=15, n/st=1000, rew=-518.24]                           


Epoch #74: test_reward: -548.031707 ± 110.525405, best_reward: -65.217313 ± 36.256643 in #17


Epoch #75: 10001it [00:09, 1029.13it/s, env_step=750000, gradient_step=75000, len=91, n/ep=11, n/st=1000, rew=-808.19]                           


Epoch #75: test_reward: -828.343293 ± 166.489207, best_reward: -65.217313 ± 36.256643 in #17


Epoch #76: 10001it [00:09, 1038.14it/s, env_step=760000, gradient_step=76000, len=73, n/ep=16, n/st=1000, rew=-587.53]                           


Epoch #76: test_reward: -598.245002 ± 127.742102, best_reward: -65.217313 ± 36.256643 in #17


Epoch #77: 10001it [00:09, 1033.55it/s, env_step=770000, gradient_step=77000, len=64, n/ep=16, n/st=1000, rew=-338.49]                           


Epoch #77: test_reward: -304.709352 ± 48.404254, best_reward: -65.217313 ± 36.256643 in #17


Epoch #78: 10001it [00:09, 1037.29it/s, env_step=780000, gradient_step=78000, len=101, n/ep=10, n/st=1000, rew=-57.52]                           


Epoch #78: test_reward: -152.940843 ± 116.106946, best_reward: -65.217313 ± 36.256643 in #17


Epoch #79: 10001it [00:09, 1054.63it/s, env_step=790000, gradient_step=79000, len=83, n/ep=13, n/st=1000, rew=-621.64]                            


Epoch #79: test_reward: -676.368213 ± 62.090207, best_reward: -65.217313 ± 36.256643 in #17


Epoch #80: 10001it [00:09, 1069.41it/s, env_step=800000, gradient_step=80000, len=82, n/ep=11, n/st=1000, rew=-679.76]                           


Epoch #80: test_reward: -642.337966 ± 58.066154, best_reward: -65.217313 ± 36.256643 in #17


Epoch #81: 10001it [00:09, 1071.76it/s, env_step=810000, gradient_step=81000, len=70, n/ep=16, n/st=1000, rew=-276.97]                           


Epoch #81: test_reward: -251.992997 ± 106.839413, best_reward: -65.217313 ± 36.256643 in #17


Epoch #82: 10001it [00:09, 1059.11it/s, env_step=820000, gradient_step=82000, len=70, n/ep=13, n/st=1000, rew=-151.41]                           


Epoch #82: test_reward: -154.407515 ± 58.692348, best_reward: -65.217313 ± 36.256643 in #17


Epoch #83: 10001it [00:09, 1070.60it/s, env_step=830000, gradient_step=83000, len=69, n/ep=13, n/st=1000, rew=-126.02]                           


Epoch #83: test_reward: -128.845742 ± 35.224586, best_reward: -65.217313 ± 36.256643 in #17


Epoch #84: 10001it [00:09, 1054.97it/s, env_step=840000, gradient_step=84000, len=75, n/ep=14, n/st=1000, rew=-204.80]                           


Epoch #84: test_reward: -221.853995 ± 78.356140, best_reward: -65.217313 ± 36.256643 in #17


Epoch #85: 10001it [00:09, 1059.38it/s, env_step=850000, gradient_step=85000, len=74, n/ep=15, n/st=1000, rew=-385.10]                           


Epoch #85: test_reward: -389.649926 ± 70.230737, best_reward: -65.217313 ± 36.256643 in #17


Epoch #86: 10001it [00:09, 1054.49it/s, env_step=860000, gradient_step=86000, len=67, n/ep=15, n/st=1000, rew=-486.08]                           


Epoch #86: test_reward: -539.008535 ± 144.560259, best_reward: -65.217313 ± 36.256643 in #17


Epoch #87: 10001it [00:09, 1068.46it/s, env_step=870000, gradient_step=87000, len=68, n/ep=16, n/st=1000, rew=-582.22]                           


Epoch #87: test_reward: -563.786373 ± 157.117740, best_reward: -65.217313 ± 36.256643 in #17


Epoch #88: 10001it [00:09, 1056.68it/s, env_step=880000, gradient_step=88000, len=71, n/ep=17, n/st=1000, rew=-597.62]                           


Epoch #88: test_reward: -558.907547 ± 153.406500, best_reward: -65.217313 ± 36.256643 in #17


Epoch #89: 10001it [00:09, 1063.62it/s, env_step=890000, gradient_step=89000, len=65, n/ep=16, n/st=1000, rew=-311.81]                           


Epoch #89: test_reward: -259.443266 ± 43.272568, best_reward: -65.217313 ± 36.256643 in #17


Epoch #90: 10001it [00:09, 1044.38it/s, env_step=900000, gradient_step=90000, len=74, n/ep=15, n/st=1000, rew=-104.87]                           


Epoch #90: test_reward: -102.350552 ± 28.547113, best_reward: -65.217313 ± 36.256643 in #17


Epoch #91: 10001it [00:09, 1063.30it/s, env_step=910000, gradient_step=91000, len=88, n/ep=11, n/st=1000, rew=-271.68]                           


Epoch #91: test_reward: -253.214619 ± 72.859472, best_reward: -65.217313 ± 36.256643 in #17


Epoch #92: 10001it [00:09, 1042.79it/s, env_step=920000, gradient_step=92000, len=88, n/ep=11, n/st=1000, rew=-395.49]                           


Epoch #92: test_reward: -432.639068 ± 91.140093, best_reward: -65.217313 ± 36.256643 in #17


Epoch #93: 10001it [00:09, 1017.91it/s, env_step=930000, gradient_step=93000, len=116, n/ep=8, n/st=1000, rew=-892.00]                           


Epoch #93: test_reward: -800.659565 ± 167.388424, best_reward: -65.217313 ± 36.256643 in #17


Epoch #94: 10001it [00:08, 1143.19it/s, env_step=940000, gradient_step=94000, len=86, n/ep=13, n/st=1000, rew=-717.76]                           


Epoch #94: test_reward: -659.671709 ± 99.198431, best_reward: -65.217313 ± 36.256643 in #17


Epoch #95: 10001it [00:08, 1209.07it/s, env_step=950000, gradient_step=95000, len=142, n/ep=6, n/st=1000, rew=-241.81]                           


Epoch #95: test_reward: -214.341062 ± 136.248725, best_reward: -65.217313 ± 36.256643 in #17


Epoch #96: 10001it [00:09, 1073.81it/s, env_step=960000, gradient_step=96000, len=201, n/ep=8, n/st=1000, rew=-244.73]                           


Epoch #96: test_reward: -243.602194 ± 97.003406, best_reward: -65.217313 ± 36.256643 in #17


Epoch #97: 10001it [00:09, 1075.76it/s, env_step=970000, gradient_step=97000, len=78, n/ep=14, n/st=1000, rew=-412.89]                           


Epoch #97: test_reward: -413.610124 ± 126.027161, best_reward: -65.217313 ± 36.256643 in #17


Epoch #98: 10001it [00:09, 1056.99it/s, env_step=980000, gradient_step=98000, len=67, n/ep=19, n/st=1000, rew=-407.70]                           


Epoch #98: test_reward: -422.675252 ± 81.099067, best_reward: -65.217313 ± 36.256643 in #17


Epoch #99: 10001it [00:08, 1120.75it/s, env_step=990000, gradient_step=99000, len=65, n/ep=16, n/st=1000, rew=-497.18]                           


Epoch #99: test_reward: -522.332434 ± 121.082275, best_reward: -65.217313 ± 36.256643 in #17


Epoch #100: 10001it [00:09, 1025.99it/s, env_step=1000000, gradient_step=100000, len=63, n/ep=13, n/st=1000, rew=-548.16]                          


Epoch #100: test_reward: -557.403129 ± 131.678031, best_reward: -65.217313 ± 36.256643 in #17
InfoStats(gradient_step=100000,
          best_reward=-65.21731272085137,
          best_reward_std=36.256643477845046,
          train_step=1000000,
          train_episode=12401,
          test_step=8607013,
          test_episode=101000,
          timing=TimingStats(total_time=2518.535719871521,
                             train_time=861.0854756832123,
                             train_time_collect=196.2006607055664,
                             train_time_update=634.9471399784088,
                             test_time=1657.4502441883087,
                             update_speed=1161.3248954252406))


In [9]:
# Let's watch its performance!
test_envs.seed(seed)
test_collector.reset()
collector_stats = test_collector.collect(n_episode = 10, render = True)
print(collector_stats)

CollectStats(n_collected_episodes=10, n_collected_steps=653, collect_time=101.52879357337952, collect_speed=6.431672996567688, returns=array([ -447.82449881,  -406.19764223,  -530.63178111,  -474.35582903,
        -387.41465344,  -535.55734933,  -529.98647276,  -613.17937989,
        -549.11003256, -1086.11987812]), returns_stat=SequenceSummaryStats(mean=-556.037751726602, std=188.49712234988263, max=-387.41465343624105, min=-1086.1198781225087), lens=array([ 51,  57,  58,  58,  58,  59,  64,  72,  75, 101]), lens_stat=SequenceSummaryStats(mean=65.3, std=13.740815114104404, max=101.0, min=51.0))


In [10]:
# lunvh tensorboard
from bbrl_utils.notebook import setup_tensorboard

setup_tensorboard("./log/")

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 28284), started 5:29:49 ago. (Use '!kill 28284' to kill it.)