In [1]:
import gymnasium as gym
import torch
from torch.utils.tensorboard import SummaryWriter
import baird
import tianshou as ts
from tianshou.data import CollectStats
from tianshou.utils.space_info import SpaceInfo
import tianshou.policy.modelfree.dqn as customlogs
%load_ext autoreload
%autoreload 2

In [2]:
task = "BairdsCounterexample-v0"
lr, epoch, batch_size = 1e-3, 10, 64
train_num, test_num = 10, 100
gamma, n_step, target_freq = 0.9, 3, 320
buffer_size = 20000
eps_train, eps_test = 0.1, 0.05
step_per_epoch, step_per_collect = 10000, 10
# You can also try SubprocVectorEnv, which will use parallelization
train_envs = ts.env.DummyVectorEnv([lambda: gym.make(task) for _ in range(train_num)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make(task) for _ in range(test_num)])
from tianshou.utils.net.common import Net
logger = ts.utils.TensorboardLogger(SummaryWriter("log/dqn",filename_suffix="hi"))  # TensorBoard is supported!

# Note: You can easily define other networks.
# See https://tianshou.readthedocs.io/en/master/01_tutorials/00_dqn.html#build-the-network
env = gym.make(task, max_episode_steps=100)
assert isinstance(env.action_space, gym.spaces.Discrete)
space_info = SpaceInfo.from_env(env)
state_shape = space_info.observation_info.obs_shape
action_shape = space_info.action_info.action_shape

net = Net(state_shape=(state_shape,), action_shape=action_shape, hidden_sizes=[128, 128, 128])
optim = torch.optim.Adam(net.parameters(), lr=lr)

In [3]:
policy: ts.policy.DQNPolicy = ts.policy.DQNPolicy(
    model=net,
    optim=optim,
    discount_factor=gamma,
    action_space=env.action_space,
    estimation_step=n_step,
    target_update_freq=target_freq,
)
train_collector = ts.data.Collector[CollectStats](
    policy,
    train_envs,
    ts.data.VectorReplayBuffer(buffer_size, train_num),
    exploration_noise=True,
)
test_collector = ts.data.Collector[CollectStats](
    policy,
    test_envs,
    exploration_noise=True,
)  # because DQN uses epsilon-greedy method
def stop_fn(mean_rewards: float) -> bool:
    if env.spec:
        if not env.spec.reward_threshold:
            return False
        else:
            return mean_rewards >= env.spec.reward_threshold
    return False

In [4]:

result = ts.trainer.OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=2,
        step_per_epoch=step_per_epoch,
        step_per_collect=step_per_collect,
        episode_per_test=test_num,
        batch_size=batch_size,
        update_per_step=1 / step_per_collect,
        train_fn=lambda epoch, env_step: policy.set_eps(eps_train),
        test_fn=lambda epoch, env_step: policy.set_eps(eps_test),
        stop_fn=stop_fn,        
).run()
print(f"Finished training in {result.timing.total_time} seconds")

KeyboardInterrupt: 

In [None]:
tianshou.policy.modelfree.dqn.custom_logs

[tensor([ 1.0009,  1.0009,  1.0009,  1.0009,  1.0009, -0.0045, -0.0045, -0.0045,
         -0.0045, -0.0045, -0.0045, -0.0045, -0.0045,  1.0018,  1.0018,  1.0018,
          1.0018,  1.0018,  1.0018,  1.0018,  1.0018,  0.9974,  0.9974,  0.9974,
          0.9974,  0.9974,  0.9974,  1.0018,  1.0018,  1.0018,  1.0018,  1.0018,
          1.0018,  1.0018,  1.0018, -0.0265, -0.0265, -0.0265, -0.0265, -0.0265,
         -0.0054, -0.0054, -0.0054, -0.0054, -0.0054, -0.0054, -0.0054,  0.9773,
          0.9773,  0.9773,  0.9773,  0.9773,  0.9773,  0.9773,  0.9773, -0.0199,
         -0.0199, -0.0199, -0.0199, -0.0199, -0.0199, -0.0199, -0.0199, -0.0199],
        grad_fn=<SubBackward0>),
 tensor([0.9487, 0.9487, 0.9487, 1.8539, 0.9487, 1.8539, 0.9487, 0.9281, 0.8839,
         0.8839, 0.8839, 0.8839, 0.8839, 0.8839, 0.8839, 0.8839, 1.8559, 0.9487,
         0.9487, 1.8559, 1.8463, 0.9487, 1.8463, 0.9487, 0.9487, 0.9487, 1.8463,
         1.8463, 0.9487, 1.8463, 0.9487, 1.8559, 1.8559, 1.8559, 1.8559, 0.

In [None]:
print(len(tianshou.policy.modelfree.dqn.custom_logs))

4000


In [None]:
from pprint import pprint
pprint(policy.model.model.model[0].weight)
pprint(policy.model_old.model.model[0].weight)

Parameter containing:
tensor([[-9.0100e-02, -3.3213e-01, -3.2691e-01,  2.0021e-01, -1.7987e-03,
         -8.8314e-02,  2.8549e-01],
        [-9.1772e-02, -3.7463e-02, -2.9622e-01, -2.8611e-01,  6.7334e-03,
          7.0876e-02, -4.5138e-02],
        [-1.1153e-01,  1.3935e-01, -2.3483e-01,  2.6344e-01, -2.4119e-01,
         -1.4259e-01, -3.2883e-01],
        [ 6.5022e-02, -2.6737e-01, -1.3544e-01, -4.8117e-02, -2.5582e-01,
          4.2197e-02,  1.3000e-01],
        [-2.7163e-02, -5.6940e-02, -9.3418e-02,  2.3663e-01, -3.7087e-01,
         -2.2895e-01,  2.0581e-01],
        [-3.5962e-01, -2.6773e-01, -1.5376e-01, -2.5040e-01, -2.1259e-01,
         -9.1208e-03, -1.3672e-01],
        [-1.0679e-01,  9.9381e-02, -2.0502e-01,  2.2630e-01, -1.2168e-01,
         -8.2128e-02,  9.1501e-02],
        [ 8.7631e-02, -9.4786e-03, -3.7020e-01, -3.3685e-01, -2.3851e-01,
          2.3932e-01,  4.0703e-02],
        [ 1.7384e-01,  2.0078e-01, -2.7685e-01,  9.1853e-02,  3.4711e-01,
          2.2536e-01, -1

In [None]:
# watch performance
policy.set_eps(eps_test)
collector = ts.data.Collector[CollectStats](policy, env, exploration_noise=True, reset_before_collect=True)
collector.collect(n_episode=100, render=1 / 35)

TypeError: Collector.__init__() got an unexpected keyword argument 'reset_before_collect'