In [17]:
import argparse
import logging

parser = argparse.ArgumentParser()
parser.add_argument('--env',       type=str,   default='MountainCar-v0')
parser.add_argument('--seed',      type=int,   default=0)
parser.add_argument('--gpu',       type=int,   default=0)
parser.add_argument('--outdir',    type=str,   default='results')
parser.add_argument('--beta',      type=float, default=1e-4)
parser.add_argument('--batchsize', type=int,   default=128)
parser.add_argument('--steps',     type=int,   default=1000)
parser.add_argument('--lr',        type=float, default=1e-3)
parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
parser.add_argument('--n-hidden-channels', type=int, default=100)
parser.add_argument('--n-hidden-layers', type=int, default=2)
parser.add_argument('--eval-interval', type=int, default=10 ** 4)
parser.add_argument('--eval-n-runs', type=int, default=1000)

args, unknown = parser.parse_known_args()

# logging.basicConfig(level=args.logger_level)

## Environment

In [18]:
# ENVIROMNET
import gym
import chainerrl

env = gym.make(args.env)
env = chainerrl.wrappers.CastObservationToFloat32(env)  # Cast observations to float32 because our model uses float32
env = chainerrl.wrappers.Render(env)
chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu,))


env.seed(args.seed)

[0]

In [19]:
import chainerrl
from chainerrl.agents.dqn import DQN
from chainerrl import q_functions, explorers, replay_buffer
from chainer import optimizers
timestep_limit = env.spec.max_episode_steps
obs_space = env.observation_space
obs_size = obs_space.low.size
action_space = env.action_space

n_actions = action_space.n
q_func = q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, n_actions,
    n_hidden_channels=args.n_hidden_channels,
    n_hidden_layers=args.n_hidden_layers)
# Use epsilon-greedy for exploration
explorer = explorers.LinearDecayEpsilonGreedy(1.0, 0.1, 1000, action_space.sample)

opt = optimizers.Adam()
opt.setup(q_func)
rbuf_capacity = 500000
rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)


agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=0.999,
                explorer=explorer, replay_start_size=1000,
                target_update_interval=100,
                update_interval=1,
                minibatch_size=args.batchsize,
                target_update_method='hard',
                soft_update_tau=0.01,
                )

In [20]:
from chainerrl import experiments

experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=8000,
            eval_n_steps=None,
            eval_n_episodes=100, eval_interval=10000,
            outdir='results', eval_env=env,
            train_max_episode_len=1000)

DEBUG:chainerrl.explorers.epsilon_greedy:t:0 a:0 non-greedy
DEBUG:chainerrl.agents.dqn:t:1 q:0.314961314201355 action_value:DiscreteActionValue greedy_actions:[0] q_values:[[0.3149613  0.24498275 0.09423555]]
DEBUG:chainerrl.agents.dqn:t:1 r:0 a:0
DEBUG:chainerrl.explorers.epsilon_greedy:t:1 a:0 non-greedy
DEBUG:chainerrl.agents.dqn:t:2 q:0.31510815024375916 action_value:DiscreteActionValue greedy_actions:[0] q_values:[[0.31510815 0.24504659 0.09443767]]
DEBUG:chainerrl.agents.dqn:t:2 r:-1.0 a:0
DEBUG:chainerrl.explorers.epsilon_greedy:t:2 a:1 non-greedy
DEBUG:chainerrl.agents.dqn:t:3 q:0.31552746891975403 action_value:DiscreteActionValue greedy_actions:[0] q_values:[[0.31552747 0.24532276 0.0947202 ]]
DEBUG:chainerrl.agents.dqn:t:3 r:-1.0 a:1
DEBUG:chainerrl.explorers.epsilon_greedy:t:3 a:0 non-greedy
DEBUG:chainerrl.agents.dqn:t:4 q:0.3159292936325073 action_value:DiscreteActionValue greedy_actions:[0] q_values:[[0.3159293  0.24568439 0.09468596]]
DEBUG:chainerrl.agents.dqn:t:4 r:-1.