In [1]:
import argparse
import logging

parser = argparse.ArgumentParser()
parser.add_argument('--env',       type=str,   default='MountainCar-v0')
parser.add_argument('--seed',      type=int,   default=0)
parser.add_argument('--gpu',       type=int,   default=0)
parser.add_argument('--outdir',    type=str,   default='results')
parser.add_argument('--beta',      type=float, default=1e-4)
parser.add_argument('--batchsize', type=int,   default=10)
parser.add_argument('--steps',     type=int,   default=1000)
parser.add_argument('--lr',        type=float, default=1e-3)
parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
args, unknown = parser.parse_known_args()

logging.basicConfig(level=args.logger_level)

## Make Env

In [2]:
import gym
import chainerrl

env = gym.make(args.env)
env = chainerrl.wrappers.CastObservationToFloat32(env)  # Cast observations to float32 because our model uses float32
env = chainerrl.wrappers.Render(env)
chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu,))
env.seed(args.seed)


--------------------------------------------------------------------------------
CuPy (cupy-cuda116) version 10.5.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy-cuda116>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



[0]

## NN

In [3]:
import chainer

obs_space    = env.observation_space
action_space = env.action_space

model = chainerrl.policies.FCSoftmaxPolicy(
    obs_space.low.size,
    action_space.n,
    n_hidden_channels=200,
    n_hidden_layers=2,
    nonlinearity=chainer.functions.leaky_relu,)

if args.gpu >= 0:
    chainer.cuda.get_device_from_id(args.gpu).use()
    model.to_gpu(args.gpu)

opt = chainer.optimizers.Adam(alpha=args.lr)
opt.setup(model)
opt.add_hook(chainer.optimizer.GradientClipping(1))

## Agent

In [4]:
agent = chainerrl.agents.REINFORCE(
    model, 
    opt, 
    beta=args.beta, 
    batchsize=args.batchsize)

## Training

In [5]:
chainerrl.experiments.train_agent(
    agent           = agent,
    env             = env,
    outdir          = chainerrl.experiments.prepare_output_dir(args, args.outdir),
    steps           = args.steps,
    max_episode_len = env.spec.max_episode_steps,)

DEBUG:chainerrl.agents.reinforce:t:1 r:0 a:0 action_distrib:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[0.23584689 0.07190295 0.20392884]] probs:[[0.35494024 0.3012696  0.34379014]] entropy:[1.0961702]
DEBUG:chainerrl.agents.reinforce:t:2 r:-1.0 a:0 action_distrib:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[0.23658404 0.07204361 0.20410742]] probs:[[0.3550722  0.30120188 0.34372592]] entropy:[1.0961571]
DEBUG:chainerrl.agents.reinforce:t:3 r:-1.0 a:1 action_distrib:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[0.23803629 0.07239079 0.20485845]] probs:[[0.35527602 0.3010419  0.3436821 ]] entropy:[1.0961292]
DEBUG:chainerrl.agents.reinforce:t:4 r:-1.0 a:2 action_distrib:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[0.23960859 0.07284409 0.20608255]] probs:[[0.35543817 0.3008425  0.34371927]] entropy:[1.0960975]
DEBUG:chainerrl.agents.reinforce:t:5 r:-1.0 a:0 action_distrib:SoftmaxDistribution(beta=1.0, min_prob=0.0) logits:[[0.24077293 0.07324953 0.2073511

: 