# One Leg training using PPO2 from StableBaselines

## Training with PPO2

These hyperparameters originates from StableBaselinesZoo Reacher's hyperparameters.

In [4]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import PPO2

# TODO: problème de multiprocess
env = make_vec_env('gym_kraby:OneLegBulletEnv-v0', n_envs=8)

# Use `tensorboard --logdir notebooks/stablebaselines/tensorboard_log/one_leg` to inspect learning
model = PPO2(
    policy=MlpPolicy,
    env=env,
    gamma=0.99,  # Discount factor
    n_steps=2048,  # batchsize = n_steps * n_envs
    ent_coef=0.0,  # Entropy coefficient for the loss calculation
    learning_rate=2.5e-4,
    lam=0.95,  # Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    nminibatches=32,  # Number of training minibatches per update.
                      # For recurrent policies, the nb of env run in parallel should be a multiple of it.
    noptepochs=10,  # Number of epoch when optimizing the surrogate
    cliprange=0.2,  # Clipping parameter, this clipping depends on the reward scaling
    verbose=False,
    tensorboard_log="./tensorboard_log/one_leg/",

    # For tests
    seed=0,  # Fixed seed
    n_cpu_tf_sess=1,  # force deterministic results
)
model.learn(total_timesteps=int(1e6))

<stable_baselines.ppo2.ppo2.PPO2 at 0x7f0a64d9f470>

In [5]:
# Saving model
model.save("trained_models/one_leg")