In [None]:
pip install stable-baselines3[extra]

In [2]:
%load_ext tensorboard

import gym
import torch as th
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize, VecMonitor
from typing import Callable

In [3]:
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    def func(progress_remaining: float) -> float:
        return progress_remaining * initial_value
    return func

In [None]:
log_name = "A2C_CartPole-v1_TensorBoard"
env = make_vec_env("CartPole-v1",n_envs=8)
policy_kwargs = dict(ortho_init=False, activation_fn=th.nn.ReLU,
                     net_arch=[dict(pi=[128, 128], vf=[128, 128])])

model = A2C("MlpPolicy", env, policy_kwargs=policy_kwargs, gamma=0.999, normalize_advantage=True, max_grad_norm=0.6, use_rms_prop=True, gae_lambda=0.98, n_steps=64, learning_rate=linear_schedule(0.003479714650024201),
            ent_coef=1.2071544125414278e-06, vf_coef=0.15287763485632272, verbose=1, tensorboard_log=f"./{log_name}/")
model.learn(total_timesteps=500000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
%tensorboard --logdir /content/A2C_CartPole-v1_TensorBoard

In [None]:
log_name = "A2C_MountainCar-v0_TensorBoard"
env = make_vec_env("MountainCar-v0",n_envs=16)
env = VecNormalize(env,norm_reward=True,norm_obs=True)
policy_kwargs = dict(ortho_init=False, activation_fn=th.nn.Tanh,
                     net_arch=[dict(pi=[64, 64], vf=[64, 64])])

venv = VecMonitor(venv=env)
model = A2C("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1, gamma=0.999, gae_lambda=0.8, n_steps=32, learning_rate=linear_schedule(0.009446314774919641),
            ent_coef=1.76756270433399e-07, vf_coef=0.12457053994527265, max_grad_norm=0.6, normalize_advantage=False, use_rms_prop=True, tensorboard_log=f"./{log_name}/")
model.learn(total_timesteps=1000000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
%tensorboard --logdir /content/A2C_MountainCar-v0_TensorBoard

In [None]:
log_name = "A2C_MountainCarContinuous-v0_TensorBoard"
env = make_vec_env("MountainCarContinuous-v0",n_envs=4)
env = VecNormalize(env,norm_reward=True,norm_obs=True)
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=[dict(pi=[64, 64], vf=[64, 64])])

venv = VecMonitor(venv=env)
model = A2C("MlpPolicy", env, policy_kwargs=policy_kwargs,use_rms_prop=True, verbose=1, tensorboard_log=f"./{log_name}/")
model.learn(total_timesteps=150000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
%tensorboard --logdir /content/A2C_MountainCarContinuous-v0_TensorBoard

In [None]:
log_name = "A2C_Acrobot-v1_TensorBoard"
env = make_vec_env("Acrobot-v1",n_envs=16)
env = VecNormalize(env,norm_reward=True,norm_obs=True)
policy_kwargs = dict(ortho_init=True,activation_fn=th.nn.Tanh,
                     net_arch=[dict(pi=[128, 128], vf=[128, 128])])

venv = VecMonitor(venv=env)
model = A2C("MlpPolicy", env, policy_kwargs=policy_kwargs, use_rms_prop=True, n_steps=16, max_grad_norm=2, verbose=1, tensorboard_log=f"./{log_name}/")
model.learn(total_timesteps=500000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
%tensorboard --logdir /content/A2C_Acrobot-v1_TensorBoard