Minimal Notebook to train an instance of meta learning Soft Actor Critic (SAC)

In [1]:
import gym

from stable_baselines3 import mSAC
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy, evaluate_meta_policy

In [2]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init

In [3]:
N_EVAL = 30 #number of scenarios to evaluate model on
N_EPOCHS = 3 # number of training epochs
N_TIMESTEPS = 2000 # number of Timesteps (=Gradient steps) per epoch

In [4]:
num_cpu = 1
env_id = 'LunarLanderContinuous-v2'
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
#env = gym.make('LunarLanderContinuous-v2')

meta_model = mSAC('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=[300, 300, 300], latent_dim = 5, hidden_sizes=[200,200,200]))#,learning_rate=0.0006)

meta_reward = []
meta_std = []

meta_model_mean_reward_before, meta_model_std_reward_before = evaluate_meta_policy(meta_model, env, n_eval_episodes=N_EVAL)
meta_reward.append(meta_model_mean_reward_before)
meta_std.append(meta_model_std_reward_before)

for i in range(N_EPOCHS):
    
    meta_model.learn(total_timesteps=N_TIMESTEPS)
    meta_model_mean_reward, meta_model_std_reward = evaluate_meta_policy(meta_model, env, n_eval_episodes=N_EVAL)

    meta_reward.append(meta_model_mean_reward)
    meta_std.append(meta_model_std_reward)
    
    print('epoch:', i+1)
    print('meta_reward = ', meta_reward)
    print('meta_std = ', meta_std)
    
env.close()

Using cpu device
-------------------------------------
| time/              |              |
|    episodes        | 4            |
|    fps             | 31           |
|    time_elapsed    | 18           |
|    total timesteps | 592          |
| train/             |              |
|    KL_loss         | 1.1450677    |
|    actor_loss      | 2.9          |
|    avg. z          | 0.0067927362 |
|    avg. z var      | 0.33214712   |
|    critic_loss     | 16.5         |
|    ent_coef        | 0.84         |
|    ent_coef_loss   | -0.571       |
|    learning_rate   | 0.0003       |
|    n_updates       | 591          |
-------------------------------------
------------------------------------
| time/              |             |
|    episodes        | 8           |
|    fps             | 30          |
|    time_elapsed    | 34          |
|    total timesteps | 1073        |
| train/             |             |
|    KL_loss         | 1.2840643   |
|    actor_loss      | 10          |
|   

BrokenPipeError: [Errno 32] Broken pipe