In [1]:
from copy import deepcopy
import itertools
import numpy as np
import torch
from torch.optim import Adam
import pybulletgym
import gym
import time
import spinup.algos.pytorch.lstm_sac.core as core
from spinup.utils.logx import EpochLogger


In [2]:
class POMDPWrapper(gym.ObservationWrapper):
    def __init__(self, env_name):
        super().__init__(gym.make(env_name))
        
        # Remove velocity info
        # OpenAIGym
        #  1. MuJoCo
        if env_name == "HalfCheetah-v3" or env_name == "HalfCheetah-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == "Ant-v3" or env_name == "Ant-v2":
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2d-v3' or env_name == "Walker2d-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'Hopper-v3' or env_name == "Hopper-v2":
            self.remain_obs_idx = np.arange(0, 5)
        elif env_name == "InvertedPendulum-v2":
            self.remain_obs_idx = np.arange(0, 2)
        elif env_name == "InvertedDoublePendulum-v2":
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        elif env_name == "Swimmer-v3" or env_name == "Swimmer-v2":
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == "Thrower-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Striker-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Pusher-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Reacher-v2":
            self.remain_obs_idx = list(np.arange(0, 6)) + list(np.arange(8, 11))
        elif env_name == 'Humanoid-v3' or env_name == "Humanoid-v2":
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        elif env_name == 'HumanoidStandup-v2':
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        # PyBulletGym
        #  1. MuJoCo
        elif env_name == 'HalfCheetahMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'AntMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2DMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'HopperMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 7)
        elif env_name == 'InvertedPendulumMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == 'InvertedDoublePendulumMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        #  2. Roboschool
        elif env_name == 'HalfCheetahPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,26)) - set(np.arange(3,6)))
        elif env_name ==  'AntPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,28)) - set(np.arange(3,6)))
        elif env_name == 'Walker2DPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,22)) - set(np.arange(3,6)))
        elif env_name == 'HopperPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,15)) - set(np.arange(3,6)))
        elif env_name == 'InvertedPendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,5)) - set([1,4]))
        elif env_name == 'InvertedDoublePendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([1,5,8]))
        elif env_name == 'ReacherPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([6,8]))
        else:
            raise ValueError('POMDP for {} is not defined!'.format(env_name))
            
        # Redefine observation_space
        obs_low = np.array([-np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        obs_high = np.array([np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        self.observation_space = gym.spaces.Box(obs_low, obs_high)
        
    def observation(self, obs):
        return obs.flatten()[self.remain_obs_idx]
    

In [3]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}



In [7]:
def sac(env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, 
        partially_observable=False,
        logger_kwargs=dict(), save_freq=1):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Wrapper environment if using POMDP
    if partially_observable == True:
        env, test_env = POMDPWrapper(env_name), POMDPWrapper(env_name)
    else:
        env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]   
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o,a)
        q2 = ac.q2(o,a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(),
                      Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort 
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), 
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy. 
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()



In [None]:
args = {'env': 'Ant-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'test_pre_feature_extraction_gated_lstm_DDPG_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

In [8]:

args = {'env': 'Ant-v2', 'hid': 256, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': True,
        'exp_name': 'sac_POMDP'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP\sac_POMDP_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"Ant-v2",
    "epochs":	50,
    "exp_name":	"sac_POMDP",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4E01E31C8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP\\sac_POMDP_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\google drive\\\\git_repos\\\\spinningup-new\\\\data\\\\sac_POMDP\\\\sac_POMDP_s5\\\\progress.txt' mode='w' e



[32;1m
Number of parameters: 	 pi: 94992, 	 q1: 93185, 	 q2: 93185
[0m
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |           -52.9 |
|          StdEpRet |            98.1 |
|          MaxEpRet |            42.4 |
|          MinEpRet |            -373 |
|  AverageTestEpRet |             686 |
|      StdTestEpRet |             119 |
|      MaxTestEpRet |             922 |
|      MinTestEpRet |             581 |
|             EpLen |             156 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            6.45 |
|         StdQ1Vals |            3.93 |
|         MaxQ1Vals |              19 |
|         MinQ1Vals |           -2.48 |
|     AverageQ2Vals |            6.45 |
|         StdQ2Vals |            3.93 |
|         MaxQ2Vals |            18.8 |
|         MinQ2Vals |           -2.43 |
|      AverageLogPi |           -3.49 |
|          StdLogPi |            1.55 |
|      

---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |             227 |
|          StdEpRet |             153 |
|          MaxEpRet |             401 |
|          MinEpRet |            33.4 |
|  AverageTestEpRet |             941 |
|      StdTestEpRet |            32.1 |
|      MaxTestEpRet |             964 |
|      MinTestEpRet |             850 |
|             EpLen |             623 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.2e+04 |
|     AverageQ1Vals |            70.1 |
|         StdQ1Vals |            6.41 |
|         MaxQ1Vals |            86.2 |
|         MinQ1Vals |           -49.7 |
|     AverageQ2Vals |            70.1 |
|         StdQ2Vals |            6.41 |
|         MaxQ2Vals |              86 |
|         MinQ2Vals |           -37.3 |
|      AverageLogPi |           -3.28 |
|          StdLogPi |            1.64 |
|          MaxLogPi |            21.6 |
|          MinLogPi |           -15.6 |


---------------------------------------
|             Epoch |              16 |
|      AverageEpRet |             221 |
|          StdEpRet |             149 |
|          MaxEpRet |             466 |
|          MinEpRet |              15 |
|  AverageTestEpRet |             834 |
|      StdTestEpRet |            10.1 |
|      MaxTestEpRet |             853 |
|      MinTestEpRet |             818 |
|             EpLen |             601 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.4e+04 |
|     AverageQ1Vals |              85 |
|         StdQ1Vals |            6.46 |
|         MaxQ1Vals |            99.4 |
|         MinQ1Vals |           -51.9 |
|     AverageQ2Vals |              85 |
|         StdQ2Vals |            6.44 |
|         MaxQ2Vals |            98.8 |
|         MinQ2Vals |           -44.2 |
|      AverageLogPi |           -3.37 |
|          StdLogPi |            1.62 |
|          MaxLogPi |            22.1 |
|          MinLogPi |           -14.4 |


KeyboardInterrupt: 

In [9]:

args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': True,
        'exp_name': 'sac_POMDP_HalfCheetah'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP_HalfCheetah\sac_POMDP_HalfCheetah_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"sac_POMDP_HalfCheetah",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4DE3A2108>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP_HalfCheetah",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP_HalfCheetah\\sac_POMDP_HalfCheetah_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\google drive\\\\git_repos\\\



---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -271 |
|          StdEpRet |              78 |
|          MaxEpRet |            -191 |
|          MinEpRet |            -383 |
|  AverageTestEpRet |           -7.04 |
|      StdTestEpRet |            1.09 |
|      MaxTestEpRet |           -5.71 |
|      MinTestEpRet |            -9.1 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            4.64 |
|         StdQ1Vals |            2.54 |
|         MaxQ1Vals |            15.3 |
|         MinQ1Vals |          -0.911 |
|     AverageQ2Vals |            4.64 |
|         StdQ2Vals |            2.54 |
|         MaxQ2Vals |            15.4 |
|         MinQ2Vals |          -0.853 |
|      AverageLogPi |            -3.5 |
|          StdLogPi |            1.18 |
|          MaxLogPi |            7.23 |
|          MinLogPi |           -11.5 |


---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |            -162 |
|          StdEpRet |            61.3 |
|          MaxEpRet |           -61.3 |
|          MinEpRet |            -222 |
|  AverageTestEpRet |           -13.3 |
|      StdTestEpRet |            1.27 |
|      MaxTestEpRet |           -10.4 |
|      MinTestEpRet |           -14.9 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |              45 |
|         StdQ1Vals |            3.96 |
|         MaxQ1Vals |              53 |
|         MinQ1Vals |            36.1 |
|     AverageQ2Vals |              45 |
|         StdQ2Vals |            3.96 |
|         MaxQ2Vals |              53 |
|         MinQ2Vals |            36.1 |
|      AverageLogPi |           -3.51 |
|          StdLogPi |            1.12 |
|          MaxLogPi |            10.9 |
|          MinLogPi |           -12.4 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |            -179 |
|          StdEpRet |            36.1 |
|          MaxEpRet |            -146 |
|          MinEpRet |            -238 |
|  AverageTestEpRet |           -8.79 |
|      StdTestEpRet |            1.24 |
|      MaxTestEpRet |           -6.92 |
|      MinTestEpRet |           -10.8 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            52.8 |
|         StdQ1Vals |            5.57 |
|         MaxQ1Vals |            63.1 |
|         MinQ1Vals |            43.8 |
|     AverageQ2Vals |            52.8 |
|         StdQ2Vals |            5.57 |
|         MaxQ2Vals |            62.9 |
|         MinQ2Vals |            43.9 |
|      AverageLogPi |           -3.63 |
|          StdLogPi |               1 |
|          MaxLogPi |            6.47 |
|          MinLogPi |           -13.5 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |            -157 |
|          StdEpRet |            79.8 |
|          MaxEpRet |           -21.2 |
|          MinEpRet |            -222 |
|  AverageTestEpRet |             -14 |
|      StdTestEpRet |           0.941 |
|      MaxTestEpRet |           -12.9 |
|      MinTestEpRet |           -16.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            54.4 |
|         StdQ1Vals |            6.73 |
|         MaxQ1Vals |            66.6 |
|         MinQ1Vals |            45.3 |
|     AverageQ2Vals |            54.4 |
|         StdQ2Vals |            6.73 |
|         MaxQ2Vals |            66.7 |
|         MinQ2Vals |              45 |
|      AverageLogPi |           -3.67 |
|          StdLogPi |           0.954 |
|          MaxLogPi |            6.59 |
|          MinLogPi |           -11.3 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |            -156 |
|          StdEpRet |            69.3 |
|          MaxEpRet |           -44.2 |
|          MinEpRet |            -224 |
|  AverageTestEpRet |           -24.9 |
|      StdTestEpRet |            1.32 |
|      MaxTestEpRet |           -22.8 |
|      MinTestEpRet |           -26.7 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |            54.8 |
|         StdQ1Vals |            7.25 |
|         MaxQ1Vals |            68.1 |
|         MinQ1Vals |            45.2 |
|     AverageQ2Vals |            54.8 |
|         StdQ2Vals |            7.25 |
|         MaxQ2Vals |            68.3 |
|         MinQ2Vals |            45.4 |
|      AverageLogPi |           -3.68 |
|          StdLogPi |           0.947 |
|          MaxLogPi |             5.8 |
|          MinLogPi |           -11.6 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |            -145 |
|          StdEpRet |            37.2 |
|          MaxEpRet |            -100 |
|          MinEpRet |            -200 |
|  AverageTestEpRet |           -7.59 |
|      StdTestEpRet |           0.862 |
|      MaxTestEpRet |            -6.1 |
|      MinTestEpRet |           -8.97 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |              55 |
|         StdQ1Vals |            7.37 |
|         MaxQ1Vals |            68.2 |
|         MinQ1Vals |            45.6 |
|     AverageQ2Vals |              55 |
|         StdQ2Vals |            7.37 |
|         MaxQ2Vals |            68.2 |
|         MinQ2Vals |            45.6 |
|      AverageLogPi |           -3.71 |
|          StdLogPi |           0.899 |
|          MaxLogPi |            5.82 |
|          MinLogPi |           -11.4 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |            -163 |
|          StdEpRet |            44.8 |
|          MaxEpRet |            -100 |
|          MinEpRet |            -210 |
|  AverageTestEpRet |           -14.3 |
|      StdTestEpRet |            0.82 |
|      MaxTestEpRet |           -13.1 |
|      MinTestEpRet |           -15.4 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |            54.9 |
|         StdQ1Vals |            7.45 |
|         MaxQ1Vals |            69.5 |
|         MinQ1Vals |            46.1 |
|     AverageQ2Vals |            54.9 |
|         StdQ2Vals |            7.45 |
|         MaxQ2Vals |            69.1 |
|         MinQ2Vals |            45.9 |
|      AverageLogPi |           -3.72 |
|          StdLogPi |           0.886 |
|          MaxLogPi |            6.77 |
|          MinLogPi |             -12 |


In [10]:

args = {'env': 'HalfCheetah-v2', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': True,
        'exp_name': 'sac_POMDP_HalfCheetah_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP_HalfCheetah_2L128\sac_POMDP_HalfCheetah_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"sac_POMDP_HalfCheetah_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4DE361F08>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP_HalfCheetah_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP_HalfCheetah_2L128\\sac_POMDP_HalfCheetah_2L128_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\ling



---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -278 |
|          StdEpRet |            43.9 |
|          MaxEpRet |            -219 |
|          MinEpRet |            -342 |
|  AverageTestEpRet |           -53.6 |
|      StdTestEpRet |           0.743 |
|      MaxTestEpRet |           -52.3 |
|      MinTestEpRet |           -54.8 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |             4.5 |
|         StdQ1Vals |            2.35 |
|         MaxQ1Vals |            14.3 |
|         MinQ1Vals |          -0.688 |
|     AverageQ2Vals |             4.5 |
|         StdQ2Vals |            2.35 |
|         MaxQ2Vals |            14.3 |
|         MinQ2Vals |          -0.505 |
|      AverageLogPi |           -3.59 |
|          StdLogPi |           0.975 |
|          MaxLogPi |            1.86 |
|          MinLogPi |           -11.5 |


---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |            -136 |
|          StdEpRet |            71.1 |
|          MaxEpRet |           -14.4 |
|          MinEpRet |            -194 |
|  AverageTestEpRet |           -7.98 |
|      StdTestEpRet |           0.945 |
|      MaxTestEpRet |            -6.4 |
|      MinTestEpRet |           -9.97 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |            45.5 |
|         StdQ1Vals |            6.03 |
|         MaxQ1Vals |            57.6 |
|         MinQ1Vals |              36 |
|     AverageQ2Vals |            45.5 |
|         StdQ2Vals |            6.03 |
|         MaxQ2Vals |            57.4 |
|         MinQ2Vals |            36.1 |
|      AverageLogPi |           -3.57 |
|          StdLogPi |            1.04 |
|          MaxLogPi |            3.45 |
|          MinLogPi |             -12 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |           -94.4 |
|          StdEpRet |            72.3 |
|          MaxEpRet |           -16.7 |
|          MinEpRet |            -180 |
|  AverageTestEpRet |           -6.61 |
|      StdTestEpRet |           0.749 |
|      MaxTestEpRet |           -5.45 |
|      MinTestEpRet |           -7.51 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            54.7 |
|         StdQ1Vals |             7.8 |
|         MaxQ1Vals |            68.2 |
|         MinQ1Vals |            44.7 |
|     AverageQ2Vals |            54.7 |
|         StdQ2Vals |             7.8 |
|         MaxQ2Vals |              68 |
|         MinQ2Vals |            44.7 |
|      AverageLogPi |           -3.71 |
|          StdLogPi |           0.871 |
|          MaxLogPi |            4.99 |
|          MinLogPi |           -10.8 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |            -114 |
|          StdEpRet |            51.4 |
|          MaxEpRet |           -58.3 |
|          MinEpRet |            -194 |
|  AverageTestEpRet |           -10.3 |
|      StdTestEpRet |            1.21 |
|      MaxTestEpRet |           -7.73 |
|      MinTestEpRet |           -12.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            56.4 |
|         StdQ1Vals |            8.44 |
|         MaxQ1Vals |            70.4 |
|         MinQ1Vals |            46.3 |
|     AverageQ2Vals |            56.4 |
|         StdQ2Vals |            8.44 |
|         MaxQ2Vals |            70.1 |
|         MinQ2Vals |            46.2 |
|      AverageLogPi |           -3.72 |
|          StdLogPi |           0.856 |
|          MaxLogPi |            5.93 |
|          MinLogPi |           -11.9 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |            -194 |
|          StdEpRet |              44 |
|          MaxEpRet |            -120 |
|          MinEpRet |            -235 |
|  AverageTestEpRet |           -12.4 |
|      StdTestEpRet |            1.24 |
|      MaxTestEpRet |           -9.89 |
|      MinTestEpRet |             -14 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |            56.6 |
|         StdQ1Vals |            8.64 |
|         MaxQ1Vals |            71.1 |
|         MinQ1Vals |            46.4 |
|     AverageQ2Vals |            56.6 |
|         StdQ2Vals |            8.64 |
|         MaxQ2Vals |            70.8 |
|         MinQ2Vals |            46.5 |
|      AverageLogPi |           -3.75 |
|          StdLogPi |           0.826 |
|          MaxLogPi |            4.05 |
|          MinLogPi |           -10.1 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |            -105 |
|          StdEpRet |            56.8 |
|          MaxEpRet |           -59.1 |
|          MinEpRet |            -201 |
|  AverageTestEpRet |           -12.1 |
|      StdTestEpRet |             1.2 |
|      MaxTestEpRet |           -9.95 |
|      MinTestEpRet |           -13.7 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |              57 |
|         StdQ1Vals |             8.9 |
|         MaxQ1Vals |            71.1 |
|         MinQ1Vals |            46.5 |
|     AverageQ2Vals |              57 |
|         StdQ2Vals |             8.9 |
|         MaxQ2Vals |            71.1 |
|         MinQ2Vals |            46.5 |
|      AverageLogPi |           -3.76 |
|          StdLogPi |           0.804 |
|          MaxLogPi |            3.26 |
|          MinLogPi |           -12.2 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |            -138 |
|          StdEpRet |            94.6 |
|          MaxEpRet |           -40.6 |
|          MinEpRet |            -234 |
|  AverageTestEpRet |             -10 |
|      StdTestEpRet |           0.775 |
|      MaxTestEpRet |           -8.58 |
|      MinTestEpRet |           -10.9 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |            56.9 |
|         StdQ1Vals |             8.9 |
|         MaxQ1Vals |            71.7 |
|         MinQ1Vals |            46.7 |
|     AverageQ2Vals |            56.9 |
|         StdQ2Vals |             8.9 |
|         MaxQ2Vals |            71.2 |
|         MinQ2Vals |            46.5 |
|      AverageLogPi |           -3.77 |
|          StdLogPi |           0.788 |
|          MaxLogPi |            5.39 |
|          MinLogPi |           -10.4 |


In [13]:

args = {'env': 'HalfCheetah-v2', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': False,
        'exp_name': 'sac_MDP_HalfCheetah_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_MDP_HalfCheetah_2L128\sac_MDP_HalfCheetah_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"sac_MDP_HalfCheetah_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4E0436948>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_MDP_HalfCheetah_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_MDP_HalfCheetah_2L128\\sac_MDP_HalfCheetah_2L128_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\goog



---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -302 |
|          StdEpRet |            25.4 |
|          MaxEpRet |            -265 |
|          MinEpRet |            -335 |
|  AverageTestEpRet |          -0.608 |
|      StdTestEpRet |            2.71 |
|      MaxTestEpRet |            6.02 |
|      MinTestEpRet |           -5.09 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            4.47 |
|         StdQ1Vals |            2.99 |
|         MaxQ1Vals |              18 |
|         MinQ1Vals |           -3.23 |
|     AverageQ2Vals |            4.47 |
|         StdQ2Vals |            2.98 |
|         MaxQ2Vals |            18.5 |
|         MinQ2Vals |           -3.92 |
|      AverageLogPi |            -3.1 |
|          StdLogPi |            1.47 |
|          MaxLogPi |            7.71 |
|          MinLogPi |           -11.6 |


---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |        3.22e+03 |
|          StdEpRet |             136 |
|          MaxEpRet |        3.43e+03 |
|          MinEpRet |        3.07e+03 |
|  AverageTestEpRet |        3.68e+03 |
|      StdTestEpRet |            76.1 |
|      MaxTestEpRet |        3.81e+03 |
|      MinTestEpRet |        3.56e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |             108 |
|         StdQ1Vals |            39.7 |
|         MaxQ1Vals |             194 |
|         MinQ1Vals |            10.2 |
|     AverageQ2Vals |             108 |
|         StdQ2Vals |            39.7 |
|         MaxQ2Vals |             192 |
|         MinQ2Vals |            12.8 |
|      AverageLogPi |          0.0453 |
|          StdLogPi |            3.12 |
|          MaxLogPi |            24.8 |
|          MinLogPi |           -14.9 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        4.55e+03 |
|          StdEpRet |            39.3 |
|          MaxEpRet |        4.61e+03 |
|          MinEpRet |        4.51e+03 |
|  AverageTestEpRet |        4.08e+03 |
|      StdTestEpRet |        1.51e+03 |
|      MaxTestEpRet |         5.2e+03 |
|      MinTestEpRet |        1.38e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |             254 |
|         StdQ1Vals |            72.7 |
|         MaxQ1Vals |             351 |
|         MinQ1Vals |           -10.7 |
|     AverageQ2Vals |             254 |
|         StdQ2Vals |            72.7 |
|         MaxQ2Vals |             350 |
|         MinQ2Vals |           -14.4 |
|      AverageLogPi |            1.57 |
|          StdLogPi |            3.71 |
|          MaxLogPi |            22.9 |
|          MinLogPi |           -13.7 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |        5.03e+03 |
|          StdEpRet |            87.8 |
|          MaxEpRet |        5.14e+03 |
|          MinEpRet |         4.9e+03 |
|  AverageTestEpRet |        5.25e+03 |
|      StdTestEpRet |             758 |
|      MaxTestEpRet |        5.63e+03 |
|      MinTestEpRet |        2.99e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             333 |
|         StdQ1Vals |            77.8 |
|         MaxQ1Vals |             424 |
|         MinQ1Vals |            6.37 |
|     AverageQ2Vals |             333 |
|         StdQ2Vals |            77.8 |
|         MaxQ2Vals |             427 |
|         MinQ2Vals |           -25.4 |
|      AverageLogPi |            2.21 |
|          StdLogPi |            3.94 |
|          MaxLogPi |            42.6 |
|          MinLogPi |             -15 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        5.49e+03 |
|          StdEpRet |            85.2 |
|          MaxEpRet |        5.63e+03 |
|          MinEpRet |         5.4e+03 |
|  AverageTestEpRet |        6.27e+03 |
|      StdTestEpRet |             104 |
|      MaxTestEpRet |        6.44e+03 |
|      MinTestEpRet |        6.11e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             374 |
|         StdQ1Vals |            87.5 |
|         MaxQ1Vals |             473 |
|         MinQ1Vals |             -67 |
|     AverageQ2Vals |             374 |
|         StdQ2Vals |            87.5 |
|         MaxQ2Vals |             472 |
|         MinQ2Vals |           -77.9 |
|      AverageLogPi |            2.69 |
|          StdLogPi |            4.08 |
|          MaxLogPi |            41.4 |
|          MinLogPi |           -13.2 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |        6.18e+03 |
|          StdEpRet |             138 |
|          MaxEpRet |         6.3e+03 |
|          MinEpRet |        5.95e+03 |
|  AverageTestEpRet |        6.91e+03 |
|      StdTestEpRet |             105 |
|      MaxTestEpRet |        7.04e+03 |
|      MinTestEpRet |        6.71e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |             408 |
|         StdQ1Vals |            93.5 |
|         MaxQ1Vals |             518 |
|         MinQ1Vals |           -13.5 |
|     AverageQ2Vals |             408 |
|         StdQ2Vals |            93.5 |
|         MaxQ2Vals |             520 |
|         MinQ2Vals |           -22.6 |
|      AverageLogPi |            3.21 |
|          StdLogPi |            4.25 |
|          MaxLogPi |            60.4 |
|          MinLogPi |           -14.2 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        6.28e+03 |
|          StdEpRet |        1.05e+03 |
|          MaxEpRet |        7.01e+03 |
|          MinEpRet |        4.46e+03 |
|  AverageTestEpRet |        7.31e+03 |
|      StdTestEpRet |            75.8 |
|      MaxTestEpRet |        7.43e+03 |
|      MinTestEpRet |        7.19e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             452 |
|         StdQ1Vals |            98.2 |
|         MaxQ1Vals |             579 |
|         MinQ1Vals |           -20.5 |
|     AverageQ2Vals |             452 |
|         StdQ2Vals |            98.2 |
|         MaxQ2Vals |             579 |
|         MinQ2Vals |           -30.6 |
|      AverageLogPi |            3.64 |
|          StdLogPi |             4.2 |
|          MaxLogPi |              51 |
|          MinLogPi |             -12 |


In [11]:

args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': True,
        'exp_name': 'sac_POMDP_HalfCheetahMuJoCoEnv_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP_HalfCheetahMuJoCoEnv_2L128\sac_POMDP_HalfCheetahMuJoCoEnv_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"sac_POMDP_HalfCheetahMuJoCoEnv_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4DE364E88>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP_HalfCheetahMuJoCoEnv_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP_HalfCheetahMuJoCoEnv_2L128\\sac_POMDP_HalfCheetahMuJoCoEnv_2L128_s5",
            "output_file":	



options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -223 |
|          StdEpRet |            51.8 |
|          MaxEpRet |            -159 |
|          MinEpRet |            -303 |
|  AverageTestEpRet |           -33.1 |
|      StdTestEpRet |              31 |
|      MaxTestEpRet |           -2.44 |
|      MinTestEpRet |           -88.8 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |             4.5 |
|         StdQ1Vals |             2.7 |
|         MaxQ1Vals |            21.1 |
|         MinQ1Vals |          -0.415 |
|     AverageQ2Vals |             4.5 |
|         StdQ2Vals |             2.7 |
|         MaxQ2Vals |            20.6 |
|         MinQ2Vals |          -0.162 |
|      AverageLogPi |           -3.39 |
|          StdLogPi |            1.34 |
|          MaxLogPi |            7.92 |
|          MinLogPi |         

---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |           -83.1 |
|          StdEpRet |            33.5 |
|          MaxEpRet |           -47.8 |
|          MinEpRet |            -122 |
|  AverageTestEpRet |            7.74 |
|      StdTestEpRet |             7.1 |
|      MaxTestEpRet |            25.1 |
|      MinTestEpRet |           -3.07 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |            47.4 |
|         StdQ1Vals |            6.02 |
|         MaxQ1Vals |            63.5 |
|         MinQ1Vals |            10.9 |
|     AverageQ2Vals |            47.4 |
|         StdQ2Vals |            6.02 |
|         MaxQ2Vals |            62.4 |
|         MinQ2Vals |            12.1 |
|      AverageLogPi |           -3.75 |
|          StdLogPi |           0.851 |
|          MaxLogPi |            5.43 |
|          MinLogPi |           -11.8 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |            -100 |
|          StdEpRet |            27.1 |
|          MaxEpRet |           -64.5 |
|          MinEpRet |            -132 |
|  AverageTestEpRet |            11.1 |
|      StdTestEpRet |            4.31 |
|      MaxTestEpRet |            18.1 |
|      MinTestEpRet |            2.66 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            57.5 |
|         StdQ1Vals |            4.39 |
|         MaxQ1Vals |            81.7 |
|         MinQ1Vals |            22.6 |
|     AverageQ2Vals |            57.5 |
|         StdQ2Vals |            4.39 |
|         MaxQ2Vals |              82 |
|         MinQ2Vals |              22 |
|      AverageLogPi |           -3.79 |
|          StdLogPi |           0.873 |
|          MaxLogPi |            6.87 |
|          MinLogPi |             -12 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |           -51.5 |
|          StdEpRet |            22.4 |
|          MaxEpRet |           -20.2 |
|          MinEpRet |           -82.4 |
|  AverageTestEpRet |           -7.13 |
|      StdTestEpRet |            41.7 |
|      MaxTestEpRet |            18.4 |
|      MinTestEpRet |            -132 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            61.7 |
|         StdQ1Vals |            4.37 |
|         MaxQ1Vals |            90.9 |
|         MinQ1Vals |            26.2 |
|     AverageQ2Vals |            61.7 |
|         StdQ2Vals |            4.37 |
|         MaxQ2Vals |            93.2 |
|         MinQ2Vals |            25.4 |
|      AverageLogPi |           -3.83 |
|          StdLogPi |           0.833 |
|          MaxLogPi |            6.01 |
|          MinLogPi |             -11 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |           -85.3 |
|          StdEpRet |            7.12 |
|          MaxEpRet |           -79.2 |
|          MinEpRet |           -96.6 |
|  AverageTestEpRet |            10.5 |
|      StdTestEpRet |            3.92 |
|      MaxTestEpRet |            15.2 |
|      MinTestEpRet |            1.84 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |            63.1 |
|         StdQ1Vals |            5.75 |
|         MaxQ1Vals |            91.5 |
|         MinQ1Vals |            14.1 |
|     AverageQ2Vals |            63.1 |
|         StdQ2Vals |            5.75 |
|         MaxQ2Vals |            90.3 |
|         MinQ2Vals |            15.4 |
|      AverageLogPi |           -3.84 |
|          StdLogPi |           0.823 |
|          MaxLogPi |            7.46 |
|          MinLogPi |           -10.1 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |            -131 |
|          StdEpRet |             124 |
|          MaxEpRet |             -42 |
|          MinEpRet |            -345 |
|  AverageTestEpRet |           -1.46 |
|      StdTestEpRet |            35.6 |
|      MaxTestEpRet |            17.4 |
|      MinTestEpRet |            -100 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |            63.8 |
|         StdQ1Vals |             5.6 |
|         MaxQ1Vals |             102 |
|         MinQ1Vals |            4.75 |
|     AverageQ2Vals |            63.8 |
|         StdQ2Vals |             5.6 |
|         MaxQ2Vals |             103 |
|         MinQ2Vals |            7.83 |
|      AverageLogPi |           -3.85 |
|          StdLogPi |           0.811 |
|          MaxLogPi |            7.39 |
|          MinLogPi |           -13.5 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |           -42.1 |
|          StdEpRet |            13.9 |
|          MaxEpRet |             -21 |
|          MinEpRet |           -59.8 |
|  AverageTestEpRet |           -4.84 |
|      StdTestEpRet |            29.5 |
|      MaxTestEpRet |            19.9 |
|      MinTestEpRet |           -60.4 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |            64.6 |
|         StdQ1Vals |            5.72 |
|         MaxQ1Vals |             104 |
|         MinQ1Vals |            4.35 |
|     AverageQ2Vals |            64.6 |
|         StdQ2Vals |            5.72 |
|         MaxQ2Vals |             103 |
|         MinQ2Vals |            4.11 |
|      AverageLogPi |           -3.86 |
|          StdLogPi |           0.809 |
|          MaxLogPi |            8.43 |
|          MinLogPi |           -12.3 |


In [12]:

args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': False,
        'exp_name': 'sac_POMDP_HalfCheetahMuJoCoEnv_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP_HalfCheetahMuJoCoEnv_2L128\sac_POMDP_HalfCheetahMuJoCoEnv_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"sac_POMDP_HalfCheetahMuJoCoEnv_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4A81462C8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP_HalfCheetahMuJoCoEnv_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP_HalfCheetahMuJoCoEnv_2L128\\sac_POMDP_HalfCheetahMuJoCoEnv_2L128_s5",
            "output_file":	



options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -227 |
|          StdEpRet |            29.4 |
|          MaxEpRet |            -182 |
|          MinEpRet |            -259 |
|  AverageTestEpRet |            -397 |
|      StdTestEpRet |               6 |
|      MaxTestEpRet |            -387 |
|      MinTestEpRet |            -406 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            11.8 |
|         StdQ1Vals |             9.4 |
|         MaxQ1Vals |            45.1 |
|         MinQ1Vals |           -4.03 |
|     AverageQ2Vals |            11.8 |
|         StdQ2Vals |             9.4 |
|         MaxQ2Vals |            47.4 |
|         MinQ2Vals |            -4.4 |
|      AverageLogPi |          0.0972 |
|          StdLogPi |            2.82 |
|          MaxLogPi |            13.3 |
|          MinLogPi |         

---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             512 |
|          StdEpRet |            28.5 |
|          MaxEpRet |             553 |
|          MinEpRet |             473 |
|  AverageTestEpRet |             858 |
|      StdTestEpRet |            10.1 |
|      MaxTestEpRet |             880 |
|      MinTestEpRet |             843 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |            70.3 |
|         StdQ1Vals |            6.48 |
|         MaxQ1Vals |            95.8 |
|         MinQ1Vals |            40.8 |
|     AverageQ2Vals |            70.3 |
|         StdQ2Vals |            6.48 |
|         MaxQ2Vals |            95.4 |
|         MinQ2Vals |            42.5 |
|      AverageLogPi |           -2.85 |
|          StdLogPi |            1.45 |
|          MaxLogPi |            6.86 |
|          MinLogPi |           -14.5 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        1.37e+03 |
|          StdEpRet |            40.2 |
|          MaxEpRet |        1.42e+03 |
|          MinEpRet |        1.33e+03 |
|  AverageTestEpRet |        1.46e+03 |
|      StdTestEpRet |             506 |
|      MaxTestEpRet |        1.66e+03 |
|      MinTestEpRet |           -53.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |             111 |
|         StdQ1Vals |            13.1 |
|         MaxQ1Vals |             141 |
|         MinQ1Vals |              43 |
|     AverageQ2Vals |             111 |
|         StdQ2Vals |            13.1 |
|         MaxQ2Vals |             140 |
|         MinQ2Vals |            43.6 |
|      AverageLogPi |           -1.37 |
|          StdLogPi |            2.05 |
|          MaxLogPi |            12.8 |
|          MinLogPi |           -14.3 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |        1.41e+03 |
|          StdEpRet |            13.8 |
|          MaxEpRet |        1.43e+03 |
|          MinEpRet |        1.39e+03 |
|  AverageTestEpRet |        1.74e+03 |
|      StdTestEpRet |            68.6 |
|      MaxTestEpRet |        1.79e+03 |
|      MinTestEpRet |        1.54e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             133 |
|         StdQ1Vals |            23.8 |
|         MaxQ1Vals |             163 |
|         MinQ1Vals |            5.89 |
|     AverageQ2Vals |             133 |
|         StdQ2Vals |            23.8 |
|         MaxQ2Vals |             163 |
|         MinQ2Vals |            5.66 |
|      AverageLogPi |           -1.15 |
|          StdLogPi |            2.09 |
|          MaxLogPi |            15.1 |
|          MinLogPi |             -14 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        1.54e+03 |
|          StdEpRet |            15.1 |
|          MaxEpRet |        1.56e+03 |
|          MinEpRet |        1.52e+03 |
|  AverageTestEpRet |        1.87e+03 |
|      StdTestEpRet |            10.4 |
|      MaxTestEpRet |        1.89e+03 |
|      MinTestEpRet |        1.85e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             143 |
|         StdQ1Vals |            29.2 |
|         MaxQ1Vals |             173 |
|         MinQ1Vals |           -4.47 |
|     AverageQ2Vals |             143 |
|         StdQ2Vals |            29.2 |
|         MaxQ2Vals |             172 |
|         MinQ2Vals |           -6.05 |
|      AverageLogPi |           -1.06 |
|          StdLogPi |            2.12 |
|          MaxLogPi |            15.2 |
|          MinLogPi |           -14.1 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |        1.61e+03 |
|          StdEpRet |            10.1 |
|          MaxEpRet |        1.62e+03 |
|          MinEpRet |         1.6e+03 |
|  AverageTestEpRet |        1.69e+03 |
|      StdTestEpRet |             589 |
|      MaxTestEpRet |        1.91e+03 |
|      MinTestEpRet |           -76.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |             152 |
|         StdQ1Vals |            27.9 |
|         MaxQ1Vals |             179 |
|         MinQ1Vals |          -0.816 |
|     AverageQ2Vals |             152 |
|         StdQ2Vals |            27.9 |
|         MaxQ2Vals |             178 |
|         MinQ2Vals |           -3.86 |
|      AverageLogPi |           -0.95 |
|          StdLogPi |            2.12 |
|          MaxLogPi |            12.4 |
|          MinLogPi |           -13.3 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        1.67e+03 |
|          StdEpRet |            28.1 |
|          MaxEpRet |         1.7e+03 |
|          MinEpRet |        1.64e+03 |
|  AverageTestEpRet |        2.03e+03 |
|      StdTestEpRet |              37 |
|      MaxTestEpRet |        2.06e+03 |
|      MinTestEpRet |        1.93e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             158 |
|         StdQ1Vals |            27.2 |
|         MaxQ1Vals |             184 |
|         MinQ1Vals |           -6.82 |
|     AverageQ2Vals |             158 |
|         StdQ2Vals |            27.2 |
|         MaxQ2Vals |             184 |
|         MinQ2Vals |           -7.22 |
|      AverageLogPi |          -0.806 |
|          StdLogPi |            2.17 |
|          MaxLogPi |            17.3 |
|          MinLogPi |           -13.8 |


In [14]:

args = {'env': 'AntMuJoCoEnv-v0', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': False,
        'exp_name': 'sac_MDP_AntMuJoCoEnv_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_MDP_AntMuJoCoEnv_2L128\sac_MDP_AntMuJoCoEnv_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"AntMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"sac_MDP_AntMuJoCoEnv_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4DE34AB88>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_MDP_AntMuJoCoEnv_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_MDP_AntMuJoCoEnv_2L128\\sac_MDP_AntMuJoCoEnv_2L128_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\



options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |             571 |
|          StdEpRet |            64.1 |
|          MaxEpRet |             638 |
|          MinEpRet |             465 |
|  AverageTestEpRet |             655 |
|      StdTestEpRet |            61.8 |
|      MaxTestEpRet |             718 |
|      MinTestEpRet |             525 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            11.7 |
|         StdQ1Vals |            6.33 |
|         MaxQ1Vals |            29.9 |
|         MinQ1Vals |           0.088 |
|     AverageQ2Vals |            11.7 |
|         StdQ2Vals |            6.33 |
|         MaxQ2Vals |            29.6 |
|         MinQ2Vals |         -0.0494 |
|      AverageLogPi |           -5.03 |
|          StdLogPi |             1.2 |
|          MaxLogPi |            10.3 |
|          MinLogPi |         

---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             694 |
|          StdEpRet |            43.4 |
|          MaxEpRet |             761 |
|          MinEpRet |             645 |
|  AverageTestEpRet |             639 |
|      StdTestEpRet |            95.4 |
|      MaxTestEpRet |             722 |
|      MinTestEpRet |             449 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |             140 |
|         StdQ1Vals |            7.97 |
|         MaxQ1Vals |             163 |
|         MinQ1Vals |             105 |
|     AverageQ2Vals |             140 |
|         StdQ2Vals |            7.97 |
|         MaxQ2Vals |             163 |
|         MinQ2Vals |             105 |
|      AverageLogPi |           -5.22 |
|          StdLogPi |           0.812 |
|          MaxLogPi |            3.49 |
|          MinLogPi |           -12.9 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             881 |
|          StdEpRet |             127 |
|          MaxEpRet |             994 |
|          MinEpRet |             668 |
|  AverageTestEpRet |             800 |
|      StdTestEpRet |            99.2 |
|      MaxTestEpRet |             939 |
|      MinTestEpRet |             629 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |             165 |
|         StdQ1Vals |            8.87 |
|         MaxQ1Vals |             188 |
|         MinQ1Vals |             126 |
|     AverageQ2Vals |             165 |
|         StdQ2Vals |            8.87 |
|         MaxQ2Vals |             189 |
|         MinQ2Vals |             126 |
|      AverageLogPi |           -5.09 |
|          StdLogPi |           0.958 |
|          MaxLogPi |            4.24 |
|          MinLogPi |           -12.2 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |        1.04e+03 |
|          StdEpRet |            75.4 |
|          MaxEpRet |         1.1e+03 |
|          MinEpRet |             914 |
|  AverageTestEpRet |             980 |
|      StdTestEpRet |             113 |
|      MaxTestEpRet |        1.16e+03 |
|      MinTestEpRet |             763 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             175 |
|         StdQ1Vals |            11.3 |
|         MaxQ1Vals |             202 |
|         MinQ1Vals |             130 |
|     AverageQ2Vals |             175 |
|         StdQ2Vals |            11.3 |
|         MaxQ2Vals |             200 |
|         MinQ2Vals |             130 |
|      AverageLogPi |           -4.94 |
|          StdLogPi |             1.1 |
|          MaxLogPi |            5.71 |
|          MinLogPi |           -12.4 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        1.19e+03 |
|          StdEpRet |            21.1 |
|          MaxEpRet |        1.21e+03 |
|          MinEpRet |        1.16e+03 |
|  AverageTestEpRet |        1.13e+03 |
|      StdTestEpRet |             113 |
|      MaxTestEpRet |        1.26e+03 |
|      MinTestEpRet |             915 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             182 |
|         StdQ1Vals |            13.4 |
|         MaxQ1Vals |             205 |
|         MinQ1Vals |             132 |
|     AverageQ2Vals |             182 |
|         StdQ2Vals |            13.4 |
|         MaxQ2Vals |             205 |
|         MinQ2Vals |             132 |
|      AverageLogPi |           -4.73 |
|          StdLogPi |            1.28 |
|          MaxLogPi |            4.37 |
|          MinLogPi |           -12.4 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |         1.2e+03 |
|          StdEpRet |            60.8 |
|          MaxEpRet |        1.24e+03 |
|          MinEpRet |         1.1e+03 |
|  AverageTestEpRet |        1.15e+03 |
|      StdTestEpRet |             152 |
|      MaxTestEpRet |        1.47e+03 |
|      MinTestEpRet |             939 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |             187 |
|         StdQ1Vals |            14.4 |
|         MaxQ1Vals |             212 |
|         MinQ1Vals |             133 |
|     AverageQ2Vals |             187 |
|         StdQ2Vals |            14.4 |
|         MaxQ2Vals |             212 |
|         MinQ2Vals |             132 |
|      AverageLogPi |           -4.52 |
|          StdLogPi |            1.41 |
|          MaxLogPi |            6.32 |
|          MinLogPi |           -12.5 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        1.34e+03 |
|          StdEpRet |            19.5 |
|          MaxEpRet |        1.36e+03 |
|          MinEpRet |        1.32e+03 |
|  AverageTestEpRet |        1.49e+03 |
|      StdTestEpRet |            61.9 |
|      MaxTestEpRet |        1.55e+03 |
|      MinTestEpRet |        1.32e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             192 |
|         StdQ1Vals |            14.9 |
|         MaxQ1Vals |             216 |
|         MinQ1Vals |             138 |
|     AverageQ2Vals |             192 |
|         StdQ2Vals |            14.9 |
|         MaxQ2Vals |             216 |
|         MinQ2Vals |             139 |
|      AverageLogPi |           -4.36 |
|          StdLogPi |            1.51 |
|          MaxLogPi |            6.62 |
|          MinLogPi |           -13.4 |


In [15]:

args = {'env': 'AntMuJoCoEnv-v0', 'hid': 128, 'l': 2, 
        'gamma': 0.99, 'seed':5, 'epochs':50, 
        'partially_observable': True,
        'exp_name': 'sac_POMDP_AntMuJoCoEnv_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

torch.set_num_threads(torch.get_num_threads())

sac(env_name=args['env'], actor_critic=core.MLPActorCritic,
    ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
    gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    partially_observable=args['partially_observable'],
    logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\sac_POMDP_AntMuJoCoEnv_2L128\sac_POMDP_AntMuJoCoEnv_2L128_s5\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            128,
            128
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_name":	"AntMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"sac_POMDP_AntMuJoCoEnv_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001D4A08EA7C8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_POMDP_AntMuJoCoEnv_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\sac_POMDP_AntMuJoCoEnv_2L128\\sac_POMDP_AntMuJoCoEnv_2L128_s5",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\



options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |             589 |
|          StdEpRet |            17.7 |
|          MaxEpRet |             607 |
|          MinEpRet |             568 |
|  AverageTestEpRet |             600 |
|      StdTestEpRet |          0.0893 |
|      MaxTestEpRet |             600 |
|      MinTestEpRet |             600 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |            11.7 |
|         StdQ1Vals |            6.25 |
|         MaxQ1Vals |              26 |
|         MinQ1Vals |          0.0347 |
|     AverageQ2Vals |            11.7 |
|         StdQ2Vals |            6.24 |
|         MaxQ2Vals |              26 |
|         MinQ2Vals |         -0.0249 |
|      AverageLogPi |           -5.45 |
|          StdLogPi |             0.4 |
|          MaxLogPi |           -2.87 |
|          MinLogPi |         

---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             684 |
|          StdEpRet |            72.5 |
|          MaxEpRet |             793 |
|          MinEpRet |             601 |
|  AverageTestEpRet |             621 |
|      StdTestEpRet |              53 |
|      MaxTestEpRet |             739 |
|      MinTestEpRet |             568 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |             134 |
|         StdQ1Vals |             6.4 |
|         MaxQ1Vals |             160 |
|         MinQ1Vals |             108 |
|     AverageQ2Vals |             134 |
|         StdQ2Vals |             6.4 |
|         MaxQ2Vals |             160 |
|         MinQ2Vals |             108 |
|      AverageLogPi |           -5.38 |
|          StdLogPi |           0.552 |
|          MaxLogPi |           0.263 |
|          MinLogPi |           -11.2 |


---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             634 |
|          StdEpRet |            75.4 |
|          MaxEpRet |             700 |
|          MinEpRet |             511 |
|  AverageTestEpRet |             676 |
|      StdTestEpRet |            99.8 |
|      MaxTestEpRet |             908 |
|      MinTestEpRet |             602 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |             160 |
|         StdQ1Vals |            7.41 |
|         MaxQ1Vals |             187 |
|         MinQ1Vals |             128 |
|     AverageQ2Vals |             160 |
|         StdQ2Vals |            7.41 |
|         MaxQ2Vals |             187 |
|         MinQ2Vals |             128 |
|      AverageLogPi |           -5.37 |
|          StdLogPi |            0.56 |
|          MaxLogPi |            2.38 |
|          MinLogPi |           -11.2 |


---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |             629 |
|          StdEpRet |            42.8 |
|          MaxEpRet |             698 |
|          MinEpRet |             582 |
|  AverageTestEpRet |             688 |
|      StdTestEpRet |            56.5 |
|      MaxTestEpRet |             792 |
|      MinTestEpRet |             611 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             165 |
|         StdQ1Vals |            7.99 |
|         MaxQ1Vals |             192 |
|         MinQ1Vals |             131 |
|     AverageQ2Vals |             165 |
|         StdQ2Vals |            7.99 |
|         MaxQ2Vals |             192 |
|         MinQ2Vals |             131 |
|      AverageLogPi |           -5.38 |
|          StdLogPi |            0.55 |
|          MaxLogPi |           -1.19 |
|          MinLogPi |           -12.6 |


---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |             658 |
|          StdEpRet |            36.6 |
|          MaxEpRet |             717 |
|          MinEpRet |             616 |
|  AverageTestEpRet |             784 |
|      StdTestEpRet |            75.7 |
|      MaxTestEpRet |             981 |
|      MinTestEpRet |             715 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             166 |
|         StdQ1Vals |            7.94 |
|         MaxQ1Vals |             193 |
|         MinQ1Vals |             132 |
|     AverageQ2Vals |             166 |
|         StdQ2Vals |            7.94 |
|         MaxQ2Vals |             193 |
|         MinQ2Vals |             132 |
|      AverageLogPi |           -5.37 |
|          StdLogPi |           0.556 |
|          MaxLogPi |           -1.65 |
|          MinLogPi |           -11.1 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |             673 |
|          StdEpRet |            20.8 |
|          MaxEpRet |             695 |
|          MinEpRet |             639 |
|  AverageTestEpRet |             610 |
|      StdTestEpRet |              47 |
|      MaxTestEpRet |             686 |
|      MinTestEpRet |             499 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.64e+05 |
|     AverageQ1Vals |             166 |
|         StdQ1Vals |             8.3 |
|         MaxQ1Vals |             193 |
|         MinQ1Vals |             133 |
|     AverageQ2Vals |             166 |
|         StdQ2Vals |             8.3 |
|         MaxQ2Vals |             193 |
|         MinQ2Vals |             133 |
|      AverageLogPi |           -5.35 |
|          StdLogPi |           0.588 |
|          MaxLogPi |          -0.339 |
|          MinLogPi |           -11.8 |


---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |             650 |
|          StdEpRet |            12.2 |
|          MaxEpRet |             667 |
|          MinEpRet |             634 |
|  AverageTestEpRet |             729 |
|      StdTestEpRet |            72.8 |
|      MaxTestEpRet |             853 |
|      MinTestEpRet |             617 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             167 |
|         StdQ1Vals |            8.34 |
|         MaxQ1Vals |             193 |
|         MinQ1Vals |             136 |
|     AverageQ2Vals |             167 |
|         StdQ2Vals |            8.34 |
|         MaxQ2Vals |             193 |
|         MinQ2Vals |             136 |
|      AverageLogPi |           -5.36 |
|          StdLogPi |           0.574 |
|          MaxLogPi |          -0.673 |
|          MinLogPi |           -11.9 |
